cae*_*ras 7 javascript sorting
我有一个随机排序的正则表达式数组,如下所示:
let patterns = [
/foo+ba+r/,
/foo/,
/foo+bar/,
/foobar/,
/m[eo]{4,}w/,
/boo/,
/fooo*/,
/meow/
]
Run Code Online (Sandbox Code Playgroud)
我不确定这是否可能,但我想编写一个算法,将正则表达式从最不贪婪到最贪婪排序,如下所示:
[
/foo/,
/boo/,
/fooo*/,
/meow/,
/foobar/,
/foo+bar/,
/m[eo]{4,}w/,
/foo+ba+r/
]
Run Code Online (Sandbox Code Playgroud)
我想这样的排序可以这样实现:
patterns.sort((p1, p2) { return p1.greediness() - p2.greediness() });
Run Code Online (Sandbox Code Playgroud)
greediness但类中不存在调用的方法RegExpr。
理想情况下,该greediness方法将返回至少可能匹配的字符数。IE:
/foo/.greediness() == 3
/boo/.greediness() == 3
/fooo*/.greediness() == 3
/meow/.greediness() == 4
/foobar/.greediness() == 6
/foo+bar/.greediness() == 6
/m[eo]{4,}w/.greediness() == 6
/foo+ba+r/.greediness() == 6
Run Code Online (Sandbox Code Playgroud)
您对这个问题的解决方案是什么?
这确实是一个非常困难的问题,需要能够解析正则表达式(如果减少正则表达式的规则,使得唯一允许的输入是“常规”字符和特殊的正则表达式特殊字符()[]|*+?,那么构造这样的解析器就不会太难了)。但考虑到你有这样一个解析器,这将是我的方法:
让我们以正则表达式为例m[eo]{2,}w。转换为 NFA,其中适当的边标记为边上方的权重,导致状态转换的字符标记为边下方,我们得到:
如果一条边由长度为 3 的元素数组定义,其中包含[from-state, to-state, weight],则上述有向图的边数组将是:
const edges = [
[0, 1, 1],
[1, 2, 0],
[1, 3, 0],
[2, 4, 1],
[3, 5, 1],
[4, 6, 0],
[5, 6, 0],
[6, 7, 0],
[6, 8, 0],
[7, 9, 1],
[8, 10, 1],
[9, 11, 0],
[10, 11, 0],
[11, 6, 0],
[11, 12, 1]
];
Run Code Online (Sandbox Code Playgroud)
应用 Dijkstra 算法获取从状态 0 到状态 12 的最短路径会产生长度为 4 的路径:
0 -> 1 -> 3 -> 5 -> 6 -> 8 -> 10 -> 11 -> 12
因此,正则表达式识别的最短字符串将为 4。
因此,现在您需要做的就是找到或编写 JavaScript 正则表达式到 NFA 算法和 Dijkstra 算法。
更新
如果您正在创建自己的正则表达式解析器,那么您实际上可以绕过创建 NFA 和 Dijkstra 算法并计算长度。以下并不声称是完整的解析器。例如,它不支持命名组,并且只识别基本的“东西”。
/*
Grammar (my own extended BNF notation) where [token(s)]? denotes an optional token or tokens
and [token(s)]+ denotes one or more of these tokens.
E -> F E'
E' -> '|' E
F -> [SINGLE_CHAR FOLLOWER | '(' ['?:']? E ')' FOLLOWER]+ | epsilon
SINGLE_CHAR -> CHAR | '[' BRACKET_CHARS ']'
FOLLOWER -> EXPRESSION_FOLLOWER NON_GREEDY
BRACKET_CHARS -> CHAR BRACKET_CHARS | epsilon
EXPRESSION_FOLLOWER -> '*' | '+' | '?' | '{' number [',' [number]? '}' | epsilon
NON_GREEDY -> '?' | epsilon
*/
const EOF = 0;
const CHAR = 1;
let current_char;
let current_token = null;
let tokenizer;
function* lexer(s) {
// Produce next token:
const single_character_tokens = '?*+{}[](),|';
const l = s.length;
let i = 0;
while (i < l) {
current_char = s[i++];
if (single_character_tokens.indexOf(current_char) != -1) {
// the current character is the token to yield:
yield current_char;
}
else {
if (current_char == '\\') {
if (i < l) {
current_char = s[i++];
if (current_char >= '0' && current_char <= '9') {
throw 'unsupported back reference';
}
if (current_char == 'b' || current_char == 'B') {
continue; // does not contribute to the length
}
}
else {
throw 'invalid escape sequence';
}
}
else if (current_char == '^' || current_char == '$') {
continue; // does not contribute to length
}
yield CHAR; // the actual character is current_char
}
}
yield EOF;
}
function FOLLOWER() {
// return a multiplier
if (current_token === '?' || current_token === '*' || current_token === '+') {
const l = current_token === '+' ? 1 : 0;
current_token = tokenizer.next().value;
if (current_token === '?') { // non-greedy
current_token = tokenizer.next().value;
}
return l;
}
if (current_token === '{') {
current_token = tokenizer.next().value;
let s = '';
while (current_token !== '}' && current_token !== EOF) {
if (current_token === EOF) {
throw 'syntax error';
}
s += current_char;
current_token = tokenizer.next().value;
}
current_token = tokenizer.next().value;
const matches = s.match(/^(\d)+(,\d*)?$/);
if (matches === null) {
throw 'synatx error';
}
return parseInt(matches[0]);
}
return 1;
}
function F() {
let l = 0;
while (current_token == CHAR || current_char == '(' || current_char == '[') {
if (current_token === CHAR || current_token === '[') {
if (current_token == CHAR) {
current_token = tokenizer.next().value;
}
else {
current_token = tokenizer.next().value;
if (current_token == ']') {
current_token = tokenizer.next().value;
// empty []
FOLLOWER();
continue;
}
while (current_token != ']' && current_token != EOF) {
current_token = tokenizer.next().value;
}
if (current_token !== ']') {
throw 'syntax error';
}
current_token = tokenizer.next().value;
}
const multiplier = FOLLOWER();
l += multiplier;
}
else if (current_token === '(') {
current_token = tokenizer.next().value;
if (current_token === '?') { // non-capturing group
current_token = tokenizer.next().value;
if (current_token !== CHAR || current_char !== ':') {
throw 'syntax error';
}
current_token = tokenizer.next().value;
}
const this_l = E();
if (current_token !== ')') {
throw 'synatx error';
}
current_token = tokenizer.next().value;
const multiplier = FOLLOWER();
l += this_l * multiplier;
}
else {
throw 'syntax error';
}
}
return l;
}
function E() {
let min_l = F();
while (current_token === '|') {
current_token = tokenizer.next().value;
const l = F();
if (l < min_l) {
min_l = l;
}
}
return min_l;
}
function parse(s) {
tokenizer = lexer(s);
current_token = tokenizer.next().value;
const l = E();
if (current_token !== EOF) {
throw 'syntax error';
}
return l;
}
let patterns = [
new RegExp(''),
/(?:)[]()/,
/abc|$/,
/^foo+ba+r$/,
/foo+ba+r/,
/foo/,
/foo+bar/,
/foobar/,
/m([eo]{4,})w/,
/m(?:[eo]{4,})w/,
/boo/,
/fooo*/,
/meow/,
/\b\d+\b/
];
RegExp.prototype.greediness = function () {
return parse(this.source);
};
let prompt_msg = 'Try your own regex wihout the / delimiters:';
while (true) {
const regex = prompt(prompt_msg);
try {
console.log(`You entered '${regex}' and its greediness is ${new RegExp(regex).greediness()}.`);
break;
}
catch (error) {
prompt_msg = `Your input resulted in ${error}. Try again:`;
}
}
console.log('\nSome tests:\n\n');
for (const pattern of patterns) {
console.log(`pattern = ${pattern.source}, greediness = ${pattern.greediness()}`);
}Run Code Online (Sandbox Code Playgroud)