确定正则表达式的最短可能匹配

Question

确定正则表达式的最短可能匹配

我有一个随机排序的正则表达式数组，如下所示：

let patterns = [
    /foo+ba+r/,
    /foo/,
    /foo+bar/,
    /foobar/,
    /m[eo]{4,}w/,
    /boo/,
    /fooo*/,
    /meow/
]

Run Code Online (Sandbox Code Playgroud)

我不确定这是否可能，但我想编写一个算法，将正则表达式从最不贪婪到最贪婪排序，如下所示：

[
    /foo/,
    /boo/,
    /fooo*/,
    /meow/,
    /foobar/,
    /foo+bar/,
    /m[eo]{4,}w/,
    /foo+ba+r/
]

Run Code Online (Sandbox Code Playgroud)

我想这样的排序可以这样实现：

patterns.sort((p1, p2) { return p1.greediness() - p2.greediness() });

Run Code Online (Sandbox Code Playgroud)

greediness但类中不存在调用的方法RegExpr。

理想情况下，该greediness方法将返回至少可能匹配的字符数。IE：

/foo/.greediness() == 3
/boo/.greediness() == 3
/fooo*/.greediness() == 3
/meow/.greediness() == 4
/foobar/.greediness() == 6
/foo+bar/.greediness() == 6
/m[eo]{4,}w/.greediness() == 6
/foo+ba+r/.greediness() == 6

Run Code Online (Sandbox Code Playgroud)

您对这个问题的解决方案是什么？

Answer 1

Boo*_*boo 6

这确实是一个非常困难的问题，需要能够解析正则表达式（如果减少正则表达式的规则，使得唯一允许的输入是“常规”字符和特殊的正则表达式特殊字符()[]|*+?，那么构造这样的解析器就不会太难了）。但考虑到你有这样一个解析器，这将是我的方法：

将正则表达式转换为非确定性有限自动机 (NFA)。这一步需要一个好的正则表达式解析器，但是一旦有了它，NFA 的构造就非常简单了。当然，如果你能找到一个现成的正则表达式来实现NFA，那就更理想了。
构建 NFA 的有向加权图表示，为表示字符转换的边赋予权重 1，为表示 epsilon 转换的边赋予权重 0。
使用 Dijkstra 算法找到从 NFA 的初始状态到最终状态的最短路径。

让我们以正则表达式为例m[eo]{2,}w。转换为 NFA，其中适当的边标记为边上方的权重，导致状态转换的字符标记为边下方，我们得到：

如果一条边由长度为 3 的元素数组定义，其中包含[from-state, to-state, weight]，则上述有向图的边数组将是：

const edges = [
    [0, 1, 1],
    [1, 2, 0],
    [1, 3, 0],
    [2, 4, 1],
    [3, 5, 1],
    [4, 6, 0],
    [5, 6, 0],
    [6, 7, 0],
    [6, 8, 0],
    [7, 9, 1],
    [8, 10, 1],
    [9, 11, 0],
    [10, 11, 0],
    [11, 6, 0],
    [11, 12, 1]
];

Run Code Online (Sandbox Code Playgroud)

应用 Dijkstra 算法获取从状态 0 到状态 12 的最短路径会产生长度为 4 的路径：

0 -> 1 -> 3 -> 5 -> 6 -> 8 -> 10 -> 11 -> 12

因此，正则表达式识别的最短字符串将为 4。

因此，现在您需要做的就是找到或编写 JavaScript 正则表达式到 NFA 算法和 Dijkstra 算法。

更新

如果您正在创建自己的正则表达式解析器，那么您实际上可以绕过创建 NFA 和 Dijkstra 算法并计算长度。以下并不声称是完整的解析器。例如，它不支持命名组，并且只识别基本的“东西”。

/*
  Grammar (my own extended BNF notation) where [token(s)]? denotes an optional token or tokens
  and [token(s)]+ denotes one or more of these tokens.

  E -> F E'
  E' -> '|' E
  F -> [SINGLE_CHAR FOLLOWER | '(' ['?:']? E ')' FOLLOWER]+ | epsilon
  SINGLE_CHAR -> CHAR | '[' BRACKET_CHARS ']'
  FOLLOWER -> EXPRESSION_FOLLOWER NON_GREEDY
  BRACKET_CHARS -> CHAR BRACKET_CHARS | epsilon
  EXPRESSION_FOLLOWER -> '*' | '+' | '?' | '{' number [',' [number]? '}' | epsilon
  NON_GREEDY -> '?' | epsilon
  */

  const EOF = 0;
  const CHAR = 1;

  let current_char;
  let current_token = null;
  let tokenizer;

  function* lexer(s) {
      // Produce next token:
      const single_character_tokens = '?*+{}[](),|';

      const l = s.length;
      let i = 0;
      while (i < l) {
          current_char = s[i++];
          if (single_character_tokens.indexOf(current_char) != -1) {
              // the current character is the token to yield:
              yield current_char;
          }
          else {
              if (current_char == '\\') {
                  if (i < l) {
                      current_char = s[i++];
                      if (current_char >= '0' && current_char <= '9') {
                          throw 'unsupported back reference';
                      }
                      if (current_char == 'b' || current_char == 'B') {
                          continue; // does not contribute to the length
                      }
                  }
                  else {
                      throw 'invalid escape sequence';
                  }
              }
              else if (current_char == '^' || current_char == '$') {
                  continue; // does not contribute to length
              }
              yield CHAR; // the actual character is current_char
          }
      }
      yield EOF;
  }

  function FOLLOWER() {
      // return a multiplier

      if (current_token === '?' || current_token === '*' || current_token === '+') {
          const l = current_token === '+' ? 1 : 0;
          current_token = tokenizer.next().value;
          if (current_token === '?') { // non-greedy
              current_token = tokenizer.next().value;
          }
          return l;
      }

      if (current_token === '{') {
          current_token = tokenizer.next().value;
          let s = '';
          while (current_token !== '}' && current_token !== EOF) {
              if (current_token === EOF) {
                  throw 'syntax error';
              }
              s += current_char;
              current_token = tokenizer.next().value;
          }
          current_token = tokenizer.next().value;
          const matches = s.match(/^(\d)+(,\d*)?$/);
          if (matches === null) {
              throw 'synatx error';
          }
          return parseInt(matches[0]);
      }

      return 1;
  }

  function F() {
      let l = 0;

      while (current_token == CHAR || current_char == '(' || current_char == '[') {
          if (current_token === CHAR || current_token === '[') {
              if (current_token == CHAR) {
                  current_token = tokenizer.next().value;
              }
              else {
                  current_token = tokenizer.next().value;
                  if (current_token == ']') {
                      current_token = tokenizer.next().value;
                      // empty []
                      FOLLOWER();
                      continue;
                  }
                  while (current_token != ']' && current_token != EOF) {
                      current_token = tokenizer.next().value;
                  }
                  if (current_token !== ']') {
                      throw 'syntax error';
                  }
                  current_token = tokenizer.next().value;
              }
              const multiplier = FOLLOWER();
              l += multiplier;
          }
          else if (current_token === '(') {
              current_token = tokenizer.next().value;
              if (current_token === '?') { // non-capturing group
                  current_token = tokenizer.next().value;
                  if (current_token !== CHAR || current_char !== ':') {
                      throw 'syntax error';
                  }
                  current_token = tokenizer.next().value;
              }
              const this_l = E();
              if (current_token !== ')') {
                  throw 'synatx error';
              }
              current_token = tokenizer.next().value;
              const multiplier = FOLLOWER();
              l += this_l * multiplier;
          }
          else {
              throw 'syntax error';
          }
      }

      return l;
  }

  function E() {
      let min_l = F();
      while (current_token === '|') {
          current_token = tokenizer.next().value;
          const l = F();
          if (l < min_l) {
              min_l = l;
          }
      }
      return min_l;
  }

  function parse(s) {
      tokenizer = lexer(s);
      current_token = tokenizer.next().value;
      const l = E();
      if (current_token !== EOF) {
          throw 'syntax error';
      }
      return l;
  }

  let patterns = [
    new RegExp(''),
    /(?:)[]()/,
    /abc|$/,
    /^foo+ba+r$/,
    /foo+ba+r/,
    /foo/,
    /foo+bar/,
    /foobar/,
    /m([eo]{4,})w/,
    /m(?:[eo]{4,})w/,
    /boo/,
    /fooo*/,
    /meow/,
    /\b\d+\b/
  ];


  RegExp.prototype.greediness = function () {
    return parse(this.source);
  };


  let prompt_msg = 'Try your own regex wihout the / delimiters:';
  while (true) {
      const regex = prompt(prompt_msg);
      try {
          console.log(`You entered '${regex}' and its greediness is ${new RegExp(regex).greediness()}.`);
          break;
      }
      catch (error) {
          prompt_msg = `Your input resulted in ${error}. Try again:`;
      }
  }

  console.log('\nSome tests:\n\n');
  for (const pattern of patterns) {
      console.log(`pattern = ${pattern.source}, greediness = ${pattern.greediness()}`);
  }

Run Code Online (Sandbox Code Playgroud)

归档时间：	3 年，1 月前
查看次数：	267 次
最近记录：	3 年，1 月前