简单字符串压缩:删除连续的重复子字符串

Question

简单字符串压缩:删除连续的重复子字符串

Sid*_*dre 7 string algorithm

最近在亚马逊的采访中我被问到了这个问题.

给定一个字符串,从中删除连续的重复子字符串.如果有多个连续的交叉子串,请删除其中最大的子串.

为清楚起见,这里有一些例子:

输入:aabcccddeaaa
输出:( abcdea 压缩连续重复的字符)
输入:abababcdeee
输出:( abcde压缩连续重复的子串)
输入:ababcdabcd
输出:ababcd

(你可以压缩' ab'或' abcd',但随着' abcd'的长度变大,你更喜欢压缩较大的那个.)

我无法想出一个有效的实现,任何人都知道一个很好的方法吗？

由于这是一个面试问题,请不要使用复杂的库函数.

Answer 1

Ami*_*mit 1

没有正则表达式...这种递归方法有效：

var cases = ['aabcccddeaaa', 'abababcdeee', 'ababcdabcd'];

function compress(str) {
  var len, sub, i, n;

  // if str is shorter than 2, can't be any repeating substrings
  if(str.length < 2)
    return str;

  // max meaningful length is str.length/2 (truncated to integer)
  for(len = (str.length / 2) | 0; len > 0; len--) {
    // search for a repeating substring of "len" length starting at index i
    for(i = 0; i + (len * 2) <= str.length; i++) {
      sub = str.substr(i, len);
      // if such a substring exists...
      if(str.indexOf(sub, i + len) == i + len) {
        // "count" how many occurences (advance index till beyond repeats)
        for(n = i + len * 2; str.indexOf(sub, n) == n; n += len);
        // return a string composed of the compressed part before the match +
        // the substring + the compressed part beyond the match
        return compress(str.substr(0, i)) + sub + compress(str.substr(n));
      }
    }
  }

  // if nothing found, return original string
  return str;
}

alert(JSON.stringify(cases.map(compress)));

Run Code Online (Sandbox Code Playgroud)

在关于算法复杂性的评论中进行了长时间的争论之后，我决定进行一些重构并使用自我实现的startsWith函数，并利用它来计算内部操作（复杂性......）。

我借此机会通过将字符串分配最小化来进一步优化，因此现在递归适用于整个字符串+开始/结束索引。

下面的代码生成一个输出，其中包括输入字符串、结果、n^2（用于 O(n^2) 比较）和实际内部操作计数。我添加了一些边缘情况来展示它的性能。我找不到导致 n^2 计数的输入，它们都在下面。

var cases = ['aabcccddeaaa', 'abababcdeee', 'ababcdabcd',
             'aabaaabaab', '1', '111222', '123456789', '1234567899'];

var innerCount;

function startsWith(str, start, subStart, subLen) {
  var subEnd = subStart + subLen - 1;
  while(subStart <= subEnd) {
    innerCount++;
    if(str[start++] != str[subStart++])
      return false;
  }
  return true;
}

function doCompress(str, maxLen, minIndex, maxIndex) {
  var len, sub, i, n;

  // if str is shorter than 2, can't be any repeating substrings
  if(maxIndex - minIndex + 1 < 2)
    return str.substring(minIndex, maxIndex + 1);

  for(len = maxLen; len > 0; len--) {
    // search for a repeating substring of "len" length starting at index i
    for(i = minIndex; i + (len * 2) <= maxIndex + 1; i++) {
      // if such a substring exists...
      if(startsWith(str, i + len, i, len)) {
        // "count" how many occurences (advance index till beyond repeats)
        for(n = i + len * 2; (n + len <= maxIndex + 1) && startsWith(str, n, i, len); n += len);
        // return a string composed of the compressed part before the match +
        // the substring + the compressed part beyond the match
        return (i > minIndex ? doCompress(str, len - 1, minIndex, i - 1) : '') +
          str.substr(i, len) +
          (n < maxIndex ? doCompress(str, len, n, maxIndex) : '');
      }
    }
  }

  // if nothing found, return original string
  return str.substring(minIndex, maxIndex + 1);
}

function compress(str) {
  innerCount = 0;
  // max meaningful length is str.length/2 (truncated to integer)
  return {
    source: str,
    result: doCompress(str, (str.length / 2) | 0, 0, str.length - 1),
    'n^2': str.length*str.length,
    innerCount: innerCount};
}

alert(JSON.stringify(cases.map(compress), null, '\t'));

Run Code Online (Sandbox Code Playgroud)

该解决方案的时间复杂度为 O(n^2)。

@Amit，我没有对你投反对票。当我面试时，我看到一个候选人在函数中使用类似indexOf的东西，其中查找子字符串是关键，我会要求候选人实现它。这些函数隐藏了相当多的复杂性。实现indexOf的方法有很多种，简单的实现可能是二次的（O(m*n)），除非候选人使用特定的东西，如KMP或Boyer-Moore（或者知道JS如何在内部实现它并提到复杂性）。 (4认同)
嗯。按照这个论证，快速排序将是 O(n)，不是吗？ (2认同)

归档时间：	10 年，3 月前
查看次数：	1589 次
最近记录：	10 年，3 月前