gio*_*o79 2 javascript string split nlp n-gram
我想将自然文本分割成单词对、三联体、四联体等等!
到目前为止我已经弄清楚如何分成对。我想我需要一个额外的循环来容纳字数
这是成对的代码
var test = "I love you so much, but Joe said \"he doesn't\"!";
var words = test.split(" ");
var two_words = [];
for (var i = 0; i < words.length - 1; i++) {
  two_words.push(words[i] + ' ' + words[i + 1]);
}
console.log(two_words);
// Here is what I am trying
var words = test.split(" ");
var split_words = [];
var split_length = 5;
for (var l = 2; l <= split_length; l++) {
  for (var i = 0; i < words.length - (l - 1); i++) {
    var split_word;
    for (c = 0; c <= l; c++) {
      split_word += split_words[i + c];
    }
    split_words.push(split_word);
  }
}
console.log(split_words);添加预期输出...(ngram 数组)sg 像这样
// 2grams
"I love"
"love you"
"you so"
"so much,"
"much, but"
"but Joe"
"Joe said"
"said "he"
""he doesn't"!"
//3grams
"I love you"
"love you so"
"you so much"
"so much, but"
//and on and on
这称为“n-gram”,可以在现代 JavaScript 中使用生成器来完成,如下所示:
function* ngrams(a, n) { 
    let buf = [];
    for (let x of a) {
        buf.push(x);
        if (buf.length === n) {
            yield buf;
            buf.shift();
        }
    }
}
var test = "The quick brown fox jumps over the lazy dog";
for (let g of ngrams(test.split(' '), 3))
    console.log(g.join(' '))另一个更简洁且可能更快的选择:
let ngrams = (a, n) => a.slice(0, 1 - n).map((_, i) => a.slice(i, i + n));