chr*_*ude 10 javascript regex split
我有一些随机字符串,例如:Hello, my name is john..我希望将字符串拆分成这样的数组:Hello, ,, , my, name, is, john, .,.我试过了str.split(/[^\w\s]|_/g),但似乎没有用.有任何想法吗?
Rob*_*sch 19
在任何非单词字符运行上拆分str即不是AZ,0-9和下划线.
var words=str.split(/\W+/); // assumes str does not begin nor end with whitespace
Run Code Online (Sandbox Code Playgroud)
或者,假设您的目标语言是英语,您可以使用以下方法从字符串中提取所有语义上有用的值(即"标记"字符串):
var str='Here\'s a (good, bad, indifferent, ...) '+
'example sentence to be used in this test '+
'of English language "token-extraction".',
punct='\\['+ '\\!'+ '\\"'+ '\\#'+ '\\$'+ // since javascript does not
'\\%'+ '\\&'+ '\\\''+ '\\('+ '\\)'+ // support POSIX character
'\\*'+ '\\+'+ '\\,'+ '\\\\'+ '\\-'+ // classes, we'll need our
'\\.'+ '\\/'+ '\\:'+ '\\;'+ '\\<'+ // own version of [:punct:]
'\\='+ '\\>'+ '\\?'+ '\\@'+ '\\['+
'\\]'+ '\\^'+ '\\_'+ '\\`'+ '\\{'+
'\\|'+ '\\}'+ '\\~'+ '\\]',
re=new RegExp( // tokenizer
'\\s*'+ // discard possible leading whitespace
'('+ // start capture group
'\\.{3}'+ // ellipsis (must appear before punct)
'|'+ // alternator
'\\w+\\-\\w+'+ // hyphenated words (must appear before punct)
'|'+ // alternator
'\\w+\'(?:\\w+)?'+ // compound words (must appear before punct)
'|'+ // alternator
'\\w+'+ // other words
'|'+ // alternator
'['+punct+']'+ // punct
')' // end capture group
);
// grep(ary[,filt]) - filters an array
// note: could use jQuery.grep() instead
// @param {Array} ary array of members to filter
// @param {Function} filt function to test truthiness of member,
// if omitted, "function(member){ if(member) return member; }" is assumed
// @returns {Array} all members of ary where result of filter is truthy
function grep(ary,filt) {
var result=[];
for(var i=0,len=ary.length;i++<len;) {
var member=ary[i]||'';
if(filt && (typeof filt === 'Function') ? filt(member) : member) {
result.push(member);
}
}
return result;
}
var tokens=grep( str.split(re) ); // note: filter function omitted
// since all we need to test
// for is truthiness
Run Code Online (Sandbox Code Playgroud)
产生:
tokens=[
'Here\'s',
'a',
'(',
'good',
',',
'bad',
',',
'indifferent',
',',
'...',
')',
'example',
'sentence',
'to',
'be',
'used',
'in',
'this',
'test',
'of',
'English',
'language',
'"',
'token-extraction',
'"',
'.'
]
Run Code Online (Sandbox Code Playgroud)
编辑
也可作为Github Gist使用
试试这个(我不确定这是不是你想要的):
str.replace(/[^\w\s]|_/g, function ($1) { return ' ' + $1 + ' ';}).replace(/[ ]+/g, ' ').split(' ');
Run Code Online (Sandbox Code Playgroud)
尝试:
str.split(/([_\W])/)
Run Code Online (Sandbox Code Playgroud)
这将被任何非字母数字字符 ( \W) 和任何下划线分割。它使用捕获括号来包含在最终结果中被拆分的项目。
| 归档时间: |
|
| 查看次数: |
18543 次 |
| 最近记录: |