该.charCodeAt函数返回caracter的unicode代码.但我想得到字节数组.我知道,如果charcode超过127,那么该字符将存储在两个或更多字节中.
var arr=[];
for(var i=0; i<str.length; i++) {
arr.push(str.charCodeAt(i))
}
Run Code Online (Sandbox Code Playgroud)
Jon*_*oni 59
在UTF-8中编码Unicode的逻辑基本上是:
这是我在UTF-8中编写JavaScript UTF-16字符串时编写的函数:
function toUTF8Array(str) {
var utf8 = [];
for (var i=0; i < str.length; i++) {
var charcode = str.charCodeAt(i);
if (charcode < 0x80) utf8.push(charcode);
else if (charcode < 0x800) {
utf8.push(0xc0 | (charcode >> 6),
0x80 | (charcode & 0x3f));
}
else if (charcode < 0xd800 || charcode >= 0xe000) {
utf8.push(0xe0 | (charcode >> 12),
0x80 | ((charcode>>6) & 0x3f),
0x80 | (charcode & 0x3f));
}
// surrogate pair
else {
i++;
// UTF-16 encodes 0x10000-0x10FFFF by
// subtracting 0x10000 and splitting the
// 20 bits of 0x0-0xFFFFF into two halves
charcode = 0x10000 + (((charcode & 0x3ff)<<10)
| (str.charCodeAt(i) & 0x3ff));
utf8.push(0xf0 | (charcode >>18),
0x80 | ((charcode>>12) & 0x3f),
0x80 | ((charcode>>6) & 0x3f),
0x80 | (charcode & 0x3f));
}
}
return utf8;
}
Run Code Online (Sandbox Code Playgroud)
Jon*_*ski 37
JavaScript的Strings的存储在UTF-16.要获得UTF-8,您必须转换String自己.
一种方法是混合encodeURIComponent(),它将输出URL编码的UTF-8字节unescape,如ecmanaut所述.
var utf8 = unescape(encodeURIComponent(str));
var arr = [];
for (var i = 0; i < utf8.length; i++) {
arr.push(utf8.charCodeAt(i));
}
Run Code Online (Sandbox Code Playgroud)
bry*_*ryc 11
新的Encoding API似乎可以让您轻松编码和解码UTF-8(使用类型化数组):
var encoded = new TextEncoder("utf-8").encode("???? ??? ?????");
var decoded = new TextDecoder("utf-8").decode(encoded);
console.log(encoded, decoded);
Run Code Online (Sandbox Code Playgroud)
浏览器支持也不错,但Microsoft Edge目前不支持它.有一个polyfill应该在IE11和Edge中工作.
API也支持许多不同的编码.我使用它来解码/编码PS2存储卡中的日文文本(Shift-JIS):
new TextDecoder("shift-jis").decode(new Uint8Array(textbuffer))
Run Code Online (Sandbox Code Playgroud)
Google Closure库具有转换为UTF-8和字节数组的功能.如果您不想使用整个库,可以从此处复制功能.为完整起见,将字符串转换为UTF-8字节数组的代码为:
goog.crypt.stringToUtf8ByteArray = function(str) {
// TODO(user): Use native implementations if/when available
var out = [], p = 0;
for (var i = 0; i < str.length; i++) {
var c = str.charCodeAt(i);
if (c < 128) {
out[p++] = c;
} else if (c < 2048) {
out[p++] = (c >> 6) | 192;
out[p++] = (c & 63) | 128;
} else if (
((c & 0xFC00) == 0xD800) && (i + 1) < str.length &&
((str.charCodeAt(i + 1) & 0xFC00) == 0xDC00)) {
// Surrogate Pair
c = 0x10000 + ((c & 0x03FF) << 10) + (str.charCodeAt(++i) & 0x03FF);
out[p++] = (c >> 18) | 240;
out[p++] = ((c >> 12) & 63) | 128;
out[p++] = ((c >> 6) & 63) | 128;
out[p++] = (c & 63) | 128;
} else {
out[p++] = (c >> 12) | 224;
out[p++] = ((c >> 6) & 63) | 128;
out[p++] = (c & 63) | 128;
}
}
return out;
};
Run Code Online (Sandbox Code Playgroud)
假设一个问题是关于一个DOMString作为输入,我们的目标是获得一个数组,当解释为字符串(例如,写入到磁盘上的文件),将是UTF-8编码:
现在几乎所有现代浏览器都支持Typed Arrays,如果没有列出这种方法,那就太惭愧了:
.readAsArrayBuffer()功能将Blob转换为ArrayBuffer例:
// Create a Blob with an Euro-char (U+20AC)
var b = new Blob(['€']);
var fr = new FileReader();
fr.onload = function() {
ua = new Uint8Array(fr.result);
// This will log "3|226|130|172"
// E2 82 AC
// In UTF-16, it would be only 2 bytes long
console.log(
fr.result.byteLength + '|' +
ua[0] + '|' +
ua[1] + '|' +
ua[2] + ''
);
};
fr.readAsArrayBuffer(b);
Run Code Online (Sandbox Code Playgroud)
在JSFiddle上玩它.我还没有对此进行基准测试,但我可以想象这对于大型DOMStrings作为输入是有效的.
| 归档时间: |
|
| 查看次数: |
96588 次 |
| 最近记录: |