bra*_*ipt 84 javascript encoding utf-8
我正在使用Javascript window.atob()
函数来解码base64编码的字符串(特别是GitHub API中的base64编码内容).问题是我得到了ASCII编码的字符(â¢
而不是™
).如何正确处理传入的base64编码流,以便将其解码为utf-8?
bra*_*ipt 218
有一篇关于Mozilla的MDN文档的精彩文章正好描述了这个问题:
"Unicode问题"因为
DOMString
s是16位编码的字符串,所以在大多数浏览window.btoa
Unicode字符串的浏览器中都会导致Character Out Of Range exception
if字符超出8位字节(0x00~0xFF)的范围.有两种方法可以解决这个问题:
- 第一个是逃避整个字符串(使用UTF-8,请参阅
encodeURIComponent
),然后对其进行编码;- 第二个是将UTF-16
DOMString
转换为UTF-8字符数组,然后对其进行编码.
关于以前解决方案的说明:MDN文章最初建议使用unescape
和escape
解决Character Out Of Range
异常问题,但它们已被弃用.这里的一些其他答案已经建议解决这个问题,decodeURIComponent
并且encodeURIComponent
已经证明这是不可靠和不可预测的.此答案的最新更新使用现代JavaScript函数来提高速度和现代化代码.
如果你想节省一些时间,你也可以考虑使用一个库:
function b64EncodeUnicode(str) {
// first we use encodeURIComponent to get percent-encoded UTF-8,
// then we convert the percent encodings into raw bytes which
// can be fed into btoa.
return btoa(encodeURIComponent(str).replace(/%([0-9A-F]{2})/g,
function toSolidBytes(match, p1) {
return String.fromCharCode('0x' + p1);
}));
}
b64EncodeUnicode('? à la mode'); // "4pyTIMOgIGxhIG1vZGU="
b64EncodeUnicode('\n'); // "Cg=="
Run Code Online (Sandbox Code Playgroud)
function b64DecodeUnicode(str) {
// Going backwards: from bytestream, to percent-encoding, to original string.
return decodeURIComponent(atob(str).split('').map(function(c) {
return '%' + ('00' + c.charCodeAt(0).toString(16)).slice(-2);
}).join(''));
}
b64DecodeUnicode('4pyTIMOgIGxhIG1vZGU='); // "? à la mode"
b64DecodeUnicode('Cg=='); // "\n"
Run Code Online (Sandbox Code Playgroud)
以下是直接来自MDN的当前建议,以及@ MA-Maddin的一些额外TypeScript兼容性:
// Encoding UTF8 ? base64
function b64EncodeUnicode(str) {
return btoa(encodeURIComponent(str).replace(/%([0-9A-F]{2})/g, function(match, p1) {
return String.fromCharCode(parseInt(p1, 16))
}))
}
b64EncodeUnicode('? à la mode') // "4pyTIMOgIGxhIG1vZGU="
b64EncodeUnicode('\n') // "Cg=="
// Decoding base64 ? UTF8
function b64DecodeUnicode(str) {
return decodeURIComponent(Array.prototype.map.call(atob(str), function(c) {
return '%' + ('00' + c.charCodeAt(0).toString(16)).slice(-2)
}).join(''))
}
b64DecodeUnicode('4pyTIMOgIGxhIG1vZGU=') // "? à la mode"
b64DecodeUnicode('Cg==') // "\n"
Run Code Online (Sandbox Code Playgroud)
这用过escape
和unescape
(现在已经弃用了,虽然这仍适用于所有现代浏览器):
function utf8_to_b64( str ) {
return window.btoa(unescape(encodeURIComponent( str )));
}
function b64_to_utf8( str ) {
return decodeURIComponent(escape(window.atob( str )));
}
// Usage:
utf8_to_b64('? à la mode'); // "4pyTIMOgIGxhIG1vZGU="
b64_to_utf8('4pyTIMOgIGxhIG1vZGU='); // "? à la mode"
Run Code Online (Sandbox Code Playgroud)
最后一件事:我在调用GitHub API时第一次遇到这个问题.为了得到这个在(移动)Safari中无法正确,其实我不得不从以base64源中去除所有的空白工作之前,我甚至可以解码源.这是否仍然与2017年有关,我不知道:
function b64_to_utf8( str ) {
str = str.replace(/\s/g, '');
return decodeURIComponent(escape(window.atob( str )));
}
Run Code Online (Sandbox Code Playgroud)
Ted*_*sen 17
事情会改变的.该逃逸/ UNESCAPE方法已被弃用.
您可以在对Base64进行编码之前对字符串进行URI编码.请注意,这不会产生Base64编码的UTF8,而是产生Base64编码的URL编码数据.双方必须就同一编码达成一致.
请参阅此处的工作示例:http://codepen.io/anon/pen/PZgbPW
// encode string
var base64 = window.btoa(encodeURIComponent('€ ?? æøåÆØÅ'));
// decode string
var str = decodeURIComponent(window.atob(tmp));
// str is now === '€ ?? æøåÆØÅ'
Run Code Online (Sandbox Code Playgroud)
对于OP的问题,第三方库如js-base64应该可以解决问题.
小智 16
对我有用的完整文章:https : //developer.mozilla.org/en-US/docs/Web/JavaScript/Base64_encoding_and_decoding
我们从 Unicode/UTF-8 编码的部分是
function utf8_to_b64( str ) {
return window.btoa(unescape(encodeURIComponent( str )));
}
function b64_to_utf8( str ) {
return decodeURIComponent(escape(window.atob( str )));
}
// Usage:
utf8_to_b64('? à la mode'); // "4pyTIMOgIGxhIG1vZGU="
b64_to_utf8('4pyTIMOgIGxhIG1vZGU='); // "? à la mode"
Run Code Online (Sandbox Code Playgroud)
这是当今最常用的方法之一。
小智 14
将 base64 解码为 UTF8 字符串
以下是@brandonscript 当前投票最多的答案
function b64DecodeUnicode(str) {
// Going backwards: from bytestream, to percent-encoding, to original string.
return decodeURIComponent(atob(str).split('').map(function(c) {
return '%' + ('00' + c.charCodeAt(0).toString(16)).slice(-2);
}).join(''));
}
Run Code Online (Sandbox Code Playgroud)
上面的代码可以工作,但速度很慢。如果您的输入是一个非常大的 base64 字符串,例如 base64 html 文档的 30,000 个字符。这将需要大量计算。
这是我的答案,使用内置 TextDecoder,对于大输入,比上述代码快近 10 倍。
function decodeBase64(base64) {
const text = atob(base64);
const length = text.length;
const bytes = new Uint8Array(length);
for (let i = 0; i < length; i++) {
bytes[i] = text.charCodeAt(i);
}
const decoder = new TextDecoder(); // default is utf-8
return decoder.decode(bytes);
}
Run Code Online (Sandbox Code Playgroud)
Ric*_*lli 10
如果将字符串视为字节更多是你的事情,你可以使用以下函数
function u_atob(ascii) {
return Uint8Array.from(atob(ascii), c => c.charCodeAt(0));
}
function u_btoa(buffer) {
var binary = [];
var bytes = new Uint8Array(buffer);
for (var i = 0, il = bytes.byteLength; i < il; i++) {
binary.push(String.fromCharCode(bytes[i]));
}
return btoa(binary.join(''));
}
// example, it works also with astral plane characters such as ''
var encodedString = new TextEncoder().encode('?');
var base64String = u_btoa(encodedString);
console.log('?' === new TextDecoder().decode(u_atob(base64String)))
Run Code Online (Sandbox Code Playgroud)
这是Mozilla 开发资源中描述的 2018 年更新的解决方案
从 Unicode 编码到 B64
function b64EncodeUnicode(str) {
// first we use encodeURIComponent to get percent-encoded UTF-8,
// then we convert the percent encodings into raw bytes which
// can be fed into btoa.
return btoa(encodeURIComponent(str).replace(/%([0-9A-F]{2})/g,
function toSolidBytes(match, p1) {
return String.fromCharCode('0x' + p1);
}));
}
b64EncodeUnicode('? à la mode'); // "4pyTIMOgIGxhIG1vZGU="
b64EncodeUnicode('\n'); // "Cg=="
Run Code Online (Sandbox Code Playgroud)
从 B64 解码为 Unicode
function b64DecodeUnicode(str) {
// Going backwards: from bytestream, to percent-encoding, to original string.
return decodeURIComponent(atob(str).split('').map(function(c) {
return '%' + ('00' + c.charCodeAt(0).toString(16)).slice(-2);
}).join(''));
}
b64DecodeUnicode('4pyTIMOgIGxhIG1vZGU='); // "? à la mode"
b64DecodeUnicode('Cg=='); // "\n"
Run Code Online (Sandbox Code Playgroud)
我假设人们可能想要一种生成广泛使用的 base64 URI 的解决方案。请访问data:text/plain;charset=utf-8;base64,4pi44pi54pi64pi74pi84pi+4pi/
查看演示(复制数据uri,打开新标签页,将数据URI粘贴到地址栏中,然后按回车进入该页面)。尽管这个 URI 是 base64 编码的,浏览器仍然能够识别高代码点并正确解码它们。缩小后的编码器+解码器为 1058 字节(+Gzip?589 字节)
!function(e){"use strict";function h(b){var a=b.charCodeAt(0);if(55296<=a&&56319>=a)if(b=b.charCodeAt(1),b===b&&56320<=b&&57343>=b){if(a=1024*(a-55296)+b-56320+65536,65535<a)return d(240|a>>>18,128|a>>>12&63,128|a>>>6&63,128|a&63)}else return d(239,191,189);return 127>=a?inputString:2047>=a?d(192|a>>>6,128|a&63):d(224|a>>>12,128|a>>>6&63,128|a&63)}function k(b){var a=b.charCodeAt(0)<<24,f=l(~a),c=0,e=b.length,g="";if(5>f&&e>=f){a=a<<f>>>24+f;for(c=1;c<f;++c)a=a<<6|b.charCodeAt(c)&63;65535>=a?g+=d(a):1114111>=a?(a-=65536,g+=d((a>>10)+55296,(a&1023)+56320)):c=0}for(;c<e;++c)g+="\ufffd";return g}var m=Math.log,n=Math.LN2,l=Math.clz32||function(b){return 31-m(b>>>0)/n|0},d=String.fromCharCode,p=atob,q=btoa;e.btoaUTF8=function(b,a){return q((a?"\u00ef\u00bb\u00bf":"")+b.replace(/[\x80-\uD7ff\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?/g,h))};e.atobUTF8=function(b,a){a||"\u00ef\u00bb\u00bf"!==b.substring(0,3)||(b=b.substring(3));return p(b).replace(/[\xc0-\xff][\x80-\xbf]*/g,k)}}(""+void 0==typeof global?""+void 0==typeof self?this:self:global)
Run Code Online (Sandbox Code Playgroud)
下面是用于生成它的源代码。
var fromCharCode = String.fromCharCode;
var btoaUTF8 = (function(btoa, replacer){"use strict";
return function(inputString, BOMit){
return btoa((BOMit ? "\xEF\xBB\xBF" : "") + inputString.replace(
/[\x80-\uD7ff\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?/g, replacer
));
}
})(btoa, function(nonAsciiChars){"use strict";
// make the UTF string into a binary UTF-8 encoded string
var point = nonAsciiChars.charCodeAt(0);
if (point >= 0xD800 && point <= 0xDBFF) {
var nextcode = nonAsciiChars.charCodeAt(1);
if (nextcode !== nextcode) // NaN because string is 1 code point long
return fromCharCode(0xef/*11101111*/, 0xbf/*10111111*/, 0xbd/*10111101*/);
// https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
if (nextcode >= 0xDC00 && nextcode <= 0xDFFF) {
point = (point - 0xD800) * 0x400 + nextcode - 0xDC00 + 0x10000;
if (point > 0xffff)
return fromCharCode(
(0x1e/*0b11110*/<<3) | (point>>>18),
(0x2/*0b10*/<<6) | ((point>>>12)&0x3f/*0b00111111*/),
(0x2/*0b10*/<<6) | ((point>>>6)&0x3f/*0b00111111*/),
(0x2/*0b10*/<<6) | (point&0x3f/*0b00111111*/)
);
} else return fromCharCode(0xef, 0xbf, 0xbd);
}
if (point <= 0x007f) return nonAsciiChars;
else if (point <= 0x07ff) {
return fromCharCode((0x6<<5)|(point>>>6), (0x2<<6)|(point&0x3f));
} else return fromCharCode(
(0xe/*0b1110*/<<4) | (point>>>12),
(0x2/*0b10*/<<6) | ((point>>>6)&0x3f/*0b00111111*/),
(0x2/*0b10*/<<6) | (point&0x3f/*0b00111111*/)
);
});
Run Code Online (Sandbox Code Playgroud)
然后,要解码 base64 数据,HTTP 将数据作为数据 URI 获取或使用下面的函数。
var clz32 = Math.clz32 || (function(log, LN2){"use strict";
return function(x) {return 31 - log(x >>> 0) / LN2 | 0};
})(Math.log, Math.LN2);
var fromCharCode = String.fromCharCode;
var atobUTF8 = (function(atob, replacer){"use strict";
return function(inputString, keepBOM){
inputString = atob(inputString);
if (!keepBOM && inputString.substring(0,3) === "\xEF\xBB\xBF")
inputString = inputString.substring(3); // eradicate UTF-8 BOM
// 0xc0 => 0b11000000; 0xff => 0b11111111; 0xc0-0xff => 0b11xxxxxx
// 0x80 => 0b10000000; 0xbf => 0b10111111; 0x80-0xbf => 0b10xxxxxx
return inputString.replace(/[\xc0-\xff][\x80-\xbf]*/g, replacer);
}
})(atob, function(encoded){"use strict";
var codePoint = encoded.charCodeAt(0) << 24;
var leadingOnes = clz32(~codePoint);
var endPos = 0, stringLen = encoded.length;
var result = "";
if (leadingOnes < 5 && stringLen >= leadingOnes) {
codePoint = (codePoint<<leadingOnes)>>>(24+leadingOnes);
for (endPos = 1; endPos < leadingOnes; ++endPos)
codePoint = (codePoint<<6) | (encoded.charCodeAt(endPos)&0x3f/*0b00111111*/);
if (codePoint <= 0xFFFF) { // BMP code point
result += fromCharCode(codePoint);
} else if (codePoint <= 0x10FFFF) {
// https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
codePoint -= 0x10000;
result += fromCharCode(
(codePoint >> 10) + 0xD800, // highSurrogate
(codePoint & 0x3ff) + 0xDC00 // lowSurrogate
);
} else endPos = 0; // to fill it in with INVALIDs
}
for (; endPos < stringLen; ++endPos) result += "\ufffd"; // replacement character
return result;
});
Run Code Online (Sandbox Code Playgroud)
更标准的好处是这个编码器和这个解码器应用更广泛,因为它们可以用作正确显示的有效 URL。观察。
!function(e){"use strict";function h(b){var a=b.charCodeAt(0);if(55296<=a&&56319>=a)if(b=b.charCodeAt(1),b===b&&56320<=b&&57343>=b){if(a=1024*(a-55296)+b-56320+65536,65535<a)return d(240|a>>>18,128|a>>>12&63,128|a>>>6&63,128|a&63)}else return d(239,191,189);return 127>=a?inputString:2047>=a?d(192|a>>>6,128|a&63):d(224|a>>>12,128|a>>>6&63,128|a&63)}function k(b){var a=b.charCodeAt(0)<<24,f=l(~a),c=0,e=b.length,g="";if(5>f&&e>=f){a=a<<f>>>24+f;for(c=1;c<f;++c)a=a<<6|b.charCodeAt(c)&63;65535>=a?g+=d(a):1114111>=a?(a-=65536,g+=d((a>>10)+55296,(a&1023)+56320)):c=0}for(;c<e;++c)g+="\ufffd";return g}var m=Math.log,n=Math.LN2,l=Math.clz32||function(b){return 31-m(b>>>0)/n|0},d=String.fromCharCode,p=atob,q=btoa;e.btoaUTF8=function(b,a){return q((a?"\u00ef\u00bb\u00bf":"")+b.replace(/[\x80-\uD7ff\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?/g,h))};e.atobUTF8=function(b,a){a||"\u00ef\u00bb\u00bf"!==b.substring(0,3)||(b=b.substring(3));return p(b).replace(/[\xc0-\xff][\x80-\xbf]*/g,k)}}(""+void 0==typeof global?""+void 0==typeof self?this:self:global)
Run Code Online (Sandbox Code Playgroud)
var fromCharCode = String.fromCharCode;
var btoaUTF8 = (function(btoa, replacer){"use strict";
return function(inputString, BOMit){
return btoa((BOMit ? "\xEF\xBB\xBF" : "") + inputString.replace(
/[\x80-\uD7ff\uDC00-\uFFFF]|[\uD800-\uDBFF][\uDC00-\uDFFF]?/g, replacer
));
}
})(btoa, function(nonAsciiChars){"use strict";
// make the UTF string into a binary UTF-8 encoded string
var point = nonAsciiChars.charCodeAt(0);
if (point >= 0xD800 && point <= 0xDBFF) {
var nextcode = nonAsciiChars.charCodeAt(1);
if (nextcode !== nextcode) // NaN because string is 1 code point long
return fromCharCode(0xef/*11101111*/, 0xbf/*10111111*/, 0xbd/*10111101*/);
// https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
if (nextcode >= 0xDC00 && nextcode <= 0xDFFF) {
point = (point - 0xD800) * 0x400 + nextcode - 0xDC00 + 0x10000;
if (point > 0xffff)
return fromCharCode(
(0x1e/*0b11110*/<<3) | (point>>>18),
(0x2/*0b10*/<<6) | ((point>>>12)&0x3f/*0b00111111*/),
(0x2/*0b10*/<<6) | ((point>>>6)&0x3f/*0b00111111*/),
(0x2/*0b10*/<<6) | (point&0x3f/*0b00111111*/)
);
} else return fromCharCode(0xef, 0xbf, 0xbd);
}
if (point <= 0x007f) return nonAsciiChars;
else if (point <= 0x07ff) {
return fromCharCode((0x6<<5)|(point>>>6), (0x2<<6)|(point&0x3f));
} else return fromCharCode(
(0xe/*0b1110*/<<4) | (point>>>12),
(0x2/*0b10*/<<6) | ((point>>>6)&0x3f/*0b00111111*/),
(0x2/*0b10*/<<6) | (point&0x3f/*0b00111111*/)
);
});
Run Code Online (Sandbox Code Playgroud)
var clz32 = Math.clz32 || (function(log, LN2){"use strict";
return function(x) {return 31 - log(x >>> 0) / LN2 | 0};
})(Math.log, Math.LN2);
var fromCharCode = String.fromCharCode;
var atobUTF8 = (function(atob, replacer){"use strict";
return function(inputString, keepBOM){
inputString = atob(inputString);
if (!keepBOM && inputString.substring(0,3) === "\xEF\xBB\xBF")
inputString = inputString.substring(3); // eradicate UTF-8 BOM
// 0xc0 => 0b11000000; 0xff => 0b11111111; 0xc0-0xff => 0b11xxxxxx
// 0x80 => 0b10000000; 0xbf => 0b10111111; 0x80-0xbf => 0b10xxxxxx
return inputString.replace(/[\xc0-\xff][\x80-\xbf]*/g, replacer);
}
})(atob, function(encoded){"use strict";
var codePoint = encoded.charCodeAt(0) << 24;
var leadingOnes = clz32(~codePoint);
var endPos = 0, stringLen = encoded.length;
var result = "";
if (leadingOnes < 5 && stringLen >= leadingOnes) {
codePoint = (codePoint<<leadingOnes)>>>(24+leadingOnes);
for (endPos = 1; endPos < leadingOnes; ++endPos)
codePoint = (codePoint<<6) | (encoded.charCodeAt(endPos)&0x3f/*0b00111111*/);
if (codePoint <= 0xFFFF) { // BMP code point
result += fromCharCode(codePoint);
} else if (codePoint <= 0x10FFFF) {
// https://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
codePoint -= 0x10000;
result += fromCharCode(
(codePoint >> 10) + 0xD800, // highSurrogate
(codePoint & 0x3ff) + 0xDC00 // lowSurrogate
);
} else endPos = 0; // to fill it in with INVALIDs
}
for (; endPos < stringLen; ++endPos) result += "\ufffd"; // replacement character
return result;
});
Run Code Online (Sandbox Code Playgroud)
除了非常规范之外,上面的代码片段也非常快。上面的代码片段不是必须在各种形式之间多次转换数据的间接连续链(例如在 Riccardo Galli 的响应中),而是尽可能直接地执行。它String.prototype.replace
在编码时只使用一个简单的快速调用来处理数据,在解码时只使用一个来解码数据。另一个优点是(尤其是对于大字符串),String.prototype.replace
允许浏览器自动处理调整字符串大小的底层内存管理,从而显着提升性能,尤其是在 Chrome 和 Firefox 等大量优化的常青浏览器中String.prototype.replace
. 最后,锦上添花的是拉丁文的 excl?s?v?用户,不包含任何高于 0x7f 的代码点的字符串处理起来特别快,因为替换算法不会修改该字符串。
我在https://github.com/anonyco/BestBase64EncoderDecoder/为这个解决方案创建了一个 github 存储库
归档时间: |
|
查看次数: |
124468 次 |
最近记录: |