使用Javascript解码UTF-8

Question

使用Javascript解码UTF-8

Jar*_*son 44 javascript unicode utf8-decode xhtml-transitional

我在XHTML网页中有Javascript传递UTF-8编码的字符串.它需要继续通过UTF-8版本,并对其进行解码.如何解码UTF-8字符串进行显示？

<script type="text/javascript">
// <![CDATA[
function updateUser(usernameSent){
    var usernameReceived = usernameSent; // Current value: GrÃƒÂ¶ÃƒÂŸe
    var usernameDecoded = usernameReceived;  // Decode to: Größe
    var html2id = '';
    html2id += 'Encoded: ' + usernameReceived + '<br />Decoded: ' + usernameDecoded;
    document.getElementById('userId').innerHTML = html2id;
}
// ]]>
</script>

Run Code Online (Sandbox Code Playgroud)

Answer 1

Cpn*_*nch 117

回答原始问题:这是你如何在javascript中解码utf-8:

http://ecmanaut.blogspot.ca/2006/07/encoding-decoding-utf8-in-javascript.html

特别,

function encode_utf8(s) {
  return unescape(encodeURIComponent(s));
}

function decode_utf8(s) {
  return decodeURIComponent(escape(s));
}

Run Code Online (Sandbox Code Playgroud)

我只是在我的代码中使用它,它完美无缺.

这适合我.但是你知道转义方法id已经弃用了.我们使用的是TypeScript,默认情况下不存在.那么逃生的最佳选择是什么？encodeURI&encodeURIComponent在这种情况下不能替换escape她,因为它们产生不同的输出. (3认同)
当已弃用的功能实际上有用时，防止其被删除的最佳方法是继续使用它，而不是避免使用它。浏览器供应商使用使用情况统计数据来确定何时删除某个功能。 (2认同)

Answer 2

Alb*_*ert 21

这应该工作:

// http://www.onicos.com/staff/iz/amuse/javascript/expert/utf.txt

/* utf.js - UTF-8 <=> UTF-16 convertion
 *
 * Copyright (C) 1999 Masanao Izumo <iz@onicos.co.jp>
 * Version: 1.0
 * LastModified: Dec 25 1999
 * This library is free.  You can redistribute it and/or modify it.
 */

function Utf8ArrayToStr(array) {
    var out, i, len, c;
    var char2, char3;

    out = "";
    len = array.length;
    i = 0;
    while(i < len) {
    c = array[i++];
    switch(c >> 4)
    { 
      case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
        // 0xxxxxxx
        out += String.fromCharCode(c);
        break;
      case 12: case 13:
        // 110x xxxx   10xx xxxx
        char2 = array[i++];
        out += String.fromCharCode(((c & 0x1F) << 6) | (char2 & 0x3F));
        break;
      case 14:
        // 1110 xxxx  10xx xxxx  10xx xxxx
        char2 = array[i++];
        char3 = array[i++];
        out += String.fromCharCode(((c & 0x0F) << 12) |
                       ((char2 & 0x3F) << 6) |
                       ((char3 & 0x3F) << 0));
        break;
    }
    }

    return out;
}

Run Code Online (Sandbox Code Playgroud)

查看JSFiddle演示.

另请参阅相关问题:此处和此处

Upvote实际了解解码UTF-8是什么. (7认同)
该代码不正确。fromCharCode`接受UTF-16值，因此您需要先将其转换为UTF-16。 (2认同)

Answer 3

Jon*_*han 13

也许使用textDecoder就足够了。

虽然在 IE 中不支持。

var decoder = new TextDecoder('utf-8'),
    decodedMessage;

decodedMessage = decoder.decode(message.data);

Run Code Online (Sandbox Code Playgroud)

处理非 UTF8 文本

在这个例子中，我们解码俄语文本“??????, ???!”，意思是“Hello, world”。在 TextDecoder() 构造函数中，我们指定适用于西里尔文脚本的 Windows-1251 字符编码。

var decoder = new TextDecoder('utf-8'),
    decodedMessage;

decodedMessage = decoder.decode(message.data);

Run Code Online (Sandbox Code Playgroud)

此处描述了 TextDecoder 的接口。

从字符串中检索字节数组同样简单：

    let win1251decoder = new TextDecoder('windows-1251');
    let bytes = new Uint8Array([207, 240, 232, 226, 229, 242, 44, 32, 236, 232, 240, 33]);
    console.log(win1251decoder.decode(bytes)); // ??????, ???!

Run Code Online (Sandbox Code Playgroud)

如果您使用不同的编码，则必须在编码时对其进行补偿。TextEncoder 的构造函数中的参数是此处列出的任何一种有效编码。

@ÁlvaroGonzález 但它可以工作并且可能是标准的（未来的浏览器也需要支持这一点，好吗？） (3认同)
现在这不是实验性的，在所有现代浏览器中都有很好的支持，并且绝对是每个人的正确选择（除非你仍然必须支持 IE） (3认同)

Answer 4

小智 9

这是我在 Google 搜索后发现的，而不仅仅是 UTF-8 编码/解码。因此，对于那些正在寻找在编码之间进行转换的转换库的人来说，就在这里。

https://github.com/inexorabletash/text-encoding

var uint8array = new TextEncoder().encode(str);
var str = new TextDecoder(encoding).decode(uint8array);

Run Code Online (Sandbox Code Playgroud)

从仓库自述文件粘贴

支持编码规范中的所有编码：

utf-8 ibm866 iso-8859-2 iso-8859-3 iso-8859-4 iso-8859-5 iso-8859-6 iso-8859-7 iso-8859-8 iso-8859-8-i iso-8859- 10 iso-8859-13 iso-8859-14 iso-8859-15 iso-8859-16 koi8-r koi8-u Macintosh windows-874 windows-1250 windows-1251 windows-1252 windows-1253 windows-1254 windows-1255 windows -1256 windows-1257 windows-1258 x-mac-西里尔语 gb18030 hz-gb-2312 big5 euc-jp iso-2022-jp shift_jis euc-kr 替换 utf-16be utf-16le x-用户定义

（某些编码可能以其他名称得到支持，例如 ascii、iso-8859-1 等。请参阅编码以了解每种编码的附加标签。）

Answer 5

fak*_*ake 6

@ albert的解决方案是我认为最接近的,但它只能解析3个字节的utf-8字符

function utf8ArrayToStr(array) {
  var out, i, len, c;
  var char2, char3;

  out = "";
  len = array.length;
  i = 0;

  // XXX: Invalid bytes are ignored
  while(i < len) {
    c = array[i++];
    if (c >> 7 == 0) {
      // 0xxx xxxx
      out += String.fromCharCode(c);
      continue;
    }

    // Invalid starting byte
    if (c >> 6 == 0x02) {
      continue;
    }

    // #### MULTIBYTE ####
    // How many bytes left for thus character?
    var extraLength = null;
    if (c >> 5 == 0x06) {
      extraLength = 1;
    } else if (c >> 4 == 0x0e) {
      extraLength = 2;
    } else if (c >> 3 == 0x1e) {
      extraLength = 3;
    } else if (c >> 2 == 0x3e) {
      extraLength = 4;
    } else if (c >> 1 == 0x7e) {
      extraLength = 5;
    } else {
      continue;
    }

    // Do we have enough bytes in our data?
    if (i+extraLength > len) {
      var leftovers = array.slice(i-1);

      // If there is an invalid byte in the leftovers we might want to
      // continue from there.
      for (; i < len; i++) if (array[i] >> 6 != 0x02) break;
      if (i != len) continue;

      // All leftover bytes are valid.
      return {result: out, leftovers: leftovers};
    }
    // Remove the UTF-8 prefix from the char (res)
    var mask = (1 << (8 - extraLength - 1)) - 1,
        res = c & mask, nextChar, count;

    for (count = 0; count < extraLength; count++) {
      nextChar = array[i++];

      // Is the char valid multibyte part?
      if (nextChar >> 6 != 0x02) {break;};
      res = (res << 6) | (nextChar & 0x3f);
    }

    if (count != extraLength) {
      i--;
      continue;
    }

    if (res <= 0xffff) {
      out += String.fromCharCode(res);
      continue;
    }

    res -= 0x10000;
    var high = ((res >> 10) & 0x3ff) + 0xd800,
        low = (res & 0x3ff) + 0xdc00;
    out += String.fromCharCode(high, low);
  }

  return {result: out, leftovers: []};
}

Run Code Online (Sandbox Code Playgroud)

{result: "parsed string", leftovers: [list of invalid bytes at the end]}如果您正在以块的形式解析字符串,则会返回此值.

编辑:修复了@unhammer找到的问题.

Answer 6

Mat*_*oss 6

这是一个处理所有Unicode代码点的解决方案,包括上(4字节)值,并受所有现代浏览器支持(IE和其他> 5.5).它使用decodeURIComponent(),但不使用不推荐使用的escape/unescape函数:

function utf8_to_str(a) {
    for(var i=0, s=''; i<a.length; i++) {
        var h = a[i].toString(16)
        if(h.length < 2) h = '0' + h
        s += '%' + h
    }
    return decodeURIComponent(s)
}

Run Code Online (Sandbox Code Playgroud)

在GitHub上测试并可用

要从字符串创建UTF-8:

function utf8_from_str(s) {
    for(var i=0, enc = encodeURIComponent(s), a = []; i < enc.length;) {
        if(enc[i] === '%') {
            a.push(parseInt(enc.substr(i+1, 2), 16))
            i += 3
        } else {
            a.push(enc.charCodeAt(i++))
        }
    }
    return a
}

Run Code Online (Sandbox Code Playgroud)

在GitHub上测试并可用

Answer 7

小智 6

更新@Albert的答案为表情符号添加条件.

function Utf8ArrayToStr(array) {
    var out, i, len, c;
    var char2, char3, char4;

    out = "";
    len = array.length;
    i = 0;
    while(i < len) {
    c = array[i++];
    switch(c >> 4)
    { 
      case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
        // 0xxxxxxx
        out += String.fromCharCode(c);
        break;
      case 12: case 13:
        // 110x xxxx   10xx xxxx
        char2 = array[i++];
        out += String.fromCharCode(((c & 0x1F) << 6) | (char2 & 0x3F));
        break;
      case 14:
        // 1110 xxxx  10xx xxxx  10xx xxxx
        char2 = array[i++];
        char3 = array[i++];
        out += String.fromCharCode(((c & 0x0F) << 12) |
                       ((char2 & 0x3F) << 6) |
                       ((char3 & 0x3F) << 0));
        break;
     case 15:
        // 1111 0xxx 10xx xxxx 10xx xxxx 10xx xxxx
        char2 = array[i++];
        char3 = array[i++];
        char4 = array[i++];
        out += String.fromCodePoint(((c & 0x07) << 18) | ((char2 & 0x3F) << 12) | ((char3 & 0x3F) << 6) | (char4 & 0x3F));

        break;
    }

    return out;
}

Run Code Online (Sandbox Code Playgroud)

Answer 8

小智 6

// 字符串转 Utf8 ByteBuffer

function strToUTF8(str){
  return Uint8Array.from(encodeURIComponent(str).replace(/%(..)/g,(m,v)=>{return String.fromCodePoint(parseInt(v,16))}), c=>c.codePointAt(0))
}

Run Code Online (Sandbox Code Playgroud)

// Utf8 ByteArray 转字符串

function UTF8toStr(ba){
  return decodeURIComponent(ba.reduce((p,c)=>{return p+'%'+c.toString(16),''}))
}

Run Code Online (Sandbox Code Playgroud)

归档时间：	13 年，1 月前
查看次数：	247462 次
最近记录：	6 年，1 月前