如何将 UTF-16 中的字符串转换为 C++ 中的 UTF-8

use*_*635 5 c++ string dll utf-8 utf-16

考虑:

STDMETHODIMP CFileSystemAPI::setRRConfig( BSTR config_str, VARIANT* ret )
{
mReportReaderFactory.reset( new sbis::report_reader::ReportReaderFactory() );

USES_CONVERSION;
std::string configuration_str = W2A( config_str );
Run Code Online (Sandbox Code Playgroud)

但是在 config_str 我得到一个 UTF-16 的字符串。如何在这段代码中将其转换为 UTF-8?

And*_*rsK 6

你可以做这样的事情

std::string WstrToUtf8Str(const std::wstring& wstr)
{
  std::string retStr;
  if (!wstr.empty())
  {
    int sizeRequired = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), -1, NULL, 0, NULL, NULL);

    if (sizeRequired > 0)
    {
      std::vector<char> utf8String(sizeRequired);
      int bytesConverted = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(),    
                           -1, &utf8String[0], utf8String.size(), NULL, 
                           NULL);
      if (bytesConverted != 0)
      {
        retStr = &utf8String[0];
      }
      else
      {
        std::stringstream err;
        err << __FUNCTION__ 
            << " std::string WstrToUtf8Str failed to convert wstring '"
            << wstr.c_str() << L"'";
        throw std::runtime_error( err.str() );
      }
    }
  }
  return retStr;
}
Run Code Online (Sandbox Code Playgroud)

您可以将 BSTR 作为 std::wstring 提供给函数


Art*_*oul 6

我实现了UTF-8 <-> UTF-16 <-> UTF-32之间的两种转换变体,第一个变体从头开始完全实现所有转换,第二个使用标准std::codecvtstd::wstring_convert(这两个类已弃用)从 C++17 开始,但仍然存在,也保证在 C++11/C++14 中存在)。

\n

如果您不喜欢我的代码,那么您可以使用几乎单头 C++ 库utfcpp,它应该经过许多客户的充分测试。

\n

要将 UTF-8 转换为 UTF-16,只需调用Utf32To16(Utf8To32(str))并将 UTF-16 转换为 UTF-8 调用Utf32To8(Utf16To32(str))。或者您可以使用我方便的UtfConv<std::wstring>(std::string("abc"))UTF-8 到 UTF-16 或UtfConv<std::string>(std::wstring(L"abc"))UTF-16 到 UTF-8 辅助函数,UtfConv实际上可以从任何 UTF-编码字符串转换为任何 UTF-编码字符串。请参阅宏中这些用法和其他用法的示例Test(cs)

\n

两种变体均符合 C++11 标准。此外,它们还可以在 CLang/GCC/MSVC 编译器中进行编译(请参阅下面的“在线尝试!”链接),并经过测试可以在 Windows/Linux 操作系统中运行。

\n

您必须使用 UTF-8 编码将我的两个代码片段保存在文件中,并提供-finput-charset=UTF-8 -fexec-charset=UTF-8CLang/GCC 选项和/utf-8MSVC 选项。仅当您放置带有非 ascii 字符的文字字符串时才需要此 utf-8 保存和选项,就像我在代码中所做的那样,仅用于测试目的。要使用函数本身,您不需要此 utf-8 保存和选项。

\n

包含<windows.h>and<clocale><iostream>, 也可以调用SetConsoleOutputCP(65001)和 ,std::setlocale(LC_ALL, "en_US.UTF-8")仅用于测试目的,以便正确设置并输出到 UTF-8 控制台。转换函数不需要这些东西。

\n

部分代码不是很有必要,我指的是UtfHelper相关的结构和函数,它们只是转换的辅助函数,主要是为了跨平台处理而创建的std::wstring,因为wchar_t在Linux上通常是32位,在Windows上通常是16位。只有低级函数Utf8To32, Utf32To8, Utf16To32,Utf32To16才是转换真正需要的东西。

\n

变体 1 是根据 UTF-8 和 UTF-16 编码的 Wikipedia 描述创建的。

\n

如果您发现错误或任何改进(特别是在变体 1 中),请告诉我,我会修复它们。

\n
\n

变体1

\n

在线尝试一下!

\n
#include <string>\n#include <iostream>\n#include <stdexcept>\n#include <type_traits>\n#include <cstdint>\n\n#ifdef _WIN32\n    #include <windows.h>\n#else\n    #include <clocale>\n#endif\n\n#define ASSERT_MSG(cond, msg) { if (!(cond)) throw std::runtime_error("Assertion (" #cond ") failed at line " + std::to_string(__LINE__) + "! Msg: " + std::string(msg)); }\n#define ASSERT(cond) ASSERT_MSG(cond, "")\n\ntemplate <typename U8StrT = std::string>\ninline static U8StrT Utf32To8(std::u32string const & s) {\n    static_assert(sizeof(typename U8StrT::value_type) == 1, "Char byte-size should be 1 for UTF-8 strings!");\n    typedef typename U8StrT::value_type VT;\n    typedef uint8_t u8;\n    U8StrT r;\n    for (auto c: s) {\n        size_t nby = c <= 0x7FU ? 1 : c <= 0x7FFU ? 2 : c <= 0xFFFFU ? 3 : c <= 0x1FFFFFU ? 4 : c <= 0x3FFFFFFU ? 5 : c <= 0x7FFFFFFFU ? 6 : 7;\n        r.push_back(VT(\n            nby <= 1 ? u8(c) : (\n                (u8(0xFFU) << (8 - nby)) |\n                u8(c >> (6 * (nby - 1)))\n            )\n        ));\n        for (size_t i = 1; i < nby; ++i)\n            r.push_back(VT(u8(0x80U | (u8(0x3FU) & u8(c >> (6 * (nby - 1 - i)))))));\n    }\n    return r;\n}\n\ntemplate <typename U8StrT>\ninline static std::u32string Utf8To32(U8StrT const & s) {\n    static_assert(sizeof(typename U8StrT::value_type) == 1, "Char byte-size should be 1 for UTF-8 strings!");\n    typedef uint8_t u8;\n    std::u32string r;\n    auto it = (u8 const *)s.c_str(), end = (u8 const *)(s.c_str() + s.length());\n    while (it < end) {\n        char32_t c = 0;\n        if (*it <= 0x7FU) {\n            c = *it;\n            ++it;\n        } else {\n            ASSERT((*it & 0xC0U) == 0xC0U);\n            size_t nby = 0;\n            for (u8 b = *it; (b & 0x80U) != 0; b <<= 1, ++nby) {(void)0;}\n            ASSERT(nby <= 7);\n            ASSERT((end - it) >= nby);\n            c = *it & (u8(0xFFU) >> (nby + 1));\n            for (size_t i = 1; i < nby; ++i) {\n                ASSERT((it[i] & 0xC0U) == 0x80U);\n                c = (c << 6) | (it[i] & 0x3FU);\n            }\n            it += nby;\n        }\n        r.push_back(c);\n    }\n    return r;\n}\n\n\ntemplate <typename U16StrT = std::u16string>\ninline static U16StrT Utf32To16(std::u32string const & s) {\n    static_assert(sizeof(typename U16StrT::value_type) == 2, "Char byte-size should be 2 for UTF-16 strings!");\n    typedef typename U16StrT::value_type VT;\n    typedef uint16_t u16;\n    U16StrT r;\n    for (auto c: s) {\n        if (c <= 0xFFFFU)\n            r.push_back(VT(c));\n        else {\n            ASSERT(c <= 0x10FFFFU);\n            c -= 0x10000U;\n            r.push_back(VT(u16(0xD800U | ((c >> 10) & 0x3FFU))));\n            r.push_back(VT(u16(0xDC00U | (c & 0x3FFU))));\n        }\n    }\n    return r;\n}\n\ntemplate <typename U16StrT>\ninline static std::u32string Utf16To32(U16StrT const & s) {\n    static_assert(sizeof(typename U16StrT::value_type) == 2, "Char byte-size should be 2 for UTF-16 strings!");\n    typedef uint16_t u16;\n    std::u32string r;\n    auto it = (u16 const *)s.c_str(), end = (u16 const *)(s.c_str() + s.length());\n    while (it < end) {\n        char32_t c = 0;\n        if (*it < 0xD800U || *it > 0xDFFFU) {\n            c = *it;\n            ++it;\n        } else if (*it >= 0xDC00U) {\n            ASSERT_MSG(false, "Unallowed UTF-16 sequence!");\n        } else {\n            ASSERT(end - it >= 2);\n            c = (*it & 0x3FFU) << 10;\n            if ((it[1] < 0xDC00U) || (it[1] > 0xDFFFU)) {\n                ASSERT_MSG(false, "Unallowed UTF-16 sequence!");\n            } else {\n                c |= it[1] & 0x3FFU;\n                c += 0x10000U;\n            }\n            it += 2;\n        }\n        r.push_back(c);\n    }\n    return r;\n}\n\n\ntemplate <typename StrT, size_t NumBytes = sizeof(typename StrT::value_type)> struct UtfHelper;\ntemplate <typename StrT> struct UtfHelper<StrT, 1> {\n    inline static std::u32string UtfTo32(StrT const & s) { return Utf8To32(s); }\n    inline static StrT UtfFrom32(std::u32string const & s) { return Utf32To8<StrT>(s); }\n};\ntemplate <typename StrT> struct UtfHelper<StrT, 2> {\n    inline static std::u32string UtfTo32(StrT const & s) { return Utf16To32(s); }\n    inline static StrT UtfFrom32(std::u32string const & s) { return Utf32To16<StrT>(s); }\n};\ntemplate <typename StrT> struct UtfHelper<StrT, 4> {\n    inline static std::u32string UtfTo32(StrT const & s) {\n        return std::u32string((char32_t const *)(s.c_str()), (char32_t const *)(s.c_str() + s.length()));\n    }\n    inline static StrT UtfFrom32(std::u32string const & s) {\n        return StrT((typename StrT::value_type const *)(s.c_str()),\n            (typename StrT::value_type const *)(s.c_str() + s.length()));\n    }\n};\ntemplate <typename StrT> inline static std::u32string UtfTo32(StrT const & s) {\n    return UtfHelper<StrT>::UtfTo32(s);\n}\ntemplate <typename StrT> inline static StrT UtfFrom32(std::u32string const & s) {\n    return UtfHelper<StrT>::UtfFrom32(s);\n}\ntemplate <typename StrToT, typename StrFromT> inline static StrToT UtfConv(StrFromT const & s) {\n    return UtfFrom32<StrToT>(UtfTo32(s));\n}\n\n#define Test(cs) \\\n    std::cout << Utf32To8(Utf8To32(std::string(cs))) << ", "; \\\n    std::cout << Utf32To8(Utf16To32(Utf32To16(Utf8To32(std::string(cs))))) << ", "; \\\n    std::cout << Utf32To8(Utf16To32(std::u16string(u##cs))) << ", "; \\\n    std::cout << Utf32To8(std::u32string(U##cs)) << ", "; \\\n    std::cout << UtfConv<std::string>(UtfConv<std::u16string>(UtfConv<std::u32string>(UtfConv<std::u32string>(UtfConv<std::u16string>(std::string(cs)))))) << ", "; \\\n    std::cout << UtfConv<std::string>(UtfConv<std::wstring>(UtfConv<std::string>(UtfConv<std::u32string>(UtfConv<std::u32string>(std::string(cs)))))) << ", "; \\\n    std::cout << UtfFrom32<std::string>(UtfTo32(std::string(cs))) << ", "; \\\n    std::cout << UtfFrom32<std::string>(UtfTo32(std::u16string(u##cs))) << ", "; \\\n    std::cout << UtfFrom32<std::string>(UtfTo32(std::wstring(L##cs))) << ", "; \\\n    std::cout << UtfFrom32<std::string>(UtfTo32(std::u32string(U##cs))) << std::endl; \\\n    std::cout << "UTF-8 num bytes: " << std::dec << Utf32To8(std::u32string(U##cs)).size() << ", "; \\\n    std::cout << "UTF-16 num bytes: " << std::dec << (Utf32To16(std::u32string(U##cs)).size() * 2) << std::endl;\n\nint main() {\n    #ifdef _WIN32\n        SetConsoleOutputCP(65001);\n    #else\n        std::setlocale(LC_ALL, "en_US.UTF-8");\n    #endif\n    try {\n        Test("World");\n        Test("\xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82");\n        Test("");\n        Test("");\n        return 0;\n    } catch (std::exception const & ex) {\n        std::cout << "Exception: " << ex.what() << std::endl;\n        return -1;\n    }\n}\n
Run Code Online (Sandbox Code Playgroud)\n

输出:

\n
World, World, World, World, World, World, World, World, World, World\nUTF-8 num bytes: 5, UTF-16 num bytes: 10\n\xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82, \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82, \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82, \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82, \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82, \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82, \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82, \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82, \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82, \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82\nUTF-8 num bytes: 12, UTF-16 num bytes: 12\n, , , , , , , , , \nUTF-8 num bytes: 8, UTF-16 num bytes: 8\n, , , , , , , , , \nUTF-8 num bytes: 4, UTF-16 num bytes: 4\n
Run Code Online (Sandbox Code Playgroud)\n
\n

变体2

\n

在线尝试一下!

\n
#include <string>\n#include <iostream>\n#include <stdexcept>\n#include <type_traits>\n#include <locale>\n#include <codecvt>\n#include <cstdint>\n\n#ifdef _WIN32\n    #include <windows.h>\n#else\n    #include <clocale>\n#endif\n\n#define ASSERT(cond) { if (!(cond)) throw std::runtime_error("Assertion (" #cond ") failed at line " + std::to_string(__LINE__) + "!"); }\n\n// Workaround for some of MSVC compilers.\n#if defined(_MSC_VER) && (!_DLL) && (_MSC_VER >= 1900 /* VS 2015*/) && (_MSC_VER <= 1914 /* VS 2017 */)\nstd::locale::id std::codecvt<char16_t, char, _Mbstatet>::id;\nstd::locale::id std::codecvt<char32_t, char, _Mbstatet>::id;\n#endif\n\ntemplate <typename U8StrT>\ninline static std::u32string Utf8To32(U8StrT const & s) {\n    static_assert(sizeof(typename U8StrT::value_type) == 1, "Char byte-size should be 1 for UTF-8 strings!");\n    std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> utf_8_32_conv_;\n    return utf_8_32_conv_.from_bytes((char const *)s.c_str(), (char const *)(s.c_str() + s.length()));\n}\n\ntemplate <typename U8StrT = std::string>\ninline static U8StrT Utf32To8(std::u32string const & s) {\n    static_assert(sizeof(typename U8StrT::value_type) == 1, "Char byte-size should be 1 for UTF-8 strings!");\n    std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> utf_8_32_conv_;\n    std::string res = utf_8_32_conv_.to_bytes(s.c_str(), s.c_str() + s.length());\n    return U8StrT(\n        (typename U8StrT::value_type const *)(res.c_str()),\n        (typename U8StrT::value_type const *)(res.c_str() + res.length()));\n}\n\ntemplate <typename U16StrT>\ninline static std::u32string Utf16To32(U16StrT const & s) {\n    static_assert(sizeof(typename U16StrT::value_type) == 2, "Char byte-size should be 2 for UTF-16 strings!");\n    std::wstring_convert<std::codecvt_utf16<char32_t, 0x10ffff, std::little_endian>, char32_t> utf_16_32_conv_;\n    return utf_16_32_conv_.from_bytes((char const *)s.c_str(), (char const *)(s.c_str() + s.length()));\n}\n\ntemplate <typename U16StrT = std::u16string>\ninline static U16StrT Utf32To16(std::u32string const & s) {\n    static_assert(sizeof(typename U16StrT::value_type) == 2, "Char byte-size should be 2 for UTF-16 strings!");\n    std::wstring_convert<std::codecvt_utf16<char32_t, 0x10ffff, std::little_endian>, char32_t> utf_16_32_conv_;\n    std::string res = utf_16_32_conv_.to_bytes(s.c_str(), s.c_str() + s.length());\n    return U16StrT(\n        (typename U16StrT::value_type const *)(res.c_str()),\n        (typename U16StrT::value_type const *)(res.c_str() + res.length()));\n}\n\n\ntemplate <typename StrT, size_t NumBytes = sizeof(typename StrT::value_type)> struct UtfHelper;\ntemplate <typename StrT> struct UtfHelper<StrT, 1> {\n    inline static std::u32string UtfTo32(StrT const & s) { return Utf8To32(s); }\n    inline static StrT UtfFrom32(std::u32string const & s) { return Utf32To8<StrT>(s); }\n};\ntemplate <typename StrT> struct UtfHelper<StrT, 2> {\n    inline static std::u32string UtfTo32(StrT const & s) { return Utf16To32(s); }\n    inline static StrT UtfFrom32(std::u32string const & s) { return Utf32To16<StrT>(s); }\n};\ntemplate <typename StrT> struct UtfHelper<StrT, 4> {\n    inline static std::u32string UtfTo32(StrT const & s) {\n        return std::u32string((char32_t const *)(s.c_str()), (char32_t const *)(s.c_str() + s.length()));\n    }\n    inline static StrT UtfFrom32(std::u32string const & s) {\n        return StrT((typename StrT::value_type const *)(s.c_str()),\n            (typename StrT::value_type const *)(s.c_str() + s.length()));\n    }\n};\ntemplate <typename StrT> inline static std::u32string UtfTo32(StrT const & s) {\n    return UtfHelper<StrT>::UtfTo32(s);\n}\ntemplate <typename StrT> inline static StrT UtfFrom32(std::u32string const & s) {\n    return UtfHelper<StrT>::UtfFrom32(s);\n}\ntemplate <typename StrToT, typename StrFromT> inline static StrToT UtfConv(StrFromT const & s) {\n    return UtfFrom32<StrToT>(UtfTo32(s));\n}\n\n#define Test(cs) \\\n    std::cout << Utf32To8(Utf8To32(std::string(cs))) << ", "; \\\n    std::cout << Utf32To8(Utf16To32(Utf32To16(Utf8To32(std::string(cs))))) << ", "; \\\n    std::cout << Utf32To8(Utf16To32(std::u16string(u##cs))) << ", "; \\\n    std::cout << Utf32To8(std::u32string(U##cs)) << ", "; \\\n    std::cout << UtfConv<std::string>(UtfConv<std::u16string>(UtfConv<std::u32string>(UtfConv<std::u32string>(UtfConv<std::u16string>(std::string(cs)))))) << ", "; \\\n    std::cout << UtfConv<std::string>(UtfConv<std::wstring>(UtfConv<std::string>(UtfConv<std::u32string>(UtfConv<std::u32string>(std::string(cs)))))) << ", "; \\\n    std::cout << UtfFrom32<std::string>(UtfTo32(std::string(cs))) << ", "; \\\n    std::cout << UtfFrom32<std::string>(UtfTo32(std::u16string(u##cs))) << ", "; \\\n    std::cout << UtfFrom32<std::string>(UtfTo32(std::wstring(L##cs))) << ", "; \\\n    std::cout << UtfFrom32<std::string>(UtfTo32(std::u32string(U##cs))) << std::endl; \\\n    std::cout << "UTF-8 num bytes: " << std::dec << Utf32To8(std::u32string(U##cs)).size() << ", "; \\\n    std::cout << "UTF-16 num bytes: " << std::dec << (Utf32To16(std::u32string(U##cs)).size() * 2) << std::endl;\n\nint main() {\n    #ifdef _WIN32\n        SetConsoleOutputCP(65001);\n    #else\n        std::setlocale(LC_ALL, "en_US.UTF-8");\n    #endif\n    try {\n        Test("World");\n        Test("\xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82");\n        Test("");\n        Test("");\n        return 0;\n    } catch (std::exception const & ex) {\n        std::cout << "Exception: " << ex.what() << std::endl;\n        return -1;\n    }\n}\n
Run Code Online (Sandbox Code Playgroud)\n

输出:

\n
World, World, World, World, World, World, World, World, World, World\nUTF-8 num bytes: 5, UTF-16 num bytes: 10\n\xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82, \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82, \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82, \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82, \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82, \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82, \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82, \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82, \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82, \xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82\nUTF-8 num bytes: 12, UTF-16 num bytes: 12\n, , , , , , , , , \nUTF-8 num bytes: 8, UTF-16 num bytes: 8\n, , , , , , , , , \nUTF-8 num bytes: 4, UTF-16 num bytes: 4\n
Run Code Online (Sandbox Code Playgroud)\n


小智 2

如果您使用 C++11,您可以查看以下内容:

http://www.cplusplus.com/reference/codecvt/codecvt_utf8_utf16/