Bootstrap

C++ UTF-8编解码


icu 编解码数据:
extern const UConverterSharedData
    _MBCSData, _Latin1Data,
    _UTF8Data, _UTF16BEData, _UTF16LEData, _UTF32BEData, _UTF32LEData,
    _ISO2022Data, 
    _LMBCSData1,_LMBCSData2, _LMBCSData3, _LMBCSData4, _LMBCSData5, _LMBCSData6,
    _LMBCSData8,_LMBCSData11,_LMBCSData16,_LMBCSData17,_LMBCSData18,_LMBCSData19,
    _HZData,_ISCIIData, _SCSUData, _ASCIIData,
    _UTF7Data, _Bocu1Data, _UTF16Data, _UTF32Data, _CESU8Data, _IMAPData, _CompoundTextData;

U_CDECL_END

Code point 代码点    Byte 1         Byte 2         Byte 3         Byte 4
U+ 0000 .. 007F        0xxxxxxx
U+ 0080 .. 07FF        110xxxxx    10xxxxxx
U+ 0800 .. FFFF        1110xxxx    10xxxxxx    10xxxxxx
U+ 10000 .. 10FFFF    11110xxx    10xxxxxx    10xxxxxx    10xxxxxx

可变长度编码
🚵🏻‍♀️ is U+1F6B5 + U+1F3FB + U+200D + U+2640 + U+FE0F
🤦🏼‍♂️ 由 5 个代码点 ( U+1F926 U+1F3FB U+200D U+2642 U+FE0F ) 
组成的事实仅仅是实现细节。它不应该被拆开,它不应该被计为多个字符,文本光标不应该位于其中,它不应该被部分选择

关键代码

std::size_t sequence_length(char8_t lead_byte)
{
    if (lead_byte < 0x80)
        return 1;
    else if ((lead_byte >> 5) == 0x6)
        return 2;
    else if ((lead_byte >> 4) == 0xe)
        return 3;
    else if ((lead_byte >> 3) == 0x1e)
        return 4;
    else
        return 0;
}

UTF_ERROR encode_next_utf8(const char32_t code_point, std::u8string &utf8str)
{
    if (!is_code_point_valid(code_point))
        return UTF_ERROR::INVALID_CODE_POINT;

    if (code_point < 0x80) {  // 1 byte
        utf8str.append(1u, static_cast<char8_t>(code_point));
    } else if (code_point < 0x800) {  // 2 bytes
        add_capacity_if_needed(utf8str, 2);
        utf8str.append(1, static_cast<char8_t>((code_point >> 6) | 0xc0));
        utf8str.append(1, static_cast<char8_t>((code_point & 0x3f) | 0x80));
    } else if (code_point < 0x10000) {  // 3 bytes
        add_capacity_if_needed(utf8str, 3);
        utf8str.append(1, static_cast<char8_t>((code_point >> 12) | 0xe0));
        utf8str.append(1, static_cast<char8_t>(((code_point >> 6) & 0x3f) | 0x80));
        utf8str.append(1, static_cast<char8_t>((code_point & 0x3f) | 0x80));
    } else {  // 4 bytes
        add_capacity_if_needed(utf8str, 4);
        utf8str.append(1, static_cast<char8_t>((code_point >> 18) | 0xf0));
        utf8str.append(1, static_cast<char8_t>(((code_point >> 12) & 0x3f) | 0x80));
        utf8str.append(1, static_cast<char8_t>(((code_point >> 6) & 0x3f) | 0x80));
        utf8str.append(1, static_cast<char8_t>((code_point & 0x3f) | 0x80));
    }

    return UTF_ERROR::OK;
}

C++ 标准库

#include <locale>
#include <codecvt>
#include <fstream>
// convert std::string to wstring
std::wstring to_wide_string(const std::string &input)
{
    //    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
    //    return converter.from_bytes(input);

    std::locale   sys_loc(std::locale("C.UTF-8"));
    std::ofstream ofs(" cvt_buf ");
    ofs << input;
    ofs.close();

    std::wifstream wifs(" cvt_buf ");
    wifs.imbue(sys_loc);
    std::wstring wstr;
    wifs >> wstr;
    wifs.close();

    return wstr;
}

// convert wstring to std::string
std::string to_byte_string(const std::wstring &input)
{
    // std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
    //    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
    //    return converter.to_bytes(input);
    std::locale    sys_loc(std::locale("C.UTF-8"));
    std::wofstream wofs(" cvt_buf ");
    wofs.imbue(sys_loc);
    wofs << input;
    wofs.close();

    std::ifstream ifs(" cvt_buf ");
    std::string   str;
    ifs >> str;
    ifs.close();

    return str;
}

std::u32string to_utf32(std::string str)
{
    return std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t>{}.from_bytes(str);
}

std::string to_utf8(std::u32string str32)
{
    return std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t>{}.to_bytes(str32);
}

参考

The Absolute Minimum Every Software Developer Must Know About Unicode in 2023 (Still No Excuses!) @ tonsky.me

GitHub - soasis/text: A spicy text library for C++ that has the explicit goal of enabling the entire ecosystem to share in proper forward progress towards a bright Unicode future.

utfcpp-3.2.1.tar.gz · src-openEuler/utf8cpp - Gitee.com

GitHub - nemtrif/utfcpp: UTF-8 with C++ in a Portable Way


创作不易,小小的支持一下吧!

悦读

道可道,非常道;名可名,非常名。 无名,天地之始,有名,万物之母。 故常无欲,以观其妙,常有欲,以观其徼。 此两者,同出而异名,同谓之玄,玄之又玄,众妙之门。

;