/* * Copyright (c) 2021 Huawei Device Co., Ltd. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "unicode_ex.h" #include #include #include #include "utils_log.h" using namespace std; /***************************************UTF8 and UTF16 unicode********************************************** UTF8 Unicode utf8 U + 0000~U + 007F 0??????? U + 0080~U + 07FF 110????? 10?????? U + 0800~U + FFFF 1110???? 10?????? 10?????? U + 10000~U + 10FFFF 11110??? 10?????? 10?????? 10?????? UTF16 Unicode utf16 code U + 000~U + FFFF 2 Byte save, same with Unicode U + 10000~U + 10FFFF 4 Byte save Unicode 0x10000 **************************************UTF8 and UTF16 unicode**********************************************/ namespace OHOS { namespace { constexpr char32_t ONE_BYTE_UTF8 = 0x00000080; constexpr char32_t TWO_BYTES_UTF8 = 0x00000800; constexpr char32_t THREE_BYTES_UTF8 = 0x00010000; constexpr char32_t UNICODE_RESERVED_START = 0x0000D800; constexpr char32_t UNICODE_RESERVED_END = 0x0000DFFF; constexpr char32_t UNICODE_MAX_NUM = 0x0010FFFF; constexpr unsigned int UTF8_OFFSET = 6; constexpr char32_t UTF8_BYTE_MASK = 0x000000BF; constexpr char32_t UTF8_BYTE_MARK = 0x00000080; constexpr char32_t UTF8_FIRST_BYTE_MARK[] = { 0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0 }; } #define UTF8_LENGTH_INVALID 0 #define UTF8_LENGTH_1 1 #define UTF8_LENGTH_2 2 #define UTF8_LENGTH_3 3 #define UTF8_LENGTH_4 4 #define UTF8_LEN_MASK 3 #define UTF8_FIRST_BYTE_INDEX 0 #define UTF8_SECOND_BYTE_INDEX 1 #define UTF8_THIRD_BYTE_INDEX 2 #define UTF8_FORTH_BYTE_INDEX 3 #define UTF8_SHIFT_WIDTH 6 #define STR16_TO_STR8_SHIFT_WIDTH 10 #define UTF16_SHIFT_WIDTH 10 #define UTF32_BYTE_SIZE_1 1 #define UTF32_BYTE_SIZE_2 2 #define UTF32_BYTE_SIZE_3 3 #define UTF32_BYTE_SIZE_4 4 // inner func and dstP is not nullptr void Utf32CodePointToUtf8(uint8_t* dstP, char32_t srcChar, size_t bytes) { dstP += bytes; if (bytes >= UTF32_BYTE_SIZE_4) { *--dstP = static_cast((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); srcChar >>= UTF8_OFFSET; } if (bytes >= UTF32_BYTE_SIZE_3) { *--dstP = static_cast((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); srcChar >>= UTF8_OFFSET; } if (bytes >= UTF32_BYTE_SIZE_2) { *--dstP = static_cast((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK); srcChar >>= UTF8_OFFSET; } if (bytes >= UTF32_BYTE_SIZE_1) { *--dstP = static_cast(srcChar | UTF8_FIRST_BYTE_MARK[bytes]); } } size_t Utf32CodePointUtf8Length(char32_t srcChar) { if (srcChar < ONE_BYTE_UTF8) { return UTF8_LENGTH_1; } else if (srcChar < TWO_BYTES_UTF8) { return UTF8_LENGTH_2; } else if (srcChar < THREE_BYTES_UTF8) { if ((srcChar < UNICODE_RESERVED_START) || (srcChar > UNICODE_RESERVED_END)) { return UTF8_LENGTH_3; } else { // Surrogates are invalid UTF-32 characters. return UTF8_LENGTH_INVALID; } } else if (srcChar <= UNICODE_MAX_NUM) { // Max code point for Unicode is 0x0010FFFF. return UTF8_LENGTH_4; } else { // Invalid UTF-32 character. return UTF8_LENGTH_INVALID; } } // get the length of utf8 from utf16 int Utf16ToUtf8Length(const char16_t* str16, size_t str16Len) { if (str16 == nullptr || str16Len == 0) { return -1; } const char16_t* const str16End = str16 + str16Len; int utf8Len = 0; while (str16 < str16End) { int charLen = 0; if (((*str16 & 0xFC00) == 0xD800) && ((str16 + 1) < str16End) && ((*(str16 + 1) & 0xFC00) == 0xDC00)) { // surrogate pairs are always 4 bytes. charLen = 4; // str16 advance 2 bytes str16 += 2; } else { charLen = Utf32CodePointUtf8Length(static_cast(*str16++)); } if (utf8Len > (INT_MAX - charLen)) { return -1; } utf8Len += charLen; } return utf8Len; } // inner function, utf8Str and utf16Str is not nullptr void StrncpyStr16ToStr8(const char16_t* utf16Str, size_t str16Len, char* utf8Str, size_t str8Len) { const char16_t* curUtf16 = utf16Str; const char16_t* const endUtf16 = utf16Str + str16Len; char* cur = utf8Str; while (curUtf16 < endUtf16) { char32_t utf32; // surrogate pairs if (((*curUtf16 & 0xFC00) == 0xD800) && ((curUtf16 + 1) < endUtf16) && (((*(curUtf16 + 1) & 0xFC00)) == 0xDC00)) { utf32 = (*curUtf16++ - 0xD800) << STR16_TO_STR8_SHIFT_WIDTH; utf32 |= *curUtf16++ - 0xDC00; utf32 += 0x10000; } else { utf32 = static_cast(*curUtf16++); } const size_t len = Utf32CodePointUtf8Length(utf32); if (str8Len < len) { break; } Utf32CodePointToUtf8(reinterpret_cast(cur), utf32, len); cur += len; str8Len -= len; } *cur = '\0'; } // inner function and str16 is not null char* Char16ToChar8(const char16_t* str16, size_t str16Len) { char* str8 = nullptr; int utf8Len = Utf16ToUtf8Length(str16, str16Len); if (utf8Len < 0 || utf8Len >= INT_MAX) { return nullptr; } // Allow for closing '\0' utf8Len += 1; str8 = reinterpret_cast(calloc(utf8Len, sizeof(char))); if (str8 == nullptr) { return nullptr; } StrncpyStr16ToStr8(str16, str16Len, str8, utf8Len); return str8; } bool String16ToString8(const u16string& str16, string& str8) { size_t str16Len = str16.length(); if (str16Len < 1) { return false; } char* str8Temp = Char16ToChar8(str16.c_str(), str16Len); if (str8Temp == nullptr) { UTILS_LOGD("Str16 to str8 failed, because str8Temp is nullptr!"); return false; } str8 = str8Temp; free(str8Temp); str8Temp = nullptr; return true; } /** * return 1-4 by first byte * 1111xxxx : 4 * 1110xxxx : 3 * 110xxxxx : 2 * 10xxxxxx : 1 * 0xxxxxxx : 1 */ static inline size_t Utf8CodePointLen(uint8_t ch) { return ((0xe5000000 >> ((ch >> UTF8_LEN_MASK) & 0x1e)) & UTF8_LEN_MASK) + 1; } static inline void Utf8ShiftAndMask(uint32_t* codePoint, const uint8_t byte) { *codePoint <<= UTF8_SHIFT_WIDTH; *codePoint |= 0x3F & byte; } uint32_t Utf8ToUtf32CodePoint(const char* src, size_t length) { uint32_t unicode = 0; switch (length) { case UTF8_LENGTH_1: return src[UTF8_FIRST_BYTE_INDEX]; case UTF8_LENGTH_2: unicode = src[UTF8_FIRST_BYTE_INDEX] & 0x1f; Utf8ShiftAndMask(&unicode, src[UTF8_SECOND_BYTE_INDEX]); return unicode; case UTF8_LENGTH_3: unicode = src[UTF8_FIRST_BYTE_INDEX] & 0x0f; Utf8ShiftAndMask(&unicode, src[UTF8_SECOND_BYTE_INDEX]); Utf8ShiftAndMask(&unicode, src[UTF8_THIRD_BYTE_INDEX]); return unicode; case UTF8_LENGTH_4: unicode = src[UTF8_FIRST_BYTE_INDEX] & 0x07; Utf8ShiftAndMask(&unicode, src[UTF8_SECOND_BYTE_INDEX]); Utf8ShiftAndMask(&unicode, src[UTF8_THIRD_BYTE_INDEX]); Utf8ShiftAndMask(&unicode, src[UTF8_FORTH_BYTE_INDEX]); return unicode; default: return 0xffff; } } int Utf8ToUtf16Length(const char* str8, size_t str8Len) { const char* const str8end = str8 + str8Len; int utf16len = 0; while (str8 < str8end) { utf16len++; size_t u8charlen = Utf8CodePointLen(*str8); if (str8 + u8charlen - 1 >= str8end) { UTILS_LOGE("Get str16 length failed because str8 unicode is illegal!"); return -1; } uint32_t codepoint = Utf8ToUtf32CodePoint(str8, u8charlen); if (codepoint > 0xFFFF) { utf16len++; // this will be a surrogate pair in utf16 } str8 += u8charlen; } if (str8 != str8end) { UTILS_LOGE("Get str16 length failed because str8length is illegal!"); return -1; } return utf16len; } char16_t* Utf8ToUtf16(const char* utf8Str, size_t u8len, char16_t* u16str, size_t u16len) { if (u16len == 0) { return u16str; } const char* const u8end = utf8Str + u8len; const char* u8cur = utf8Str; const char16_t* const u16end = u16str + u16len; char16_t* u16cur = u16str; while ((u8cur < u8end) && (u16cur < u16end)) { size_t len = Utf8CodePointLen(*u8cur); uint32_t codepoint = Utf8ToUtf32CodePoint(u8cur, len); // Convert the UTF32 codepoint to one or more UTF16 codepoints if (codepoint <= 0xFFFF) { // Single UTF16 character *u16cur++ = static_cast(codepoint); } else { // Multiple UTF16 characters with surrogates codepoint = codepoint - 0x10000; *u16cur++ = static_cast((codepoint >> UTF16_SHIFT_WIDTH) + 0xD800); if (u16cur >= u16end) { // Ooops... not enough room for this surrogate pair. return u16cur - 1; } *u16cur++ = static_cast((codepoint & 0x3FF) + 0xDC00); } u8cur += len; } return u16cur; } void StrncpyStr8ToStr16(const char* utf8Str, size_t u8len, char16_t* u16str, size_t u16len) { char16_t* result = Utf8ToUtf16(utf8Str, u8len, u16str, u16len - 1); *result = 0; return; } // inner function and str8 is not null char16_t* Char8ToChar16(const char* str8, size_t str8Len) { char16_t* str16 = nullptr; int utf16Len = Utf8ToUtf16Length(str8, str8Len); if (utf16Len < 0) { UTILS_LOGE("Get str16 length failed,length is: %{public}d", utf16Len); return nullptr; } // Allow for closing 0 utf16Len = utf16Len + 1; str16 = reinterpret_cast(calloc(utf16Len, sizeof(char16_t))); if (str16 == nullptr) { UTILS_LOGE("Str16 malloc memory failed!"); return nullptr; } StrncpyStr8ToStr16(str8, str8Len, str16, utf16Len); return str16; } bool String8ToString16(const string& str8, u16string& str16) { size_t str8len = str8.length(); if (str8len < 1) { return false; } char16_t* str16Temp = Char8ToChar16(str8.c_str(), str8len); if (str16Temp == nullptr) { UTILS_LOGD("str8 to str16 failed, str16Temp is nullptr!"); return false; } str16 = str16Temp; free(str16Temp); str16Temp = nullptr; return true; } } // namespace OHOS