/** * Copyright (c) 2024 Huawei Device Co., Ltd. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "utf.h" #include namespace OHOS::Ace { /* * MUtf-8 * * U+0000 => C0 80 * * N Bits for First Last Byte 1 Byte 2 Byte 3 Byte 4 Byte 5 Byte 6 * code point code point code point * 1 7 U+0000 U+007F 0xxxxxxx * 2 11 U+0080 U+07FF 110xxxxx 10xxxxxx * 3 16 U+0800 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx * 6 21 U+10000 U+10FFFF 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx * for U+10000 -- U+10FFFF encodes the following (value - 0x10000) */ /* * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size]. * In case of invalid sequence return first byte of it. */ size_t MUtf8ToUtf16Size(const uint8_t* mutf8, size_t mutf8Len) { size_t pos = 0; size_t res = 0; while (pos != mutf8Len) { auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8Len - pos); if (nbytes == 0) { nbytes = 1; } res += pair > MAX_U16 ? CONST_2 : 1; mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) pos += nbytes; } return res; } std::pair ConvertMUtf8ToUtf16Pair(const uint8_t* data, size_t maxBytes) { uint8_t d0 = *data; if ((d0 & MASK1) == 0) { return { d0, 1 }; } if (maxBytes < CONST_2) { return { d0, 1 }; } uint8_t d1 = *(data + 1); if ((d0 & MASK2) == 0) { return { ((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2 }; } if (maxBytes < CONST_3) { return { d0, 1 }; } uint8_t d2 = *(data + CONST_2); if ((d0 & MASK3) == 0) { return { ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT), CONST_3 }; } if (maxBytes < CONST_4) { return { d0, 1 }; } uint8_t d3 = *(data + CONST_3); uint32_t codePoint = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) | ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT); uint32_t pair = 0; pair |= ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT; pair <<= PAIR_ELEMENT_WIDTH; pair |= (codePoint & MASK_10BIT) + U16_TAIL; return { pair, CONST_4 }; } size_t ConvertRegionUtf8ToUtf16( const uint8_t* mutf8In, uint16_t* utf16Out, size_t mutf8Len, size_t utf16Len, size_t start) { size_t inPos = 0; size_t outPos = 0; while (inPos < mutf8Len) { auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8In, mutf8Len - inPos); auto [pHi, pLo] = SplitUtf16Pair(pair); mutf8In += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) inPos += nbytes; if (start > 0) { start -= nbytes; continue; } if (pHi != 0) { if (outPos++ >= utf16Len - 1) { // check for place for two uint16 --outPos; break; } *utf16Out++ = pHi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) } if (outPos++ >= utf16Len) { --outPos; break; } *utf16Out++ = pLo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) } return outPos; } bool IsUTF16HighSurrogate(uint16_t ch) { return DECODE_LEAD_LOW <= ch && ch <= DECODE_LEAD_HIGH; } bool IsUTF16LowSurrogate(uint16_t ch) { return DECODE_TRAIL_LOW <= ch && ch <= DECODE_TRAIL_HIGH; } size_t UTF8Length(uint32_t codePoint) { if (codePoint <= UTF8_1B_MAX) { return UtfLength::ONE; } if (codePoint <= UTF8_2B_MAX) { return UtfLength::TWO; } if (codePoint <= UTF8_3B_MAX) { return UtfLength::THREE; } return UtfLength::FOUR; } // Methods for encode unicode to unicode size_t EncodeUTF8(uint32_t codePoint, uint8_t* utf8, size_t len, size_t index) { size_t size = UTF8Length(codePoint); if (index + size > len) { return 0; } for (size_t j = size - 1; j > 0; j--) { uint8_t cont = ((codePoint | BYTE_MARK) & BYTE_MASK); utf8[index + j] = cont; codePoint >>= UTF8_OFFSET; } utf8[index] = codePoint | FIRST_BYTE_MARK[size]; return size; } uint32_t HandleAndDecodeInvalidUTF16(uint16_t const* utf16, size_t len, size_t* index) { uint16_t first = utf16[*index]; // A valid surrogate pair should always start with a High Surrogate if (IsUTF16LowSurrogate(first)) { return UTF16_REPLACEMENT_CHARACTER; } if (IsUTF16HighSurrogate(first) || (first & SURROGATE_MASK) == DECODE_LEAD_LOW) { if (*index == len - 1) { // A High surrogate not paired with another surrogate return UTF16_REPLACEMENT_CHARACTER; } uint16_t second = utf16[*index + 1]; if (!IsUTF16LowSurrogate(second)) { // A High surrogate not followed by a low surrogate return UTF16_REPLACEMENT_CHARACTER; } // A valid surrogate pair, decode normally (*index)++; return ((first - DECODE_LEAD_LOW) << UTF16_OFFSET) + (second - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR; } // A unicode not fallen into the range of representing by surrogate pair, return as it is return first; } size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t* utf16In, uint8_t* utf8Out, size_t utf16Len, size_t utf8Len, size_t start) { if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) { return 0; } size_t utf8Pos = 0; size_t end = start + utf16Len; for (size_t i = start; i < end; ++i) { uint32_t codePoint = HandleAndDecodeInvalidUTF16(utf16In, end, &i); if (codePoint == 0) { continue; } utf8Pos += EncodeUTF8(codePoint, utf8Out, utf8Len, utf8Pos); } return utf8Pos; } bool IsUTF8(std::string& data) { if (data.empty()) { return false; } bool hasZeroByte = false; bool hasMultiByteUTF8 = false; for (size_t i = 0; i < data.size(); ++i) { unsigned char c = data[i]; // Check for UTF-16LE byte order mark (BOM) if (i == 0 && data.size() >= INDEX_TWO && data[INDEX_ONE] == UTF16LE_ZERO_BYTE && (c == UTF16LE_BOM_FF || c == UTF16LE_BOM_FE)) { return false; } // Check for zero bytes, which are common in UTF-16LE if (c == UTF16LE_ZERO_BYTE) { hasZeroByte = true; } // Check for multi-byte UTF-8 sequences if ((c & UTF8_HIGH_BIT) != 0) { // High bit is set, indicating a non-ASCII character if ((c & UTF8_TWO_BYTE_MASK) == UTF8_TWO_BYTE_PATTERN && i + INDEX_ONE < data.size() && (data[i + INDEX_ONE ] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER) { // Two-byte UTF-8 character hasMultiByteUTF8 = true; i += INDEX_ONE; // Skip the next byte } else if ((c & UTF8_THREE_BYTE_MASK) == UTF8_THREE_BYTE_PATTERN && i + INDEX_TWO < data.size() && (data[i + INDEX_ONE] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER && (data[i + INDEX_TWO] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER) { // Three-byte UTF-8 character hasMultiByteUTF8 = true; i += INDEX_TWO; // Skip the next two bytes } else if ((c & UTF8_FOUR_BYTE_MASK) == UTF8_FOUR_BYTE_PATTERN && i + INDEX_THREE < data.size() && (data[i + INDEX_ONE] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER && (data[i + INDEX_TWO] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER && (data[i + INDEX_THREE] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER) { // Four-byte UTF-8 character hasMultiByteUTF8 = true; i += INDEX_THREE; // Skip the next three bytes } } } if (hasZeroByte && !hasMultiByteUTF8) { // If we found zero bytes and no multi-byte UTF-8 sequences, it's likely UTF-16LE return false; } else if (hasMultiByteUTF8) { // If we found multi-byte UTF-8 sequences, it's likely UTF-8 return true; } else { // If all characters are ASCII, it's either pure ASCII or we don't have enough data to determine the encoding return false; } } void ConvertIllegalStr(std::string& str) { if (IsUTF8(str)) { uint8_t* buf8 = reinterpret_cast(const_cast(str.c_str())); size_t utf8Len = str.size(); auto utf16Len = MUtf8ToUtf16Size(buf8, utf8Len); std::unique_ptr buf16 = std::make_unique(utf16Len); auto resultLen = ConvertRegionUtf8ToUtf16(buf8, buf16.get(), utf8Len, utf16Len, 0); if (resultLen == utf16Len) { DebuggerConvertRegionUtf16ToUtf8(buf16.get(), buf8, utf16Len, utf8Len, 0); } } } } // namespace OHOS::Ace