1 /**
2 * Copyright (c) 2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "utf.h"
17 #include <memory>
18
19 namespace OHOS::Ace {
20
21 /*
22 * MUtf-8
23 *
24 * U+0000 => C0 80
25 *
26 * N Bits for First Last Byte 1 Byte 2 Byte 3 Byte 4 Byte 5 Byte 6
27 * code point code point code point
28 * 1 7 U+0000 U+007F 0xxxxxxx
29 * 2 11 U+0080 U+07FF 110xxxxx 10xxxxxx
30 * 3 16 U+0800 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
31 * 6 21 U+10000 U+10FFFF 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx
32 * for U+10000 -- U+10FFFF encodes the following (value - 0x10000)
33 */
34
35 /*
36 * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size].
37 * In case of invalid sequence return first byte of it.
38 */
MUtf8ToUtf16Size(const uint8_t * mutf8,size_t mutf8Len)39 size_t MUtf8ToUtf16Size(const uint8_t* mutf8, size_t mutf8Len)
40 {
41 size_t pos = 0;
42 size_t res = 0;
43 while (pos != mutf8Len) {
44 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8Len - pos);
45 if (nbytes == 0) {
46 nbytes = 1;
47 }
48 res += pair > MAX_U16 ? CONST_2 : 1;
49 mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
50 pos += nbytes;
51 }
52 return res;
53 }
54
ConvertMUtf8ToUtf16Pair(const uint8_t * data,size_t maxBytes)55 std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t* data, size_t maxBytes)
56 {
57 uint8_t d0 = *data;
58 if ((d0 & MASK1) == 0) {
59 return { d0, 1 };
60 }
61
62 if (maxBytes < CONST_2) {
63 return { d0, 1 };
64 }
65 uint8_t d1 = *(data + 1);
66 if ((d0 & MASK2) == 0) {
67 return { ((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2 };
68 }
69
70 if (maxBytes < CONST_3) {
71 return { d0, 1 };
72 }
73 uint8_t d2 = *(data + CONST_2);
74 if ((d0 & MASK3) == 0) {
75 return { ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
76 CONST_3 };
77 }
78
79 if (maxBytes < CONST_4) {
80 return { d0, 1 };
81 }
82 uint8_t d3 = *(data + CONST_3);
83 uint32_t codePoint = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) |
84 ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
85
86 uint32_t pair = 0;
87 pair |= ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT;
88 pair <<= PAIR_ELEMENT_WIDTH;
89 pair |= (codePoint & MASK_10BIT) + U16_TAIL;
90
91 return { pair, CONST_4 };
92 }
93
ConvertRegionUtf8ToUtf16(const uint8_t * mutf8In,uint16_t * utf16Out,size_t mutf8Len,size_t utf16Len,size_t start)94 size_t ConvertRegionUtf8ToUtf16(
95 const uint8_t* mutf8In, uint16_t* utf16Out, size_t mutf8Len, size_t utf16Len, size_t start)
96 {
97 size_t inPos = 0;
98 size_t outPos = 0;
99 while (inPos < mutf8Len) {
100 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8In, mutf8Len - inPos);
101 auto [pHi, pLo] = SplitUtf16Pair(pair);
102
103 mutf8In += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
104 inPos += nbytes;
105 if (start > 0) {
106 start -= nbytes;
107 continue;
108 }
109
110 if (pHi != 0) {
111 if (outPos++ >= utf16Len - 1) { // check for place for two uint16
112 --outPos;
113 break;
114 }
115 *utf16Out++ = pHi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
116 }
117 if (outPos++ >= utf16Len) {
118 --outPos;
119 break;
120 }
121 *utf16Out++ = pLo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
122 }
123 return outPos;
124 }
125
IsUTF16HighSurrogate(uint16_t ch)126 bool IsUTF16HighSurrogate(uint16_t ch)
127 {
128 return DECODE_LEAD_LOW <= ch && ch <= DECODE_LEAD_HIGH;
129 }
130
IsUTF16LowSurrogate(uint16_t ch)131 bool IsUTF16LowSurrogate(uint16_t ch)
132 {
133 return DECODE_TRAIL_LOW <= ch && ch <= DECODE_TRAIL_HIGH;
134 }
135
UTF8Length(uint32_t codePoint)136 size_t UTF8Length(uint32_t codePoint)
137 {
138 if (codePoint <= UTF8_1B_MAX) {
139 return UtfLength::ONE;
140 }
141 if (codePoint <= UTF8_2B_MAX) {
142 return UtfLength::TWO;
143 }
144 if (codePoint <= UTF8_3B_MAX) {
145 return UtfLength::THREE;
146 }
147 return UtfLength::FOUR;
148 }
149
150 // Methods for encode unicode to unicode
EncodeUTF8(uint32_t codePoint,uint8_t * utf8,size_t len,size_t index)151 size_t EncodeUTF8(uint32_t codePoint, uint8_t* utf8, size_t len, size_t index)
152 {
153 size_t size = UTF8Length(codePoint);
154 if (index + size > len) {
155 return 0;
156 }
157 for (size_t j = size - 1; j > 0; j--) {
158 uint8_t cont = ((codePoint | BYTE_MARK) & BYTE_MASK);
159 utf8[index + j] = cont;
160 codePoint >>= UTF8_OFFSET;
161 }
162 utf8[index] = codePoint | FIRST_BYTE_MARK[size];
163 return size;
164 }
165
HandleAndDecodeInvalidUTF16(uint16_t const * utf16,size_t len,size_t * index)166 uint32_t HandleAndDecodeInvalidUTF16(uint16_t const* utf16, size_t len, size_t* index)
167 {
168 uint16_t first = utf16[*index];
169 // A valid surrogate pair should always start with a High Surrogate
170 if (IsUTF16LowSurrogate(first)) {
171 return UTF16_REPLACEMENT_CHARACTER;
172 }
173 if (IsUTF16HighSurrogate(first) || (first & SURROGATE_MASK) == DECODE_LEAD_LOW) {
174 if (*index == len - 1) {
175 // A High surrogate not paired with another surrogate
176 return UTF16_REPLACEMENT_CHARACTER;
177 }
178 uint16_t second = utf16[*index + 1];
179 if (!IsUTF16LowSurrogate(second)) {
180 // A High surrogate not followed by a low surrogate
181 return UTF16_REPLACEMENT_CHARACTER;
182 }
183 // A valid surrogate pair, decode normally
184 (*index)++;
185 return ((first - DECODE_LEAD_LOW) << UTF16_OFFSET) + (second - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
186 }
187 // A unicode not fallen into the range of representing by surrogate pair, return as it is
188 return first;
189 }
190
DebuggerConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start)191 size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t* utf16In, uint8_t* utf8Out, size_t utf16Len, size_t utf8Len,
192 size_t start)
193 {
194 if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
195 return 0;
196 }
197 size_t utf8Pos = 0;
198 size_t end = start + utf16Len;
199 for (size_t i = start; i < end; ++i) {
200 uint32_t codePoint = HandleAndDecodeInvalidUTF16(utf16In, end, &i);
201 if (codePoint == 0) {
202 continue;
203 }
204 utf8Pos += EncodeUTF8(codePoint, utf8Out, utf8Len, utf8Pos);
205 }
206 return utf8Pos;
207 }
208
IsUTF8(std::string & data)209 bool IsUTF8(std::string& data)
210 {
211 if (data.empty()) {
212 return false;
213 }
214
215 bool hasZeroByte = false;
216 bool hasMultiByteUTF8 = false;
217
218 for (size_t i = 0; i < data.size(); ++i) {
219 unsigned char c = data[i];
220
221 // Check for UTF-16LE byte order mark (BOM)
222 if (i == 0 && data.size() >= INDEX_TWO && data[INDEX_ONE] == UTF16LE_ZERO_BYTE &&
223 (c == UTF16LE_BOM_FF || c == UTF16LE_BOM_FE)) {
224 return false;
225 }
226
227 // Check for zero bytes, which are common in UTF-16LE
228 if (c == UTF16LE_ZERO_BYTE) {
229 hasZeroByte = true;
230 }
231
232 // Check for multi-byte UTF-8 sequences
233 if ((c & UTF8_HIGH_BIT) != 0) { // High bit is set, indicating a non-ASCII character
234 if ((c & UTF8_TWO_BYTE_MASK) == UTF8_TWO_BYTE_PATTERN && i + INDEX_ONE < data.size() &&
235 (data[i + INDEX_ONE ] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER) {
236 // Two-byte UTF-8 character
237 hasMultiByteUTF8 = true;
238 i += INDEX_ONE; // Skip the next byte
239 } else if ((c & UTF8_THREE_BYTE_MASK) == UTF8_THREE_BYTE_PATTERN && i + INDEX_TWO < data.size() &&
240 (data[i + INDEX_ONE] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER &&
241 (data[i + INDEX_TWO] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER) {
242 // Three-byte UTF-8 character
243 hasMultiByteUTF8 = true;
244 i += INDEX_TWO; // Skip the next two bytes
245 } else if ((c & UTF8_FOUR_BYTE_MASK) == UTF8_FOUR_BYTE_PATTERN && i + INDEX_THREE < data.size() &&
246 (data[i + INDEX_ONE] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER &&
247 (data[i + INDEX_TWO] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER &&
248 (data[i + INDEX_THREE] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER) {
249 // Four-byte UTF-8 character
250 hasMultiByteUTF8 = true;
251 i += INDEX_THREE; // Skip the next three bytes
252 }
253 }
254 }
255
256 if (hasZeroByte && !hasMultiByteUTF8) {
257 // If we found zero bytes and no multi-byte UTF-8 sequences, it's likely UTF-16LE
258 return false;
259 } else if (hasMultiByteUTF8) {
260 // If we found multi-byte UTF-8 sequences, it's likely UTF-8
261 return true;
262 } else {
263 // If all characters are ASCII, it's either pure ASCII or we don't have enough data to determine the encoding
264 return false;
265 }
266 }
267
ConvertIllegalStr(std::string & str)268 void ConvertIllegalStr(std::string& str)
269 {
270 if (IsUTF8(str)) {
271 uint8_t* buf8 = reinterpret_cast<uint8_t*>(const_cast<char*>(str.c_str()));
272 size_t utf8Len = str.size();
273 auto utf16Len = MUtf8ToUtf16Size(buf8, utf8Len);
274 std::unique_ptr<uint16_t[]> buf16 = std::make_unique<uint16_t[]>(utf16Len);
275 auto resultLen = ConvertRegionUtf8ToUtf16(buf8, buf16.get(), utf8Len, utf16Len, 0);
276 if (resultLen == utf16Len) {
277 DebuggerConvertRegionUtf16ToUtf8(buf16.get(), buf8, utf16Len, utf8Len, 0);
278 }
279 }
280 }
281
282 } // namespace OHOS::Ace
283