1 /**
2  * Copyright (c) 2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "utf.h"
17 #include <memory>
18 
19 namespace OHOS::Ace {
20 
21 /*
22  * MUtf-8
23  *
24  * U+0000 => C0 80
25  *
26  * N  Bits for     First        Last        Byte 1      Byte 2      Byte 3      Byte 4      Byte 5      Byte 6
27  *    code point   code point   code point
28  * 1  7            U+0000       U+007F      0xxxxxxx
29  * 2  11           U+0080       U+07FF      110xxxxx    10xxxxxx
30  * 3  16           U+0800       U+FFFF      1110xxxx    10xxxxxx    10xxxxxx
31  * 6  21           U+10000      U+10FFFF    11101101    1010xxxx    10xxxxxx    11101101    1011xxxx    10xxxxxx
32  * for U+10000 -- U+10FFFF encodes the following (value - 0x10000)
33  */
34 
35 /*
36  * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size].
37  * In case of invalid sequence return first byte of it.
38  */
MUtf8ToUtf16Size(const uint8_t * mutf8,size_t mutf8Len)39 size_t MUtf8ToUtf16Size(const uint8_t* mutf8, size_t mutf8Len)
40 {
41     size_t pos = 0;
42     size_t res = 0;
43     while (pos != mutf8Len) {
44         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8Len - pos);
45         if (nbytes == 0) {
46             nbytes = 1;
47         }
48         res += pair > MAX_U16 ? CONST_2 : 1;
49         mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
50         pos += nbytes;
51     }
52     return res;
53 }
54 
ConvertMUtf8ToUtf16Pair(const uint8_t * data,size_t maxBytes)55 std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t* data, size_t maxBytes)
56 {
57     uint8_t d0 = *data;
58     if ((d0 & MASK1) == 0) {
59         return { d0, 1 };
60     }
61 
62     if (maxBytes < CONST_2) {
63         return { d0, 1 };
64     }
65     uint8_t d1 = *(data + 1);
66     if ((d0 & MASK2) == 0) {
67         return { ((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2 };
68     }
69 
70     if (maxBytes < CONST_3) {
71         return { d0, 1 };
72     }
73     uint8_t d2 = *(data + CONST_2);
74     if ((d0 & MASK3) == 0) {
75         return { ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
76             CONST_3 };
77     }
78 
79     if (maxBytes < CONST_4) {
80         return { d0, 1 };
81     }
82     uint8_t d3 = *(data + CONST_3);
83     uint32_t codePoint = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) |
84                          ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
85 
86     uint32_t pair = 0;
87     pair |= ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT;
88     pair <<= PAIR_ELEMENT_WIDTH;
89     pair |= (codePoint & MASK_10BIT) + U16_TAIL;
90 
91     return { pair, CONST_4 };
92 }
93 
ConvertRegionUtf8ToUtf16(const uint8_t * mutf8In,uint16_t * utf16Out,size_t mutf8Len,size_t utf16Len,size_t start)94 size_t ConvertRegionUtf8ToUtf16(
95     const uint8_t* mutf8In, uint16_t* utf16Out, size_t mutf8Len, size_t utf16Len, size_t start)
96 {
97     size_t inPos = 0;
98     size_t outPos = 0;
99     while (inPos < mutf8Len) {
100         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8In, mutf8Len - inPos);
101         auto [pHi, pLo] = SplitUtf16Pair(pair);
102 
103         mutf8In += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
104         inPos += nbytes;
105         if (start > 0) {
106             start -= nbytes;
107             continue;
108         }
109 
110         if (pHi != 0) {
111             if (outPos++ >= utf16Len - 1) { // check for place for two uint16
112                 --outPos;
113                 break;
114             }
115             *utf16Out++ = pHi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
116         }
117         if (outPos++ >= utf16Len) {
118             --outPos;
119             break;
120         }
121         *utf16Out++ = pLo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
122     }
123     return outPos;
124 }
125 
IsUTF16HighSurrogate(uint16_t ch)126 bool IsUTF16HighSurrogate(uint16_t ch)
127 {
128     return DECODE_LEAD_LOW <= ch && ch <= DECODE_LEAD_HIGH;
129 }
130 
IsUTF16LowSurrogate(uint16_t ch)131 bool IsUTF16LowSurrogate(uint16_t ch)
132 {
133     return DECODE_TRAIL_LOW <= ch && ch <= DECODE_TRAIL_HIGH;
134 }
135 
UTF8Length(uint32_t codePoint)136 size_t UTF8Length(uint32_t codePoint)
137 {
138     if (codePoint <= UTF8_1B_MAX) {
139         return UtfLength::ONE;
140     }
141     if (codePoint <= UTF8_2B_MAX) {
142         return UtfLength::TWO;
143     }
144     if (codePoint <= UTF8_3B_MAX) {
145         return UtfLength::THREE;
146     }
147     return UtfLength::FOUR;
148 }
149 
150 // Methods for encode unicode to unicode
EncodeUTF8(uint32_t codePoint,uint8_t * utf8,size_t len,size_t index)151 size_t EncodeUTF8(uint32_t codePoint, uint8_t* utf8, size_t len, size_t index)
152 {
153     size_t size = UTF8Length(codePoint);
154     if (index + size > len) {
155         return 0;
156     }
157     for (size_t j = size - 1; j > 0; j--) {
158         uint8_t cont = ((codePoint | BYTE_MARK) & BYTE_MASK);
159         utf8[index + j] = cont;
160         codePoint >>= UTF8_OFFSET;
161     }
162     utf8[index] = codePoint | FIRST_BYTE_MARK[size];
163     return size;
164 }
165 
HandleAndDecodeInvalidUTF16(uint16_t const * utf16,size_t len,size_t * index)166 uint32_t HandleAndDecodeInvalidUTF16(uint16_t const* utf16, size_t len, size_t* index)
167 {
168     uint16_t first = utf16[*index];
169     // A valid surrogate pair should always start with a High Surrogate
170     if (IsUTF16LowSurrogate(first)) {
171         return UTF16_REPLACEMENT_CHARACTER;
172     }
173     if (IsUTF16HighSurrogate(first) || (first & SURROGATE_MASK) == DECODE_LEAD_LOW) {
174         if (*index == len - 1) {
175             // A High surrogate not paired with another surrogate
176             return UTF16_REPLACEMENT_CHARACTER;
177         }
178         uint16_t second = utf16[*index + 1];
179         if (!IsUTF16LowSurrogate(second)) {
180             // A High surrogate not followed by a low surrogate
181             return UTF16_REPLACEMENT_CHARACTER;
182         }
183         // A valid surrogate pair, decode normally
184         (*index)++;
185         return ((first - DECODE_LEAD_LOW) << UTF16_OFFSET) + (second - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
186     }
187     // A unicode not fallen into the range of representing by surrogate pair, return as it is
188     return first;
189 }
190 
DebuggerConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start)191 size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t* utf16In, uint8_t* utf8Out, size_t utf16Len, size_t utf8Len,
192     size_t start)
193 {
194     if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
195         return 0;
196     }
197     size_t utf8Pos = 0;
198     size_t end = start + utf16Len;
199     for (size_t i = start; i < end; ++i) {
200         uint32_t codePoint = HandleAndDecodeInvalidUTF16(utf16In, end, &i);
201         if (codePoint == 0) {
202             continue;
203         }
204         utf8Pos += EncodeUTF8(codePoint, utf8Out, utf8Len, utf8Pos);
205     }
206     return utf8Pos;
207 }
208 
IsUTF8(std::string & data)209 bool IsUTF8(std::string& data)
210 {
211     if (data.empty()) {
212         return false;
213     }
214 
215     bool hasZeroByte = false;
216     bool hasMultiByteUTF8 = false;
217 
218     for (size_t i = 0; i < data.size(); ++i) {
219         unsigned char c = data[i];
220 
221         // Check for UTF-16LE byte order mark (BOM)
222         if (i == 0 && data.size() >= INDEX_TWO && data[INDEX_ONE] == UTF16LE_ZERO_BYTE &&
223             (c == UTF16LE_BOM_FF || c == UTF16LE_BOM_FE)) {
224             return false;
225         }
226 
227         // Check for zero bytes, which are common in UTF-16LE
228         if (c == UTF16LE_ZERO_BYTE) {
229             hasZeroByte = true;
230         }
231 
232         // Check for multi-byte UTF-8 sequences
233         if ((c & UTF8_HIGH_BIT) != 0) { // High bit is set, indicating a non-ASCII character
234             if ((c & UTF8_TWO_BYTE_MASK) == UTF8_TWO_BYTE_PATTERN && i + INDEX_ONE < data.size() &&
235                 (data[i + INDEX_ONE ] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER) {
236                 // Two-byte UTF-8 character
237                 hasMultiByteUTF8 = true;
238                 i += INDEX_ONE; // Skip the next byte
239             } else if ((c & UTF8_THREE_BYTE_MASK) == UTF8_THREE_BYTE_PATTERN && i + INDEX_TWO < data.size() &&
240                        (data[i + INDEX_ONE] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER &&
241                        (data[i + INDEX_TWO] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER) {
242                 // Three-byte UTF-8 character
243                 hasMultiByteUTF8 = true;
244                 i += INDEX_TWO; // Skip the next two bytes
245             } else if ((c & UTF8_FOUR_BYTE_MASK) == UTF8_FOUR_BYTE_PATTERN && i + INDEX_THREE < data.size() &&
246                        (data[i + INDEX_ONE] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER &&
247                        (data[i + INDEX_TWO] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER &&
248                        (data[i + INDEX_THREE] & UTF8_HIGH_BIT) == UTF8_MULTIBYTE_FOLLOWER) {
249                 // Four-byte UTF-8 character
250                 hasMultiByteUTF8 = true;
251                 i += INDEX_THREE; // Skip the next three bytes
252             }
253         }
254     }
255 
256     if (hasZeroByte && !hasMultiByteUTF8) {
257         // If we found zero bytes and no multi-byte UTF-8 sequences, it's likely UTF-16LE
258         return false;
259     } else if (hasMultiByteUTF8) {
260         // If we found multi-byte UTF-8 sequences, it's likely UTF-8
261         return true;
262     } else {
263         // If all characters are ASCII, it's either pure ASCII or we don't have enough data to determine the encoding
264         return false;
265     }
266 }
267 
ConvertIllegalStr(std::string & str)268 void ConvertIllegalStr(std::string& str)
269 {
270     if (IsUTF8(str)) {
271         uint8_t* buf8 =  reinterpret_cast<uint8_t*>(const_cast<char*>(str.c_str()));
272         size_t utf8Len = str.size();
273         auto utf16Len = MUtf8ToUtf16Size(buf8, utf8Len);
274         std::unique_ptr<uint16_t[]> buf16 = std::make_unique<uint16_t[]>(utf16Len);
275         auto resultLen = ConvertRegionUtf8ToUtf16(buf8, buf16.get(), utf8Len, utf16Len, 0);
276         if (resultLen == utf16Len) {
277             DebuggerConvertRegionUtf16ToUtf8(buf16.get(), buf8, utf16Len, utf8Len, 0);
278         }
279     }
280 }
281 
282 } // namespace OHOS::Ace
283