1 /*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "unicode_ex.h"
17
18 #include <climits>
19 #include <cstdio>
20 #include <cstdlib>
21
22 #include "utils_log.h"
23 using namespace std;
24 /***************************************UTF8 and UTF16 unicode**********************************************
25 UTF8
26 Unicode utf8
27 U + 0000~U + 007F 0???????
28 U + 0080~U + 07FF 110????? 10??????
29 U + 0800~U + FFFF 1110???? 10?????? 10??????
30 U + 10000~U + 10FFFF 11110??? 10?????? 10?????? 10??????
31
32 UTF16
33 Unicode utf16 code
34
35 U + 000~U + FFFF 2 Byte save, same with Unicode
36 U + 10000~U + 10FFFF 4 Byte save Unicode 0x10000
37 **************************************UTF8 and UTF16 unicode**********************************************/
38 namespace OHOS {
39 namespace {
40 constexpr char32_t ONE_BYTE_UTF8 = 0x00000080;
41 constexpr char32_t TWO_BYTES_UTF8 = 0x00000800;
42 constexpr char32_t THREE_BYTES_UTF8 = 0x00010000;
43
44
45 constexpr char32_t UNICODE_RESERVED_START = 0x0000D800;
46 constexpr char32_t UNICODE_RESERVED_END = 0x0000DFFF;
47 constexpr char32_t UNICODE_MAX_NUM = 0x0010FFFF;
48 constexpr unsigned int UTF8_OFFSET = 6;
49
50 constexpr char32_t UTF8_BYTE_MASK = 0x000000BF;
51 constexpr char32_t UTF8_BYTE_MARK = 0x00000080;
52 constexpr char32_t UTF8_FIRST_BYTE_MARK[] = {
53 0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
54 };
55 }
56
57 #define UTF8_LENGTH_INVALID 0
58 #define UTF8_LENGTH_1 1
59 #define UTF8_LENGTH_2 2
60 #define UTF8_LENGTH_3 3
61 #define UTF8_LENGTH_4 4
62 #define UTF8_LEN_MASK 3
63 #define UTF8_FIRST_BYTE_INDEX 0
64 #define UTF8_SECOND_BYTE_INDEX 1
65 #define UTF8_THIRD_BYTE_INDEX 2
66 #define UTF8_FORTH_BYTE_INDEX 3
67 #define UTF8_SHIFT_WIDTH 6
68 #define STR16_TO_STR8_SHIFT_WIDTH 10
69 #define UTF16_SHIFT_WIDTH 10
70 #define UTF32_BYTE_SIZE_1 1
71 #define UTF32_BYTE_SIZE_2 2
72 #define UTF32_BYTE_SIZE_3 3
73 #define UTF32_BYTE_SIZE_4 4
74
75 // inner func and dstP is not nullptr
Utf32CodePointToUtf8(uint8_t * dstP,char32_t srcChar,size_t bytes)76 void Utf32CodePointToUtf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
77 {
78 dstP += bytes;
79 if (bytes >= UTF32_BYTE_SIZE_4) {
80 *--dstP = static_cast<uint8_t>((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
81 srcChar >>= UTF8_OFFSET;
82 }
83
84 if (bytes >= UTF32_BYTE_SIZE_3) {
85 *--dstP = static_cast<uint8_t>((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
86 srcChar >>= UTF8_OFFSET;
87 }
88
89 if (bytes >= UTF32_BYTE_SIZE_2) {
90 *--dstP = static_cast<uint8_t>((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
91 srcChar >>= UTF8_OFFSET;
92 }
93
94 if (bytes >= UTF32_BYTE_SIZE_1) {
95 *--dstP = static_cast<uint8_t>(srcChar | UTF8_FIRST_BYTE_MARK[bytes]);
96 }
97 }
98
Utf32CodePointUtf8Length(char32_t srcChar)99 size_t Utf32CodePointUtf8Length(char32_t srcChar)
100 {
101 if (srcChar < ONE_BYTE_UTF8) {
102 return UTF8_LENGTH_1;
103 } else if (srcChar < TWO_BYTES_UTF8) {
104 return UTF8_LENGTH_2;
105 } else if (srcChar < THREE_BYTES_UTF8) {
106 if ((srcChar < UNICODE_RESERVED_START) || (srcChar > UNICODE_RESERVED_END)) {
107 return UTF8_LENGTH_3;
108 } else {
109 // Surrogates are invalid UTF-32 characters.
110 return UTF8_LENGTH_INVALID;
111 }
112 } else if (srcChar <= UNICODE_MAX_NUM) {
113 // Max code point for Unicode is 0x0010FFFF.
114 return UTF8_LENGTH_4;
115 } else {
116 // Invalid UTF-32 character.
117 return UTF8_LENGTH_INVALID;
118 }
119 }
120
121 // get the length of utf8 from utf16
Utf16ToUtf8Length(const char16_t * str16,size_t str16Len)122 int Utf16ToUtf8Length(const char16_t* str16, size_t str16Len)
123 {
124 if (str16 == nullptr || str16Len == 0) {
125 return -1;
126 }
127
128 const char16_t* const str16End = str16 + str16Len;
129 int utf8Len = 0;
130 while (str16 < str16End) {
131 int charLen = 0;
132 if (((*str16 & 0xFC00) == 0xD800) && ((str16 + 1) < str16End)
133 && ((*(str16 + 1) & 0xFC00) == 0xDC00)) {
134 // surrogate pairs are always 4 bytes.
135 charLen = 4;
136 // str16 advance 2 bytes
137 str16 += 2;
138 } else {
139 charLen = Utf32CodePointUtf8Length(static_cast<char32_t>(*str16++));
140 }
141
142 if (utf8Len > (INT_MAX - charLen)) {
143 return -1;
144 }
145 utf8Len += charLen;
146 }
147 return utf8Len;
148 }
149
150 // inner function, utf8Str and utf16Str is not nullptr
StrncpyStr16ToStr8(const char16_t * utf16Str,size_t str16Len,char * utf8Str,size_t str8Len)151 void StrncpyStr16ToStr8(const char16_t* utf16Str, size_t str16Len, char* utf8Str, size_t str8Len)
152 {
153 const char16_t* curUtf16 = utf16Str;
154 const char16_t* const endUtf16 = utf16Str + str16Len;
155 char* cur = utf8Str;
156 while (curUtf16 < endUtf16) {
157 char32_t utf32;
158 // surrogate pairs
159 if (((*curUtf16 & 0xFC00) == 0xD800) && ((curUtf16 + 1) < endUtf16)
160 && (((*(curUtf16 + 1) & 0xFC00)) == 0xDC00)) {
161 utf32 = (*curUtf16++ - 0xD800) << STR16_TO_STR8_SHIFT_WIDTH;
162 utf32 |= *curUtf16++ - 0xDC00;
163 utf32 += 0x10000;
164 } else {
165 utf32 = static_cast<char32_t>(*curUtf16++);
166 }
167 const size_t len = Utf32CodePointUtf8Length(utf32);
168 if (str8Len < len) {
169 break;
170 }
171
172 Utf32CodePointToUtf8(reinterpret_cast<uint8_t*>(cur), utf32, len);
173 cur += len;
174 str8Len -= len;
175 }
176 *cur = '\0';
177 }
178
179 // inner function and str16 is not null
Char16ToChar8(const char16_t * str16,size_t str16Len)180 char* Char16ToChar8(const char16_t* str16, size_t str16Len)
181 {
182 char* str8 = nullptr;
183 int utf8Len = Utf16ToUtf8Length(str16, str16Len);
184 if (utf8Len < 0 || utf8Len >= INT_MAX) {
185 return nullptr;
186 }
187
188 // Allow for closing '\0'
189 utf8Len += 1;
190 str8 = reinterpret_cast<char*>(calloc(utf8Len, sizeof(char)));
191 if (str8 == nullptr) {
192 return nullptr;
193 }
194
195 StrncpyStr16ToStr8(str16, str16Len, str8, utf8Len);
196 return str8;
197 }
198
String16ToString8(const u16string & str16,string & str8)199 bool String16ToString8(const u16string& str16, string& str8)
200 {
201 size_t str16Len = str16.length();
202 if (str16Len < 1) {
203 return false;
204 }
205
206 char* str8Temp = Char16ToChar8(str16.c_str(), str16Len);
207 if (str8Temp == nullptr) {
208 UTILS_LOGD("Str16 to str8 failed, because str8Temp is nullptr!");
209 return false;
210 }
211
212 str8 = str8Temp;
213 free(str8Temp);
214 str8Temp = nullptr;
215 return true;
216 }
217
218 /**
219 * return 1-4 by first byte
220 * 1111xxxx : 4
221 * 1110xxxx : 3
222 * 110xxxxx : 2
223 * 10xxxxxx : 1
224 * 0xxxxxxx : 1
225 */
Utf8CodePointLen(uint8_t ch)226 static inline size_t Utf8CodePointLen(uint8_t ch)
227 {
228 return ((0xe5000000 >> ((ch >> UTF8_LEN_MASK) & 0x1e)) & UTF8_LEN_MASK) + 1;
229 }
230
Utf8ShiftAndMask(uint32_t * codePoint,const uint8_t byte)231 static inline void Utf8ShiftAndMask(uint32_t* codePoint, const uint8_t byte)
232 {
233 *codePoint <<= UTF8_SHIFT_WIDTH;
234 *codePoint |= 0x3F & byte;
235 }
236
Utf8ToUtf32CodePoint(const char * src,size_t length)237 uint32_t Utf8ToUtf32CodePoint(const char* src, size_t length)
238 {
239 uint32_t unicode = 0;
240
241 switch (length) {
242 case UTF8_LENGTH_1:
243 return src[UTF8_FIRST_BYTE_INDEX];
244 case UTF8_LENGTH_2:
245 unicode = src[UTF8_FIRST_BYTE_INDEX] & 0x1f;
246 Utf8ShiftAndMask(&unicode, src[UTF8_SECOND_BYTE_INDEX]);
247 return unicode;
248 case UTF8_LENGTH_3:
249 unicode = src[UTF8_FIRST_BYTE_INDEX] & 0x0f;
250 Utf8ShiftAndMask(&unicode, src[UTF8_SECOND_BYTE_INDEX]);
251 Utf8ShiftAndMask(&unicode, src[UTF8_THIRD_BYTE_INDEX]);
252 return unicode;
253 case UTF8_LENGTH_4:
254 unicode = src[UTF8_FIRST_BYTE_INDEX] & 0x07;
255 Utf8ShiftAndMask(&unicode, src[UTF8_SECOND_BYTE_INDEX]);
256 Utf8ShiftAndMask(&unicode, src[UTF8_THIRD_BYTE_INDEX]);
257 Utf8ShiftAndMask(&unicode, src[UTF8_FORTH_BYTE_INDEX]);
258 return unicode;
259 default:
260 return 0xffff;
261 }
262 }
263
Utf8ToUtf16Length(const char * str8,size_t str8Len)264 int Utf8ToUtf16Length(const char* str8, size_t str8Len)
265 {
266 const char* const str8end = str8 + str8Len;
267 int utf16len = 0;
268 while (str8 < str8end) {
269 utf16len++;
270 size_t u8charlen = Utf8CodePointLen(*str8);
271 if (str8 + u8charlen - 1 >= str8end) {
272 UTILS_LOGE("Get str16 length failed because str8 unicode is illegal!");
273 return -1;
274 }
275 uint32_t codepoint = Utf8ToUtf32CodePoint(str8, u8charlen);
276 if (codepoint > 0xFFFF) {
277 utf16len++; // this will be a surrogate pair in utf16
278 }
279 str8 += u8charlen;
280 }
281 if (str8 != str8end) {
282 UTILS_LOGE("Get str16 length failed because str8length is illegal!");
283 return -1;
284 }
285 return utf16len;
286 }
287
Utf8ToUtf16(const char * utf8Str,size_t u8len,char16_t * u16str,size_t u16len)288 char16_t* Utf8ToUtf16(const char* utf8Str, size_t u8len, char16_t* u16str, size_t u16len)
289 {
290 if (u16len == 0) {
291 return u16str;
292 }
293 const char* const u8end = utf8Str + u8len;
294 const char* u8cur = utf8Str;
295 const char16_t* const u16end = u16str + u16len;
296 char16_t* u16cur = u16str;
297
298 while ((u8cur < u8end) && (u16cur < u16end)) {
299 size_t len = Utf8CodePointLen(*u8cur);
300 uint32_t codepoint = Utf8ToUtf32CodePoint(u8cur, len);
301 // Convert the UTF32 codepoint to one or more UTF16 codepoints
302 if (codepoint <= 0xFFFF) {
303 // Single UTF16 character
304 *u16cur++ = static_cast<char16_t>(codepoint);
305 } else {
306 // Multiple UTF16 characters with surrogates
307 codepoint = codepoint - 0x10000;
308 *u16cur++ = static_cast<char16_t>((codepoint >> UTF16_SHIFT_WIDTH) + 0xD800);
309 if (u16cur >= u16end) {
310 // Ooops... not enough room for this surrogate pair.
311 return u16cur - 1;
312 }
313 *u16cur++ = static_cast<char16_t>((codepoint & 0x3FF) + 0xDC00);
314 }
315
316 u8cur += len;
317 }
318 return u16cur;
319 }
320
StrncpyStr8ToStr16(const char * utf8Str,size_t u8len,char16_t * u16str,size_t u16len)321 void StrncpyStr8ToStr16(const char* utf8Str, size_t u8len, char16_t* u16str, size_t u16len)
322 {
323 char16_t* result = Utf8ToUtf16(utf8Str, u8len, u16str, u16len - 1);
324 *result = 0;
325 return;
326 }
327
328 // inner function and str8 is not null
Char8ToChar16(const char * str8,size_t str8Len)329 char16_t* Char8ToChar16(const char* str8, size_t str8Len)
330 {
331 char16_t* str16 = nullptr;
332 int utf16Len = Utf8ToUtf16Length(str8, str8Len);
333 if (utf16Len < 0) {
334 UTILS_LOGE("Get str16 length failed,length is: %{public}d", utf16Len);
335 return nullptr;
336 }
337
338 // Allow for closing 0
339 utf16Len = utf16Len + 1;
340 str16 = reinterpret_cast<char16_t*>(calloc(utf16Len, sizeof(char16_t)));
341 if (str16 == nullptr) {
342 UTILS_LOGE("Str16 malloc memory failed!");
343 return nullptr;
344 }
345
346 StrncpyStr8ToStr16(str8, str8Len, str16, utf16Len);
347 return str16;
348 }
349
String8ToString16(const string & str8,u16string & str16)350 bool String8ToString16(const string& str8, u16string& str16)
351 {
352 size_t str8len = str8.length();
353 if (str8len < 1) {
354 return false;
355 }
356
357 char16_t* str16Temp = Char8ToChar16(str8.c_str(), str8len);
358 if (str16Temp == nullptr) {
359 UTILS_LOGD("str8 to str16 failed, str16Temp is nullptr!");
360 return false;
361 }
362
363 str16 = str16Temp;
364 free(str16Temp);
365 str16Temp = nullptr;
366 return true;
367 }
368 } // namespace OHOS
369