1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "unicode_ex.h"
17 
18 #include <climits>
19 #include <cstdio>
20 #include <cstdlib>
21 
22 #include "utils_log.h"
23 using namespace std;
24 /***************************************UTF8 and UTF16 unicode**********************************************
25 UTF8
26 Unicode                                 utf8
27 U + 0000~U + 007F                    0???????
28 U + 0080~U + 07FF                    110????? 10??????
29 U + 0800~U + FFFF                    1110???? 10?????? 10??????
30 U + 10000~U + 10FFFF                 11110??? 10?????? 10?????? 10??????
31 
32 UTF16
33 Unicode                           utf16 code
34 
35 U + 000~U + FFFF                    2 Byte save, same with Unicode
36 U + 10000~U + 10FFFF                4 Byte save Unicode 0x10000
37 **************************************UTF8 and UTF16 unicode**********************************************/
38 namespace OHOS {
39 namespace {
40 constexpr char32_t ONE_BYTE_UTF8 = 0x00000080;
41 constexpr char32_t TWO_BYTES_UTF8 = 0x00000800;
42 constexpr char32_t THREE_BYTES_UTF8 = 0x00010000;
43 
44 
45 constexpr char32_t UNICODE_RESERVED_START = 0x0000D800;
46 constexpr char32_t UNICODE_RESERVED_END = 0x0000DFFF;
47 constexpr char32_t UNICODE_MAX_NUM = 0x0010FFFF;
48 constexpr unsigned int UTF8_OFFSET = 6;
49 
50 constexpr char32_t UTF8_BYTE_MASK = 0x000000BF;
51 constexpr char32_t UTF8_BYTE_MARK = 0x00000080;
52 constexpr char32_t UTF8_FIRST_BYTE_MARK[] = {
53     0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
54 };
55 }
56 
57 #define UTF8_LENGTH_INVALID 0
58 #define UTF8_LENGTH_1 1
59 #define UTF8_LENGTH_2 2
60 #define UTF8_LENGTH_3 3
61 #define UTF8_LENGTH_4 4
62 #define UTF8_LEN_MASK 3
63 #define UTF8_FIRST_BYTE_INDEX 0
64 #define UTF8_SECOND_BYTE_INDEX 1
65 #define UTF8_THIRD_BYTE_INDEX 2
66 #define UTF8_FORTH_BYTE_INDEX 3
67 #define UTF8_SHIFT_WIDTH 6
68 #define STR16_TO_STR8_SHIFT_WIDTH 10
69 #define UTF16_SHIFT_WIDTH 10
70 #define UTF32_BYTE_SIZE_1 1
71 #define UTF32_BYTE_SIZE_2 2
72 #define UTF32_BYTE_SIZE_3 3
73 #define UTF32_BYTE_SIZE_4 4
74 
75 // inner func and dstP is not nullptr
Utf32CodePointToUtf8(uint8_t * dstP,char32_t srcChar,size_t bytes)76 void Utf32CodePointToUtf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
77 {
78     dstP += bytes;
79     if (bytes >= UTF32_BYTE_SIZE_4) {
80         *--dstP = static_cast<uint8_t>((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
81         srcChar >>= UTF8_OFFSET;
82     }
83 
84     if (bytes >= UTF32_BYTE_SIZE_3) {
85         *--dstP = static_cast<uint8_t>((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
86         srcChar >>= UTF8_OFFSET;
87     }
88 
89     if (bytes >= UTF32_BYTE_SIZE_2) {
90         *--dstP = static_cast<uint8_t>((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
91         srcChar >>= UTF8_OFFSET;
92     }
93 
94     if (bytes >= UTF32_BYTE_SIZE_1) {
95         *--dstP = static_cast<uint8_t>(srcChar | UTF8_FIRST_BYTE_MARK[bytes]);
96     }
97 }
98 
Utf32CodePointUtf8Length(char32_t srcChar)99 size_t Utf32CodePointUtf8Length(char32_t srcChar)
100 {
101     if (srcChar < ONE_BYTE_UTF8) {
102         return UTF8_LENGTH_1;
103     } else if (srcChar < TWO_BYTES_UTF8) {
104         return UTF8_LENGTH_2;
105     } else if (srcChar < THREE_BYTES_UTF8) {
106         if ((srcChar < UNICODE_RESERVED_START) || (srcChar > UNICODE_RESERVED_END)) {
107             return UTF8_LENGTH_3;
108         } else {
109             // Surrogates are invalid UTF-32 characters.
110             return UTF8_LENGTH_INVALID;
111         }
112     } else if (srcChar <= UNICODE_MAX_NUM) {
113         // Max code point for Unicode is 0x0010FFFF.
114         return UTF8_LENGTH_4;
115     } else {
116         // Invalid UTF-32 character.
117         return UTF8_LENGTH_INVALID;
118     }
119 }
120 
121 // get the length of utf8 from utf16
Utf16ToUtf8Length(const char16_t * str16,size_t str16Len)122 int Utf16ToUtf8Length(const char16_t* str16, size_t str16Len)
123 {
124     if (str16 == nullptr || str16Len == 0) {
125         return -1;
126     }
127 
128     const char16_t* const str16End = str16 + str16Len;
129     int utf8Len = 0;
130     while (str16 < str16End) {
131         int charLen = 0;
132         if (((*str16 & 0xFC00) == 0xD800) && ((str16 + 1) < str16End)
133             && ((*(str16 + 1) & 0xFC00) == 0xDC00)) {
134             // surrogate pairs are always 4 bytes.
135             charLen = 4;
136             // str16 advance 2 bytes
137             str16 += 2;
138         } else {
139             charLen = Utf32CodePointUtf8Length(static_cast<char32_t>(*str16++));
140         }
141 
142         if (utf8Len > (INT_MAX - charLen)) {
143             return -1;
144         }
145         utf8Len += charLen;
146     }
147     return utf8Len;
148 }
149 
150 // inner function, utf8Str and utf16Str is not nullptr
StrncpyStr16ToStr8(const char16_t * utf16Str,size_t str16Len,char * utf8Str,size_t str8Len)151 void StrncpyStr16ToStr8(const char16_t* utf16Str, size_t str16Len, char* utf8Str, size_t str8Len)
152 {
153     const char16_t* curUtf16 = utf16Str;
154     const char16_t* const endUtf16 = utf16Str + str16Len;
155     char* cur = utf8Str;
156     while (curUtf16 < endUtf16) {
157         char32_t utf32;
158         // surrogate pairs
159         if (((*curUtf16 & 0xFC00) == 0xD800) && ((curUtf16 + 1) < endUtf16)
160             && (((*(curUtf16 + 1) & 0xFC00)) == 0xDC00)) {
161             utf32 = (*curUtf16++ - 0xD800) << STR16_TO_STR8_SHIFT_WIDTH;
162             utf32 |= *curUtf16++ - 0xDC00;
163             utf32 += 0x10000;
164         } else {
165             utf32 = static_cast<char32_t>(*curUtf16++);
166         }
167         const size_t len = Utf32CodePointUtf8Length(utf32);
168         if (str8Len < len) {
169             break;
170         }
171 
172         Utf32CodePointToUtf8(reinterpret_cast<uint8_t*>(cur), utf32, len);
173         cur += len;
174         str8Len -= len;
175     }
176     *cur = '\0';
177 }
178 
179 // inner function and str16 is not null
Char16ToChar8(const char16_t * str16,size_t str16Len)180 char* Char16ToChar8(const char16_t* str16, size_t str16Len)
181 {
182     char* str8 = nullptr;
183     int utf8Len = Utf16ToUtf8Length(str16, str16Len);
184     if (utf8Len < 0 || utf8Len >= INT_MAX) {
185         return nullptr;
186     }
187 
188     // Allow for closing '\0'
189     utf8Len += 1;
190     str8 = reinterpret_cast<char*>(calloc(utf8Len, sizeof(char)));
191     if (str8 == nullptr) {
192         return nullptr;
193     }
194 
195     StrncpyStr16ToStr8(str16, str16Len, str8, utf8Len);
196     return str8;
197 }
198 
String16ToString8(const u16string & str16,string & str8)199 bool String16ToString8(const u16string& str16, string& str8)
200 {
201     size_t str16Len = str16.length();
202     if (str16Len < 1) {
203         return false;
204     }
205 
206     char* str8Temp = Char16ToChar8(str16.c_str(), str16Len);
207     if (str8Temp == nullptr) {
208         UTILS_LOGD("Str16 to str8 failed, because str8Temp is nullptr!");
209         return false;
210     }
211 
212     str8 = str8Temp;
213     free(str8Temp);
214     str8Temp = nullptr;
215     return true;
216 }
217 
218 /**
219 * return 1-4 by first byte
220 * 1111xxxx : 4
221 * 1110xxxx : 3
222 * 110xxxxx : 2
223 * 10xxxxxx : 1
224 * 0xxxxxxx : 1
225 */
Utf8CodePointLen(uint8_t ch)226 static inline size_t Utf8CodePointLen(uint8_t ch)
227 {
228     return ((0xe5000000 >> ((ch >> UTF8_LEN_MASK) & 0x1e)) & UTF8_LEN_MASK) + 1;
229 }
230 
Utf8ShiftAndMask(uint32_t * codePoint,const uint8_t byte)231 static inline void Utf8ShiftAndMask(uint32_t* codePoint, const uint8_t byte)
232 {
233     *codePoint <<= UTF8_SHIFT_WIDTH;
234     *codePoint |= 0x3F & byte;
235 }
236 
Utf8ToUtf32CodePoint(const char * src,size_t length)237 uint32_t Utf8ToUtf32CodePoint(const char* src, size_t length)
238 {
239     uint32_t unicode = 0;
240 
241     switch (length) {
242         case UTF8_LENGTH_1:
243             return src[UTF8_FIRST_BYTE_INDEX];
244         case UTF8_LENGTH_2:
245             unicode = src[UTF8_FIRST_BYTE_INDEX] & 0x1f;
246             Utf8ShiftAndMask(&unicode, src[UTF8_SECOND_BYTE_INDEX]);
247             return unicode;
248         case UTF8_LENGTH_3:
249             unicode = src[UTF8_FIRST_BYTE_INDEX] & 0x0f;
250             Utf8ShiftAndMask(&unicode, src[UTF8_SECOND_BYTE_INDEX]);
251             Utf8ShiftAndMask(&unicode, src[UTF8_THIRD_BYTE_INDEX]);
252             return unicode;
253         case UTF8_LENGTH_4:
254             unicode = src[UTF8_FIRST_BYTE_INDEX] & 0x07;
255             Utf8ShiftAndMask(&unicode, src[UTF8_SECOND_BYTE_INDEX]);
256             Utf8ShiftAndMask(&unicode, src[UTF8_THIRD_BYTE_INDEX]);
257             Utf8ShiftAndMask(&unicode, src[UTF8_FORTH_BYTE_INDEX]);
258             return unicode;
259         default:
260             return 0xffff;
261     }
262 }
263 
Utf8ToUtf16Length(const char * str8,size_t str8Len)264 int Utf8ToUtf16Length(const char* str8, size_t str8Len)
265 {
266     const char* const str8end = str8 + str8Len;
267     int utf16len = 0;
268     while (str8 < str8end) {
269         utf16len++;
270         size_t u8charlen = Utf8CodePointLen(*str8);
271         if (str8 + u8charlen - 1 >= str8end) {
272             UTILS_LOGE("Get str16 length failed because str8 unicode is illegal!");
273             return -1;
274         }
275         uint32_t codepoint = Utf8ToUtf32CodePoint(str8, u8charlen);
276         if (codepoint > 0xFFFF) {
277             utf16len++; // this will be a surrogate pair in utf16
278         }
279         str8 += u8charlen;
280     }
281     if (str8 != str8end) {
282         UTILS_LOGE("Get str16 length failed because str8length is illegal!");
283         return -1;
284     }
285     return utf16len;
286 }
287 
Utf8ToUtf16(const char * utf8Str,size_t u8len,char16_t * u16str,size_t u16len)288 char16_t* Utf8ToUtf16(const char* utf8Str, size_t u8len, char16_t* u16str, size_t u16len)
289 {
290     if (u16len == 0) {
291         return u16str;
292     }
293     const char* const u8end = utf8Str + u8len;
294     const char* u8cur = utf8Str;
295     const char16_t* const u16end = u16str + u16len;
296     char16_t* u16cur = u16str;
297 
298     while ((u8cur < u8end) && (u16cur < u16end)) {
299         size_t len = Utf8CodePointLen(*u8cur);
300         uint32_t codepoint = Utf8ToUtf32CodePoint(u8cur, len);
301         // Convert the UTF32 codepoint to one or more UTF16 codepoints
302         if (codepoint <= 0xFFFF) {
303             // Single UTF16 character
304             *u16cur++ = static_cast<char16_t>(codepoint);
305         } else {
306             // Multiple UTF16 characters with surrogates
307             codepoint = codepoint - 0x10000;
308             *u16cur++ = static_cast<char16_t>((codepoint >> UTF16_SHIFT_WIDTH) + 0xD800);
309             if (u16cur >= u16end) {
310                 // Ooops...  not enough room for this surrogate pair.
311                 return u16cur - 1;
312             }
313             *u16cur++ = static_cast<char16_t>((codepoint & 0x3FF) + 0xDC00);
314         }
315 
316         u8cur += len;
317     }
318     return u16cur;
319 }
320 
StrncpyStr8ToStr16(const char * utf8Str,size_t u8len,char16_t * u16str,size_t u16len)321 void StrncpyStr8ToStr16(const char* utf8Str, size_t u8len, char16_t* u16str, size_t u16len)
322 {
323     char16_t* result = Utf8ToUtf16(utf8Str, u8len, u16str, u16len - 1);
324     *result = 0;
325     return;
326 }
327 
328 // inner function and str8 is not null
Char8ToChar16(const char * str8,size_t str8Len)329 char16_t* Char8ToChar16(const char* str8, size_t str8Len)
330 {
331     char16_t* str16 = nullptr;
332     int utf16Len = Utf8ToUtf16Length(str8, str8Len);
333     if (utf16Len < 0) {
334         UTILS_LOGE("Get str16 length failed,length is: %{public}d", utf16Len);
335         return nullptr;
336     }
337 
338     // Allow for closing 0
339     utf16Len = utf16Len + 1;
340     str16 = reinterpret_cast<char16_t*>(calloc(utf16Len, sizeof(char16_t)));
341     if (str16 == nullptr) {
342         UTILS_LOGE("Str16 malloc memory failed!");
343         return nullptr;
344     }
345 
346     StrncpyStr8ToStr16(str8, str8Len, str16, utf16Len);
347     return str16;
348 }
349 
String8ToString16(const string & str8,u16string & str16)350 bool String8ToString16(const string& str8, u16string& str16)
351 {
352     size_t str8len = str8.length();
353     if (str8len < 1) {
354         return false;
355     }
356 
357     char16_t* str16Temp = Char8ToChar16(str8.c_str(), str8len);
358     if (str16Temp == nullptr) {
359         UTILS_LOGD("str8 to str16 failed, str16Temp is nullptr!");
360         return false;
361     }
362 
363     str16 = str16Temp;
364     free(str16Temp);
365     str16Temp = nullptr;
366     return true;
367 }
368 } // namespace OHOS
369