1 /*
2  * Copyright (c) 2022 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "converter.h"
17 
18 #include <climits>
19 #include <codecvt>
20 #include <locale>
21 
22 using namespace std;
23 
24 namespace OHOS::buffer {
25 
IsOneByte(uint8_t u8Char)26 bool IsOneByte(uint8_t u8Char)
27 {
28     return (u8Char & 0x80) == 0;
29 }
30 
Utf8ToUtf16BEToData(const unsigned char * data,u16string & u16Str,string::size_type & index,uint8_t & c1)31 void Utf8ToUtf16BEToData(const unsigned char *data, u16string &u16Str, string::size_type &index, uint8_t &c1)
32 {
33     uint8_t c2 = data[++index]; // The second byte
34     uint8_t c3 = data[++index]; // The third byte
35     uint8_t c4 = data[++index]; // The forth byte
36     // Calculate the UNICODE code point value (3 bits lower for the first byte, 6 bits for the other)
37     // 3 : shift left 3 times of UTF8_VALID_BITS
38     uint32_t codePoint = ((c1 & LOWER_3_BITS_MASK) << (3 * UTF8_VALID_BITS)) |
39         // 2 : shift left 2 times of UTF8_VALID_BITS
40         ((c2 & LOWER_6_BITS_MASK) << (2 * UTF8_VALID_BITS)) |
41         ((c3 & LOWER_6_BITS_MASK) << UTF8_VALID_BITS) |
42         (c4 & LOWER_6_BITS_MASK);
43     // In UTF-16, U+10000 to U+10FFFF represent surrogate pairs with two 16-bit units
44     if (codePoint >= UTF16_SPECIAL_VALUE) {
45         codePoint -= UTF16_SPECIAL_VALUE;
46         // 10 : a half of 20 , shift right 10 bits
47         u16Str.push_back(static_cast<char16_t>((codePoint >> 10) | HIGH_AGENT_MASK));
48         u16Str.push_back(static_cast<char16_t>((codePoint & LOWER_10_BITS_MASK) | LOW_AGENT_MASK));
49     } else { // In UTF-16, U+0000 to U+D7FF and U+E000 to U+FFFF are Unicode code point values
50         // U+D800 to U+DFFF are invalid characters, for simplicity,
51         // assume it does not exist (if any, not encoded)
52         u16Str.push_back(static_cast<char16_t>(codePoint));
53     }
54 }
55 
Utf8ToUtf16BE(const string & u8Str,bool * ok)56 u16string Utf8ToUtf16BE(const string &u8Str, bool *ok)
57 {
58     u16string u16Str = u"";
59     u16Str.reserve(u8Str.size());
60     string::size_type len = u8Str.length();
61     const unsigned char *data = reinterpret_cast<const unsigned char *>(u8Str.data());
62     bool isOk = true;
63     for (string::size_type i = 0; i < len; ++i) {
64         uint8_t c1 = data[i]; // The first byte
65         if (IsOneByte(c1)) { // only 1 byte represents the UNICODE code point
66             u16Str.push_back(static_cast<char16_t>(c1));
67             continue;
68         }
69         switch (c1 & HIGER_4_BITS_MASK) {
70             case FOUR_BYTES_STYLE: { // 4 byte characters, from 0x10000 to 0x10FFFF
71                 Utf8ToUtf16BEToData(data, u16Str, i, c1);
72                 break;
73             }
74             case THREE_BYTES_STYLE: { // 3 byte characters, from 0x800 to 0xFFFF
75                 uint8_t c2 = data[++i]; // The second byte
76                 uint8_t c3 = data[++i]; // The third byte
77                 // Calculates the UNICODE code point value
78                 // (4 bits lower for the first byte, 6 bits lower for the other)
79                 // 2 : shift left 2 times of UTF8_VALID_BITS
80                 uint32_t codePoint = ((c1 & LOWER_4_BITS_MASK) << (2 * UTF8_VALID_BITS)) |
81                     ((c2 & LOWER_6_BITS_MASK) << UTF8_VALID_BITS) |
82                     (c3 & LOWER_6_BITS_MASK);
83                 u16Str.push_back(static_cast<char16_t>(codePoint));
84                 break;
85             }
86             case TWO_BYTES_STYLE1: // 2 byte characters, from 0x80 to 0x7FF
87             case TWO_BYTES_STYLE2: {
88                 uint8_t c2 = data[++i]; // The second byte
89                 // Calculates the UNICODE code point value
90                 // (5 bits lower for the first byte, 6 bits lower for the other)
91                 uint32_t codePoint = ((c1 & LOWER_5_BITS_MASK) << UTF8_VALID_BITS) |
92                     (c2 & LOWER_6_BITS_MASK);
93                 u16Str.push_back(static_cast<char16_t>(codePoint));
94                 break;
95             }
96             default: {
97                 isOk = false;
98                 break;
99             }
100         }
101     }
102     if (ok != nullptr) {
103         *ok = isOk;
104     }
105     return u16Str;
106 }
107 
Utf16BEToLE(const u16string & wstr)108 u16string Utf16BEToLE(const u16string &wstr)
109 {
110     u16string str16 = u"";
111     const char16_t *data = wstr.data();
112     for (unsigned int i = 0; i < wstr.length(); i++) {
113         char16_t wc = data[i];
114         char16_t high = (wc >> 8) & 0x00FF;
115         char16_t low = wc & 0x00FF;
116         char16_t c16 = (low << 8) | high;
117         str16.push_back(c16);
118     }
119     return str16;
120 }
121 
Utf16BEToANSI(const u16string & wstr)122 string Utf16BEToANSI(const u16string &wstr)
123 {
124     string ret = "";
125     for (u16string::const_iterator it = wstr.begin(); it != wstr.end(); ++it) {
126         char16_t wc = (*it);
127         // get the lower bit from the UNICODE code point
128         char c = static_cast<char>(wc & LOWER_8_BITS_MASK);
129         ret.push_back(c);
130     }
131     return ret;
132 }
133 
Utf8ToUtf16BEToANSI(const string & str)134 string Utf8ToUtf16BEToANSI(const string &str)
135 {
136     u16string u16Str = Utf8ToUtf16BE(str);
137     string ret = Utf16BEToANSI(u16Str);
138     return ret;
139 }
140 
IsBase64Char(unsigned char c)141 bool IsBase64Char(unsigned char c)
142 {
143     return (isalnum(c) || (c == '+') || (c == '/') || (c == '-') || (c == '_'));
144 }
145 
146 /**
147 * Base64Encode - Base64 encode
148 * @src: Data to be encoded
149 * @len: Length of the data to be encoded
150 * Returns: Allocated buffer of outLen bytes of encoded data,
151 * or empty string on failure
152 */
Base64Encode(const unsigned char * src,size_t len,EncodingType type)153 string Base64Encode(const unsigned char *src, size_t len, EncodingType type)
154 {
155     if (src == nullptr) {
156         return string();
157     }
158     unsigned char *out = nullptr;
159     unsigned char *pos = nullptr;
160     const unsigned char *pEnd = nullptr;
161     const unsigned char *pStart = nullptr;
162     size_t outLen = 4 * ((len + 2) / 3); // 3-byte blocks to 4-byte
163 
164     if (outLen < len) {
165         return string(); // integer overflow
166     }
167 
168     string outStr = "";
169     outStr.resize(outLen);
170     out = reinterpret_cast<unsigned char *>(&outStr[0]);
171 
172     pEnd = src + len;
173     pStart = src;
174     pos = out;
175 
176     string table = BASE64_TABLE;
177     if (type == BASE64URL) {
178         table = BASE64URL_TABLE;
179     }
180     // 3 : 3 bytes is just 24 bits which is 4 times of 6 bits
181     while (pEnd - pStart >= 3) {
182         // 2 : add two zeros in front of the first set of 6 bits to become a new 8 binary bits
183         *pos = table[pStart[0] >> 2];
184         // 4 : add two zeros in front of the following second set of 6 bits to become the new 8 binary bits
185         *(pos + 1) = table[((pStart[0] & LOWER_2_BITS_MASK) << 4) | (pStart[1] >> 4)];
186         // 2 : 4 : 6 : add two zeros in front of the following third set of 6 bits to become the new 8 binary bits
187         *(pos + 2) = table[((pStart[1] & LOWER_4_BITS_MASK) << 2) | (pStart[2] >> 6)];
188         // 2 : 3 : add two zeros in front of the following forth set of 6 bits to become the new 8 binary bits
189         *(pos + 3) = table[pStart[2] & LOWER_6_BITS_MASK];
190         // 4 : the pointer of pos scrolls off 4 bytes to point the next 4 bytes of encoded chars
191         pos += 4;
192         // 3 : the pointer of pStart scrolls off 3 bytes to point the next 3 bytes of which will be encoded chars
193         pStart += 3;
194     }
195 
196     // process the last set of less than 3 bytes of data
197     if (pEnd - pStart > 0) {
198         // 2 : add two zeros in front of the first set of 6 bits to become a new 8 binary bits
199         *pos = table[pStart[0] >> 2];
200         if (pEnd - pStart == 1) { // one byte remaining
201             // 4 : paddle the last two bits of the last byte with two zeros in front of it and four zeros after it
202             *(pos + 1) = table[(pStart[0] & LOWER_2_BITS_MASK) << 4];
203             // 2 : fill in the missing bytes with '='
204             *(pos + 2) = '=';
205         } else { // two bytes remaining
206             // 4 : add two zeros in front of the second set of 6 bits to become the new 8 binary bits
207             *(pos + 1) = table[((pStart[0] & LOWER_2_BITS_MASK) << 4) | (pStart[1] >> 4)];
208             // 2 : paddle the last four bits of the last byte with two zeros in front of it and two zeros after it
209             *(pos + 2) = table[(pStart[1] & LOWER_4_BITS_MASK) << 2];
210         }
211         // 3 : fill in the missing bytes with '='
212         *(pos + 3) = '=';
213     }
214 
215     if (type == BASE64URL) {
216         size_t poss = outStr.find_last_not_of('=');
217         if (poss != std::string::npos) {
218             outStr.erase(poss + 1);
219         }
220     }
221     return outStr;
222 }
223 
Base64Decode(string const & encodedStr,EncodingType type)224 string Base64Decode(string const& encodedStr, EncodingType type)
225 {
226     size_t len = encodedStr.size();
227     unsigned int index = 0;
228     unsigned int cursor = 0;
229     unsigned char charArray4[4] = {0}; // an array to stage a group of indexes for encoded string
230     unsigned char charArray3[3] = {0}; // an array to stage a set of original string
231     string ret = "";
232     string table = BASE64_TABLE;
233 
234     if (type == BASE64URL) {
235         table = BASE64URL_TABLE;
236     }
237     while ((encodedStr[cursor] != '=') && IsBase64Char(encodedStr[cursor])) {
238         // stage a 4-byte string to charArray4
239         charArray4[index] = encodedStr[cursor];
240         index++;
241         cursor++;
242         if (index == 4) { // 4 : after 4 chars is assigned to charArray4
243             // 4 : fill data into charArray4
244             for (index = 0; index < 4; index++) {
245                 charArray4[index] = table.find(charArray4[index]) & LOWER_8_BITS_MASK;
246             }
247             // get the last six bits of the first byte of charArray4 and the first valid
248             // 2 : 4 : two bits(except two higer bits) of the second byte, combine them to a new byte
249             charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4);
250             // get the last four bits of the second byte of charArray4 and the first valid
251             // 4 : 2 : four bits(except two higer bits) of the third byte, combine them to a new byte
252             charArray3[1] = ((charArray4[1] & LOWER_4_BITS_MASK) << 4) + ((charArray4[2] & MIDDLE_4_BITS_MASK) >> 2);
253             // get the last two bits of the third byte of charArray4 and the forth byte,
254             // 2 : 3 : 6 : combine them to a new byte
255             charArray3[2] = ((charArray4[2] & LOWER_2_BITS_MASK) << 6) + charArray4[3];
256             // 3 : assigns the decoded string to the return value
257             for (index = 0; index < 3; index++) {
258                 ret += charArray3[index];
259             }
260             index = 0;
261         }
262         if (cursor > len - 1) {
263             break;
264         }
265     }
266 
267     if (index != 0) {
268         // fill data into charArray4
269         for (unsigned int i = 0; i < index; i++) {
270             charArray4[i] = table.find(charArray4[i]) & LOWER_8_BITS_MASK;
271         }
272         // get the last six bits of the first byte of charArray4 and the first valid
273         // 2 : 4 : two bits(except two higer bits) of the second byte, combine them to a new byte
274         charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4);
275         // get the last four bits of the second byte of charArray4 and the first valid
276         // 4 : 2 : four bits(except two higer bits) of the third byte, combine them to a new byte
277         charArray3[1] = ((charArray4[1] & LOWER_4_BITS_MASK) << 4) + ((charArray4[2] & LOWER_6_BITS_MASK) >> 2);
278         // assigns the decoded string to the return value
279         for (unsigned int i = 0; i < index - 1; i++) {
280             ret += charArray3[i];
281         }
282     }
283 
284     return ret;
285 }
286 
IsValidHex(const string & hex)287 bool IsValidHex(const string &hex)
288 {
289     bool isValid = false;
290     for (unsigned int i = 0; i < hex.size(); i++) {
291         char c = hex.at(i);
292         // 0 ~ 9, A ~ F, a ~ f
293         if ((c <= '9' && c >= '0') || (c <= 'F' && c >= 'A') || (c <= 'f' && c >= 'a')) {
294             isValid = true;
295         } else {
296             isValid = false;
297             break;
298         }
299     }
300     return isValid;
301 }
302 
HexDecode(const string & hexStr)303 string HexDecode(const string &hexStr)
304 {
305     string nums = "";
306     unsigned int arrSize = hexStr.size();
307 
308     // 2 : means a half length of hex str's size
309     for (unsigned int i = 0; i < arrSize / 2; i++) {
310         string hexStrTmp = "";
311         int num = 0;
312         // 2 : offset is i * 2
313         hexStrTmp.push_back(hexStr[i * 2]);
314         // 2 : offset is i * 2 + 1
315         hexStrTmp.push_back(hexStr[i * 2 + 1]);
316         if (!IsValidHex(hexStrTmp)) {
317             break;
318         }
319         // 16 : the base is 16
320         num = stoi(hexStrTmp, nullptr, 16);
321         nums.push_back(static_cast<char>(num));
322     }
323 
324     return nums;
325 }
326 
327 // Find the position of the last character in pat from patIndex
GetGoodSuffixLengthByLastChar(uint8_t * pat,int patIndex,int patLen)328 int GetGoodSuffixLengthByLastChar(uint8_t *pat, int patIndex, int patLen)
329 {
330     int lastIndex = patLen - 1;
331     int index = -1;
332     while (patIndex >= 0) {
333         if (pat[patIndex] == pat[lastIndex]) {
334             index = patIndex;
335             break;
336         } else {
337             patIndex--;
338         }
339     }
340     return lastIndex - index;
341 }
342 // Find the position of the first character in pat from patIndex
GetGoodSuffixLengthByFirstChar(uint8_t * pat,int patIndex,int tarlen)343 int GetGoodSuffixLengthByFirstChar(uint8_t *pat, int patIndex, int tarlen)
344 {
345     int indexOfNextFirstChar = tarlen;
346     for (int i = patIndex; i < tarlen; i++) {
347         if (pat[0] == pat[i]) {
348             indexOfNextFirstChar = i;
349             break;
350         }
351     }
352     return indexOfNextFirstChar;
353 }
354 
355 // Match forward from patIndex to get the position of the singleChar in the pat
356 // and the length of the bad character
GetBadCharLengthInReverseOrder(uint8_t * pat,char singleChar,int patIndex)357 int GetBadCharLengthInReverseOrder(uint8_t *pat, char singleChar, int patIndex)
358 {
359     int index = -1;
360     for (int i = patIndex - 1; i >= 0; --i) {
361         if (pat[i] == singleChar) {
362             index = i;
363             break;
364         }
365     }
366     return patIndex - index;
367 }
368 
369 // Get the position of character c in pat
GetBadCharLengthInSequence(uint8_t * pat,char singleChar,int patIndex,int tarlen)370 int GetBadCharLengthInSequence(uint8_t *pat, char singleChar, int patIndex, int tarlen)
371 {
372     int resIndex = tarlen;
373     for (int i = patIndex; i < tarlen; i++) {
374         if (singleChar == pat[i]) {
375             resIndex = i;
376             break;
377         }
378     }
379     return resIndex;
380 }
381 
FindLastIndex(uint8_t * source,uint8_t * target,int soulen,int tarlen)382 int FindLastIndex(uint8_t *source, uint8_t *target, int soulen, int tarlen)
383 {
384     if (source == nullptr || target == nullptr) {
385         return -1;
386     }
387     if (soulen < tarlen || tarlen == 0) {
388         return -1;
389     }
390     int i = soulen - tarlen;
391     int j = 0;
392 
393     while (i >= 0) {
394         if (source[i] == target[j]) {
395             if (j == tarlen - 1) {
396                 return i - (tarlen - 1);
397             }
398             i++;
399             j++;
400         } else {
401             if (j == 0) {
402                 int badValue = GetBadCharLengthInSequence(target, source[i], j, tarlen);
403                 i = i - badValue;
404                 j = 0;
405             } else {
406                 int badValue = GetBadCharLengthInSequence(target, source[i], j, tarlen);
407                 int goodSuffix = GetGoodSuffixLengthByFirstChar(target, j, tarlen);
408                 int distance = badValue > goodSuffix ? badValue : goodSuffix;
409                 i = i - distance;
410                 j = 0;
411             }
412         }
413     }
414     return -1;
415 }
416 
FindIndexInner(uint8_t * target,uint8_t * source,int tarlen,int & indexI,int & indexJ)417 bool FindIndexInner(uint8_t* target, uint8_t* source, int tarlen, int &indexI, int &indexJ)
418 {
419     if (indexJ == tarlen - 1) {
420         int badValue = GetBadCharLengthInReverseOrder(target, source[indexI], indexJ);
421         indexI = indexI + badValue;
422     } else {
423         int badValue = GetBadCharLengthInReverseOrder(target, source[indexI], indexJ);
424         int goodSuffix = GetGoodSuffixLengthByLastChar(target, indexJ, tarlen);
425         int distance = badValue > goodSuffix ? badValue : goodSuffix;
426         long addVal = static_cast<long>(indexI) + tarlen;
427         long addRst = addVal + distance;
428         if (abs(addVal) > INT_MAX || abs(addRst) > INT_MAX) {
429             return false;
430         }
431         indexI = indexI + tarlen - 1 - indexJ + distance;
432         indexJ = tarlen - 1;
433     }
434     return true;
435 }
436 
FindIndex(uint8_t * source,uint8_t * target,int soulen,int tarlen)437 int FindIndex(uint8_t* source, uint8_t* target, int soulen, int tarlen)
438 {
439     if (source == nullptr || target == nullptr) {
440         return -1;
441     }
442     if (soulen < tarlen || tarlen == 0) {
443         return -1;
444     }
445     int i = tarlen - 1;
446     int j = tarlen - 1;
447     while (i < soulen) {
448         if (source[i] == target[j]) {
449             if (j == 0) {
450                 return i;
451             }
452             i--;
453             j--;
454         } else {
455             bool flag = FindIndexInner(target, source, tarlen, i, j);
456             if (!flag) {
457                 return -1;
458             }
459         }
460     }
461     return -1;
462 }
463 }
464