1 /*
2  * Copyright (c) 2023 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "utf8_utils.h"
17 
18 namespace OHOS::Request::Utf8Utils {
19 
20 static constexpr size_t TWO_OCTET = 2;
21 static constexpr size_t THREE_OCTET = 3;
22 static constexpr size_t FOUR_OCTET = 4;
23 
24 // Given a first byte, determines how many bytes are in this UTF-8 character.
Utf8CharWidth(uint8_t b)25 size_t Utf8CharWidth(uint8_t b)
26 {
27     // https://tools.ietf.org/html/rfc3629
28     static const size_t UTF8_CHAR_WIDTH[256] = {
29         // 1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
30         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0
31         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
32         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
33         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3
34         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
35         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
36         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
37         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
38         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
39         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
40         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
41         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
42         0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C
43         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D
44         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E
45         4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // F
46     };
47     return UTF8_CHAR_WIDTH[b];
48 }
49 
GetNextByte(const std::vector<uint8_t> & v,size_t & index,uint8_t & next)50 bool GetNextByte(const std::vector<uint8_t> &v, size_t &index, uint8_t &next)
51 {
52     index += 1;
53     if (index >= v.size()) {
54         return false;
55     }
56     next = v[index];
57     return true;
58 }
59 
60 // https://tools.ietf.org/html/rfc3629
61 // UTF8-1      = %x00-7F
62 // UTF8-2      = %xC2-DF UTF8-tail
Check2Bytes(const std::vector<uint8_t> & v,size_t & index)63 bool Check2Bytes(const std::vector<uint8_t> &v, size_t &index)
64 {
65     uint8_t next = 0;
66     return GetNextByte(v, index, next) && (next >= 0x80 && next <= 0xBF);
67 }
68 
69 // https://tools.ietf.org/html/rfc3629
70 // UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
71 //               %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
Check3Bytes(const std::vector<uint8_t> & v,const size_t & first,size_t & index)72 bool Check3Bytes(const std::vector<uint8_t> &v, const size_t &first, size_t &index)
73 {
74     uint8_t next = 0;
75     if (!GetNextByte(v, index, next)) {
76         return false;
77     };
78 
79     if (first == 0xE0 && next >= 0xA0 && next <= 0xBF) {
80     } else if (first >= 0xE1 && first <= 0xEC && next >= 0x80 && next <= 0xBF) {
81     } else if (first == 0xED && next >= 0x80 && next <= 0x9F) {
82     } else if (first >= 0xEE && first <= 0xEF && next >= 0x80 && next <= 0xBF) {
83     } else {
84         return false;
85     };
86 
87     return Check2Bytes(v, index);
88 }
89 
90 // https://tools.ietf.org/html/rfc3629
91 // UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
92 //               %xF4 %x80-8F 2( UTF8-tail )
Check4Bytes(const std::vector<uint8_t> & v,const size_t & first,size_t & index)93 bool Check4Bytes(const std::vector<uint8_t> &v, const size_t &first, size_t &index)
94 {
95     uint8_t next = 0;
96     if (!GetNextByte(v, index, next)) {
97         return false;
98     };
99 
100     if (first == 0xF0 && next >= 0x90 && next <= 0xBF) {
101     } else if (first >= 0xF1 && first <= 0xF3 && next >= 0x80 && next <= 0xBF) {
102     } else if (first == 0xF4 && next >= 0x80 && next <= 0x8F) {
103     } else {
104         return false;
105     }
106 
107     return Check2Bytes(v, index) && Check2Bytes(v, index);
108 }
109 
RunUtf8Validation(const std::vector<uint8_t> & v)110 bool RunUtf8Validation(const std::vector<uint8_t> &v)
111 {
112     size_t index = 0;
113     size_t len = v.size();
114 
115     while (index < len) {
116         uint8_t first = v[index];
117 
118         // <= 0x7F means single byte.
119         if (first <= 0x7F) {
120             index += 1;
121             continue;
122         }
123 
124         size_t w = Utf8CharWidth(first);
125         if (w == TWO_OCTET) {
126             if (!Check2Bytes(v, index)) {
127                 return false;
128             }
129         } else if (w == THREE_OCTET) {
130             if (!Check3Bytes(v, first, index)) {
131                 return false;
132             }
133         } else if (w == FOUR_OCTET) {
134             if (!Check4Bytes(v, first, index)) {
135                 return false;
136             }
137         } else {
138             return false;
139         };
140         index += 1;
141     }
142     return true;
143 }
144 } // namespace OHOS::Request::Utf8Utils