1 /*
2 * Copyright (c) 2023 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "utf8_utils.h"
17
18 namespace OHOS::Request::Utf8Utils {
19
20 static constexpr size_t TWO_OCTET = 2;
21 static constexpr size_t THREE_OCTET = 3;
22 static constexpr size_t FOUR_OCTET = 4;
23
24 // Given a first byte, determines how many bytes are in this UTF-8 character.
Utf8CharWidth(uint8_t b)25 size_t Utf8CharWidth(uint8_t b)
26 {
27 // https://tools.ietf.org/html/rfc3629
28 static const size_t UTF8_CHAR_WIDTH[256] = {
29 // 1 2 3 4 5 6 7 8 9 A B C D E F
30 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0
31 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
32 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
33 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3
34 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
35 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
36 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
37 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
38 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
39 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
40 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
41 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
42 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C
43 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D
44 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E
45 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // F
46 };
47 return UTF8_CHAR_WIDTH[b];
48 }
49
GetNextByte(const std::vector<uint8_t> & v,size_t & index,uint8_t & next)50 bool GetNextByte(const std::vector<uint8_t> &v, size_t &index, uint8_t &next)
51 {
52 index += 1;
53 if (index >= v.size()) {
54 return false;
55 }
56 next = v[index];
57 return true;
58 }
59
60 // https://tools.ietf.org/html/rfc3629
61 // UTF8-1 = %x00-7F
62 // UTF8-2 = %xC2-DF UTF8-tail
Check2Bytes(const std::vector<uint8_t> & v,size_t & index)63 bool Check2Bytes(const std::vector<uint8_t> &v, size_t &index)
64 {
65 uint8_t next = 0;
66 return GetNextByte(v, index, next) && (next >= 0x80 && next <= 0xBF);
67 }
68
69 // https://tools.ietf.org/html/rfc3629
70 // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
71 // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
Check3Bytes(const std::vector<uint8_t> & v,const size_t & first,size_t & index)72 bool Check3Bytes(const std::vector<uint8_t> &v, const size_t &first, size_t &index)
73 {
74 uint8_t next = 0;
75 if (!GetNextByte(v, index, next)) {
76 return false;
77 };
78
79 if (first == 0xE0 && next >= 0xA0 && next <= 0xBF) {
80 } else if (first >= 0xE1 && first <= 0xEC && next >= 0x80 && next <= 0xBF) {
81 } else if (first == 0xED && next >= 0x80 && next <= 0x9F) {
82 } else if (first >= 0xEE && first <= 0xEF && next >= 0x80 && next <= 0xBF) {
83 } else {
84 return false;
85 };
86
87 return Check2Bytes(v, index);
88 }
89
90 // https://tools.ietf.org/html/rfc3629
91 // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
92 // %xF4 %x80-8F 2( UTF8-tail )
Check4Bytes(const std::vector<uint8_t> & v,const size_t & first,size_t & index)93 bool Check4Bytes(const std::vector<uint8_t> &v, const size_t &first, size_t &index)
94 {
95 uint8_t next = 0;
96 if (!GetNextByte(v, index, next)) {
97 return false;
98 };
99
100 if (first == 0xF0 && next >= 0x90 && next <= 0xBF) {
101 } else if (first >= 0xF1 && first <= 0xF3 && next >= 0x80 && next <= 0xBF) {
102 } else if (first == 0xF4 && next >= 0x80 && next <= 0x8F) {
103 } else {
104 return false;
105 }
106
107 return Check2Bytes(v, index) && Check2Bytes(v, index);
108 }
109
RunUtf8Validation(const std::vector<uint8_t> & v)110 bool RunUtf8Validation(const std::vector<uint8_t> &v)
111 {
112 size_t index = 0;
113 size_t len = v.size();
114
115 while (index < len) {
116 uint8_t first = v[index];
117
118 // <= 0x7F means single byte.
119 if (first <= 0x7F) {
120 index += 1;
121 continue;
122 }
123
124 size_t w = Utf8CharWidth(first);
125 if (w == TWO_OCTET) {
126 if (!Check2Bytes(v, index)) {
127 return false;
128 }
129 } else if (w == THREE_OCTET) {
130 if (!Check3Bytes(v, first, index)) {
131 return false;
132 }
133 } else if (w == FOUR_OCTET) {
134 if (!Check4Bytes(v, first, index)) {
135 return false;
136 }
137 } else {
138 return false;
139 };
140 index += 1;
141 }
142 return true;
143 }
144 } // namespace OHOS::Request::Utf8Utils