1 /*
2 * Copyright (c) 2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #ifndef API_BASE_UTIL_UTF8_DECODE_H
17 #define API_BASE_UTIL_UTF8_DECODE_H
18
19 #include <cstdint>
20
21 #include <base/containers/string_view.h>
22 #include <base/namespace.h>
23 #include <base/util/log.h>
24
BASE_BEGIN_NAMESPACE()25 BASE_BEGIN_NAMESPACE()
26 namespace {
27
28 constexpr uint32_t UTF8_ACCEPT = 0;
29 constexpr uint32_t UTF8_REJECT = 1;
30
31 static constexpr const uint8_t utf8d[] = {
32 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1f
33 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3f
34 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5f
35 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7f
36 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9f
37 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // a0..bf
38 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // c0..df
39 0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, // e0..ef
40 0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, // f0..ff
41 0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, // s0..s0
42 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1..s2
43 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s3..s4
44 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s5..s6
45 1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // s7..s8
46 };
47
48 constexpr inline uint32_t decode(uint32_t* state, uint32_t* codep, unsigned char byte)
49 {
50 uint32_t type = utf8d[byte];
51
52 *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6) : (0xff >> type) & (byte);
53
54 *state = utf8d[256 + *state * 16 + type];
55 return *state;
56 }
57 } // namespace
58
59 /** Decode utf8 encoded string.
60 * @param buf Utf8 encoded string pointer, moved to next codepoint on success.
61 * @return Next unicode codepoint on success, 0 otherwise.
62 */
GetCharUtf8(const char ** buf)63 static uint32_t GetCharUtf8(const char** buf)
64 {
65 uint32_t state = 0U;
66 uint32_t codepoint = 0U;
67
68 while (**buf) {
69 decode(&state, &codepoint, static_cast<unsigned char>(**buf));
70 (*buf)++;
71 switch (state) {
72 case UTF8_ACCEPT:
73 return codepoint;
74 case UTF8_REJECT:
75 BASE_LOG_E("invalid utf8 sequence\n");
76 return 0;
77 }
78 }
79 return 0;
80 }
81
82 /** Count valid character in provided utf8 encoded string.
83 * @param string Utf8 encoded string.
84 * @return Valid unicode codepoint count in provided utf8 string.
85 */
CountGlyphsUtf8(const BASE_NS::string_view string)86 static uint32_t CountGlyphsUtf8(const BASE_NS::string_view string)
87 {
88 uint32_t state = 0U;
89 uint32_t codepoint = 0U;
90 uint32_t count = 0U;
91 const char* s = string.data();
92 const char* sEnd = string.data() + string.length();
93
94 for (; (s < sEnd) && *s; ++s) {
95 if (!decode(&state, &codepoint, static_cast<unsigned char>(*s))) {
96 count += 1U;
97 }
98 }
99 if (state != UTF8_ACCEPT) {
100 BASE_LOG_E("malformed utf8 string\n");
101 }
102 return count;
103 }
104 BASE_END_NAMESPACE()
105 #endif // API_BASE_UTIL_UTF8_DECODE_H
106