1 /*
2  * Copyright (c) 2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef API_BASE_UTIL_UTF8_DECODE_H
17 #define API_BASE_UTIL_UTF8_DECODE_H
18 
19 #include <cstdint>
20 
21 #include <base/containers/string_view.h>
22 #include <base/namespace.h>
23 #include <base/util/log.h>
24 
BASE_BEGIN_NAMESPACE()25 BASE_BEGIN_NAMESPACE()
26 namespace {
27 
28 constexpr uint32_t UTF8_ACCEPT = 0;
29 constexpr uint32_t UTF8_REJECT = 1;
30 
31 static constexpr const uint8_t utf8d[] = {
32     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00..1f
33     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20..3f
34     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40..5f
35     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60..7f
36     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // 80..9f
37     7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // a0..bf
38     8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // c0..df
39     0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3,                 // e0..ef
40     0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8,                 // f0..ff
41     0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1,                 // s0..s0
42     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, // s1..s2
43     1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, // s3..s4
44     1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, // s5..s6
45     1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // s7..s8
46 };
47 
48 constexpr inline uint32_t decode(uint32_t* state, uint32_t* codep, unsigned char byte)
49 {
50     uint32_t type = utf8d[byte];
51 
52     *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6) : (0xff >> type) & (byte);
53 
54     *state = utf8d[256 + *state * 16 + type];
55     return *state;
56 }
57 } // namespace
58 
59 /** Decode utf8 encoded string.
60  * @param buf Utf8 encoded string pointer, moved to next codepoint on success.
61  * @return Next unicode codepoint on success, 0 otherwise.
62  */
GetCharUtf8(const char ** buf)63 static uint32_t GetCharUtf8(const char** buf)
64 {
65     uint32_t state = 0U;
66     uint32_t codepoint = 0U;
67 
68     while (**buf) {
69         decode(&state, &codepoint, static_cast<unsigned char>(**buf));
70         (*buf)++;
71         switch (state) {
72             case UTF8_ACCEPT:
73                 return codepoint;
74             case UTF8_REJECT:
75                 BASE_LOG_E("invalid utf8 sequence\n");
76                 return 0;
77         }
78     }
79     return 0;
80 }
81 
82 /** Count valid character in provided utf8 encoded string.
83  * @param string Utf8 encoded string.
84  * @return Valid unicode codepoint count in provided utf8 string.
85  */
CountGlyphsUtf8(const BASE_NS::string_view string)86 static uint32_t CountGlyphsUtf8(const BASE_NS::string_view string)
87 {
88     uint32_t state = 0U;
89     uint32_t codepoint = 0U;
90     uint32_t count = 0U;
91     const char* s = string.data();
92     const char* sEnd = string.data() + string.length();
93 
94     for (; (s < sEnd) && *s; ++s) {
95         if (!decode(&state, &codepoint, static_cast<unsigned char>(*s))) {
96             count += 1U;
97         }
98     }
99     if (state != UTF8_ACCEPT) {
100         BASE_LOG_E("malformed utf8 string\n");
101     }
102     return count;
103 }
104 BASE_END_NAMESPACE()
105 #endif // API_BASE_UTIL_UTF8_DECODE_H
106