1 /*
2 * Copyright (c) 2023 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15 #include "regex_rule.h"
16 #include "i18n_hilog.h"
17 #include "phonenumbers/phonenumberutil.h"
18 #include "phonenumbers/phonenumber.h"
19 #include "phonenumbers/shortnumberinfo.h"
20
21 namespace OHOS {
22 namespace Global {
23 namespace I18n {
24 using i18n::phonenumbers::PhoneNumberMatch;
25 using i18n::phonenumbers::PhoneNumber;
26 using i18n::phonenumbers::PhoneNumberUtil;
27 using i18n::phonenumbers::ShortNumberInfo;
28
RegexRule(icu::UnicodeString & regex,std::string & isValidType,std::string & handleType,std::string & insensitive,std::string & type)29 RegexRule::RegexRule(icu::UnicodeString& regex, std::string& isValidType, std::string& handleType,
30 std::string& insensitive, std::string& type)
31 {
32 this->regex = regex;
33 if (type == "CONTAIN") {
34 // 9 indicates a certain execution logic of the border rule.
35 this->type = 9;
36 } else if (type == "CONTAIN_OR_INTERSECT") {
37 // 8 indicates a certain execution logic of the border rule.
38 this->type = 8;
39 } else {
40 this->type = 0;
41 }
42 this->status = U_ZERO_ERROR;
43 this->isValidType = isValidType;
44 this->handleType = handleType;
45 this->insensitive = insensitive;
46 if (regex.length() == 0) {
47 return;
48 }
49 if (U_FAILURE(this->status)) {
50 HILOG_ERROR_I18N("member pattern construct failed.");
51 }
52 }
53
~RegexRule()54 RegexRule::~RegexRule()
55 {
56 }
57
CountDigits(icu::UnicodeString & str)58 int RegexRule::CountDigits(icu::UnicodeString& str)
59 {
60 int count = 0;
61 int len = str.length();
62 for (int i = 0; i < len; i++) {
63 if (u_isdigit(str[i])) {
64 count++;
65 }
66 }
67 return count;
68 }
69
GetType()70 int RegexRule::GetType()
71 {
72 return type;
73 }
74
GetPattern()75 icu::RegexPattern* RegexRule::GetPattern()
76 {
77 // Sets whether regular expression matching is case sensitive
78 if (insensitive == "True") {
79 return icu::RegexPattern::compile(this->regex, URegexpFlag::UREGEX_CASE_INSENSITIVE, this->status);
80 } else {
81 return icu::RegexPattern::compile(this->regex, 0, this->status);
82 }
83 }
84
IsValid(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)85 PhoneNumberMatch* RegexRule::IsValid(PhoneNumberMatch* possibleNumber, icu::UnicodeString& message)
86 {
87 if (isValidType == "PreSuf") {
88 return IsValidPreSuf(possibleNumber, message);
89 } else if (isValidType == "Code") {
90 return IsValidCode(possibleNumber, message);
91 } else if (isValidType == "Rawstr") {
92 return IsValidRawstr(possibleNumber, message);
93 }
94 return IsValidDefault(possibleNumber, message);
95 }
96
97 // Check the preifx or suffix of possibleNumber
IsValidPreSuf(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)98 PhoneNumberMatch* RegexRule::IsValidPreSuf(PhoneNumberMatch* possibleNumber, icu::UnicodeString& message)
99 {
100 if (possibleNumber != nullptr) {
101 if (possibleNumber->start() - 1 >= 0) {
102 return IsValidStart(possibleNumber, message);
103 }
104 if (possibleNumber->end() <= message.length() - 1) {
105 return IsValidEnd(possibleNumber, message);
106 }
107 }
108 return possibleNumber;
109 }
110
111 // check the suffix of possibleNumber
IsValidEnd(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)112 PhoneNumberMatch* RegexRule::IsValidEnd(PhoneNumberMatch* possibleNumber, icu::UnicodeString& message)
113 {
114 icu::UnicodeString after = message.tempSubString(possibleNumber->end());
115 bool isTwo = true;
116 int len = after.length();
117 // check the 1st and 2nd char of the suffix.
118 for (int i = 0; i < len; i++) {
119 UChar32 afterChar = after[i];
120 if (i == 0 && !u_isUUppercase(afterChar)) {
121 isTwo = false;
122 break;
123 }
124 // 2 is the third position in the string.
125 if (i < 2 && u_isUAlphabetic(afterChar)) {
126 if (u_isUUppercase(afterChar)) {
127 continue;
128 } else {
129 isTwo = false;
130 break;
131 }
132 }
133 // 1 and 2 are the second and third position in the string, respectively.
134 if (i == 1 || i == 2) {
135 if (afterChar == '-' || afterChar == '\'') {
136 isTwo = false;
137 break;
138 } else if (u_isdigit(afterChar) || u_isspace(afterChar)) {
139 break;
140 } else if (!u_isUAlphabetic(afterChar)) {
141 break;
142 } else {
143 isTwo = false;
144 break;
145 }
146 }
147 }
148 if (!isTwo) {
149 return possibleNumber;
150 } else {
151 return nullptr;
152 }
153 }
154
155 // check the prefix of possibleNumber
IsValidStart(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)156 PhoneNumberMatch* RegexRule::IsValidStart(PhoneNumberMatch* possibleNumber, icu::UnicodeString& message)
157 {
158 icu::UnicodeString before = message.tempSubString(0, possibleNumber->start());
159 bool isTwo = true;
160 int len = before.length();
161 for (int i = 0; i < len; i++) {
162 char beforeChar = before[len - 1 - i];
163 if (i == 0 && !u_isUUppercase(beforeChar)) {
164 isTwo = false;
165 break;
166 }
167 // 2 is the third position in the string.
168 if (i < 2 && u_isUAlphabetic(beforeChar)) {
169 if (u_isUUppercase(beforeChar)) {
170 continue;
171 } else {
172 isTwo = false;
173 break;
174 }
175 }
176 if (beforeChar == '-' || beforeChar == '\'') {
177 isTwo = false;
178 break;
179 } else if (u_isdigit(beforeChar) || u_isspace(beforeChar)) {
180 break;
181 } else if (!u_isUAlphabetic(beforeChar)) {
182 break;
183 } else {
184 isTwo = false;
185 break;
186 }
187 }
188 if (!isTwo) {
189 return possibleNumber;
190 } else {
191 return nullptr;
192 }
193 }
194
IsValidDefault(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)195 PhoneNumberMatch* RegexRule::IsValidDefault(PhoneNumberMatch* possibleNumber, icu::UnicodeString& message)
196 {
197 return possibleNumber;
198 }
199
PrefixValid(icu::UnicodeString & number,int length)200 bool RegexRule::PrefixValid(icu::UnicodeString& number, int length)
201 {
202 icu::UnicodeString preNumber = number.tempSubString(0, length);
203 if (length == 1) {
204 if (number[0] == '0' || number[0] == '1' || number[0] == '+') {
205 return true;
206 }
207 // 3 indicates the first three digits of a phone number.
208 } else if (length == 3) {
209 if (preNumber == "400" || preNumber == "800") {
210 return true;
211 }
212 // 5 indicates the first five digits of a phone number.
213 } else if (length == 5) {
214 if (preNumber == "11808" || preNumber == "17909" || preNumber == "12593" ||
215 preNumber == "17951" || preNumber == "17911") {
216 return true;
217 }
218 }
219 return false;
220 }
221
NumberValid(icu::UnicodeString & number)222 bool RegexRule::NumberValid(icu::UnicodeString& number)
223 {
224 int lengthOne = 1;
225 // 3 indicates the first three digits of a phone number.
226 int lengthThree = 3;
227 // 11 is the number of digits in the phone number.
228 if (number[0] == '1' && CountDigits(number) > 11) {
229 // 5 indicates the first five digits of a phone number.
230 int lengthFive = 5;
231 if (!PrefixValid(number, lengthFive)) {
232 return false;
233 }
234 // 12 is the number of digits, 0 and 1 indicate the first and second position, respectively.
235 } else if (number[0] == '0' && CountDigits(number) > 12 && number[1] != '0') {
236 return false;
237 // 10 is the number of digits in the phone number.
238 } else if (PrefixValid(number, lengthThree) && CountDigits(number) != 10) {
239 return false;
240 // 9 is the number of digits in the phone number.
241 } else if (!PrefixValid(number, lengthOne) && !PrefixValid(number, lengthThree) && CountDigits(number) >= 9) {
242 if (number.trim()[0] != '9' && number.trim()[0] != '1') {
243 return false;
244 }
245 // 4 is the number of digits in the phone number.
246 } else if (CountDigits(number) <= 4) {
247 return false;
248 }
249 return true;
250 }
251
IsValidCode(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)252 PhoneNumberMatch* RegexRule::IsValidCode(PhoneNumberMatch* possibleNumber, icu::UnicodeString& message)
253 {
254 bool isValid = true;
255 icu::UnicodeString number = possibleNumber->raw_string().c_str();
256 // Processes the ;ext= extention number format
257 int32_t ind = number.trim().indexOf(";ext=");
258 if (ind != -1) {
259 number = number.trim().tempSubString(0, ind);
260 }
261 if (number[0] == '(' || number[0] == '[') {
262 StartWithBrackets(number);
263 }
264 isValid = NumberValid(number);
265 if (isValid) {
266 return possibleNumber;
267 } else {
268 return nullptr;
269 }
270 }
271
IsValidRawstr(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)272 PhoneNumberMatch* RegexRule::IsValidRawstr(PhoneNumberMatch* possibleNumber, icu::UnicodeString& message)
273 {
274 bool isValid = true;
275 icu::UnicodeString number = possibleNumber->raw_string().c_str();
276 // Processes the ;ext= extention number format
277 int32_t ind = number.trim().indexOf(";ext=");
278 if (ind != -1) {
279 number = number.trim().tempSubString(0, ind);
280 }
281 if (number[0] == '(' || number[0] == '[') {
282 number = number.tempSubString(1);
283 }
284 // 8 is the number of digits in the phone number.
285 if (number[0] != '0' && CountDigits(number) == 8) {
286 isValid = false;
287 }
288 // 4 is the number of digits in the phone number.
289 if (CountDigits(number) <= 4) {
290 isValid = false;
291 }
292 if (isValid) {
293 return possibleNumber;
294 } else {
295 return nullptr;
296 }
297 }
298
Handle(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)299 std::vector<MatchedNumberInfo> RegexRule::Handle(PhoneNumberMatch *possibleNumber, icu::UnicodeString& message)
300 {
301 if (handleType == "Operator") {
302 return HandleOperator(possibleNumber, message);
303 } else if (handleType == "Blank") {
304 return HandleBlank(possibleNumber, message);
305 } else if (handleType == "Slant") {
306 return HandleSlant(possibleNumber, message);
307 } else if (handleType == "StartWithMobile") {
308 return HandleStartWithMobile(possibleNumber, message);
309 } else if (handleType == "EndWithMobile") {
310 return HandleEndWithMobile(possibleNumber, message);
311 }
312 return HandleDefault(possibleNumber, message);
313 }
314
HandleDefault(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)315 std::vector<MatchedNumberInfo> RegexRule::HandleDefault(PhoneNumberMatch* possibleNumber, icu::UnicodeString& message)
316 {
317 MatchedNumberInfo matcher;
318 matcher.SetBegin(0);
319 matcher.SetEnd(1);
320 icu::UnicodeString content = "";
321 matcher.SetContent(content);
322 std::vector<MatchedNumberInfo> matchedNumberInfoList;
323 matchedNumberInfoList.push_back(matcher);
324 return matchedNumberInfoList;
325 }
326
HandleOperator(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)327 std::vector<MatchedNumberInfo> RegexRule::HandleOperator(PhoneNumberMatch* possibleNumber, icu::UnicodeString& message)
328 {
329 MatchedNumberInfo matcher;
330 if (possibleNumber->raw_string()[0] == '(' || possibleNumber->raw_string()[0] == '[') {
331 matcher.SetBegin(possibleNumber->start() + 1);
332 } else {
333 matcher.SetBegin(possibleNumber->start());
334 }
335 matcher.SetEnd(possibleNumber->end());
336 matcher.SetContent(message);
337 std::vector<MatchedNumberInfo> matchedNumberInfoList;
338 matchedNumberInfoList.push_back(matcher);
339 return matchedNumberInfoList;
340 }
341
HandleBlank(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)342 std::vector<MatchedNumberInfo> RegexRule::HandleBlank(PhoneNumberMatch* possibleNumber, icu::UnicodeString& message)
343 {
344 // exclude phone number 5201314
345 icu::UnicodeString speString = "5201314";
346 MatchedNumberInfo matchedNumberInfo;
347 icu::UnicodeString number = possibleNumber->raw_string().c_str();
348 icu::RegexPattern* pattern = GetPattern();
349 UErrorCode status;
350 icu::RegexMatcher* matcher = pattern->matcher(number, status);
351 UErrorCode negativeStatus = U_ZERO_ERROR;
352 // exclude phone number 2333333
353 icu::UnicodeString negativeRegex = "(?<![-\\d])(23{6,7})(?![-\\d])";
354 icu::RegexMatcher negativePattern(negativeRegex, 0, negativeStatus);
355 negativePattern.reset(number);
356 std::vector<MatchedNumberInfo> matchedNumberInfoList;
357 if (matcher != nullptr && matcher->find()) {
358 if (negativePattern.find() || number == speString) {
359 return matchedNumberInfoList;
360 }
361 if (possibleNumber->raw_string()[0] != '(' && possibleNumber->raw_string()[0] != '[') {
362 matchedNumberInfo.SetBegin(matcher->start(status) + possibleNumber->start());
363 } else {
364 matchedNumberInfo.SetBegin(possibleNumber->start());
365 }
366 matchedNumberInfo.SetEnd(matcher->end(status) + possibleNumber->start());
367 matchedNumberInfo.SetContent(number);
368 matchedNumberInfoList.push_back(matchedNumberInfo);
369 }
370 delete matcher;
371 delete pattern;
372 return matchedNumberInfoList;
373 }
374
HandleSlant(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)375 std::vector<MatchedNumberInfo> RegexRule::HandleSlant(PhoneNumberMatch* possibleNumber, icu::UnicodeString& message)
376 {
377 MatchedNumberInfo matchedNumberInfo;
378 MatchedNumberInfo numberInfo;
379 icu::UnicodeString number = possibleNumber->raw_string().c_str();
380 icu::RegexPattern* pattern = GetPattern();
381 UErrorCode status;
382 icu::RegexMatcher* matcher = pattern->matcher(number, status);
383 std::vector<MatchedNumberInfo> matchedNumberInfoList;
384 if (matcher != nullptr && matcher->find()) {
385 int start = matcher->start(status);
386 std::vector<MatchedNumberInfo> tempList = GetNumbersWithSlant(number);
387 // 2 is the size of tempList.
388 if (tempList.size() == 2 && start == 1) {
389 start = 0;
390 }
391 if (tempList.size() > 0) {
392 matchedNumberInfo.SetBegin(tempList[0].GetBegin() + start + possibleNumber->start());
393 matchedNumberInfo.SetEnd(tempList[0].GetEnd() + possibleNumber->start());
394 icu::UnicodeString contentFirst = tempList[0].GetContent();
395 matchedNumberInfo.SetContent(contentFirst);
396 matchedNumberInfoList.push_back(matchedNumberInfo);
397 // 2 is the size of tempList.
398 if (tempList.size() == 2) {
399 numberInfo.SetBegin(tempList[1].GetBegin() + start + possibleNumber->start());
400 numberInfo.SetEnd(tempList[1].GetEnd() + possibleNumber->start());
401 icu::UnicodeString contentSecond = tempList[1].GetContent();
402 numberInfo.SetContent(contentSecond);
403 matchedNumberInfoList.push_back(numberInfo);
404 }
405 }
406 }
407 delete matcher;
408 delete pattern;
409 return matchedNumberInfoList;
410 }
411
HandleStartWithMobile(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)412 std::vector<MatchedNumberInfo> RegexRule::HandleStartWithMobile(PhoneNumberMatch* possibleNumber,
413 icu::UnicodeString& message)
414 {
415 return HandlePossibleNumberWithPattern(possibleNumber, message, false);
416 }
417
HandleEndWithMobile(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)418 std::vector<MatchedNumberInfo> RegexRule::HandleEndWithMobile(PhoneNumberMatch* possibleNumber,
419 icu::UnicodeString& message)
420 {
421 return HandlePossibleNumberWithPattern(possibleNumber, message, true);
422 }
423
424 // Handle phone number starting with '(' or '['
StartWithBrackets(icu::UnicodeString & number)425 void RegexRule::StartWithBrackets(icu::UnicodeString& number)
426 {
427 icu::UnicodeString right = "";
428 if (number[0] == '(') {
429 right = ')';
430 }
431 if (number[0] == '[') {
432 right = ']';
433 }
434 int neind = number.indexOf(right);
435 if (neind != -1) {
436 icu::UnicodeString phoneStr = number.tempSubString(0, neind);
437 int phoneLength = CountDigits(phoneStr);
438 icu::UnicodeString extraStr = number.tempSubString(neind);
439 int extra = CountDigits(extraStr);
440 // 4 is the number of numbers in parentheses, 1 and 2 are the number of numbers outside parentheses.
441 if ((phoneLength > 4) && (extra == 1 || extra == 2)) {
442 number = number.tempSubString(1, neind - 1);
443 } else {
444 number = number.tempSubString(1);
445 }
446 } else {
447 number = number.tempSubString(1);
448 }
449 }
450
451 // identify short number separated by '/'
GetNumbersWithSlant(icu::UnicodeString & testStr)452 std::vector<MatchedNumberInfo> RegexRule::GetNumbersWithSlant(icu::UnicodeString& testStr)
453 {
454 std::vector<MatchedNumberInfo> shortList;
455 PhoneNumberUtil* pnu = PhoneNumberUtil::GetInstance();
456 ShortNumberInfo* shortInfo = new (std::nothrow) ShortNumberInfo();
457 if (shortInfo == nullptr) {
458 HILOG_ERROR_I18N("ShortNumberInfo construct failed.");
459 return shortList;
460 }
461 std::string numberFisrt = "";
462 std::string numberEnd = "";
463 int slantIndex = 0;
464 for (int i = 0; i < testStr.length(); i++) {
465 if (testStr[i] == '/' || testStr[i] == '|') {
466 slantIndex = i;
467 testStr.tempSubString(0, i).toUTF8String(numberFisrt);
468 testStr.tempSubString(i + 1).toUTF8String(numberEnd);
469 }
470 }
471 PhoneNumber phoneNumberFirst;
472 PhoneNumber phoneNumberEnd;
473 pnu->Parse(numberFisrt, "CN", &phoneNumberFirst);
474 pnu->Parse(numberEnd, "CN", &phoneNumberEnd);
475 if (shortInfo->IsValidShortNumber(phoneNumberFirst)) {
476 MatchedNumberInfo matchedNumberInfoFirst;
477 matchedNumberInfoFirst.SetBegin(0);
478 matchedNumberInfoFirst.SetEnd(slantIndex);
479 icu::UnicodeString contentFirst = numberFisrt.c_str();
480 matchedNumberInfoFirst.SetContent(contentFirst);
481 shortList.push_back(matchedNumberInfoFirst);
482 }
483 if (shortInfo->IsValidShortNumber(phoneNumberEnd)) {
484 MatchedNumberInfo matchedNumberInfoEnd;
485 matchedNumberInfoEnd.SetBegin(slantIndex + 1);
486 matchedNumberInfoEnd.SetEnd(testStr.length());
487 icu::UnicodeString contentEnd = numberEnd.c_str();
488 matchedNumberInfoEnd.SetContent(contentEnd);
489 shortList.push_back(matchedNumberInfoEnd);
490 }
491 delete shortInfo;
492 return shortList;
493 }
494
HandlePossibleNumberWithPattern(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message,bool isStartsWithNumber)495 std::vector<MatchedNumberInfo> RegexRule::HandlePossibleNumberWithPattern(PhoneNumberMatch* possibleNumber,
496 icu::UnicodeString& message, bool isStartsWithNumber)
497 {
498 UErrorCode status = U_ZERO_ERROR;
499 std::vector<MatchedNumberInfo> matchedList;
500 icu::UnicodeString possible = possibleNumber->raw_string().c_str();
501 icu::RegexPattern* pattern = GetPattern();
502 icu::RegexMatcher* mat = pattern->matcher(message, status);
503 while (mat != nullptr && mat->find(status)) {
504 int start = mat->start(status);
505 int end = mat->end(status);
506 icu::UnicodeString matched = message.tempSubString(start, end - start);
507 bool isMatch = isStartsWithNumber ? matched.startsWith(possible) : matched.endsWith(possible);
508 if (isMatch) {
509 MatchedNumberInfo info;
510 info.SetBegin(isStartsWithNumber ? start : end - possible.length());
511 info.SetEnd(isStartsWithNumber ? (start + possible.length()) : end);
512 info.SetContent(possible);
513 matchedList.push_back(info);
514 }
515 }
516 delete mat;
517 delete pattern;
518 return matchedList;
519 }
520 } // namespace I18n
521 } // namespace Global
522 } // namespace OHOS