1 /*
2  * Copyright (c) 2023 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 #include "regex_rule.h"
16 #include "i18n_hilog.h"
17 #include "phonenumbers/phonenumberutil.h"
18 #include "phonenumbers/phonenumber.h"
19 #include "phonenumbers/shortnumberinfo.h"
20 
21 namespace OHOS {
22 namespace Global {
23 namespace I18n {
24 using i18n::phonenumbers::PhoneNumberMatch;
25 using i18n::phonenumbers::PhoneNumber;
26 using i18n::phonenumbers::PhoneNumberUtil;
27 using i18n::phonenumbers::ShortNumberInfo;
28 
RegexRule(icu::UnicodeString & regex,std::string & isValidType,std::string & handleType,std::string & insensitive,std::string & type)29 RegexRule::RegexRule(icu::UnicodeString& regex, std::string& isValidType, std::string& handleType,
30     std::string& insensitive, std::string& type)
31 {
32     this->regex = regex;
33     if (type == "CONTAIN") {
34         // 9 indicates a certain execution logic of the border rule.
35         this->type = 9;
36     } else if (type == "CONTAIN_OR_INTERSECT") {
37         // 8 indicates a certain execution logic of the border rule.
38         this->type = 8;
39     } else {
40         this->type = 0;
41     }
42     this->status = U_ZERO_ERROR;
43     this->isValidType = isValidType;
44     this->handleType = handleType;
45     this->insensitive = insensitive;
46     if (regex.length() == 0) {
47         return;
48     }
49     if (U_FAILURE(this->status)) {
50         HILOG_ERROR_I18N("member pattern construct failed.");
51     }
52 }
53 
~RegexRule()54 RegexRule::~RegexRule()
55 {
56 }
57 
CountDigits(icu::UnicodeString & str)58 int RegexRule::CountDigits(icu::UnicodeString& str)
59 {
60     int count = 0;
61     int len = str.length();
62     for (int i = 0; i < len; i++) {
63         if (u_isdigit(str[i])) {
64             count++;
65         }
66     }
67     return count;
68 }
69 
GetType()70 int RegexRule::GetType()
71 {
72     return type;
73 }
74 
GetPattern()75 icu::RegexPattern* RegexRule::GetPattern()
76 {
77     // Sets whether regular expression matching is case sensitive
78     if (insensitive == "True") {
79         return icu::RegexPattern::compile(this->regex, URegexpFlag::UREGEX_CASE_INSENSITIVE, this->status);
80     } else {
81         return icu::RegexPattern::compile(this->regex, 0, this->status);
82     }
83 }
84 
IsValid(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)85 PhoneNumberMatch* RegexRule::IsValid(PhoneNumberMatch* possibleNumber, icu::UnicodeString& message)
86 {
87     if (isValidType == "PreSuf") {
88         return IsValidPreSuf(possibleNumber, message);
89     } else if (isValidType == "Code") {
90         return IsValidCode(possibleNumber, message);
91     } else if (isValidType == "Rawstr") {
92         return IsValidRawstr(possibleNumber, message);
93     }
94     return IsValidDefault(possibleNumber, message);
95 }
96 
97 // Check the preifx or suffix of possibleNumber
IsValidPreSuf(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)98 PhoneNumberMatch* RegexRule::IsValidPreSuf(PhoneNumberMatch* possibleNumber, icu::UnicodeString& message)
99 {
100     if (possibleNumber != nullptr) {
101         if (possibleNumber->start() - 1 >= 0) {
102             return IsValidStart(possibleNumber, message);
103         }
104         if (possibleNumber->end() <= message.length() - 1) {
105             return IsValidEnd(possibleNumber, message);
106         }
107     }
108     return possibleNumber;
109 }
110 
111 // check the suffix of possibleNumber
IsValidEnd(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)112 PhoneNumberMatch* RegexRule::IsValidEnd(PhoneNumberMatch* possibleNumber, icu::UnicodeString& message)
113 {
114     icu::UnicodeString after = message.tempSubString(possibleNumber->end());
115     bool isTwo = true;
116     int len = after.length();
117     // check the 1st and 2nd char of the suffix.
118     for (int i = 0; i < len; i++) {
119         UChar32 afterChar = after[i];
120         if (i == 0 && !u_isUUppercase(afterChar)) {
121             isTwo = false;
122             break;
123         }
124         // 2 is the third position in the string.
125         if (i < 2 && u_isUAlphabetic(afterChar)) {
126             if (u_isUUppercase(afterChar)) {
127                 continue;
128             } else {
129                 isTwo = false;
130                 break;
131             }
132         }
133         // 1 and 2 are the second and third position in the string, respectively.
134         if (i == 1 || i == 2) {
135             if (afterChar == '-' || afterChar == '\'') {
136                 isTwo = false;
137                 break;
138             } else if (u_isdigit(afterChar) || u_isspace(afterChar)) {
139                 break;
140             } else if (!u_isUAlphabetic(afterChar)) {
141                 break;
142             } else {
143                 isTwo = false;
144                 break;
145             }
146         }
147     }
148     if (!isTwo) {
149         return possibleNumber;
150     } else {
151         return nullptr;
152     }
153 }
154 
155 // check the prefix of possibleNumber
IsValidStart(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)156 PhoneNumberMatch* RegexRule::IsValidStart(PhoneNumberMatch* possibleNumber, icu::UnicodeString& message)
157 {
158     icu::UnicodeString before = message.tempSubString(0, possibleNumber->start());
159     bool isTwo = true;
160     int len = before.length();
161     for (int i = 0; i < len; i++) {
162         char beforeChar = before[len - 1 - i];
163         if (i == 0 && !u_isUUppercase(beforeChar)) {
164             isTwo = false;
165             break;
166         }
167         // 2 is the third position in the string.
168         if (i < 2 && u_isUAlphabetic(beforeChar)) {
169             if (u_isUUppercase(beforeChar)) {
170                 continue;
171             } else {
172                 isTwo = false;
173                 break;
174             }
175         }
176         if (beforeChar == '-' || beforeChar == '\'') {
177             isTwo = false;
178             break;
179         } else if (u_isdigit(beforeChar) || u_isspace(beforeChar)) {
180             break;
181         } else if (!u_isUAlphabetic(beforeChar)) {
182             break;
183         } else {
184             isTwo = false;
185             break;
186         }
187     }
188     if (!isTwo) {
189         return possibleNumber;
190     } else {
191         return nullptr;
192     }
193 }
194 
IsValidDefault(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)195 PhoneNumberMatch* RegexRule::IsValidDefault(PhoneNumberMatch* possibleNumber, icu::UnicodeString& message)
196 {
197     return possibleNumber;
198 }
199 
PrefixValid(icu::UnicodeString & number,int length)200 bool RegexRule::PrefixValid(icu::UnicodeString& number, int length)
201 {
202     icu::UnicodeString preNumber = number.tempSubString(0, length);
203     if (length == 1) {
204         if (number[0] == '0' || number[0] == '1' || number[0] == '+') {
205             return true;
206         }
207     // 3 indicates the first three digits of a phone number.
208     } else if (length == 3) {
209         if (preNumber == "400" || preNumber == "800") {
210             return true;
211         }
212     // 5 indicates the first five digits of a phone number.
213     } else if (length == 5) {
214         if (preNumber == "11808" || preNumber == "17909" || preNumber == "12593" ||
215             preNumber == "17951" || preNumber == "17911") {
216             return true;
217         }
218     }
219     return false;
220 }
221 
NumberValid(icu::UnicodeString & number)222 bool RegexRule::NumberValid(icu::UnicodeString& number)
223 {
224     int lengthOne = 1;
225     // 3 indicates the first three digits of a phone number.
226     int lengthThree = 3;
227     // 11 is the number of digits in the phone number.
228     if (number[0] == '1' && CountDigits(number) > 11) {
229         // 5 indicates the first five digits of a phone number.
230         int lengthFive = 5;
231         if (!PrefixValid(number, lengthFive)) {
232             return false;
233         }
234     // 12 is the number of digits, 0 and 1 indicate the first and second position, respectively.
235     } else if (number[0] == '0' && CountDigits(number) > 12 && number[1] != '0') {
236         return false;
237     // 10 is the number of digits in the phone number.
238     } else if (PrefixValid(number, lengthThree) && CountDigits(number) != 10) {
239         return false;
240     // 9 is the number of digits in the phone number.
241     } else if (!PrefixValid(number, lengthOne) && !PrefixValid(number, lengthThree) && CountDigits(number) >= 9) {
242         if (number.trim()[0] != '9' && number.trim()[0] != '1') {
243             return false;
244         }
245     // 4 is the number of digits in the phone number.
246     } else if (CountDigits(number) <= 4) {
247         return false;
248     }
249     return true;
250 }
251 
IsValidCode(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)252 PhoneNumberMatch* RegexRule::IsValidCode(PhoneNumberMatch* possibleNumber, icu::UnicodeString& message)
253 {
254     bool isValid = true;
255     icu::UnicodeString number = possibleNumber->raw_string().c_str();
256     // Processes the ;ext= extention number format
257     int32_t ind = number.trim().indexOf(";ext=");
258     if (ind != -1) {
259         number = number.trim().tempSubString(0, ind);
260     }
261     if (number[0] == '(' || number[0] == '[') {
262         StartWithBrackets(number);
263     }
264     isValid = NumberValid(number);
265     if (isValid) {
266         return possibleNumber;
267     } else {
268         return nullptr;
269     }
270 }
271 
IsValidRawstr(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)272 PhoneNumberMatch* RegexRule::IsValidRawstr(PhoneNumberMatch* possibleNumber, icu::UnicodeString& message)
273 {
274     bool isValid = true;
275     icu::UnicodeString number = possibleNumber->raw_string().c_str();
276     // Processes the ;ext= extention number format
277     int32_t ind = number.trim().indexOf(";ext=");
278     if (ind != -1) {
279         number = number.trim().tempSubString(0, ind);
280     }
281     if (number[0] == '(' || number[0] == '[') {
282         number = number.tempSubString(1);
283     }
284     // 8 is the number of digits in the phone number.
285     if (number[0] != '0' && CountDigits(number) == 8) {
286         isValid = false;
287     }
288     // 4 is the number of digits in the phone number.
289     if (CountDigits(number) <= 4) {
290         isValid = false;
291     }
292     if (isValid) {
293         return possibleNumber;
294     } else {
295         return nullptr;
296     }
297 }
298 
Handle(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)299 std::vector<MatchedNumberInfo> RegexRule::Handle(PhoneNumberMatch *possibleNumber, icu::UnicodeString& message)
300 {
301     if (handleType == "Operator") {
302         return HandleOperator(possibleNumber, message);
303     } else if (handleType == "Blank") {
304         return HandleBlank(possibleNumber, message);
305     } else if (handleType == "Slant") {
306         return HandleSlant(possibleNumber, message);
307     } else if (handleType == "StartWithMobile") {
308         return HandleStartWithMobile(possibleNumber, message);
309     } else if (handleType == "EndWithMobile") {
310         return HandleEndWithMobile(possibleNumber, message);
311     }
312     return HandleDefault(possibleNumber, message);
313 }
314 
HandleDefault(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)315 std::vector<MatchedNumberInfo> RegexRule::HandleDefault(PhoneNumberMatch* possibleNumber, icu::UnicodeString& message)
316 {
317     MatchedNumberInfo matcher;
318     matcher.SetBegin(0);
319     matcher.SetEnd(1);
320     icu::UnicodeString content = "";
321     matcher.SetContent(content);
322     std::vector<MatchedNumberInfo> matchedNumberInfoList;
323     matchedNumberInfoList.push_back(matcher);
324     return matchedNumberInfoList;
325 }
326 
HandleOperator(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)327 std::vector<MatchedNumberInfo> RegexRule::HandleOperator(PhoneNumberMatch* possibleNumber, icu::UnicodeString& message)
328 {
329     MatchedNumberInfo matcher;
330     if (possibleNumber->raw_string()[0] == '(' || possibleNumber->raw_string()[0] == '[') {
331         matcher.SetBegin(possibleNumber->start() + 1);
332     } else {
333         matcher.SetBegin(possibleNumber->start());
334     }
335     matcher.SetEnd(possibleNumber->end());
336     matcher.SetContent(message);
337     std::vector<MatchedNumberInfo> matchedNumberInfoList;
338     matchedNumberInfoList.push_back(matcher);
339     return matchedNumberInfoList;
340 }
341 
HandleBlank(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)342 std::vector<MatchedNumberInfo> RegexRule::HandleBlank(PhoneNumberMatch* possibleNumber, icu::UnicodeString& message)
343 {
344     // exclude phone number 5201314
345     icu::UnicodeString speString = "5201314";
346     MatchedNumberInfo matchedNumberInfo;
347     icu::UnicodeString number = possibleNumber->raw_string().c_str();
348     icu::RegexPattern* pattern = GetPattern();
349     UErrorCode status;
350     icu::RegexMatcher* matcher = pattern->matcher(number, status);
351     UErrorCode negativeStatus = U_ZERO_ERROR;
352     // exclude phone number 2333333
353     icu::UnicodeString negativeRegex = "(?<![-\\d])(23{6,7})(?![-\\d])";
354     icu::RegexMatcher negativePattern(negativeRegex, 0, negativeStatus);
355     negativePattern.reset(number);
356     std::vector<MatchedNumberInfo> matchedNumberInfoList;
357     if (matcher != nullptr && matcher->find()) {
358         if (negativePattern.find() || number == speString) {
359             return matchedNumberInfoList;
360         }
361         if (possibleNumber->raw_string()[0] != '(' && possibleNumber->raw_string()[0] != '[') {
362             matchedNumberInfo.SetBegin(matcher->start(status) + possibleNumber->start());
363         } else {
364             matchedNumberInfo.SetBegin(possibleNumber->start());
365         }
366         matchedNumberInfo.SetEnd(matcher->end(status) + possibleNumber->start());
367         matchedNumberInfo.SetContent(number);
368         matchedNumberInfoList.push_back(matchedNumberInfo);
369     }
370     delete matcher;
371     delete pattern;
372     return matchedNumberInfoList;
373 }
374 
HandleSlant(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)375 std::vector<MatchedNumberInfo> RegexRule::HandleSlant(PhoneNumberMatch* possibleNumber, icu::UnicodeString& message)
376 {
377     MatchedNumberInfo matchedNumberInfo;
378     MatchedNumberInfo numberInfo;
379     icu::UnicodeString number = possibleNumber->raw_string().c_str();
380     icu::RegexPattern* pattern = GetPattern();
381     UErrorCode status;
382     icu::RegexMatcher* matcher = pattern->matcher(number, status);
383     std::vector<MatchedNumberInfo> matchedNumberInfoList;
384     if (matcher != nullptr && matcher->find()) {
385         int start = matcher->start(status);
386         std::vector<MatchedNumberInfo> tempList = GetNumbersWithSlant(number);
387         // 2 is the size of tempList.
388         if (tempList.size() == 2 && start == 1) {
389             start = 0;
390         }
391         if (tempList.size() > 0) {
392             matchedNumberInfo.SetBegin(tempList[0].GetBegin() + start + possibleNumber->start());
393             matchedNumberInfo.SetEnd(tempList[0].GetEnd() + possibleNumber->start());
394             icu::UnicodeString contentFirst = tempList[0].GetContent();
395             matchedNumberInfo.SetContent(contentFirst);
396             matchedNumberInfoList.push_back(matchedNumberInfo);
397             // 2 is the size of tempList.
398             if (tempList.size() == 2) {
399                 numberInfo.SetBegin(tempList[1].GetBegin() + start + possibleNumber->start());
400                 numberInfo.SetEnd(tempList[1].GetEnd() + possibleNumber->start());
401                 icu::UnicodeString contentSecond = tempList[1].GetContent();
402                 numberInfo.SetContent(contentSecond);
403                 matchedNumberInfoList.push_back(numberInfo);
404             }
405         }
406     }
407     delete matcher;
408     delete pattern;
409     return matchedNumberInfoList;
410 }
411 
HandleStartWithMobile(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)412 std::vector<MatchedNumberInfo> RegexRule::HandleStartWithMobile(PhoneNumberMatch* possibleNumber,
413     icu::UnicodeString& message)
414 {
415     return HandlePossibleNumberWithPattern(possibleNumber, message, false);
416 }
417 
HandleEndWithMobile(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message)418 std::vector<MatchedNumberInfo> RegexRule::HandleEndWithMobile(PhoneNumberMatch* possibleNumber,
419     icu::UnicodeString& message)
420 {
421     return HandlePossibleNumberWithPattern(possibleNumber, message, true);
422 }
423 
424 // Handle phone number starting with '(' or '['
StartWithBrackets(icu::UnicodeString & number)425 void RegexRule::StartWithBrackets(icu::UnicodeString& number)
426 {
427     icu::UnicodeString right = "";
428     if (number[0] == '(') {
429         right = ')';
430     }
431     if (number[0] == '[') {
432         right = ']';
433     }
434     int neind = number.indexOf(right);
435     if (neind != -1) {
436         icu::UnicodeString phoneStr = number.tempSubString(0, neind);
437         int phoneLength = CountDigits(phoneStr);
438         icu::UnicodeString extraStr = number.tempSubString(neind);
439         int extra = CountDigits(extraStr);
440         // 4 is the number of numbers in parentheses, 1 and 2 are the number of numbers outside parentheses.
441         if ((phoneLength > 4) && (extra == 1 || extra == 2)) {
442             number = number.tempSubString(1, neind - 1);
443         } else {
444             number = number.tempSubString(1);
445         }
446     } else {
447         number = number.tempSubString(1);
448     }
449 }
450 
451 // identify short number separated by '/'
GetNumbersWithSlant(icu::UnicodeString & testStr)452 std::vector<MatchedNumberInfo> RegexRule::GetNumbersWithSlant(icu::UnicodeString& testStr)
453 {
454     std::vector<MatchedNumberInfo> shortList;
455     PhoneNumberUtil* pnu = PhoneNumberUtil::GetInstance();
456     ShortNumberInfo* shortInfo = new (std::nothrow) ShortNumberInfo();
457     if (shortInfo == nullptr) {
458         HILOG_ERROR_I18N("ShortNumberInfo construct failed.");
459         return shortList;
460     }
461     std::string numberFisrt = "";
462     std::string numberEnd = "";
463     int slantIndex = 0;
464     for (int i = 0; i < testStr.length(); i++) {
465         if (testStr[i] == '/' || testStr[i] == '|') {
466             slantIndex = i;
467             testStr.tempSubString(0, i).toUTF8String(numberFisrt);
468             testStr.tempSubString(i + 1).toUTF8String(numberEnd);
469         }
470     }
471     PhoneNumber phoneNumberFirst;
472     PhoneNumber phoneNumberEnd;
473     pnu->Parse(numberFisrt, "CN", &phoneNumberFirst);
474     pnu->Parse(numberEnd, "CN", &phoneNumberEnd);
475     if (shortInfo->IsValidShortNumber(phoneNumberFirst)) {
476         MatchedNumberInfo matchedNumberInfoFirst;
477         matchedNumberInfoFirst.SetBegin(0);
478         matchedNumberInfoFirst.SetEnd(slantIndex);
479         icu::UnicodeString contentFirst = numberFisrt.c_str();
480         matchedNumberInfoFirst.SetContent(contentFirst);
481         shortList.push_back(matchedNumberInfoFirst);
482     }
483     if (shortInfo->IsValidShortNumber(phoneNumberEnd)) {
484         MatchedNumberInfo matchedNumberInfoEnd;
485         matchedNumberInfoEnd.SetBegin(slantIndex + 1);
486         matchedNumberInfoEnd.SetEnd(testStr.length());
487         icu::UnicodeString contentEnd = numberEnd.c_str();
488         matchedNumberInfoEnd.SetContent(contentEnd);
489         shortList.push_back(matchedNumberInfoEnd);
490     }
491     delete shortInfo;
492     return shortList;
493 }
494 
HandlePossibleNumberWithPattern(PhoneNumberMatch * possibleNumber,icu::UnicodeString & message,bool isStartsWithNumber)495 std::vector<MatchedNumberInfo> RegexRule::HandlePossibleNumberWithPattern(PhoneNumberMatch* possibleNumber,
496     icu::UnicodeString& message, bool isStartsWithNumber)
497 {
498     UErrorCode status = U_ZERO_ERROR;
499     std::vector<MatchedNumberInfo> matchedList;
500     icu::UnicodeString possible = possibleNumber->raw_string().c_str();
501     icu::RegexPattern* pattern = GetPattern();
502     icu::RegexMatcher* mat = pattern->matcher(message, status);
503     while (mat != nullptr && mat->find(status)) {
504         int start = mat->start(status);
505         int end = mat->end(status);
506         icu::UnicodeString matched = message.tempSubString(start, end - start);
507         bool isMatch = isStartsWithNumber ? matched.startsWith(possible) : matched.endsWith(possible);
508         if (isMatch) {
509             MatchedNumberInfo info;
510             info.SetBegin(isStartsWithNumber ? start : end - possible.length());
511             info.SetEnd(isStartsWithNumber ? (start + possible.length()) : end);
512             info.SetContent(possible);
513             matchedList.push_back(info);
514         }
515     }
516     delete mat;
517     delete pattern;
518     return matchedList;
519 }
520 } // namespace I18n
521 } // namespace Global
522 } // namespace OHOS