1 /*
2  * Copyright (c) 2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 #include "csv_parser.h"
16 
17 #include <algorithm>
18 #include <cctype>
19 
META_BEGIN_NAMESPACE()20 META_BEGIN_NAMESPACE()
21 
22 CsvParser::CsvParser(BASE_NS::string_view csv, const char delimiter) : delimiter_(delimiter), csv_(csv) {}
23 
GetRow(CsvRow & row)24 bool CsvParser::GetRow(CsvRow& row)
25 {
26     auto nextRow = ParseRow();
27     row.swap(nextRow);
28     return !row.empty();
29 }
30 
Reset()31 void CsvParser::Reset()
32 {
33     pos_ = 0;
34 }
35 
36 /**
37  * @brief Returns a trimmed string based on state.
38  * @param sv The string to trim.
39  * @param state State of the parser.
40  * @return If state is QUOTED, returns the string itself. Otherwise returns the string
41  *         trimmed from trailing and leading whitespace.
42  */
Trimmed(BASE_NS::string_view sv,State state)43 BASE_NS::string_view CsvParser::Trimmed(BASE_NS::string_view sv, State state)
44 {
45     if (state == QUOTED) {
46         return sv;
47     }
48     constexpr auto nspace = [](unsigned char ch) { return !std::isspace(static_cast<int>(ch)); };
49     sv.remove_suffix(std::distance(std::find_if(sv.rbegin(), sv.rend(), nspace).base(), sv.end()));
50     sv.remove_prefix(std::find_if(sv.begin(), sv.end(), nspace) - sv.begin());
51     return sv;
52 }
53 
HandleEscaped(char next)54 std::pair<bool, char> HandleEscaped(char next)
55 {
56     std::pair<bool, char> result { true, next };
57     switch (next) {
58         case 'n':
59             result.second = '\n';
60             break;
61         case '\\':
62             result.second = '\\';
63             break;
64         case 't':
65             result.second = '\t';
66             break;
67         case '"':
68             result.second = '"';
69             break;
70         default:
71             result.first = false;
72             break;
73     }
74     return result;
75 }
76 
ParseRow()77 CsvParser::CsvRow CsvParser::ParseRow()
78 {
79     BASE_NS::vector<BASE_NS::string> items;
80     BASE_NS::string item;
81     State state { NO_QUOTE };
82 
83     while (pos_ < csv_.size()) {
84         auto c = csv_[pos_++];
85         if (c == '\r') { // Ignore carriage returns
86             continue;
87         }
88         if (c == '"') {
89             if (state == IN_QUOTE && pos_ < csv_.size() - 1 && csv_[pos_] == '"') {
90                 // Double quotes interpreted as a single quote
91                 item += c;
92                 pos_++;
93             } else { // Begin/end quote
94                 state = (state == NO_QUOTE) ? IN_QUOTE : QUOTED;
95                 if (state == IN_QUOTE) {
96                     // Quoted part starts, ignore anything before it
97                     item.clear();
98                 }
99             }
100         } else if (c == delimiter_ && state != IN_QUOTE) {
101             // Delimiter found while not within quotes, move to next item
102             items.emplace_back(Trimmed(item, state));
103             item.clear();
104             state = NO_QUOTE;
105         } else if (c == '\n' && state != IN_QUOTE) {
106             // End of line while not within quotes, the row is complete
107             break;
108         } else if (state != QUOTED) {
109             // By default include character in result, unless we already had
110             // quoted content, then anything outside of quotes is ignored until
111             // next delimiter
112             if (c == '\\' && pos_ < csv_.size() - 1) {
113                 if (auto esc = HandleEscaped(csv_[pos_]); esc.first) {
114                     item += esc.second;
115                     pos_++;
116                     continue;
117                 }
118             }
119             item += c;
120         }
121     }
122 
123     // Any leftover since the last delimiter is the last item on the row
124     if (auto trimmed = Trimmed(item, state); !trimmed.empty()) {
125         items.emplace_back(trimmed);
126     }
127     return items;
128 }
129 
130 META_END_NAMESPACE()
131