1 // Copyright (c) 2023 Huawei Device Co., Ltd.
2 // Licensed under the Apache License, Version 2.0 (the "License");
3 // you may not use this file except in compliance with the License.
4 // You may obtain a copy of the License at
5 //
6 //     http://www.apache.org/licenses/LICENSE-2.0
7 //
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 //! HTTP url [`PercentEncoder`].
15 //!
16 //! URI references are used to target requests, indicate redirects, and define
17 //! relationships.
18 //!
19 //! [`PercentEncoder`]: https://url.spec.whatwg.org/#fragment-percent-encode
20 
21 use std::str;
22 use std::str::{Bytes, Chars};
23 
24 use crate::error::HttpError;
25 use crate::request::uri::{InvalidUri, Uri};
26 
27 type Utf8Char<'a> = (char, &'a str);
28 
29 const USERINFO: &[u8; 19] = b" \"#<>?`{}/:;=@[]^|\\";
30 const FRAGMENT: &[u8; 5] = b" \"<>`";
31 const PATH: &[u8; 9] = b" \"#<>?`{}";
32 const QUERY: &[u8; 6] = b" \"#<>\'";
33 
34 /// HTTP url percent encoding implementation.
35 ///
36 /// # Examples
37 ///
38 /// ```
39 /// use ylong_http::request::uri::PercentEncoder;
40 ///
41 /// let url = "https://www.example.com/data/测试文件.txt";
42 /// let encoded = PercentEncoder::parse(url).unwrap();
43 /// assert_eq!(
44 ///     encoded,
45 ///     "https://www.example.com/data/%E6%B5%8B%E8%AF%95%E6%96%87%E4%BB%B6.txt"
46 /// );
47 /// ```
48 pub struct PercentEncoder {
49     normalized: Normalized,
50 }
51 
52 impl PercentEncoder {
53     /// Percent-coding entry.
parse(origin: &str) -> Result<String, HttpError>54     pub fn parse(origin: &str) -> Result<String, HttpError> {
55         let mut encoder = Self {
56             normalized: Normalized::from_size(origin.len()),
57         };
58         let bytes = UrlChars {
59             remaining: origin.chars(),
60         };
61         let remaining = encoder.parse_scheme(bytes)?;
62         let remaining = encoder.parse_double_slash(remaining)?;
63         let remaining = encoder.parse_userinfo(remaining)?;
64         let remaining = encoder.parse_authority(remaining)?;
65         let remaining = encoder.parse_path(remaining)?;
66         encoder.parse_query_and_fragment(remaining)?;
67         Ok(encoder.normalized.url())
68     }
69 
parse_scheme<'a>(&mut self, mut origin: UrlChars<'a>) -> Result<UrlChars<'a>, HttpError>70     fn parse_scheme<'a>(&mut self, mut origin: UrlChars<'a>) -> Result<UrlChars<'a>, HttpError> {
71         while let Some(char) = origin.next() {
72             match char {
73                 'a'..='z' | 'A'..='Z' | '0'..='9' | '+' | '-' | '.' => {
74                     self.normalized.push(char.to_ascii_lowercase())
75                 }
76                 ':' => {
77                     self.normalized.push(char);
78                     return Ok(origin);
79                 }
80                 _ => return Err(InvalidUri::InvalidScheme.into()),
81             }
82         }
83         Err(InvalidUri::InvalidScheme.into())
84     }
85 
parse_double_slash<'a>(&mut self, origin: UrlChars<'a>) -> Result<UrlChars<'a>, HttpError>86     fn parse_double_slash<'a>(&mut self, origin: UrlChars<'a>) -> Result<UrlChars<'a>, HttpError> {
87         let mut chars = origin.clone();
88         let mut count = 0;
89         loop {
90             let mut tmp_chars = chars.clone();
91             if matches!(tmp_chars.next(), Some(c) if matches!(c, '/')) {
92                 count += 1;
93                 chars = tmp_chars;
94                 self.normalized.push('/');
95             } else {
96                 break;
97             }
98         }
99         if count == 2 {
100             Ok(chars)
101         } else {
102             Err(InvalidUri::InvalidScheme.into())
103         }
104     }
105 
parse_userinfo<'a>(&mut self, mut origin: UrlChars<'a>) -> Result<UrlChars<'a>, HttpError>106     fn parse_userinfo<'a>(&mut self, mut origin: UrlChars<'a>) -> Result<UrlChars<'a>, HttpError> {
107         let mut chars = origin.clone();
108 
109         let mut size = 0;
110         let mut after_at = None;
111         while let Some(ch) = chars.next() {
112             match ch {
113                 '@' => {
114                     after_at = Some((size, chars.clone()));
115                 }
116                 '/' | '?' | '#' => break,
117                 _ => {}
118             }
119             size += 1;
120         }
121 
122         let (mut info_len, remaining) = match after_at {
123             None => {
124                 return Ok(origin);
125             }
126             Some((0, remaining)) => {
127                 if matches!(remaining.clone().next(), Some(c) if matches!(c, '/' | '?' | '#')) {
128                     return Err(InvalidUri::UriMissHost.into());
129                 }
130                 return Ok(remaining);
131             }
132             Some(at) => at,
133         };
134 
135         let mut has_username = false;
136         while info_len > 0 {
137             info_len -= 1;
138             if let Some(ch) = origin.next_u8() {
139                 if ch.0 == ':' && !has_username {
140                     has_username = true;
141                     self.normalized.push(':')
142                 } else {
143                     self.normalized.percent_encoding_push(ch, USERINFO);
144                 }
145             }
146         }
147         self.normalized.push('@');
148 
149         Ok(remaining)
150     }
parse_authority<'a>(&mut self, mut origin: UrlChars<'a>) -> Result<UrlChars<'a>, HttpError>151     fn parse_authority<'a>(&mut self, mut origin: UrlChars<'a>) -> Result<UrlChars<'a>, HttpError> {
152         loop {
153             let chars = origin.clone();
154             let c = if let Some(ch) = origin.next() {
155                 ch
156             } else {
157                 break;
158             };
159             match c {
160                 '/' | '?' | '#' => {
161                     origin = chars;
162                     break;
163                 }
164                 _ => {
165                     self.normalized.push(c);
166                 }
167             }
168         }
169         Ok(origin)
170     }
171 
parse_path<'a>(&mut self, mut origin: UrlChars<'a>) -> Result<UrlChars<'a>, HttpError>172     fn parse_path<'a>(&mut self, mut origin: UrlChars<'a>) -> Result<UrlChars<'a>, HttpError> {
173         loop {
174             let chars = origin.clone();
175 
176             let (ch, u8_str) = if let Some((ch, u8_str)) = origin.next_u8() {
177                 (ch, u8_str)
178             } else {
179                 break;
180             };
181             match ch {
182                 '/' => {
183                     self.normalized.push(ch);
184                 }
185                 '#' | '?' => {
186                     origin = chars;
187                     break;
188                 }
189                 _ => {
190                     self.normalized.percent_encoding_push((ch, u8_str), PATH);
191                 }
192             }
193         }
194 
195         Ok(origin)
196     }
197 
parse_query_and_fragment(&mut self, mut origin: UrlChars) -> Result<(), HttpError>198     fn parse_query_and_fragment(&mut self, mut origin: UrlChars) -> Result<(), HttpError> {
199         let mut remaining = origin.clone();
200         match origin.first_valid() {
201             None => {}
202             Some('?') => {
203                 self.normalized.push('?');
204                 let chars = self.parse_query(origin)?;
205                 remaining = chars;
206             }
207             Some('#') => {
208                 self.normalized.push('#');
209                 remaining = origin;
210             }
211             _ => {
212                 return Err(InvalidUri::InvalidFormat.into());
213             }
214         }
215         self.parse_fragment(remaining)
216     }
217 
parse_query<'a>(&mut self, mut origin: UrlChars<'a>) -> Result<UrlChars<'a>, HttpError>218     fn parse_query<'a>(&mut self, mut origin: UrlChars<'a>) -> Result<UrlChars<'a>, HttpError> {
219         while let Some((ch, u8_str)) = origin.next_u8() {
220             match ch {
221                 '#' => {
222                     self.normalized.push('#');
223                     break;
224                 }
225                 _ => self.normalized.percent_encoding_push((ch, u8_str), QUERY),
226             }
227         }
228 
229         Ok(origin)
230     }
231 
parse_fragment(&mut self, mut origin: UrlChars) -> Result<(), HttpError>232     fn parse_fragment(&mut self, mut origin: UrlChars) -> Result<(), HttpError> {
233         while let Some(utf8) = origin.next_u8() {
234             self.normalized.percent_encoding_push(utf8, FRAGMENT);
235         }
236         Ok(())
237     }
238 }
239 
240 pub(crate) struct Normalized {
241     url: String,
242 }
243 
244 impl Normalized {
from_size(size: usize) -> Self245     pub(crate) fn from_size(size: usize) -> Self {
246         Self {
247             url: String::with_capacity(size),
248         }
249     }
push(&mut self, ch: char)250     pub(crate) fn push(&mut self, ch: char) {
251         if !matches!(ch, '\t' | '\r' | '\n') {
252             self.url.push(ch);
253         }
254     }
255 
percent_encoding_push(&mut self, u8_ch: Utf8Char, char_set: &[u8])256     pub(crate) fn percent_encoding_push(&mut self, u8_ch: Utf8Char, char_set: &[u8]) {
257         let (ch, u8_str) = u8_ch;
258         if !matches!(ch, '\t' | '\r' | '\n') {
259             self.percent_encoding_char(u8_str, char_set);
260         }
261     }
262 
percent_encoding_char(&mut self, u8_str: &str, char_set: &[u8])263     pub(crate) fn percent_encoding_char(&mut self, u8_str: &str, char_set: &[u8]) {
264         let mut start = 0;
265         for (index, &byte) in u8_str.as_bytes().iter().enumerate() {
266             if should_percent_encoding(byte, char_set) {
267                 if start < index {
268                     let unencoded =
269                         unsafe { str::from_utf8_unchecked(&u8_str.as_bytes()[start..index]) };
270                     self.url.push_str(unencoded);
271                 }
272                 let encoded = percent_hex(byte);
273                 self.url.push('%');
274                 self.url.push_str(encoded);
275 
276                 start = index + 1;
277             }
278         }
279 
280         let ch_len = u8_str.len();
281         if start < ch_len {
282             let unencoded = unsafe { str::from_utf8_unchecked(&u8_str.as_bytes()[start..ch_len]) };
283             self.url.push_str(unencoded);
284         }
285     }
286 
url(self) -> String287     pub(crate) fn url(self) -> String {
288         self.url
289     }
290 }
291 
292 #[derive(Clone)]
293 struct UrlChars<'a> {
294     remaining: Chars<'a>,
295 }
296 
297 impl<'a> UrlChars<'a> {
next_u8(&mut self) -> Option<Utf8Char>298     pub(crate) fn next_u8(&mut self) -> Option<Utf8Char> {
299         let url_str = self.remaining.as_str();
300         self.remaining.next().map(|c| (c, &url_str[..c.len_utf8()]))
301     }
302 
first_valid(&mut self) -> Option<char>303     pub(crate) fn first_valid(&mut self) -> Option<char> {
304         self.remaining
305             .by_ref()
306             .find(|&c| !matches!(c, '\t' | '\r' | '\n'))
307     }
308 }
309 
310 impl<'a> Iterator for UrlChars<'a> {
311     type Item = char;
312 
next(&mut self) -> Option<Self::Item>313     fn next(&mut self) -> Option<Self::Item> {
314         self.remaining.next()
315     }
316 }
317 
should_percent_encoding(byte: u8, bytes: &[u8]) -> bool318 pub(crate) fn should_percent_encoding(byte: u8, bytes: &[u8]) -> bool {
319     !bytes.is_ascii() || byte < 0x20 || byte == 0x7f || byte >= 0x80 || bytes.contains(&byte)
320 }
321 
percent_hex(byte: u8) -> &'static str322 pub(crate) fn percent_hex(byte: u8) -> &'static str {
323     static HEX_ASCII: &[u8; 512] = b"\
324       000102030405060708090A0B0C0D0E0F\
325       101112131415161718191A1B1C1D1E1F\
326       202122232425262728292A2B2C2D2E2F\
327       303132333435363738393A3B3C3D3E3F\
328       404142434445464748494A4B4C4D4E4F\
329       505152535455565758595A5B5C5D5E5F\
330       606162636465666768696A6B6C6D6E6F\
331       707172737475767778797A7B7C7D7E7F\
332       808182838485868788898A8B8C8D8E8F\
333       909192939495969798999A9B9C9D9E9F\
334       A0A1A2A3A4A5A6A7A8A9AAABACADAEAF\
335       B0B1B2B3B4B5B6B7B8B9BABBBCBDBEBF\
336       C0C1C2C3C4C5C6C7C8C9CACBCCCDCECF\
337       D0D1D2D3D4D5D6D7D8D9DADBDCDDDEDF\
338       E0E1E2E3E4E5E6E7E8E9EAEBECEDEEEF\
339       F0F1F2F3F4F5F6F7F8F9FAFBFCFDFEFF\
340       ";
341     let index = usize::from(byte) * 2;
342     unsafe { str::from_utf8_unchecked(&HEX_ASCII[index..index + 2]) }
343 }
344 
345 #[cfg(test)]
346 mod ut_uri_percent_encoder {
347     use crate::request::uri::percent_encoding::PercentEncoder;
348     use crate::request::uri::{InvalidUri, Uri};
349 
350     macro_rules! err_percent_encode {
351         ($url:expr, $err:expr) => {{
352             let encoded = PercentEncoder::parse($url).err();
353             assert_eq!(encoded, $err);
354         }};
355     }
356 
357     macro_rules! success_percent_encode {
358         ($url:expr, $encoded:expr) => {{
359             let encoded = PercentEncoder::parse($url).unwrap();
360             assert_eq!(encoded, $encoded);
361         }};
362     }
363 
364     /// UT test cases for `PercentEncoder::parse`.
365     ///
366     /// # Brief
367     /// 1. Creates PercentEncoder by calling PercentEncoder::new().
368     /// 2. parse an url that contains chinese.
369     /// 3. Checks if the test result is correct by assert_eq!().
370     #[test]
url_percent_encode()371     fn url_percent_encode() {
372         success_percent_encode!(
373             "https://测试名:测试密码@www.example.com/data/new-测试文件.txt?from=project-名称#fragment-百分比-encode",
374             "https://%E6%B5%8B%E8%AF%95%E5%90%8D:%E6%B5%8B%E8%AF%95%E5%AF%86%E7%A0%81@www.example.com/data/new-%E6%B5%8B%E8%AF%95%E6%96%87%E4%BB%B6.txt?from=project-%E5%90%8D%E7%A7%B0#fragment-%E7%99%BE%E5%88%86%E6%AF%94-encode"
375         );
376 
377         success_percent_encode!(
378             "https://@www.example.com/data/new-测试文件.txt?from=project-名称#fragment-百分比-encode",
379             "https://www.example.com/data/new-%E6%B5%8B%E8%AF%95%E6%96%87%E4%BB%B6.txt?from=project-%E5%90%8D%E7%A7%B0#fragment-%E7%99%BE%E5%88%86%E6%AF%94-encode"
380         );
381 
382         success_percent_encode!(
383             "https://www.example.com/data/new-测试文件.txt#fragment-百分比-encode",
384             "https://www.example.com/data/new-%E6%B5%8B%E8%AF%95%E6%96%87%E4%BB%B6.txt#fragment-%E7%99%BE%E5%88%86%E6%AF%94-encode"
385         )
386     }
387 
388     /// UT test cases for `PercentEncoder::parse`.
389     ///
390     /// # Brief
391     /// 1. Creates PercentEncoder by calling PercentEncoder::new().
392     /// 2. parse an url that is wrong.
393     /// 3. Checks if the test result is correct by assert_eq!().
394     #[test]
url_percent_encode_failure()395     fn url_percent_encode_failure() {
396         err_percent_encode!(
397             "htt ps://测试名:测试密码@www.example.com/data/new-测试文件.txt?from=project-名称#fragment-百分比-encode",
398             Some(InvalidUri::InvalidScheme.into())
399         );
400         err_percent_encode!("htt ps://", Some(InvalidUri::InvalidScheme.into()));
401         err_percent_encode!("https", Some(InvalidUri::InvalidScheme.into()));
402         err_percent_encode!(
403             "https:///www.example.com",
404             Some(InvalidUri::InvalidScheme.into())
405         );
406         err_percent_encode!("https://@/data", Some(InvalidUri::UriMissHost.into()))
407     }
408 }
409