Gitly


1 // urllib parses URLs and implements query escaping.
2 // See RFC 3986. This module generally follows RFC 3986, except where
3 // it deviates for compatibility reasons.
4 // Based off:   https://github.com/golang/go/blob/master/src/net/url/url.go
5 // Last commit: https://github.com/golang/go/commit/fe2ed5054176935d4adcf13e891715ccf2ee3cce
6 // Copyright 2009 The Go Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style
8 // license that can be found in the LICENSE file.
9 module urllib
10 
11 import strings
12 
13 enum EncodingMode {
14     encode_path
15     encode_path_segment
16     encode_host
17     encode_zone
18     encode_user_password
19     encode_query_component
20     encode_fragment
21 }
22 
23 const err_msg_escape = 'unescape: invalid URL escape'
24 const err_msg_parse = 'parse: failed parsing url'
25 
26 fn error_msg(message string, val string) string {
27     mut msg := 'net.urllib.${message}'
28     if val != '' {
29         msg = '${msg} (${val})'
30     }
31     return msg
32 }
33 
34 // Return true if the specified character should be escaped when
35 // appearing in a URL string, according to RFC 3986.
36 //
37 // Please be informed that for now should_escape does not check all
38 // reserved characters correctly. See golang.org/issue/5684.
39 fn should_escape(c u8, mode EncodingMode) bool {
40     // §2.3 Unreserved characters (alphanum)
41     if c.is_alnum() {
42         return false
43     }
44     if mode == .encode_host || mode == .encode_zone {
45         // §3.2.2 host allows
46         // sub-delims = `!` / `$` / `&` / ``` / `(` / `)` / `*` / `+` / `,` / `;` / `=`
47         // as part of reg-name.
48         // We add : because we include :port as part of host.
49         // We add [ ] because we include [ipv6]:port as part of host.
50         // We add < > because they`re the only characters left that
51         // we could possibly allow, and parse will reject them if we
52         // escape them (because hosts can`t use %-encoding for
53         // ASCII bytes).
54         if c in [`!`, `$`, `&`, `\\`, `(`, `)`, `*`, `+`, `,`, `;`, `=`, `:`, `[`, `]`, `<`, `>`,
55             `"`] {
56             return false
57         }
58     }
59     match c {
60         `-`, `_`, `.`, `~` {
61             // §2.3 Unreserved characters (mark)
62             return false
63         }
64         `$`, `&`, `+`, `,`, `/`, `:`, `;`, `=`, `?`, `@` {
65             // §2.2 Reserved characters (reserved)
66             // Different sections of the URL allow a few of
67             // the reserved characters to appear unescaped.
68             match mode {
69                 .encode_path {
70                     // §3.3
71                     // The RFC allows : @ & = + $ but saves / ; , for assigning
72                     // meaning to individual path segments. This package
73                     // only manipulates the path as a whole, so we allow those
74                     // last three as well. That leaves only ? to escape.
75                     return c == `?`
76                 }
77                 .encode_path_segment {
78                     // §3.3
79                     // The RFC allows : @ & = + $ but saves / ; , for assigning
80                     // meaning to individual path segments.
81                     return c == `/` || c == `;` || c == `,` || c == `?`
82                 }
83                 .encode_user_password {
84                     // §3.2.1
85                     // The RFC allows `;`, `:`, `&`, `=`, `+`, `$`, and `,` in
86                     // userinfo, so we must escape only `@`, `/`, and `?`.
87                     // The parsing of userinfo treats `:` as special so we must escape
88                     // that too.
89                     return c == `@` || c == `/` || c == `?` || c == `:`
90                 }
91                 .encode_query_component {
92                     // §3.4
93                     // The RFC reserves (so we must escape) everything.
94                     return true
95                 }
96                 .encode_fragment {
97                     // §4.1
98                     // The RFC text is silent but the grammar allows
99                     // everything, so escape nothing.
100                     return false
101                 }
102                 else {}
103             }
104         }
105         else {}
106     }
107 
108     if mode == .encode_fragment {
109         // RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are
110         // included in reserved from RFC 2396 §2.2. The remaining sub-delims do not
111         // need to be escaped. To minimize potential breakage, we apply two restrictions:
112         // (1) we always escape sub-delims outside of the fragment, and (2) we always
113         // escape single quote to avoid breaking callers that had previously assumed that
114         // single quotes would be escaped. See issue #19917.
115         match c {
116             `!`, `(`, `)`, `*` { return false }
117             else {}
118         }
119     }
120     // Everything else must be escaped.
121     return true
122 }
123 
124 // query_unescape does the inverse transformation of query_escape,
125 // converting each 3-byte encoded substring of the form '%AB' into the
126 // hex-decoded byte 0xAB.
127 // It returns an error if any % is not followed by two hexadecimal
128 // digits.
129 pub fn query_unescape(s string) !string {
130     return unescape(s, .encode_query_component)
131 }
132 
133 // path_unescape does the inverse transformation of path_escape,
134 // converting each 3-byte encoded substring of the form '%AB' into the
135 // hex-decoded byte 0xAB. It returns an error if any % is not followed
136 // by two hexadecimal digits.
137 //
138 // path_unescape is identical to query_unescape except that it does not
139 // unescape '+' to ' ' (space).
140 pub fn path_unescape(s string) !string {
141     return unescape(s, .encode_path_segment)
142 }
143 
144 // unescape unescapes a string; the mode specifies
145 // which section of the URL string is being unescaped.
146 fn unescape(s_ string, mode EncodingMode) !string {
147     mut s := s_
148     // Count %, check that they're well-formed.
149     mut n := 0
150     mut has_plus := false
151     for i := 0; i < s.len; {
152         x := s[i]
153         match x {
154             `%` {
155                 if s == '' {
156                     break
157                 }
158                 n++
159                 if i + 2 >= s.len || !ishex(s[i + 1]) || !ishex(s[i + 2]) {
160                     if mode == .encode_query_component && i + 1 < s.len {
161                         s = s[..i] + '%25' + s[(i + 1)..]
162                         i += 4 // skip the %25 and the next character
163                         continue
164                     }
165                     s = s[i..]
166                     if s.len > 3 {
167                         s = s[..3]
168                     }
169                     return error(error_msg(err_msg_escape, s))
170                 }
171                 // Per https://tools.ietf.org/html/rfc3986#page-21
172                 // in the host component %-encoding can only be used
173                 // for non-ASCII bytes.
174                 // But https://tools.ietf.org/html/rfc6874#section-2
175                 // introduces %25 being allowed to escape a percent sign
176                 // in IPv6 scoped-address literals. Yay.
177                 if i + 3 >= s.len && mode == .encode_host && unhex(s[i + 1]) < 8
178                     && s[i..i + 3] != '%25' {
179                     return error(error_msg(err_msg_escape, s[i..i + 3]))
180                 }
181                 if mode == .encode_zone {
182                     // RFC 6874 says basically 'anything goes' for zone identifiers
183                     // and that even non-ASCII can be redundantly escaped,
184                     // but it seems prudent to restrict %-escaped bytes here to those
185                     // that are valid host name bytes in their unescaped form.
186                     // That is, you can use escaping in the zone identifier but not
187                     // to introduce bytes you couldn't just write directly.
188                     // But Windows puts spaces here! Yay.
189                     if i + 3 >= s.len {
190                         return error(error_msg('unescape: invalid escape sequence', ''))
191                     }
192                     v := ((unhex(s[i + 1]) << u8(4)) | unhex(s[i + 2]))
193                     if s[i..i + 3] != '%25' && v != ` ` && should_escape(v, .encode_host) {
194                         error(error_msg(err_msg_escape, s[i..i + 3]))
195                     }
196                 }
197                 i += 3
198             }
199             `+` {
200                 has_plus = mode == .encode_query_component
201                 i++
202             }
203             else {
204                 if (mode == .encode_host || mode == .encode_zone) && s[i] < 0x80
205                     && should_escape(s[i], mode) {
206                     error(error_msg('unescape: invalid character in host name', s[i..i + 1]))
207                 }
208                 i++
209             }
210         }
211     }
212     if n == 0 && !has_plus {
213         return '${s}' // TODO: `return s` once an autofree bug is fixed
214     }
215     if s.len < 2 * n {
216         return error(error_msg('unescape: invalid escape sequence', ''))
217     }
218     mut t := strings.new_builder(s.len - 2 * n)
219     for i := 0; i < s.len; i++ {
220         x := s[i]
221         match x {
222             `%` {
223                 if i + 2 >= s.len {
224                     return error(error_msg('unescape: invalid escape sequence', ''))
225                 }
226                 t.write_string(((unhex(s[i + 1]) << u8(4)) | unhex(s[i + 2])).ascii_str())
227                 i += 2
228             }
229             `+` {
230                 if mode == .encode_query_component {
231                     t.write_string(' ')
232                 } else {
233                     t.write_string('+')
234                 }
235             }
236             else {
237                 t.write_string(s[i].ascii_str())
238             }
239         }
240     }
241     return t.str()
242 }
243 
244 // query_escape escapes the string so it can be safely placed
245 // inside a URL query.
246 pub fn query_escape(s string) string {
247     return escape(s, .encode_query_component)
248 }
249 
250 // path_escape escapes the string so it can be safely placed inside a URL path segment,
251 // replacing special characters (including /) with %XX sequences as needed.
252 pub fn path_escape(s string) string {
253     return escape(s, .encode_path_segment)
254 }
255 
256 fn escape(s string, mode EncodingMode) string {
257     mut space_count := 0
258     mut hex_count := 0
259     mut c := u8(0)
260     for i in 0 .. s.len {
261         c = s[i]
262         if should_escape(c, mode) {
263             if c == ` ` && mode == .encode_query_component {
264                 space_count++
265             } else {
266                 hex_count++
267             }
268         }
269     }
270     if space_count == 0 && hex_count == 0 {
271         return s
272     }
273     required := s.len + 2 * hex_count
274     mut t := []u8{len: required}
275     if hex_count == 0 {
276         copy(mut t, s.bytes())
277         for i in 0 .. s.len {
278             if s[i] == ` ` {
279                 t[i] = `+`
280             }
281         }
282         return t.bytestr()
283     }
284     upperhex := '0123456789ABCDEF'
285     mut j := 0
286     for i in 0 .. s.len {
287         c1 := s[i]
288         if c1 == ` ` && mode == .encode_query_component {
289             t[j] = `+`
290             j++
291         } else if should_escape(c1, mode) {
292             t[j] = `%`
293             t[j + 1] = upperhex[c1 >> 4]
294             t[j + 2] = upperhex[c1 & 15]
295             j += 3
296         } else {
297             t[j] = s[i]
298             j++
299         }
300     }
301     return t.bytestr()
302 }
303 
304 // A URL represents a parsed URL (technically, a URI reference).
305 // The general form represented is:
306 // [scheme:][//[userinfo@]host][/]path[?query][#fragment]
307 // URLs that do not start with a slash after the scheme are interpreted as:
308 // scheme:opaque[?query][#fragment]
309 //
310 // Note that the path field is stored in decoded form: /%47%6f%2f becomes /Go/.
311 // A consequence is that it is impossible to tell which slashes in the path were
312 // slashes in the raw URL and which were %2f. This distinction is rarely important,
313 // but when it is, the code should use raw_path, an optional field which only gets
314 // set if the default encoding is different from path.
315 //
316 // URL's String method uses the escaped_path method to obtain the path. See the
317 // escaped_path method for more details.
318 pub struct URL {
319 pub mut:
320     scheme      string
321     opaque      string    // encoded opaque data
322     user        ?Userinfo // username and password information
323     host        string    // host or host:port
324     path        string    // path (relative paths may omit leading slash)
325     raw_path    string    // encoded path hint (see escaped_path method)
326     force_query bool      // append a query ('?') even if raw_query is empty
327     raw_query   string    // encoded query values, without '?'
328     fragment    string    // fragment for references, without '#'
329 }
330 
331 // debug returns a string representation of *ALL* the fields of the given URL
332 pub fn (url &URL) debug() string {
333     return 'URL{\n  scheme: ${url.scheme}\n  opaque: ${url.opaque}\n  user: ${url.user}\n  host: ${url.host}\n  path: ${url.path}\n  raw_path: ${url.raw_path}\n  force_query: ${url.force_query}\n  raw_query: ${url.raw_query}\n  fragment: ${url.fragment}\n}'
334 }
335 
336 // user returns a Userinfo containing the provided username
337 // and no password set.
338 pub fn user(username string) Userinfo {
339     return Userinfo{
340         username:     username
341         password:     ''
342         password_set: false
343     }
344 }
345 
346 // user_password returns a Userinfo containing the provided username
347 // and password.
348 //
349 // This functionality should only be used with legacy web sites.
350 // RFC 2396 warns that interpreting Userinfo this way
351 // ``is NOT RECOMMENDED, because the passing of authentication
352 // information in clear text (such as URI) has proven to be a
353 // security risk in almost every case where it has been used.''
354 fn user_password(username string, password string) Userinfo {
355     return Userinfo{username, password, true}
356 }
357 
358 // The Userinfo type is an immutable encapsulation of username and
359 // password details for a URL. An existing Userinfo value is guaranteed
360 // to have a username set (potentially empty, as allowed by RFC 2396),
361 // and optionally a password.
362 pub struct Userinfo {
363 pub:
364     username     string
365     password     string
366     password_set bool
367 }
368 
369 fn (u Userinfo) empty() bool {
370     return u.username == '' && u.password == ''
371 }
372 
373 // string returns the encoded userinfo information in the standard form
374 // of 'username[:password]'.
375 fn (u Userinfo) str() string {
376     if u.empty() {
377         return ''
378     }
379     mut s := escape(u.username, .encode_user_password)
380     if u.password_set {
381         s += ':' + escape(u.password, .encode_user_password)
382     }
383     return s
384 }
385 
386 // Maybe rawurl is of the form scheme:path.
387 // (scheme must be [a-zA-Z][a-zA-Z0-9+-.]*)
388 // If so, return [scheme, path]; else return ['', rawurl]
389 fn split_by_scheme(rawurl string) ![]string {
390     for i in 0 .. rawurl.len {
391         c := rawurl[i]
392         if c.is_letter() {
393             // do nothing
394         } else if c.is_digit() || c in [`+`, `-`, `.`] {
395             if i == 0 {
396                 return ['', rawurl]
397             }
398         } else if c == `:` {
399             if i == 0 {
400                 return error(error_msg('split_by_scheme: missing protocol scheme', ''))
401             }
402             return [rawurl[..i], rawurl[i + 1..]]
403         } else {
404             // we have encountered an invalid character,
405             // so there is no valid scheme
406             return ['', rawurl]
407         }
408     }
409     return ['', rawurl]
410 }
411 
412 fn get_scheme(rawurl string) !string {
413     parts := split_by_scheme(rawurl) or { return err.msg() }
414     return parts[0]
415 }
416 
417 // split slices s into two substrings separated by the first occurrence of
418 // sep. If cutc is true then sep is included with the second substring.
419 // If sep does not occur in s then s and the empty string is returned.
420 fn split(s string, sep u8, cutc bool) (string, string) {
421     i := s.index_u8(sep)
422     if i < 0 {
423         return s, ''
424     }
425     if cutc {
426         return s[..i], s[i + 1..]
427     }
428     return s[..i], s[i..]
429 }
430 
431 // parse parses rawurl into a URL structure.
432 //
433 // The rawurl may be relative (a path, without a host) or absolute
434 // (starting with a scheme). Trying to parse a hostname and path
435 // without a scheme is invalid but may not necessarily return an
436 // error, due to parsing ambiguities.
437 pub fn parse(rawurl string) !URL {
438     // Cut off #frag
439     u, frag := split(rawurl, `#`, true)
440     mut url := parse_url(u, false) or {
441         return error(error_msg(err_msg_parse + '[${err.msg()}]', u))
442     }
443     if frag == '' {
444         return url
445     }
446     f := unescape(frag, .encode_fragment) or {
447         return error(error_msg(err_msg_parse + '[${err.msg()}]', u))
448     }
449     url.fragment = f
450     return url
451 }
452 
453 // parse_request_uri parses rawurl into a URL structure for an HTTP request.
454 // It accepts only absolute URIs or absolute paths and preserves leading `//`
455 // sequences as part of the path for request targets.
456 // The string rawurl is assumed not to have a #fragment suffix.
457 // (Web browsers strip #fragment before sending the URL to a web server.)
458 pub fn parse_request_uri(rawurl string) !URL {
459     return parse_url(rawurl, true)
460 }
461 
462 // parse_url parses a URL from a string in one of two contexts. If
463 // via_request is true, the URL is assumed to have arrived via an HTTP request,
464 // in which case only absolute URLs or path-absolute relative URLs are allowed.
465 // If via_request is false, all forms of relative URLs are allowed.
466 @[manualfree]
467 fn parse_url(rawurl string, via_request bool) !URL {
468     if string_contains_ctl_u8(rawurl) {
469         return error(error_msg('parse_url: invalid control character in URL', rawurl))
470     }
471     if rawurl == '' && via_request {
472         return error(error_msg('parse_url: empty URL', rawurl))
473     }
474     mut url := URL{
475         user: none
476     }
477     if rawurl == '*' {
478         url.path = '*'
479         return url
480     }
481     // Split off possible leading 'http:', 'mailto:', etc.
482     // Cannot contain escaped characters.
483     p := split_by_scheme(rawurl)!
484     url.scheme = p[0]
485     mut rest := p[1]
486     url.scheme = url.scheme.to_lower()
487     // if rest.ends_with('?') && strings.count(rest, '?') == 1 {
488     if rest.ends_with('?') && !rest[..1].contains('?') {
489         url.force_query = true
490         rest = rest[..rest.len - 1]
491     } else {
492         r, raw_query := split(rest, `?`, true)
493         rest = r
494         url.raw_query = raw_query
495     }
496     if !rest.starts_with('/') {
497         if url.scheme != '' {
498             // We consider rootless paths per RFC 3986 as opaque.
499             url.opaque = rest
500             return url
501         }
502         if via_request {
503             return error(error_msg('parse_url: invalid URI for request', ''))
504         }
505         // Avoid confusion with malformed schemes, like cache_object:foo/bar.
506         // See golang.org/issue/16822.
507         //
508         // RFC 3986, §3.3:
509         // In addition, a URI reference (Section 4.1) may be a relative-path reference,
510         // in which case the first path segment cannot contain a colon (':') character.
511         if colon := rest.index(':') {
512             slash := rest.index('/') or { return error('there should be a / in the URL') }
513             if colon >= 0 && (slash < 0 || colon < slash) {
514                 // First path segment has colon. Not allowed in relative URL.
515                 return error(error_msg('parse_url: first path segment in URL cannot contain colon',
516                     ''))
517             }
518         }
519     }
520     if ((url.scheme != '' || !via_request) && !rest.starts_with('///')) && rest.starts_with('//')
521         && rest.len > 2 {
522         authority, r := split(rest[2..], `/`, false)
523         rest = r
524         a := parse_authority(authority)!
525         url.user = a.user
526         url.host = a.host
527     }
528     // Set path and, optionally, raw_path.
529     // raw_path is a hint of the encoding of path. We don't want to set it if
530     // the default escaping of path is equivalent, to help make sure that people
531     // don't rely on it in general.
532     url.set_path(rest)!
533     return url
534 }
535 
536 struct ParseAuthorityRes {
537     user ?Userinfo
538     host string
539 }
540 
541 fn parse_authority(authority string) !ParseAuthorityRes {
542     i := authority.last_index_u8(`@`)
543     if i < 0 {
544         return ParseAuthorityRes{
545             host: parse_host(authority)!
546             user: user('')
547         }
548     }
549     raw_user, raw_host := authority[..i], authority[i + 1..]
550     if !valid_userinfo(raw_user) {
551         return error(error_msg('parse_authority: invalid userinfo', ''))
552     }
553     host := parse_host(raw_host)!
554     name, pwd := split(raw_user, `:`, true)
555     auth := if pwd != '' {
556         user_password(unescape(name, .encode_user_password)!, unescape(pwd, .encode_user_password)!)
557     } else {
558         user(unescape(name, .encode_user_password)!)
559     }
560     return ParseAuthorityRes{
561         user: auth
562         host: host
563     }
564 }
565 
566 // parse_host parses host as an authority without user
567 // information. That is, as host[:port].
568 fn parse_host(host string) !string {
569     if host.len > 0 && host[0] == `[` {
570         // parse an IP-Literal in RFC 3986 and RFC 6874.
571         // E.g., '[fe80::1]', '[fe80::1%25en0]', '[fe80::1]:80'.
572         i := host.last_index_u8(`]`)
573         if i == -1 {
574             return error(error_msg("parse_host: missing ']' in host", ''))
575         }
576         mut colon_port := host[i + 1..]
577         if !valid_optional_port(colon_port) {
578             return error(error_msg('parse_host: invalid port ${colon_port} after host ', ''))
579         }
580         // RFC 6874 defines that %25 (%-encoded percent) introduces
581         // the zone identifier, and the zone identifier can use basically
582         // any %-encoding it likes. That's different from the host, which
583         // can only %-encode non-ASCII bytes.
584         // We do impose some restrictions on the zone, to avoid stupidity
585         // like newlines.
586         if zone := host[..i].index('%25') {
587             host1 := unescape(host[..zone], .encode_host)!
588             host2 := unescape(host[zone..i], .encode_zone)!
589             host3 := unescape(host[i..], .encode_host)!
590             return host1 + host2 + host3
591         }
592     } else {
593         i := host.last_index_u8(`:`)
594         if i != -1 {
595             colon_port := host[i..]
596             if !valid_optional_port(colon_port) {
597                 return error(error_msg('parse_host: invalid port ${colon_port} after host ', ''))
598             }
599         }
600     }
601     h := unescape(host, .encode_host)!
602     return h
603 }
604 
605 // set_path sets the path and raw_path fields of the URL based on the provided
606 // escaped path p. It maintains the invariant that raw_path is only specified
607 // when it differs from the default encoding of the path.
608 // For example:
609 // - set_path('/foo/bar')   will set path='/foo/bar' and raw_path=''
610 // - set_path('/foo%2fbar') will set path='/foo/bar' and raw_path='/foo%2fbar'
611 // set_path will return an error only if the provided path contains an invalid
612 // escaping.
613 pub fn (mut u URL) set_path(p string) !bool {
614     u.path = unescape(p, .encode_path)!
615     u.raw_path = if p == escape(u.path, .encode_path) { '' } else { p }
616     return true
617 }
618 
619 // escaped_path returns the escaped form of u.path.
620 // In general there are multiple possible escaped forms of any path.
621 // escaped_path returns u.raw_path when it is a valid escaping of u.path.
622 // Otherwise escaped_path ignores u.raw_path and computes an escaped
623 // form on its own.
624 // The String and request_uri methods use escaped_path to construct
625 // their results.
626 // In general, code should call escaped_path instead of
627 // reading u.raw_path directly.
628 pub fn (u &URL) escaped_path() string {
629     if u.raw_path != '' && valid_encoded_path(u.raw_path) {
630         unescape(u.raw_path, .encode_path) or { return '' }
631         return u.raw_path
632     }
633     if u.path == '*' {
634         return '*' // don't escape (Issue 11202)
635     }
636     return escape(u.path, .encode_path)
637 }
638 
639 // valid_encoded_path reports whether s is a valid encoded path.
640 // It must not contain any bytes that require escaping during path encoding.
641 fn valid_encoded_path(s string) bool {
642     for i in 0 .. s.len {
643         // RFC 3986, Appendix A.
644         // pchar = unreserved / pct-encoded / sub-delims / ':' / '@'.
645         // should_escape is not quite compliant with the RFC,
646         // so we check the sub-delims ourselves and let
647         // should_escape handle the others.
648         x := s[i]
649         match x {
650             `!`, `$`, `&`, `\\`, `(`, `)`, `*`, `+`, `,`, `;`, `=`, `:`, `@` {
651                 // ok
652             }
653             `[`, `]` {
654                 // ok - not specified in RFC 3986 but left alone by modern browsers
655             }
656             `%` {
657                 // ok - percent encoded, will decode
658             }
659             else {
660                 if should_escape(s[i], .encode_path) {
661                     return false
662                 }
663             }
664         }
665     }
666     return true
667 }
668 
669 // valid_optional_port reports whether port is either an empty string
670 // or matches /^:\d*$/
671 fn valid_optional_port(port string) bool {
672     if port == '' {
673         return true
674     }
675     if port[0] != `:` {
676         return false
677     }
678     for b in port[1..] {
679         if b < `0` || b > `9` {
680             return false
681         }
682     }
683     return true
684 }
685 
686 // str reassembles the URL into a valid URL string.
687 // The general form of the result is one of:
688 //
689 // scheme:opaque?query#fragment
690 // scheme://userinfo@host/path?query#fragment
691 //
692 // If u.opaque is non-empty, String uses the first form;
693 // otherwise it uses the second form.
694 // Any non-ASCII characters in host are escaped.
695 // To obtain the path, String uses u.escaped_path().
696 //
697 // In the second form, the following rules apply:
698 // - if u.scheme is empty, scheme: is omitted.
699 // - if u.user is none, userinfo@ is omitted.
700 // - if u.host is empty, host/ is omitted.
701 // - if u.scheme and u.host are empty and u.user is nil,
702 // the entire scheme://userinfo@host/ is omitted.
703 // - if u.host is non-empty and u.path begins with a /,
704 // the form host/path does not add its own /.
705 // - if u.raw_query is empty, ?query is omitted.
706 // - if u.fragment is empty, #fragment is omitted.
707 pub fn (u URL) str() string {
708     mut buf := strings.new_builder(200)
709     if u.scheme != '' {
710         buf.write_string(u.scheme)
711         buf.write_string(':')
712     }
713     if u.opaque != '' {
714         buf.write_string(u.opaque)
715     } else {
716         userinfo := u.user or { Userinfo{} }
717         if u.scheme != '' || u.host != '' || !userinfo.empty() {
718             if u.host != '' || u.path != '' || !userinfo.empty() {
719                 buf.write_string('//')
720             }
721             if !userinfo.empty() {
722                 buf.write_string(userinfo.str())
723                 buf.write_string('@')
724             }
725             if u.host != '' {
726                 buf.write_string(escape(u.host, .encode_host))
727             }
728         }
729         path := u.escaped_path()
730         if path != '' && path[0] != `/` && u.host != '' {
731             buf.write_string('/')
732         }
733         if buf.len == 0 {
734             // RFC 3986 §4.2
735             // A path segment that contains a colon character (e.g., 'this:that')
736             // cannot be used as the first segment of a relative-path reference, as
737             // it would be mistaken for a scheme name. Such a segment must be
738             // preceded by a dot-segment (e.g., './this:that') to make a relative-
739             // path reference.
740             i := path.index_u8(`:`)
741             if i > -1 {
742                 // TODO: remove this when autofree handles tmp
743                 // expressions like this
744                 if i > -1 && path[..i].index_u8(`/`) == -1 {
745                     buf.write_string('./')
746                 }
747             }
748         }
749         buf.write_string(path)
750     }
751     if u.force_query || u.raw_query != '' {
752         buf.write_string('?')
753         buf.write_string(u.raw_query)
754     }
755     if u.fragment != '' {
756         buf.write_string('#')
757         buf.write_string(escape(u.fragment, .encode_fragment))
758     }
759     return buf.str()
760 }
761 
762 // Values maps a string key to a list of values.
763 // It is typically used for query parameters and form values.
764 // Unlike in the http.Header map, the keys in a Values map
765 // are case-sensitive.
766 // parseQuery parses the URL-encoded query string and returns
767 // a map listing the values specified for each key.
768 // parseQuery always returns a non-nil map containing all the
769 // valid query parameters found; err describes the first decoding error
770 // encountered, if any.
771 //
772 // Query is expected to be a list of key=value settings separated by
773 // ampersands or semicolons. A setting without an equals sign is
774 // interpreted as a key set to an empty value.
775 pub fn parse_query(query string) !Values {
776     mut m := new_values()
777     parse_query_values(mut m, query)!
778     return m
779 }
780 
781 // parse_query_silent is the same as parse_query
782 // but any errors will be silent
783 fn parse_query_silent(query string) Values {
784     mut m := new_values()
785     parse_query_values(mut m, query) or {}
786     return m
787 }
788 
789 fn parse_query_values(mut m Values, query string) !bool {
790     mut had_error := false
791     mut q := query
792     for q != '' {
793         mut key := q
794         mut i := key.index_any('&;')
795         if i >= 0 {
796             q = key[i + 1..]
797             key = key[..i]
798         } else {
799             q = ''
800         }
801         if key == '' {
802             continue
803         }
804         mut value := ''
805         if idx := key.index('=') {
806             i = idx
807             value = key[i + 1..]
808             key = key[..i]
809         }
810         k := query_unescape(key) or {
811             had_error = true
812             continue
813         }
814         key = k
815         v := query_unescape(value) or {
816             had_error = true
817             continue
818         }
819         value = v
820         m.add(key, value)
821     }
822     if had_error {
823         return error(error_msg('parse_query_values: failed parsing query string', ''))
824     }
825     return true
826 }
827 
828 // encode encodes the values into ``URL encoded'' form
829 // ('bar=baz&foo=quux').
830 // The syntx of the query string is specified in the
831 // RFC173 https://datatracker.ietf.org/doc/html/rfc1738
832 //
833 // HTTP grammar
834 //
835 // httpurl        = "http://" hostport [ "/" hpath [ "?" search ]]
836 // hpath          = hsegment *[ "/" hsegment ]
837 // hsegment       = *[ uchar | ";" | ":" | "@" | "&" | "=" ]
838 // search         = *[ uchar | ";" | ":" | "@" | "&" | "=" ]
839 pub fn (v Values) encode() string {
840     if v.len == 0 {
841         return ''
842     }
843     mut buf := strings.new_builder(200)
844     for qvalue in v.data {
845         key_kscaped := query_escape(qvalue.key)
846         if buf.len > 0 {
847             buf.write_string('&')
848         }
849         buf.write_string(key_kscaped)
850         if qvalue.value == '' {
851             continue
852         }
853         buf.write_string('=')
854         buf.write_string(query_escape(qvalue.value))
855     }
856     return buf.str()
857 }
858 
859 // resolve_path applies special path segments from refs and applies
860 // them to base, per RFC 3986.
861 fn resolve_path(base string, ref string) string {
862     mut full := ''
863     if ref == '' {
864         full = base
865     } else if ref[0] != `/` {
866         i := base.last_index_u8(`/`)
867         full = base[..i + 1] + ref
868     } else {
869         full = ref
870     }
871     if full == '' {
872         return ''
873     }
874     mut dst := []string{}
875     src := full.split('/')
876     for _, elem in src {
877         match elem {
878             '.' {
879                 // drop
880             }
881             '..' {
882                 if dst.len > 0 {
883                     dst = unsafe { dst[..dst.len - 1] }
884                 }
885             }
886             else {
887                 dst << elem
888             }
889         }
890     }
891     last := src[src.len - 1]
892     if last == '.' || last == '..' {
893         // Add final slash to the joined path.
894         dst << ''
895     }
896     return '/' + dst.join('/').trim_left('/')
897 }
898 
899 // is_abs reports whether the URL is absolute.
900 // Absolute means that it has a non-empty scheme.
901 pub fn (u &URL) is_abs() bool {
902     return u.scheme != ''
903 }
904 
905 // parse parses a URL in the context of the receiver. The provided URL
906 // may be relative or absolute. parse returns nil, err on parse
907 // failure, otherwise its return value is the same as resolve_reference.
908 pub fn (u &URL) parse(ref string) !URL {
909     refurl := parse(ref)!
910     return u.resolve_reference(refurl)
911 }
912 
913 // resolve_reference resolves a URI reference to an absolute URI from
914 // an absolute base URI u, per RFC 3986 Section 5.2. The URI reference
915 // may be relative or absolute. resolve_reference always returns a new
916 // URL instance, even if the returned URL is identical to either the
917 // base or reference. If ref is an absolute URL, then resolve_reference
918 // ignores base and returns a copy of ref.
919 pub fn (u &URL) resolve_reference(ref &URL) !URL {
920     mut url := *ref
921     if ref.scheme == '' {
922         url.scheme = u.scheme
923     }
924     ref_user := ref.user or { Userinfo{} }
925     if ref.scheme != '' || ref.host != '' || !ref_user.empty() {
926         // The 'absoluteURI' or 'net_path' cases.
927         // We can ignore the error from set_path since we know we provided a
928         // validly-escaped path.
929         url.set_path(resolve_path(ref.escaped_path(), ''))!
930         return url
931     }
932     if ref.opaque != '' {
933         url.user = user('')
934         url.host = ''
935         url.path = ''
936         return url
937     }
938     if ref.path == '' && ref.raw_query == '' {
939         url.raw_query = u.raw_query
940         if ref.fragment == '' {
941             url.fragment = u.fragment
942         }
943     }
944     // The 'abs_path' or 'rel_path' cases.
945     url.host = u.host
946     url.user = u.user
947     url.set_path(resolve_path(u.escaped_path(), ref.escaped_path()))!
948     return url
949 }
950 
951 // query parses raw_query and returns the corresponding values.
952 // It silently discards malformed value pairs.
953 // To check errors use parseQuery.
954 pub fn (u &URL) query() Values {
955     v := parse_query_silent(u.raw_query)
956     return v
957 }
958 
959 // request_uri returns the encoded path?query or opaque?query
960 // string that would be used in an HTTP request for u.
961 pub fn (u &URL) request_uri() string {
962     mut result := u.opaque
963     if result == '' {
964         result = u.escaped_path()
965         if result == '' {
966             result = '/'
967         }
968     } else {
969         if result.starts_with('//') {
970             result = u.scheme + ':' + result
971         }
972     }
973     if u.force_query || u.raw_query != '' {
974         result += '?' + u.raw_query
975     }
976     return result
977 }
978 
979 // hostname returns u.host, stripping any valid port number if present.
980 //
981 // If the result is enclosed in square brackets, as literal IPv6 addresses are,
982 // the square brackets are removed from the result.
983 pub fn (u &URL) hostname() string {
984     host, _ := split_host_port(u.host)
985     return host
986 }
987 
988 // port returns the port part of u.host, without the leading colon.
989 // If u.host doesn't contain a port, port returns an empty string.
990 pub fn (u &URL) port() string {
991     _, port := split_host_port(u.host)
992     return port
993 }
994 
995 // split_host_port separates host and port. If the port is not valid, it returns
996 // the entire input as host, and it doesn't check the validity of the host.
997 // Per RFC 3986, it requires ports to be numeric.
998 pub fn split_host_port(hostport string) (string, string) {
999     mut host := hostport
1000     mut port := ''
1001     colon := host.last_index_u8(`:`)
1002     if colon != -1 {
1003         if valid_optional_port(host[colon..]) {
1004             port = host[colon + 1..]
1005             host = host[..colon]
1006         }
1007     }
1008     if host.len > 1 && host[0] == `[` && host.ends_with(']') {
1009         host = host[1..host.len - 1]
1010     }
1011     return host, port
1012 }
1013 
1014 // valid_userinfo reports whether s is a valid userinfo string per RFC 3986
1015 // Section 3.2.1:
1016 // userinfo    = *( unreserved / pct-encoded / sub-delims / ':' )
1017 // unreserved  = ALPHA / DIGIT / '-' / '.' / '_' / '~'
1018 // sub-delims  = '!' / '$' / '&' / ''' / '(' / ')'
1019 // / '*' / '+' / ',' / ';' / '='
1020 //
1021 // It doesn't validate pct-encoded. The caller does that via fn unescape.
1022 pub fn valid_userinfo(s string) bool {
1023     for r in s {
1024         if r.is_alnum() {
1025             continue
1026         }
1027         match r {
1028             `-`, `.`, `_`, `:`, `~`, `!`, `$`, `&`, `\\`, `(`, `)`, `*`, `+`, `,`, `;`, `=`, `%`,
1029             `@` {
1030                 continue
1031             }
1032             else {
1033                 return false
1034             }
1035         }
1036     }
1037     return true
1038 }
1039 
1040 // string_contains_ctl_byte reports whether s contains any ASCII control character.
1041 fn string_contains_ctl_u8(s string) bool {
1042     for i in 0 .. s.len {
1043         b := s[i]
1044         if b < ` ` || b == 0x7f {
1045             return true
1046         }
1047     }
1048     return false
1049 }
1050 
1051 pub fn ishex(c u8) bool {
1052     if `0` <= c && c <= `9` {
1053         return true
1054     } else if `a` <= c && c <= `f` {
1055         return true
1056     } else if `A` <= c && c <= `F` {
1057         return true
1058     }
1059     return false
1060 }
1061 
1062 fn unhex(c u8) u8 {
1063     if `0` <= c && c <= `9` {
1064         return c - `0`
1065     } else if `a` <= c && c <= `f` {
1066         return c - `a` + 10
1067     } else if `A` <= c && c <= `F` {
1068         return c - `A` + 10
1069     }
1070     return 0
1071 }
1072