v / vlib / net / urllib / urllib.v
1071 lines · 1021 sloc · 31.01 KB · 390efe46a1f46f302ae98c803b8ffbbb333fdb28
Raw
1// urllib parses URLs and implements query escaping.
2// See RFC 3986. This module generally follows RFC 3986, except where
3// it deviates for compatibility reasons.
4// Based off: https://github.com/golang/go/blob/master/src/net/url/url.go
5// Last commit: https://github.com/golang/go/commit/fe2ed5054176935d4adcf13e891715ccf2ee3cce
6// Copyright 2009 The Go Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style
8// license that can be found in the LICENSE file.
9module urllib
10
11import strings
12
13enum EncodingMode {
14 encode_path
15 encode_path_segment
16 encode_host
17 encode_zone
18 encode_user_password
19 encode_query_component
20 encode_fragment
21}
22
23const err_msg_escape = 'unescape: invalid URL escape'
24const err_msg_parse = 'parse: failed parsing url'
25
26fn error_msg(message string, val string) string {
27 mut msg := 'net.urllib.${message}'
28 if val != '' {
29 msg = '${msg} (${val})'
30 }
31 return msg
32}
33
34// Return true if the specified character should be escaped when
35// appearing in a URL string, according to RFC 3986.
36//
37// Please be informed that for now should_escape does not check all
38// reserved characters correctly. See golang.org/issue/5684.
39fn should_escape(c u8, mode EncodingMode) bool {
40 // §2.3 Unreserved characters (alphanum)
41 if c.is_alnum() {
42 return false
43 }
44 if mode == .encode_host || mode == .encode_zone {
45 // §3.2.2 host allows
46 // sub-delims = `!` / `$` / `&` / ``` / `(` / `)` / `*` / `+` / `,` / `;` / `=`
47 // as part of reg-name.
48 // We add : because we include :port as part of host.
49 // We add [ ] because we include [ipv6]:port as part of host.
50 // We add < > because they`re the only characters left that
51 // we could possibly allow, and parse will reject them if we
52 // escape them (because hosts can`t use %-encoding for
53 // ASCII bytes).
54 if c in [`!`, `$`, `&`, `\\`, `(`, `)`, `*`, `+`, `,`, `;`, `=`, `:`, `[`, `]`, `<`, `>`,
55 `"`] {
56 return false
57 }
58 }
59 match c {
60 `-`, `_`, `.`, `~` {
61 // §2.3 Unreserved characters (mark)
62 return false
63 }
64 `$`, `&`, `+`, `,`, `/`, `:`, `;`, `=`, `?`, `@` {
65 // §2.2 Reserved characters (reserved)
66 // Different sections of the URL allow a few of
67 // the reserved characters to appear unescaped.
68 match mode {
69 .encode_path {
70 // §3.3
71 // The RFC allows : @ & = + $ but saves / ; , for assigning
72 // meaning to individual path segments. This package
73 // only manipulates the path as a whole, so we allow those
74 // last three as well. That leaves only ? to escape.
75 return c == `?`
76 }
77 .encode_path_segment {
78 // §3.3
79 // The RFC allows : @ & = + $ but saves / ; , for assigning
80 // meaning to individual path segments.
81 return c == `/` || c == `;` || c == `,` || c == `?`
82 }
83 .encode_user_password {
84 // §3.2.1
85 // The RFC allows `;`, `:`, `&`, `=`, `+`, `$`, and `,` in
86 // userinfo, so we must escape only `@`, `/`, and `?`.
87 // The parsing of userinfo treats `:` as special so we must escape
88 // that too.
89 return c == `@` || c == `/` || c == `?` || c == `:`
90 }
91 .encode_query_component {
92 // §3.4
93 // The RFC reserves (so we must escape) everything.
94 return true
95 }
96 .encode_fragment {
97 // §4.1
98 // The RFC text is silent but the grammar allows
99 // everything, so escape nothing.
100 return false
101 }
102 else {}
103 }
104 }
105 else {}
106 }
107
108 if mode == .encode_fragment {
109 // RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are
110 // included in reserved from RFC 2396 §2.2. The remaining sub-delims do not
111 // need to be escaped. To minimize potential breakage, we apply two restrictions:
112 // (1) we always escape sub-delims outside of the fragment, and (2) we always
113 // escape single quote to avoid breaking callers that had previously assumed that
114 // single quotes would be escaped. See issue #19917.
115 match c {
116 `!`, `(`, `)`, `*` { return false }
117 else {}
118 }
119 }
120 // Everything else must be escaped.
121 return true
122}
123
124// query_unescape does the inverse transformation of query_escape,
125// converting each 3-byte encoded substring of the form '%AB' into the
126// hex-decoded byte 0xAB.
127// It returns an error if any % is not followed by two hexadecimal
128// digits.
129pub fn query_unescape(s string) !string {
130 return unescape(s, .encode_query_component)
131}
132
133// path_unescape does the inverse transformation of path_escape,
134// converting each 3-byte encoded substring of the form '%AB' into the
135// hex-decoded byte 0xAB. It returns an error if any % is not followed
136// by two hexadecimal digits.
137//
138// path_unescape is identical to query_unescape except that it does not
139// unescape '+' to ' ' (space).
140pub fn path_unescape(s string) !string {
141 return unescape(s, .encode_path_segment)
142}
143
144// unescape unescapes a string; the mode specifies
145// which section of the URL string is being unescaped.
146fn unescape(s_ string, mode EncodingMode) !string {
147 mut s := s_
148 // Count %, check that they're well-formed.
149 mut n := 0
150 mut has_plus := false
151 for i := 0; i < s.len; {
152 x := s[i]
153 match x {
154 `%` {
155 if s == '' {
156 break
157 }
158 n++
159 if i + 2 >= s.len || !ishex(s[i + 1]) || !ishex(s[i + 2]) {
160 if mode == .encode_query_component && i + 1 < s.len {
161 s = s[..i] + '%25' + s[(i + 1)..]
162 i += 4 // skip the %25 and the next character
163 continue
164 }
165 s = s[i..]
166 if s.len > 3 {
167 s = s[..3]
168 }
169 return error(error_msg(err_msg_escape, s))
170 }
171 // Per https://tools.ietf.org/html/rfc3986#page-21
172 // in the host component %-encoding can only be used
173 // for non-ASCII bytes.
174 // But https://tools.ietf.org/html/rfc6874#section-2
175 // introduces %25 being allowed to escape a percent sign
176 // in IPv6 scoped-address literals. Yay.
177 if i + 3 >= s.len && mode == .encode_host && unhex(s[i + 1]) < 8
178 && s[i..i + 3] != '%25' {
179 return error(error_msg(err_msg_escape, s[i..i + 3]))
180 }
181 if mode == .encode_zone {
182 // RFC 6874 says basically 'anything goes' for zone identifiers
183 // and that even non-ASCII can be redundantly escaped,
184 // but it seems prudent to restrict %-escaped bytes here to those
185 // that are valid host name bytes in their unescaped form.
186 // That is, you can use escaping in the zone identifier but not
187 // to introduce bytes you couldn't just write directly.
188 // But Windows puts spaces here! Yay.
189 if i + 3 >= s.len {
190 return error(error_msg('unescape: invalid escape sequence', ''))
191 }
192 v := ((unhex(s[i + 1]) << u8(4)) | unhex(s[i + 2]))
193 if s[i..i + 3] != '%25' && v != ` ` && should_escape(v, .encode_host) {
194 error(error_msg(err_msg_escape, s[i..i + 3]))
195 }
196 }
197 i += 3
198 }
199 `+` {
200 has_plus = mode == .encode_query_component
201 i++
202 }
203 else {
204 if (mode == .encode_host || mode == .encode_zone) && s[i] < 0x80
205 && should_escape(s[i], mode) {
206 error(error_msg('unescape: invalid character in host name', s[i..i + 1]))
207 }
208 i++
209 }
210 }
211 }
212 if n == 0 && !has_plus {
213 return '${s}' // TODO: `return s` once an autofree bug is fixed
214 }
215 if s.len < 2 * n {
216 return error(error_msg('unescape: invalid escape sequence', ''))
217 }
218 mut t := strings.new_builder(s.len - 2 * n)
219 for i := 0; i < s.len; i++ {
220 x := s[i]
221 match x {
222 `%` {
223 if i + 2 >= s.len {
224 return error(error_msg('unescape: invalid escape sequence', ''))
225 }
226 t.write_string(((unhex(s[i + 1]) << u8(4)) | unhex(s[i + 2])).ascii_str())
227 i += 2
228 }
229 `+` {
230 if mode == .encode_query_component {
231 t.write_string(' ')
232 } else {
233 t.write_string('+')
234 }
235 }
236 else {
237 t.write_string(s[i].ascii_str())
238 }
239 }
240 }
241 return t.str()
242}
243
244// query_escape escapes the string so it can be safely placed
245// inside a URL query.
246pub fn query_escape(s string) string {
247 return escape(s, .encode_query_component)
248}
249
250// path_escape escapes the string so it can be safely placed inside a URL path segment,
251// replacing special characters (including /) with %XX sequences as needed.
252pub fn path_escape(s string) string {
253 return escape(s, .encode_path_segment)
254}
255
256fn escape(s string, mode EncodingMode) string {
257 mut space_count := 0
258 mut hex_count := 0
259 mut c := u8(0)
260 for i in 0 .. s.len {
261 c = s[i]
262 if should_escape(c, mode) {
263 if c == ` ` && mode == .encode_query_component {
264 space_count++
265 } else {
266 hex_count++
267 }
268 }
269 }
270 if space_count == 0 && hex_count == 0 {
271 return s
272 }
273 required := s.len + 2 * hex_count
274 mut t := []u8{len: required}
275 if hex_count == 0 {
276 copy(mut t, s.bytes())
277 for i in 0 .. s.len {
278 if s[i] == ` ` {
279 t[i] = `+`
280 }
281 }
282 return t.bytestr()
283 }
284 upperhex := '0123456789ABCDEF'
285 mut j := 0
286 for i in 0 .. s.len {
287 c1 := s[i]
288 if c1 == ` ` && mode == .encode_query_component {
289 t[j] = `+`
290 j++
291 } else if should_escape(c1, mode) {
292 t[j] = `%`
293 t[j + 1] = upperhex[c1 >> 4]
294 t[j + 2] = upperhex[c1 & 15]
295 j += 3
296 } else {
297 t[j] = s[i]
298 j++
299 }
300 }
301 return t.bytestr()
302}
303
304// A URL represents a parsed URL (technically, a URI reference).
305// The general form represented is:
306// [scheme:][//[userinfo@]host][/]path[?query][#fragment]
307// URLs that do not start with a slash after the scheme are interpreted as:
308// scheme:opaque[?query][#fragment]
309//
310// Note that the path field is stored in decoded form: /%47%6f%2f becomes /Go/.
311// A consequence is that it is impossible to tell which slashes in the path were
312// slashes in the raw URL and which were %2f. This distinction is rarely important,
313// but when it is, the code should use raw_path, an optional field which only gets
314// set if the default encoding is different from path.
315//
316// URL's String method uses the escaped_path method to obtain the path. See the
317// escaped_path method for more details.
318pub struct URL {
319pub mut:
320 scheme string
321 opaque string // encoded opaque data
322 user ?Userinfo // username and password information
323 host string // host or host:port
324 path string // path (relative paths may omit leading slash)
325 raw_path string // encoded path hint (see escaped_path method)
326 force_query bool // append a query ('?') even if raw_query is empty
327 raw_query string // encoded query values, without '?'
328 fragment string // fragment for references, without '#'
329}
330
331// debug returns a string representation of *ALL* the fields of the given URL
332pub fn (url &URL) debug() string {
333 return 'URL{\n scheme: ${url.scheme}\n opaque: ${url.opaque}\n user: ${url.user}\n host: ${url.host}\n path: ${url.path}\n raw_path: ${url.raw_path}\n force_query: ${url.force_query}\n raw_query: ${url.raw_query}\n fragment: ${url.fragment}\n}'
334}
335
336// user returns a Userinfo containing the provided username
337// and no password set.
338pub fn user(username string) Userinfo {
339 return Userinfo{
340 username: username
341 password: ''
342 password_set: false
343 }
344}
345
346// user_password returns a Userinfo containing the provided username
347// and password.
348//
349// This functionality should only be used with legacy web sites.
350// RFC 2396 warns that interpreting Userinfo this way
351// ``is NOT RECOMMENDED, because the passing of authentication
352// information in clear text (such as URI) has proven to be a
353// security risk in almost every case where it has been used.''
354fn user_password(username string, password string) Userinfo {
355 return Userinfo{username, password, true}
356}
357
358// The Userinfo type is an immutable encapsulation of username and
359// password details for a URL. An existing Userinfo value is guaranteed
360// to have a username set (potentially empty, as allowed by RFC 2396),
361// and optionally a password.
362pub struct Userinfo {
363pub:
364 username string
365 password string
366 password_set bool
367}
368
369fn (u Userinfo) empty() bool {
370 return u.username == '' && u.password == ''
371}
372
373// string returns the encoded userinfo information in the standard form
374// of 'username[:password]'.
375fn (u Userinfo) str() string {
376 if u.empty() {
377 return ''
378 }
379 mut s := escape(u.username, .encode_user_password)
380 if u.password_set {
381 s += ':' + escape(u.password, .encode_user_password)
382 }
383 return s
384}
385
386// Maybe rawurl is of the form scheme:path.
387// (scheme must be [a-zA-Z][a-zA-Z0-9+-.]*)
388// If so, return [scheme, path]; else return ['', rawurl]
389fn split_by_scheme(rawurl string) ![]string {
390 for i in 0 .. rawurl.len {
391 c := rawurl[i]
392 if c.is_letter() {
393 // do nothing
394 } else if c.is_digit() || c in [`+`, `-`, `.`] {
395 if i == 0 {
396 return ['', rawurl]
397 }
398 } else if c == `:` {
399 if i == 0 {
400 return error(error_msg('split_by_scheme: missing protocol scheme', ''))
401 }
402 return [rawurl[..i], rawurl[i + 1..]]
403 } else {
404 // we have encountered an invalid character,
405 // so there is no valid scheme
406 return ['', rawurl]
407 }
408 }
409 return ['', rawurl]
410}
411
412fn get_scheme(rawurl string) !string {
413 parts := split_by_scheme(rawurl) or { return err.msg() }
414 return parts[0]
415}
416
417// split slices s into two substrings separated by the first occurrence of
418// sep. If cutc is true then sep is included with the second substring.
419// If sep does not occur in s then s and the empty string is returned.
420fn split(s string, sep u8, cutc bool) (string, string) {
421 i := s.index_u8(sep)
422 if i < 0 {
423 return s, ''
424 }
425 if cutc {
426 return s[..i], s[i + 1..]
427 }
428 return s[..i], s[i..]
429}
430
431// parse parses rawurl into a URL structure.
432//
433// The rawurl may be relative (a path, without a host) or absolute
434// (starting with a scheme). Trying to parse a hostname and path
435// without a scheme is invalid but may not necessarily return an
436// error, due to parsing ambiguities.
437pub fn parse(rawurl string) !URL {
438 // Cut off #frag
439 u, frag := split(rawurl, `#`, true)
440 mut url := parse_url(u, false) or {
441 return error(error_msg(err_msg_parse + '[${err.msg()}]', u))
442 }
443 if frag == '' {
444 return url
445 }
446 f := unescape(frag, .encode_fragment) or {
447 return error(error_msg(err_msg_parse + '[${err.msg()}]', u))
448 }
449 url.fragment = f
450 return url
451}
452
453// parse_request_uri parses rawurl into a URL structure for an HTTP request.
454// It accepts only absolute URIs or absolute paths and preserves leading `//`
455// sequences as part of the path for request targets.
456// The string rawurl is assumed not to have a #fragment suffix.
457// (Web browsers strip #fragment before sending the URL to a web server.)
458pub fn parse_request_uri(rawurl string) !URL {
459 return parse_url(rawurl, true)
460}
461
462// parse_url parses a URL from a string in one of two contexts. If
463// via_request is true, the URL is assumed to have arrived via an HTTP request,
464// in which case only absolute URLs or path-absolute relative URLs are allowed.
465// If via_request is false, all forms of relative URLs are allowed.
466@[manualfree]
467fn parse_url(rawurl string, via_request bool) !URL {
468 if string_contains_ctl_u8(rawurl) {
469 return error(error_msg('parse_url: invalid control character in URL', rawurl))
470 }
471 if rawurl == '' && via_request {
472 return error(error_msg('parse_url: empty URL', rawurl))
473 }
474 mut url := URL{
475 user: none
476 }
477 if rawurl == '*' {
478 url.path = '*'
479 return url
480 }
481 // Split off possible leading 'http:', 'mailto:', etc.
482 // Cannot contain escaped characters.
483 p := split_by_scheme(rawurl)!
484 url.scheme = p[0]
485 mut rest := p[1]
486 url.scheme = url.scheme.to_lower()
487 // if rest.ends_with('?') && strings.count(rest, '?') == 1 {
488 if rest.ends_with('?') && !rest[..1].contains('?') {
489 url.force_query = true
490 rest = rest[..rest.len - 1]
491 } else {
492 r, raw_query := split(rest, `?`, true)
493 rest = r
494 url.raw_query = raw_query
495 }
496 if !rest.starts_with('/') {
497 if url.scheme != '' {
498 // We consider rootless paths per RFC 3986 as opaque.
499 url.opaque = rest
500 return url
501 }
502 if via_request {
503 return error(error_msg('parse_url: invalid URI for request', ''))
504 }
505 // Avoid confusion with malformed schemes, like cache_object:foo/bar.
506 // See golang.org/issue/16822.
507 //
508 // RFC 3986, §3.3:
509 // In addition, a URI reference (Section 4.1) may be a relative-path reference,
510 // in which case the first path segment cannot contain a colon (':') character.
511 if colon := rest.index(':') {
512 slash := rest.index('/') or { return error('there should be a / in the URL') }
513 if colon >= 0 && (slash < 0 || colon < slash) {
514 // First path segment has colon. Not allowed in relative URL.
515 return error(error_msg('parse_url: first path segment in URL cannot contain colon',
516 ''))
517 }
518 }
519 }
520 if ((url.scheme != '' || !via_request) && !rest.starts_with('///')) && rest.starts_with('//')
521 && rest.len > 2 {
522 authority, r := split(rest[2..], `/`, false)
523 rest = r
524 a := parse_authority(authority)!
525 url.user = a.user
526 url.host = a.host
527 }
528 // Set path and, optionally, raw_path.
529 // raw_path is a hint of the encoding of path. We don't want to set it if
530 // the default escaping of path is equivalent, to help make sure that people
531 // don't rely on it in general.
532 url.set_path(rest)!
533 return url
534}
535
536struct ParseAuthorityRes {
537 user ?Userinfo
538 host string
539}
540
541fn parse_authority(authority string) !ParseAuthorityRes {
542 i := authority.last_index_u8(`@`)
543 if i < 0 {
544 return ParseAuthorityRes{
545 host: parse_host(authority)!
546 user: user('')
547 }
548 }
549 raw_user, raw_host := authority[..i], authority[i + 1..]
550 if !valid_userinfo(raw_user) {
551 return error(error_msg('parse_authority: invalid userinfo', ''))
552 }
553 host := parse_host(raw_host)!
554 name, pwd := split(raw_user, `:`, true)
555 auth := if pwd != '' {
556 user_password(unescape(name, .encode_user_password)!, unescape(pwd, .encode_user_password)!)
557 } else {
558 user(unescape(name, .encode_user_password)!)
559 }
560 return ParseAuthorityRes{
561 user: auth
562 host: host
563 }
564}
565
566// parse_host parses host as an authority without user
567// information. That is, as host[:port].
568fn parse_host(host string) !string {
569 if host.len > 0 && host[0] == `[` {
570 // parse an IP-Literal in RFC 3986 and RFC 6874.
571 // E.g., '[fe80::1]', '[fe80::1%25en0]', '[fe80::1]:80'.
572 i := host.last_index_u8(`]`)
573 if i == -1 {
574 return error(error_msg("parse_host: missing ']' in host", ''))
575 }
576 mut colon_port := host[i + 1..]
577 if !valid_optional_port(colon_port) {
578 return error(error_msg('parse_host: invalid port ${colon_port} after host ', ''))
579 }
580 // RFC 6874 defines that %25 (%-encoded percent) introduces
581 // the zone identifier, and the zone identifier can use basically
582 // any %-encoding it likes. That's different from the host, which
583 // can only %-encode non-ASCII bytes.
584 // We do impose some restrictions on the zone, to avoid stupidity
585 // like newlines.
586 if zone := host[..i].index('%25') {
587 host1 := unescape(host[..zone], .encode_host)!
588 host2 := unescape(host[zone..i], .encode_zone)!
589 host3 := unescape(host[i..], .encode_host)!
590 return host1 + host2 + host3
591 }
592 } else {
593 i := host.last_index_u8(`:`)
594 if i != -1 {
595 colon_port := host[i..]
596 if !valid_optional_port(colon_port) {
597 return error(error_msg('parse_host: invalid port ${colon_port} after host ', ''))
598 }
599 }
600 }
601 h := unescape(host, .encode_host)!
602 return h
603}
604
605// set_path sets the path and raw_path fields of the URL based on the provided
606// escaped path p. It maintains the invariant that raw_path is only specified
607// when it differs from the default encoding of the path.
608// For example:
609// - set_path('/foo/bar') will set path='/foo/bar' and raw_path=''
610// - set_path('/foo%2fbar') will set path='/foo/bar' and raw_path='/foo%2fbar'
611// set_path will return an error only if the provided path contains an invalid
612// escaping.
613pub fn (mut u URL) set_path(p string) !bool {
614 u.path = unescape(p, .encode_path)!
615 u.raw_path = if p == escape(u.path, .encode_path) { '' } else { p }
616 return true
617}
618
619// escaped_path returns the escaped form of u.path.
620// In general there are multiple possible escaped forms of any path.
621// escaped_path returns u.raw_path when it is a valid escaping of u.path.
622// Otherwise escaped_path ignores u.raw_path and computes an escaped
623// form on its own.
624// The String and request_uri methods use escaped_path to construct
625// their results.
626// In general, code should call escaped_path instead of
627// reading u.raw_path directly.
628pub fn (u &URL) escaped_path() string {
629 if u.raw_path != '' && valid_encoded_path(u.raw_path) {
630 unescape(u.raw_path, .encode_path) or { return '' }
631 return u.raw_path
632 }
633 if u.path == '*' {
634 return '*' // don't escape (Issue 11202)
635 }
636 return escape(u.path, .encode_path)
637}
638
639// valid_encoded_path reports whether s is a valid encoded path.
640// It must not contain any bytes that require escaping during path encoding.
641fn valid_encoded_path(s string) bool {
642 for i in 0 .. s.len {
643 // RFC 3986, Appendix A.
644 // pchar = unreserved / pct-encoded / sub-delims / ':' / '@'.
645 // should_escape is not quite compliant with the RFC,
646 // so we check the sub-delims ourselves and let
647 // should_escape handle the others.
648 x := s[i]
649 match x {
650 `!`, `$`, `&`, `\\`, `(`, `)`, `*`, `+`, `,`, `;`, `=`, `:`, `@` {
651 // ok
652 }
653 `[`, `]` {
654 // ok - not specified in RFC 3986 but left alone by modern browsers
655 }
656 `%` {
657 // ok - percent encoded, will decode
658 }
659 else {
660 if should_escape(s[i], .encode_path) {
661 return false
662 }
663 }
664 }
665 }
666 return true
667}
668
669// valid_optional_port reports whether port is either an empty string
670// or matches /^:\d*$/
671fn valid_optional_port(port string) bool {
672 if port == '' {
673 return true
674 }
675 if port[0] != `:` {
676 return false
677 }
678 for b in port[1..] {
679 if b < `0` || b > `9` {
680 return false
681 }
682 }
683 return true
684}
685
686// str reassembles the URL into a valid URL string.
687// The general form of the result is one of:
688//
689// scheme:opaque?query#fragment
690// scheme://userinfo@host/path?query#fragment
691//
692// If u.opaque is non-empty, String uses the first form;
693// otherwise it uses the second form.
694// Any non-ASCII characters in host are escaped.
695// To obtain the path, String uses u.escaped_path().
696//
697// In the second form, the following rules apply:
698// - if u.scheme is empty, scheme: is omitted.
699// - if u.user is none, userinfo@ is omitted.
700// - if u.host is empty, host/ is omitted.
701// - if u.scheme and u.host are empty and u.user is nil,
702// the entire scheme://userinfo@host/ is omitted.
703// - if u.host is non-empty and u.path begins with a /,
704// the form host/path does not add its own /.
705// - if u.raw_query is empty, ?query is omitted.
706// - if u.fragment is empty, #fragment is omitted.
707pub fn (u URL) str() string {
708 mut buf := strings.new_builder(200)
709 if u.scheme != '' {
710 buf.write_string(u.scheme)
711 buf.write_string(':')
712 }
713 if u.opaque != '' {
714 buf.write_string(u.opaque)
715 } else {
716 userinfo := u.user or { Userinfo{} }
717 if u.scheme != '' || u.host != '' || !userinfo.empty() {
718 if u.host != '' || u.path != '' || !userinfo.empty() {
719 buf.write_string('//')
720 }
721 if !userinfo.empty() {
722 buf.write_string(userinfo.str())
723 buf.write_string('@')
724 }
725 if u.host != '' {
726 buf.write_string(escape(u.host, .encode_host))
727 }
728 }
729 path := u.escaped_path()
730 if path != '' && path[0] != `/` && u.host != '' {
731 buf.write_string('/')
732 }
733 if buf.len == 0 {
734 // RFC 3986 §4.2
735 // A path segment that contains a colon character (e.g., 'this:that')
736 // cannot be used as the first segment of a relative-path reference, as
737 // it would be mistaken for a scheme name. Such a segment must be
738 // preceded by a dot-segment (e.g., './this:that') to make a relative-
739 // path reference.
740 i := path.index_u8(`:`)
741 if i > -1 {
742 // TODO: remove this when autofree handles tmp
743 // expressions like this
744 if i > -1 && path[..i].index_u8(`/`) == -1 {
745 buf.write_string('./')
746 }
747 }
748 }
749 buf.write_string(path)
750 }
751 if u.force_query || u.raw_query != '' {
752 buf.write_string('?')
753 buf.write_string(u.raw_query)
754 }
755 if u.fragment != '' {
756 buf.write_string('#')
757 buf.write_string(escape(u.fragment, .encode_fragment))
758 }
759 return buf.str()
760}
761
762// Values maps a string key to a list of values.
763// It is typically used for query parameters and form values.
764// Unlike in the http.Header map, the keys in a Values map
765// are case-sensitive.
766// parseQuery parses the URL-encoded query string and returns
767// a map listing the values specified for each key.
768// parseQuery always returns a non-nil map containing all the
769// valid query parameters found; err describes the first decoding error
770// encountered, if any.
771//
772// Query is expected to be a list of key=value settings separated by
773// ampersands or semicolons. A setting without an equals sign is
774// interpreted as a key set to an empty value.
775pub fn parse_query(query string) !Values {
776 mut m := new_values()
777 parse_query_values(mut m, query)!
778 return m
779}
780
781// parse_query_silent is the same as parse_query
782// but any errors will be silent
783fn parse_query_silent(query string) Values {
784 mut m := new_values()
785 parse_query_values(mut m, query) or {}
786 return m
787}
788
789fn parse_query_values(mut m Values, query string) !bool {
790 mut had_error := false
791 mut q := query
792 for q != '' {
793 mut key := q
794 mut i := key.index_any('&;')
795 if i >= 0 {
796 q = key[i + 1..]
797 key = key[..i]
798 } else {
799 q = ''
800 }
801 if key == '' {
802 continue
803 }
804 mut value := ''
805 if idx := key.index('=') {
806 i = idx
807 value = key[i + 1..]
808 key = key[..i]
809 }
810 k := query_unescape(key) or {
811 had_error = true
812 continue
813 }
814 key = k
815 v := query_unescape(value) or {
816 had_error = true
817 continue
818 }
819 value = v
820 m.add(key, value)
821 }
822 if had_error {
823 return error(error_msg('parse_query_values: failed parsing query string', ''))
824 }
825 return true
826}
827
828// encode encodes the values into ``URL encoded'' form
829// ('bar=baz&foo=quux').
830// The syntx of the query string is specified in the
831// RFC173 https://datatracker.ietf.org/doc/html/rfc1738
832//
833// HTTP grammar
834//
835// httpurl = "http://" hostport [ "/" hpath [ "?" search ]]
836// hpath = hsegment *[ "/" hsegment ]
837// hsegment = *[ uchar | ";" | ":" | "@" | "&" | "=" ]
838// search = *[ uchar | ";" | ":" | "@" | "&" | "=" ]
839pub fn (v Values) encode() string {
840 if v.len == 0 {
841 return ''
842 }
843 mut buf := strings.new_builder(200)
844 for qvalue in v.data {
845 key_kscaped := query_escape(qvalue.key)
846 if buf.len > 0 {
847 buf.write_string('&')
848 }
849 buf.write_string(key_kscaped)
850 if qvalue.value == '' {
851 continue
852 }
853 buf.write_string('=')
854 buf.write_string(query_escape(qvalue.value))
855 }
856 return buf.str()
857}
858
859// resolve_path applies special path segments from refs and applies
860// them to base, per RFC 3986.
861fn resolve_path(base string, ref string) string {
862 mut full := ''
863 if ref == '' {
864 full = base
865 } else if ref[0] != `/` {
866 i := base.last_index_u8(`/`)
867 full = base[..i + 1] + ref
868 } else {
869 full = ref
870 }
871 if full == '' {
872 return ''
873 }
874 mut dst := []string{}
875 src := full.split('/')
876 for _, elem in src {
877 match elem {
878 '.' {
879 // drop
880 }
881 '..' {
882 if dst.len > 0 {
883 dst = unsafe { dst[..dst.len - 1] }
884 }
885 }
886 else {
887 dst << elem
888 }
889 }
890 }
891 last := src[src.len - 1]
892 if last == '.' || last == '..' {
893 // Add final slash to the joined path.
894 dst << ''
895 }
896 return '/' + dst.join('/').trim_left('/')
897}
898
899// is_abs reports whether the URL is absolute.
900// Absolute means that it has a non-empty scheme.
901pub fn (u &URL) is_abs() bool {
902 return u.scheme != ''
903}
904
905// parse parses a URL in the context of the receiver. The provided URL
906// may be relative or absolute. parse returns nil, err on parse
907// failure, otherwise its return value is the same as resolve_reference.
908pub fn (u &URL) parse(ref string) !URL {
909 refurl := parse(ref)!
910 return u.resolve_reference(refurl)
911}
912
913// resolve_reference resolves a URI reference to an absolute URI from
914// an absolute base URI u, per RFC 3986 Section 5.2. The URI reference
915// may be relative or absolute. resolve_reference always returns a new
916// URL instance, even if the returned URL is identical to either the
917// base or reference. If ref is an absolute URL, then resolve_reference
918// ignores base and returns a copy of ref.
919pub fn (u &URL) resolve_reference(ref &URL) !URL {
920 mut url := *ref
921 if ref.scheme == '' {
922 url.scheme = u.scheme
923 }
924 ref_user := ref.user or { Userinfo{} }
925 if ref.scheme != '' || ref.host != '' || !ref_user.empty() {
926 // The 'absoluteURI' or 'net_path' cases.
927 // We can ignore the error from set_path since we know we provided a
928 // validly-escaped path.
929 url.set_path(resolve_path(ref.escaped_path(), ''))!
930 return url
931 }
932 if ref.opaque != '' {
933 url.user = user('')
934 url.host = ''
935 url.path = ''
936 return url
937 }
938 if ref.path == '' && ref.raw_query == '' {
939 url.raw_query = u.raw_query
940 if ref.fragment == '' {
941 url.fragment = u.fragment
942 }
943 }
944 // The 'abs_path' or 'rel_path' cases.
945 url.host = u.host
946 url.user = u.user
947 url.set_path(resolve_path(u.escaped_path(), ref.escaped_path()))!
948 return url
949}
950
951// query parses raw_query and returns the corresponding values.
952// It silently discards malformed value pairs.
953// To check errors use parseQuery.
954pub fn (u &URL) query() Values {
955 v := parse_query_silent(u.raw_query)
956 return v
957}
958
959// request_uri returns the encoded path?query or opaque?query
960// string that would be used in an HTTP request for u.
961pub fn (u &URL) request_uri() string {
962 mut result := u.opaque
963 if result == '' {
964 result = u.escaped_path()
965 if result == '' {
966 result = '/'
967 }
968 } else {
969 if result.starts_with('//') {
970 result = u.scheme + ':' + result
971 }
972 }
973 if u.force_query || u.raw_query != '' {
974 result += '?' + u.raw_query
975 }
976 return result
977}
978
979// hostname returns u.host, stripping any valid port number if present.
980//
981// If the result is enclosed in square brackets, as literal IPv6 addresses are,
982// the square brackets are removed from the result.
983pub fn (u &URL) hostname() string {
984 host, _ := split_host_port(u.host)
985 return host
986}
987
988// port returns the port part of u.host, without the leading colon.
989// If u.host doesn't contain a port, port returns an empty string.
990pub fn (u &URL) port() string {
991 _, port := split_host_port(u.host)
992 return port
993}
994
995// split_host_port separates host and port. If the port is not valid, it returns
996// the entire input as host, and it doesn't check the validity of the host.
997// Per RFC 3986, it requires ports to be numeric.
998pub fn split_host_port(hostport string) (string, string) {
999 mut host := hostport
1000 mut port := ''
1001 colon := host.last_index_u8(`:`)
1002 if colon != -1 {
1003 if valid_optional_port(host[colon..]) {
1004 port = host[colon + 1..]
1005 host = host[..colon]
1006 }
1007 }
1008 if host.len > 1 && host[0] == `[` && host.ends_with(']') {
1009 host = host[1..host.len - 1]
1010 }
1011 return host, port
1012}
1013
1014// valid_userinfo reports whether s is a valid userinfo string per RFC 3986
1015// Section 3.2.1:
1016// userinfo = *( unreserved / pct-encoded / sub-delims / ':' )
1017// unreserved = ALPHA / DIGIT / '-' / '.' / '_' / '~'
1018// sub-delims = '!' / '$' / '&' / ''' / '(' / ')'
1019// / '*' / '+' / ',' / ';' / '='
1020//
1021// It doesn't validate pct-encoded. The caller does that via fn unescape.
1022pub fn valid_userinfo(s string) bool {
1023 for r in s {
1024 if r.is_alnum() {
1025 continue
1026 }
1027 match r {
1028 `-`, `.`, `_`, `:`, `~`, `!`, `$`, `&`, `\\`, `(`, `)`, `*`, `+`, `,`, `;`, `=`, `%`,
1029 `@` {
1030 continue
1031 }
1032 else {
1033 return false
1034 }
1035 }
1036 }
1037 return true
1038}
1039
1040// string_contains_ctl_byte reports whether s contains any ASCII control character.
1041fn string_contains_ctl_u8(s string) bool {
1042 for i in 0 .. s.len {
1043 b := s[i]
1044 if b < ` ` || b == 0x7f {
1045 return true
1046 }
1047 }
1048 return false
1049}
1050
1051pub fn ishex(c u8) bool {
1052 if `0` <= c && c <= `9` {
1053 return true
1054 } else if `a` <= c && c <= `f` {
1055 return true
1056 } else if `A` <= c && c <= `F` {
1057 return true
1058 }
1059 return false
1060}
1061
1062fn unhex(c u8) u8 {
1063 if `0` <= c && c <= `9` {
1064 return c - `0`
1065 } else if `a` <= c && c <= `f` {
1066 return c - `a` + 10
1067 } else if `A` <= c && c <= `F` {
1068 return c - `A` + 10
1069 }
1070 return 0
1071}
1072