| 1 | // urllib parses URLs and implements query escaping. |
| 2 | // See RFC 3986. This module generally follows RFC 3986, except where |
| 3 | // it deviates for compatibility reasons. |
| 4 | // Based off: https://github.com/golang/go/blob/master/src/net/url/url.go |
| 5 | // Last commit: https://github.com/golang/go/commit/fe2ed5054176935d4adcf13e891715ccf2ee3cce |
| 6 | // Copyright 2009 The Go Authors. All rights reserved. |
| 7 | // Use of this source code is governed by a BSD-style |
| 8 | // license that can be found in the LICENSE file. |
| 9 | module urllib |
| 10 | |
| 11 | import strings |
| 12 | |
| 13 | enum EncodingMode { |
| 14 | encode_path |
| 15 | encode_path_segment |
| 16 | encode_host |
| 17 | encode_zone |
| 18 | encode_user_password |
| 19 | encode_query_component |
| 20 | encode_fragment |
| 21 | } |
| 22 | |
| 23 | const err_msg_escape = 'unescape: invalid URL escape' |
| 24 | const err_msg_parse = 'parse: failed parsing url' |
| 25 | |
| 26 | fn error_msg(message string, val string) string { |
| 27 | mut msg := 'net.urllib.${message}' |
| 28 | if val != '' { |
| 29 | msg = '${msg} (${val})' |
| 30 | } |
| 31 | return msg |
| 32 | } |
| 33 | |
| 34 | // Return true if the specified character should be escaped when |
| 35 | // appearing in a URL string, according to RFC 3986. |
| 36 | // |
| 37 | // Please be informed that for now should_escape does not check all |
| 38 | // reserved characters correctly. See golang.org/issue/5684. |
| 39 | fn should_escape(c u8, mode EncodingMode) bool { |
| 40 | // §2.3 Unreserved characters (alphanum) |
| 41 | if c.is_alnum() { |
| 42 | return false |
| 43 | } |
| 44 | if mode == .encode_host || mode == .encode_zone { |
| 45 | // §3.2.2 host allows |
| 46 | // sub-delims = `!` / `$` / `&` / ``` / `(` / `)` / `*` / `+` / `,` / `;` / `=` |
| 47 | // as part of reg-name. |
| 48 | // We add : because we include :port as part of host. |
| 49 | // We add [ ] because we include [ipv6]:port as part of host. |
| 50 | // We add < > because they`re the only characters left that |
| 51 | // we could possibly allow, and parse will reject them if we |
| 52 | // escape them (because hosts can`t use %-encoding for |
| 53 | // ASCII bytes). |
| 54 | if c in [`!`, `$`, `&`, `\\`, `(`, `)`, `*`, `+`, `,`, `;`, `=`, `:`, `[`, `]`, `<`, `>`, |
| 55 | `"`] { |
| 56 | return false |
| 57 | } |
| 58 | } |
| 59 | match c { |
| 60 | `-`, `_`, `.`, `~` { |
| 61 | // §2.3 Unreserved characters (mark) |
| 62 | return false |
| 63 | } |
| 64 | `$`, `&`, `+`, `,`, `/`, `:`, `;`, `=`, `?`, `@` { |
| 65 | // §2.2 Reserved characters (reserved) |
| 66 | // Different sections of the URL allow a few of |
| 67 | // the reserved characters to appear unescaped. |
| 68 | match mode { |
| 69 | .encode_path { |
| 70 | // §3.3 |
| 71 | // The RFC allows : @ & = + $ but saves / ; , for assigning |
| 72 | // meaning to individual path segments. This package |
| 73 | // only manipulates the path as a whole, so we allow those |
| 74 | // last three as well. That leaves only ? to escape. |
| 75 | return c == `?` |
| 76 | } |
| 77 | .encode_path_segment { |
| 78 | // §3.3 |
| 79 | // The RFC allows : @ & = + $ but saves / ; , for assigning |
| 80 | // meaning to individual path segments. |
| 81 | return c == `/` || c == `;` || c == `,` || c == `?` |
| 82 | } |
| 83 | .encode_user_password { |
| 84 | // §3.2.1 |
| 85 | // The RFC allows `;`, `:`, `&`, `=`, `+`, `$`, and `,` in |
| 86 | // userinfo, so we must escape only `@`, `/`, and `?`. |
| 87 | // The parsing of userinfo treats `:` as special so we must escape |
| 88 | // that too. |
| 89 | return c == `@` || c == `/` || c == `?` || c == `:` |
| 90 | } |
| 91 | .encode_query_component { |
| 92 | // §3.4 |
| 93 | // The RFC reserves (so we must escape) everything. |
| 94 | return true |
| 95 | } |
| 96 | .encode_fragment { |
| 97 | // §4.1 |
| 98 | // The RFC text is silent but the grammar allows |
| 99 | // everything, so escape nothing. |
| 100 | return false |
| 101 | } |
| 102 | else {} |
| 103 | } |
| 104 | } |
| 105 | else {} |
| 106 | } |
| 107 | |
| 108 | if mode == .encode_fragment { |
| 109 | // RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are |
| 110 | // included in reserved from RFC 2396 §2.2. The remaining sub-delims do not |
| 111 | // need to be escaped. To minimize potential breakage, we apply two restrictions: |
| 112 | // (1) we always escape sub-delims outside of the fragment, and (2) we always |
| 113 | // escape single quote to avoid breaking callers that had previously assumed that |
| 114 | // single quotes would be escaped. See issue #19917. |
| 115 | match c { |
| 116 | `!`, `(`, `)`, `*` { return false } |
| 117 | else {} |
| 118 | } |
| 119 | } |
| 120 | // Everything else must be escaped. |
| 121 | return true |
| 122 | } |
| 123 | |
| 124 | // query_unescape does the inverse transformation of query_escape, |
| 125 | // converting each 3-byte encoded substring of the form '%AB' into the |
| 126 | // hex-decoded byte 0xAB. |
| 127 | // It returns an error if any % is not followed by two hexadecimal |
| 128 | // digits. |
| 129 | pub fn query_unescape(s string) !string { |
| 130 | return unescape(s, .encode_query_component) |
| 131 | } |
| 132 | |
| 133 | // path_unescape does the inverse transformation of path_escape, |
| 134 | // converting each 3-byte encoded substring of the form '%AB' into the |
| 135 | // hex-decoded byte 0xAB. It returns an error if any % is not followed |
| 136 | // by two hexadecimal digits. |
| 137 | // |
| 138 | // path_unescape is identical to query_unescape except that it does not |
| 139 | // unescape '+' to ' ' (space). |
| 140 | pub fn path_unescape(s string) !string { |
| 141 | return unescape(s, .encode_path_segment) |
| 142 | } |
| 143 | |
| 144 | // unescape unescapes a string; the mode specifies |
| 145 | // which section of the URL string is being unescaped. |
| 146 | fn unescape(s_ string, mode EncodingMode) !string { |
| 147 | mut s := s_ |
| 148 | // Count %, check that they're well-formed. |
| 149 | mut n := 0 |
| 150 | mut has_plus := false |
| 151 | for i := 0; i < s.len; { |
| 152 | x := s[i] |
| 153 | match x { |
| 154 | `%` { |
| 155 | if s == '' { |
| 156 | break |
| 157 | } |
| 158 | n++ |
| 159 | if i + 2 >= s.len || !ishex(s[i + 1]) || !ishex(s[i + 2]) { |
| 160 | if mode == .encode_query_component && i + 1 < s.len { |
| 161 | s = s[..i] + '%25' + s[(i + 1)..] |
| 162 | i += 4 // skip the %25 and the next character |
| 163 | continue |
| 164 | } |
| 165 | s = s[i..] |
| 166 | if s.len > 3 { |
| 167 | s = s[..3] |
| 168 | } |
| 169 | return error(error_msg(err_msg_escape, s)) |
| 170 | } |
| 171 | // Per https://tools.ietf.org/html/rfc3986#page-21 |
| 172 | // in the host component %-encoding can only be used |
| 173 | // for non-ASCII bytes. |
| 174 | // But https://tools.ietf.org/html/rfc6874#section-2 |
| 175 | // introduces %25 being allowed to escape a percent sign |
| 176 | // in IPv6 scoped-address literals. Yay. |
| 177 | if i + 3 >= s.len && mode == .encode_host && unhex(s[i + 1]) < 8 |
| 178 | && s[i..i + 3] != '%25' { |
| 179 | return error(error_msg(err_msg_escape, s[i..i + 3])) |
| 180 | } |
| 181 | if mode == .encode_zone { |
| 182 | // RFC 6874 says basically 'anything goes' for zone identifiers |
| 183 | // and that even non-ASCII can be redundantly escaped, |
| 184 | // but it seems prudent to restrict %-escaped bytes here to those |
| 185 | // that are valid host name bytes in their unescaped form. |
| 186 | // That is, you can use escaping in the zone identifier but not |
| 187 | // to introduce bytes you couldn't just write directly. |
| 188 | // But Windows puts spaces here! Yay. |
| 189 | if i + 3 >= s.len { |
| 190 | return error(error_msg('unescape: invalid escape sequence', '')) |
| 191 | } |
| 192 | v := ((unhex(s[i + 1]) << u8(4)) | unhex(s[i + 2])) |
| 193 | if s[i..i + 3] != '%25' && v != ` ` && should_escape(v, .encode_host) { |
| 194 | error(error_msg(err_msg_escape, s[i..i + 3])) |
| 195 | } |
| 196 | } |
| 197 | i += 3 |
| 198 | } |
| 199 | `+` { |
| 200 | has_plus = mode == .encode_query_component |
| 201 | i++ |
| 202 | } |
| 203 | else { |
| 204 | if (mode == .encode_host || mode == .encode_zone) && s[i] < 0x80 |
| 205 | && should_escape(s[i], mode) { |
| 206 | error(error_msg('unescape: invalid character in host name', s[i..i + 1])) |
| 207 | } |
| 208 | i++ |
| 209 | } |
| 210 | } |
| 211 | } |
| 212 | if n == 0 && !has_plus { |
| 213 | return '${s}' // TODO: `return s` once an autofree bug is fixed |
| 214 | } |
| 215 | if s.len < 2 * n { |
| 216 | return error(error_msg('unescape: invalid escape sequence', '')) |
| 217 | } |
| 218 | mut t := strings.new_builder(s.len - 2 * n) |
| 219 | for i := 0; i < s.len; i++ { |
| 220 | x := s[i] |
| 221 | match x { |
| 222 | `%` { |
| 223 | if i + 2 >= s.len { |
| 224 | return error(error_msg('unescape: invalid escape sequence', '')) |
| 225 | } |
| 226 | t.write_string(((unhex(s[i + 1]) << u8(4)) | unhex(s[i + 2])).ascii_str()) |
| 227 | i += 2 |
| 228 | } |
| 229 | `+` { |
| 230 | if mode == .encode_query_component { |
| 231 | t.write_string(' ') |
| 232 | } else { |
| 233 | t.write_string('+') |
| 234 | } |
| 235 | } |
| 236 | else { |
| 237 | t.write_string(s[i].ascii_str()) |
| 238 | } |
| 239 | } |
| 240 | } |
| 241 | return t.str() |
| 242 | } |
| 243 | |
| 244 | // query_escape escapes the string so it can be safely placed |
| 245 | // inside a URL query. |
| 246 | pub fn query_escape(s string) string { |
| 247 | return escape(s, .encode_query_component) |
| 248 | } |
| 249 | |
| 250 | // path_escape escapes the string so it can be safely placed inside a URL path segment, |
| 251 | // replacing special characters (including /) with %XX sequences as needed. |
| 252 | pub fn path_escape(s string) string { |
| 253 | return escape(s, .encode_path_segment) |
| 254 | } |
| 255 | |
| 256 | fn escape(s string, mode EncodingMode) string { |
| 257 | mut space_count := 0 |
| 258 | mut hex_count := 0 |
| 259 | mut c := u8(0) |
| 260 | for i in 0 .. s.len { |
| 261 | c = s[i] |
| 262 | if should_escape(c, mode) { |
| 263 | if c == ` ` && mode == .encode_query_component { |
| 264 | space_count++ |
| 265 | } else { |
| 266 | hex_count++ |
| 267 | } |
| 268 | } |
| 269 | } |
| 270 | if space_count == 0 && hex_count == 0 { |
| 271 | return s |
| 272 | } |
| 273 | required := s.len + 2 * hex_count |
| 274 | mut t := []u8{len: required} |
| 275 | if hex_count == 0 { |
| 276 | copy(mut t, s.bytes()) |
| 277 | for i in 0 .. s.len { |
| 278 | if s[i] == ` ` { |
| 279 | t[i] = `+` |
| 280 | } |
| 281 | } |
| 282 | return t.bytestr() |
| 283 | } |
| 284 | upperhex := '0123456789ABCDEF' |
| 285 | mut j := 0 |
| 286 | for i in 0 .. s.len { |
| 287 | c1 := s[i] |
| 288 | if c1 == ` ` && mode == .encode_query_component { |
| 289 | t[j] = `+` |
| 290 | j++ |
| 291 | } else if should_escape(c1, mode) { |
| 292 | t[j] = `%` |
| 293 | t[j + 1] = upperhex[c1 >> 4] |
| 294 | t[j + 2] = upperhex[c1 & 15] |
| 295 | j += 3 |
| 296 | } else { |
| 297 | t[j] = s[i] |
| 298 | j++ |
| 299 | } |
| 300 | } |
| 301 | return t.bytestr() |
| 302 | } |
| 303 | |
| 304 | // A URL represents a parsed URL (technically, a URI reference). |
| 305 | // The general form represented is: |
| 306 | // [scheme:][//[userinfo@]host][/]path[?query][#fragment] |
| 307 | // URLs that do not start with a slash after the scheme are interpreted as: |
| 308 | // scheme:opaque[?query][#fragment] |
| 309 | // |
| 310 | // Note that the path field is stored in decoded form: /%47%6f%2f becomes /Go/. |
| 311 | // A consequence is that it is impossible to tell which slashes in the path were |
| 312 | // slashes in the raw URL and which were %2f. This distinction is rarely important, |
| 313 | // but when it is, the code should use raw_path, an optional field which only gets |
| 314 | // set if the default encoding is different from path. |
| 315 | // |
| 316 | // URL's String method uses the escaped_path method to obtain the path. See the |
| 317 | // escaped_path method for more details. |
| 318 | pub struct URL { |
| 319 | pub mut: |
| 320 | scheme string |
| 321 | opaque string // encoded opaque data |
| 322 | user ?Userinfo // username and password information |
| 323 | host string // host or host:port |
| 324 | path string // path (relative paths may omit leading slash) |
| 325 | raw_path string // encoded path hint (see escaped_path method) |
| 326 | force_query bool // append a query ('?') even if raw_query is empty |
| 327 | raw_query string // encoded query values, without '?' |
| 328 | fragment string // fragment for references, without '#' |
| 329 | } |
| 330 | |
| 331 | // debug returns a string representation of *ALL* the fields of the given URL |
| 332 | pub fn (url &URL) debug() string { |
| 333 | return 'URL{\n scheme: ${url.scheme}\n opaque: ${url.opaque}\n user: ${url.user}\n host: ${url.host}\n path: ${url.path}\n raw_path: ${url.raw_path}\n force_query: ${url.force_query}\n raw_query: ${url.raw_query}\n fragment: ${url.fragment}\n}' |
| 334 | } |
| 335 | |
| 336 | // user returns a Userinfo containing the provided username |
| 337 | // and no password set. |
| 338 | pub fn user(username string) Userinfo { |
| 339 | return Userinfo{ |
| 340 | username: username |
| 341 | password: '' |
| 342 | password_set: false |
| 343 | } |
| 344 | } |
| 345 | |
| 346 | // user_password returns a Userinfo containing the provided username |
| 347 | // and password. |
| 348 | // |
| 349 | // This functionality should only be used with legacy web sites. |
| 350 | // RFC 2396 warns that interpreting Userinfo this way |
| 351 | // ``is NOT RECOMMENDED, because the passing of authentication |
| 352 | // information in clear text (such as URI) has proven to be a |
| 353 | // security risk in almost every case where it has been used.'' |
| 354 | fn user_password(username string, password string) Userinfo { |
| 355 | return Userinfo{username, password, true} |
| 356 | } |
| 357 | |
| 358 | // The Userinfo type is an immutable encapsulation of username and |
| 359 | // password details for a URL. An existing Userinfo value is guaranteed |
| 360 | // to have a username set (potentially empty, as allowed by RFC 2396), |
| 361 | // and optionally a password. |
| 362 | pub struct Userinfo { |
| 363 | pub: |
| 364 | username string |
| 365 | password string |
| 366 | password_set bool |
| 367 | } |
| 368 | |
| 369 | fn (u Userinfo) empty() bool { |
| 370 | return u.username == '' && u.password == '' |
| 371 | } |
| 372 | |
| 373 | // string returns the encoded userinfo information in the standard form |
| 374 | // of 'username[:password]'. |
| 375 | fn (u Userinfo) str() string { |
| 376 | if u.empty() { |
| 377 | return '' |
| 378 | } |
| 379 | mut s := escape(u.username, .encode_user_password) |
| 380 | if u.password_set { |
| 381 | s += ':' + escape(u.password, .encode_user_password) |
| 382 | } |
| 383 | return s |
| 384 | } |
| 385 | |
| 386 | // Maybe rawurl is of the form scheme:path. |
| 387 | // (scheme must be [a-zA-Z][a-zA-Z0-9+-.]*) |
| 388 | // If so, return [scheme, path]; else return ['', rawurl] |
| 389 | fn split_by_scheme(rawurl string) ![]string { |
| 390 | for i in 0 .. rawurl.len { |
| 391 | c := rawurl[i] |
| 392 | if c.is_letter() { |
| 393 | // do nothing |
| 394 | } else if c.is_digit() || c in [`+`, `-`, `.`] { |
| 395 | if i == 0 { |
| 396 | return ['', rawurl] |
| 397 | } |
| 398 | } else if c == `:` { |
| 399 | if i == 0 { |
| 400 | return error(error_msg('split_by_scheme: missing protocol scheme', '')) |
| 401 | } |
| 402 | return [rawurl[..i], rawurl[i + 1..]] |
| 403 | } else { |
| 404 | // we have encountered an invalid character, |
| 405 | // so there is no valid scheme |
| 406 | return ['', rawurl] |
| 407 | } |
| 408 | } |
| 409 | return ['', rawurl] |
| 410 | } |
| 411 | |
| 412 | fn get_scheme(rawurl string) !string { |
| 413 | split := split_by_scheme(rawurl) or { return err.msg() } |
| 414 | return split[0] |
| 415 | } |
| 416 | |
| 417 | // split slices s into two substrings separated by the first occurrence of |
| 418 | // sep. If cutc is true then sep is included with the second substring. |
| 419 | // If sep does not occur in s then s and the empty string is returned. |
| 420 | fn split(s string, sep u8, cutc bool) (string, string) { |
| 421 | i := s.index_u8(sep) |
| 422 | if i < 0 { |
| 423 | return s, '' |
| 424 | } |
| 425 | if cutc { |
| 426 | return s[..i], s[i + 1..] |
| 427 | } |
| 428 | return s[..i], s[i..] |
| 429 | } |
| 430 | |
| 431 | // parse parses rawurl into a URL structure. |
| 432 | // |
| 433 | // The rawurl may be relative (a path, without a host) or absolute |
| 434 | // (starting with a scheme). Trying to parse a hostname and path |
| 435 | // without a scheme is invalid but may not necessarily return an |
| 436 | // error, due to parsing ambiguities. |
| 437 | pub fn parse(rawurl string) !URL { |
| 438 | // Cut off #frag |
| 439 | u, frag := split(rawurl, `#`, true) |
| 440 | mut url := parse_url(u, false) or { |
| 441 | return error(error_msg(err_msg_parse + '[${err.msg()}]', u)) |
| 442 | } |
| 443 | if frag == '' { |
| 444 | return url |
| 445 | } |
| 446 | f := unescape(frag, .encode_fragment) or { |
| 447 | return error(error_msg(err_msg_parse + '[${err.msg()}]', u)) |
| 448 | } |
| 449 | url.fragment = f |
| 450 | return url |
| 451 | } |
| 452 | |
| 453 | // parse_request_uri parses rawurl into a URL structure for an HTTP request. |
| 454 | // It accepts only absolute URIs or absolute paths and preserves leading `//` |
| 455 | // sequences as part of the path for request targets. |
| 456 | // The string rawurl is assumed not to have a #fragment suffix. |
| 457 | // (Web browsers strip #fragment before sending the URL to a web server.) |
| 458 | pub fn parse_request_uri(rawurl string) !URL { |
| 459 | return parse_url(rawurl, true) |
| 460 | } |
| 461 | |
| 462 | // parse_url parses a URL from a string in one of two contexts. If |
| 463 | // via_request is true, the URL is assumed to have arrived via an HTTP request, |
| 464 | // in which case only absolute URLs or path-absolute relative URLs are allowed. |
| 465 | // If via_request is false, all forms of relative URLs are allowed. |
| 466 | @[manualfree] |
| 467 | fn parse_url(rawurl string, via_request bool) !URL { |
| 468 | if string_contains_ctl_u8(rawurl) { |
| 469 | return error(error_msg('parse_url: invalid control character in URL', rawurl)) |
| 470 | } |
| 471 | if rawurl == '' && via_request { |
| 472 | return error(error_msg('parse_url: empty URL', rawurl)) |
| 473 | } |
| 474 | mut url := URL{ |
| 475 | user: none |
| 476 | } |
| 477 | if rawurl == '*' { |
| 478 | url.path = '*' |
| 479 | return url |
| 480 | } |
| 481 | // Split off possible leading 'http:', 'mailto:', etc. |
| 482 | // Cannot contain escaped characters. |
| 483 | p := split_by_scheme(rawurl)! |
| 484 | url.scheme = p[0] |
| 485 | mut rest := p[1] |
| 486 | url.scheme = url.scheme.to_lower() |
| 487 | // if rest.ends_with('?') && strings.count(rest, '?') == 1 { |
| 488 | if rest.ends_with('?') && !rest[..1].contains('?') { |
| 489 | url.force_query = true |
| 490 | rest = rest[..rest.len - 1] |
| 491 | } else { |
| 492 | r, raw_query := split(rest, `?`, true) |
| 493 | rest = r |
| 494 | url.raw_query = raw_query |
| 495 | } |
| 496 | if !rest.starts_with('/') { |
| 497 | if url.scheme != '' { |
| 498 | // We consider rootless paths per RFC 3986 as opaque. |
| 499 | url.opaque = rest |
| 500 | return url |
| 501 | } |
| 502 | if via_request { |
| 503 | return error(error_msg('parse_url: invalid URI for request', '')) |
| 504 | } |
| 505 | // Avoid confusion with malformed schemes, like cache_object:foo/bar. |
| 506 | // See golang.org/issue/16822. |
| 507 | // |
| 508 | // RFC 3986, §3.3: |
| 509 | // In addition, a URI reference (Section 4.1) may be a relative-path reference, |
| 510 | // in which case the first path segment cannot contain a colon (':') character. |
| 511 | if colon := rest.index(':') { |
| 512 | slash := rest.index('/') or { return error('there should be a / in the URL') } |
| 513 | if colon >= 0 && (slash < 0 || colon < slash) { |
| 514 | // First path segment has colon. Not allowed in relative URL. |
| 515 | return error(error_msg('parse_url: first path segment in URL cannot contain colon', |
| 516 | '')) |
| 517 | } |
| 518 | } |
| 519 | } |
| 520 | if ((url.scheme != '' || !via_request) && !rest.starts_with('///')) && rest.starts_with('//') |
| 521 | && rest.len > 2 { |
| 522 | authority, r := split(rest[2..], `/`, false) |
| 523 | rest = r |
| 524 | a := parse_authority(authority)! |
| 525 | url.user = a.user |
| 526 | url.host = a.host |
| 527 | } |
| 528 | // Set path and, optionally, raw_path. |
| 529 | // raw_path is a hint of the encoding of path. We don't want to set it if |
| 530 | // the default escaping of path is equivalent, to help make sure that people |
| 531 | // don't rely on it in general. |
| 532 | url.set_path(rest)! |
| 533 | return url |
| 534 | } |
| 535 | |
| 536 | struct ParseAuthorityRes { |
| 537 | user ?Userinfo |
| 538 | host string |
| 539 | } |
| 540 | |
| 541 | fn parse_authority(authority string) !ParseAuthorityRes { |
| 542 | i := authority.last_index_u8(`@`) |
| 543 | if i < 0 { |
| 544 | return ParseAuthorityRes{ |
| 545 | host: parse_host(authority)! |
| 546 | user: user('') |
| 547 | } |
| 548 | } |
| 549 | raw_user, raw_host := authority[..i], authority[i + 1..] |
| 550 | if !valid_userinfo(raw_user) { |
| 551 | return error(error_msg('parse_authority: invalid userinfo', '')) |
| 552 | } |
| 553 | host := parse_host(raw_host)! |
| 554 | name, pwd := split(raw_user, `:`, true) |
| 555 | auth := if pwd != '' { |
| 556 | user_password(unescape(name, .encode_user_password)!, unescape(pwd, .encode_user_password)!) |
| 557 | } else { |
| 558 | user(unescape(name, .encode_user_password)!) |
| 559 | } |
| 560 | return ParseAuthorityRes{ |
| 561 | user: auth |
| 562 | host: host |
| 563 | } |
| 564 | } |
| 565 | |
| 566 | // parse_host parses host as an authority without user |
| 567 | // information. That is, as host[:port]. |
| 568 | fn parse_host(host string) !string { |
| 569 | if host.len > 0 && host[0] == `[` { |
| 570 | // parse an IP-Literal in RFC 3986 and RFC 6874. |
| 571 | // E.g., '[fe80::1]', '[fe80::1%25en0]', '[fe80::1]:80'. |
| 572 | i := host.last_index_u8(`]`) |
| 573 | if i == -1 { |
| 574 | return error(error_msg("parse_host: missing ']' in host", '')) |
| 575 | } |
| 576 | mut colon_port := host[i + 1..] |
| 577 | if !valid_optional_port(colon_port) { |
| 578 | return error(error_msg('parse_host: invalid port ${colon_port} after host ', '')) |
| 579 | } |
| 580 | // RFC 6874 defines that %25 (%-encoded percent) introduces |
| 581 | // the zone identifier, and the zone identifier can use basically |
| 582 | // any %-encoding it likes. That's different from the host, which |
| 583 | // can only %-encode non-ASCII bytes. |
| 584 | // We do impose some restrictions on the zone, to avoid stupidity |
| 585 | // like newlines. |
| 586 | if zone := host[..i].index('%25') { |
| 587 | host1 := unescape(host[..zone], .encode_host)! |
| 588 | host2 := unescape(host[zone..i], .encode_zone)! |
| 589 | host3 := unescape(host[i..], .encode_host)! |
| 590 | return host1 + host2 + host3 |
| 591 | } |
| 592 | } else { |
| 593 | i := host.last_index_u8(`:`) |
| 594 | if i != -1 { |
| 595 | colon_port := host[i..] |
| 596 | if !valid_optional_port(colon_port) { |
| 597 | return error(error_msg('parse_host: invalid port ${colon_port} after host ', '')) |
| 598 | } |
| 599 | } |
| 600 | } |
| 601 | h := unescape(host, .encode_host)! |
| 602 | return h |
| 603 | } |
| 604 | |
| 605 | // set_path sets the path and raw_path fields of the URL based on the provided |
| 606 | // escaped path p. It maintains the invariant that raw_path is only specified |
| 607 | // when it differs from the default encoding of the path. |
| 608 | // For example: |
| 609 | // - set_path('/foo/bar') will set path='/foo/bar' and raw_path='' |
| 610 | // - set_path('/foo%2fbar') will set path='/foo/bar' and raw_path='/foo%2fbar' |
| 611 | // set_path will return an error only if the provided path contains an invalid |
| 612 | // escaping. |
| 613 | pub fn (mut u URL) set_path(p string) !bool { |
| 614 | u.path = unescape(p, .encode_path)! |
| 615 | u.raw_path = if p == escape(u.path, .encode_path) { '' } else { p } |
| 616 | return true |
| 617 | } |
| 618 | |
| 619 | // escaped_path returns the escaped form of u.path. |
| 620 | // In general there are multiple possible escaped forms of any path. |
| 621 | // escaped_path returns u.raw_path when it is a valid escaping of u.path. |
| 622 | // Otherwise escaped_path ignores u.raw_path and computes an escaped |
| 623 | // form on its own. |
| 624 | // The String and request_uri methods use escaped_path to construct |
| 625 | // their results. |
| 626 | // In general, code should call escaped_path instead of |
| 627 | // reading u.raw_path directly. |
| 628 | pub fn (u &URL) escaped_path() string { |
| 629 | if u.raw_path != '' && valid_encoded_path(u.raw_path) { |
| 630 | unescape(u.raw_path, .encode_path) or { return '' } |
| 631 | return u.raw_path |
| 632 | } |
| 633 | if u.path == '*' { |
| 634 | return '*' // don't escape (Issue 11202) |
| 635 | } |
| 636 | return escape(u.path, .encode_path) |
| 637 | } |
| 638 | |
| 639 | // valid_encoded_path reports whether s is a valid encoded path. |
| 640 | // It must not contain any bytes that require escaping during path encoding. |
| 641 | fn valid_encoded_path(s string) bool { |
| 642 | for i in 0 .. s.len { |
| 643 | // RFC 3986, Appendix A. |
| 644 | // pchar = unreserved / pct-encoded / sub-delims / ':' / '@'. |
| 645 | // should_escape is not quite compliant with the RFC, |
| 646 | // so we check the sub-delims ourselves and let |
| 647 | // should_escape handle the others. |
| 648 | x := s[i] |
| 649 | match x { |
| 650 | `!`, `$`, `&`, `\\`, `(`, `)`, `*`, `+`, `,`, `;`, `=`, `:`, `@` { |
| 651 | // ok |
| 652 | } |
| 653 | `[`, `]` { |
| 654 | // ok - not specified in RFC 3986 but left alone by modern browsers |
| 655 | } |
| 656 | `%` { |
| 657 | // ok - percent encoded, will decode |
| 658 | } |
| 659 | else { |
| 660 | if should_escape(s[i], .encode_path) { |
| 661 | return false |
| 662 | } |
| 663 | } |
| 664 | } |
| 665 | } |
| 666 | return true |
| 667 | } |
| 668 | |
| 669 | // valid_optional_port reports whether port is either an empty string |
| 670 | // or matches /^:\d*$/ |
| 671 | fn valid_optional_port(port string) bool { |
| 672 | if port == '' { |
| 673 | return true |
| 674 | } |
| 675 | if port[0] != `:` { |
| 676 | return false |
| 677 | } |
| 678 | for b in port[1..] { |
| 679 | if b < `0` || b > `9` { |
| 680 | return false |
| 681 | } |
| 682 | } |
| 683 | return true |
| 684 | } |
| 685 | |
| 686 | // str reassembles the URL into a valid URL string. |
| 687 | // The general form of the result is one of: |
| 688 | // |
| 689 | // scheme:opaque?query#fragment |
| 690 | // scheme://userinfo@host/path?query#fragment |
| 691 | // |
| 692 | // If u.opaque is non-empty, String uses the first form; |
| 693 | // otherwise it uses the second form. |
| 694 | // Any non-ASCII characters in host are escaped. |
| 695 | // To obtain the path, String uses u.escaped_path(). |
| 696 | // |
| 697 | // In the second form, the following rules apply: |
| 698 | // - if u.scheme is empty, scheme: is omitted. |
| 699 | // - if u.user is none, userinfo@ is omitted. |
| 700 | // - if u.host is empty, host/ is omitted. |
| 701 | // - if u.scheme and u.host are empty and u.user is nil, |
| 702 | // the entire scheme://userinfo@host/ is omitted. |
| 703 | // - if u.host is non-empty and u.path begins with a /, |
| 704 | // the form host/path does not add its own /. |
| 705 | // - if u.raw_query is empty, ?query is omitted. |
| 706 | // - if u.fragment is empty, #fragment is omitted. |
| 707 | pub fn (u URL) str() string { |
| 708 | mut buf := strings.new_builder(200) |
| 709 | if u.scheme != '' { |
| 710 | buf.write_string(u.scheme) |
| 711 | buf.write_string(':') |
| 712 | } |
| 713 | if u.opaque != '' { |
| 714 | buf.write_string(u.opaque) |
| 715 | } else { |
| 716 | user := u.user or { Userinfo{} } |
| 717 | if u.scheme != '' || u.host != '' || !user.empty() { |
| 718 | if u.host != '' || u.path != '' || !user.empty() { |
| 719 | buf.write_string('//') |
| 720 | } |
| 721 | if !user.empty() { |
| 722 | buf.write_string(user.str()) |
| 723 | buf.write_string('@') |
| 724 | } |
| 725 | if u.host != '' { |
| 726 | buf.write_string(escape(u.host, .encode_host)) |
| 727 | } |
| 728 | } |
| 729 | path := u.escaped_path() |
| 730 | if path != '' && path[0] != `/` && u.host != '' { |
| 731 | buf.write_string('/') |
| 732 | } |
| 733 | if buf.len == 0 { |
| 734 | // RFC 3986 §4.2 |
| 735 | // A path segment that contains a colon character (e.g., 'this:that') |
| 736 | // cannot be used as the first segment of a relative-path reference, as |
| 737 | // it would be mistaken for a scheme name. Such a segment must be |
| 738 | // preceded by a dot-segment (e.g., './this:that') to make a relative- |
| 739 | // path reference. |
| 740 | i := path.index_u8(`:`) |
| 741 | if i > -1 { |
| 742 | // TODO: remove this when autofree handles tmp |
| 743 | // expressions like this |
| 744 | if i > -1 && path[..i].index_u8(`/`) == -1 { |
| 745 | buf.write_string('./') |
| 746 | } |
| 747 | } |
| 748 | } |
| 749 | buf.write_string(path) |
| 750 | } |
| 751 | if u.force_query || u.raw_query != '' { |
| 752 | buf.write_string('?') |
| 753 | buf.write_string(u.raw_query) |
| 754 | } |
| 755 | if u.fragment != '' { |
| 756 | buf.write_string('#') |
| 757 | buf.write_string(escape(u.fragment, .encode_fragment)) |
| 758 | } |
| 759 | return buf.str() |
| 760 | } |
| 761 | |
| 762 | // Values maps a string key to a list of values. |
| 763 | // It is typically used for query parameters and form values. |
| 764 | // Unlike in the http.Header map, the keys in a Values map |
| 765 | // are case-sensitive. |
| 766 | // parseQuery parses the URL-encoded query string and returns |
| 767 | // a map listing the values specified for each key. |
| 768 | // parseQuery always returns a non-nil map containing all the |
| 769 | // valid query parameters found; err describes the first decoding error |
| 770 | // encountered, if any. |
| 771 | // |
| 772 | // Query is expected to be a list of key=value settings separated by |
| 773 | // ampersands or semicolons. A setting without an equals sign is |
| 774 | // interpreted as a key set to an empty value. |
| 775 | pub fn parse_query(query string) !Values { |
| 776 | mut m := new_values() |
| 777 | parse_query_values(mut m, query)! |
| 778 | return m |
| 779 | } |
| 780 | |
| 781 | // parse_query_silent is the same as parse_query |
| 782 | // but any errors will be silent |
| 783 | fn parse_query_silent(query string) Values { |
| 784 | mut m := new_values() |
| 785 | parse_query_values(mut m, query) or {} |
| 786 | return m |
| 787 | } |
| 788 | |
| 789 | fn parse_query_values(mut m Values, query string) !bool { |
| 790 | mut had_error := false |
| 791 | mut q := query |
| 792 | for q != '' { |
| 793 | mut key := q |
| 794 | mut i := key.index_any('&;') |
| 795 | if i >= 0 { |
| 796 | q = key[i + 1..] |
| 797 | key = key[..i] |
| 798 | } else { |
| 799 | q = '' |
| 800 | } |
| 801 | if key == '' { |
| 802 | continue |
| 803 | } |
| 804 | mut value := '' |
| 805 | if idx := key.index('=') { |
| 806 | i = idx |
| 807 | value = key[i + 1..] |
| 808 | key = key[..i] |
| 809 | } |
| 810 | k := query_unescape(key) or { |
| 811 | had_error = true |
| 812 | continue |
| 813 | } |
| 814 | key = k |
| 815 | v := query_unescape(value) or { |
| 816 | had_error = true |
| 817 | continue |
| 818 | } |
| 819 | value = v |
| 820 | m.add(key, value) |
| 821 | } |
| 822 | if had_error { |
| 823 | return error(error_msg('parse_query_values: failed parsing query string', '')) |
| 824 | } |
| 825 | return true |
| 826 | } |
| 827 | |
| 828 | // encode encodes the values into ``URL encoded'' form |
| 829 | // ('bar=baz&foo=quux'). |
| 830 | // The syntx of the query string is specified in the |
| 831 | // RFC173 https://datatracker.ietf.org/doc/html/rfc1738 |
| 832 | // |
| 833 | // HTTP grammar |
| 834 | // |
| 835 | // httpurl = "http://" hostport [ "/" hpath [ "?" search ]] |
| 836 | // hpath = hsegment *[ "/" hsegment ] |
| 837 | // hsegment = *[ uchar | ";" | ":" | "@" | "&" | "=" ] |
| 838 | // search = *[ uchar | ";" | ":" | "@" | "&" | "=" ] |
| 839 | pub fn (v Values) encode() string { |
| 840 | if v.len == 0 { |
| 841 | return '' |
| 842 | } |
| 843 | mut buf := strings.new_builder(200) |
| 844 | for qvalue in v.data { |
| 845 | key_kscaped := query_escape(qvalue.key) |
| 846 | if buf.len > 0 { |
| 847 | buf.write_string('&') |
| 848 | } |
| 849 | buf.write_string(key_kscaped) |
| 850 | if qvalue.value == '' { |
| 851 | continue |
| 852 | } |
| 853 | buf.write_string('=') |
| 854 | buf.write_string(query_escape(qvalue.value)) |
| 855 | } |
| 856 | return buf.str() |
| 857 | } |
| 858 | |
| 859 | // resolve_path applies special path segments from refs and applies |
| 860 | // them to base, per RFC 3986. |
| 861 | fn resolve_path(base string, ref string) string { |
| 862 | mut full := '' |
| 863 | if ref == '' { |
| 864 | full = base |
| 865 | } else if ref[0] != `/` { |
| 866 | i := base.last_index_u8(`/`) |
| 867 | full = base[..i + 1] + ref |
| 868 | } else { |
| 869 | full = ref |
| 870 | } |
| 871 | if full == '' { |
| 872 | return '' |
| 873 | } |
| 874 | mut dst := []string{} |
| 875 | src := full.split('/') |
| 876 | for _, elem in src { |
| 877 | match elem { |
| 878 | '.' { |
| 879 | // drop |
| 880 | } |
| 881 | '..' { |
| 882 | if dst.len > 0 { |
| 883 | dst = unsafe { dst[..dst.len - 1] } |
| 884 | } |
| 885 | } |
| 886 | else { |
| 887 | dst << elem |
| 888 | } |
| 889 | } |
| 890 | } |
| 891 | last := src[src.len - 1] |
| 892 | if last == '.' || last == '..' { |
| 893 | // Add final slash to the joined path. |
| 894 | dst << '' |
| 895 | } |
| 896 | return '/' + dst.join('/').trim_left('/') |
| 897 | } |
| 898 | |
| 899 | // is_abs reports whether the URL is absolute. |
| 900 | // Absolute means that it has a non-empty scheme. |
| 901 | pub fn (u &URL) is_abs() bool { |
| 902 | return u.scheme != '' |
| 903 | } |
| 904 | |
| 905 | // parse parses a URL in the context of the receiver. The provided URL |
| 906 | // may be relative or absolute. parse returns nil, err on parse |
| 907 | // failure, otherwise its return value is the same as resolve_reference. |
| 908 | pub fn (u &URL) parse(ref string) !URL { |
| 909 | refurl := parse(ref)! |
| 910 | return u.resolve_reference(refurl) |
| 911 | } |
| 912 | |
| 913 | // resolve_reference resolves a URI reference to an absolute URI from |
| 914 | // an absolute base URI u, per RFC 3986 Section 5.2. The URI reference |
| 915 | // may be relative or absolute. resolve_reference always returns a new |
| 916 | // URL instance, even if the returned URL is identical to either the |
| 917 | // base or reference. If ref is an absolute URL, then resolve_reference |
| 918 | // ignores base and returns a copy of ref. |
| 919 | pub fn (u &URL) resolve_reference(ref &URL) !URL { |
| 920 | mut url := *ref |
| 921 | if ref.scheme == '' { |
| 922 | url.scheme = u.scheme |
| 923 | } |
| 924 | ref_user := ref.user or { Userinfo{} } |
| 925 | if ref.scheme != '' || ref.host != '' || !ref_user.empty() { |
| 926 | // The 'absoluteURI' or 'net_path' cases. |
| 927 | // We can ignore the error from set_path since we know we provided a |
| 928 | // validly-escaped path. |
| 929 | url.set_path(resolve_path(ref.escaped_path(), ''))! |
| 930 | return url |
| 931 | } |
| 932 | if ref.opaque != '' { |
| 933 | url.user = user('') |
| 934 | url.host = '' |
| 935 | url.path = '' |
| 936 | return url |
| 937 | } |
| 938 | if ref.path == '' && ref.raw_query == '' { |
| 939 | url.raw_query = u.raw_query |
| 940 | if ref.fragment == '' { |
| 941 | url.fragment = u.fragment |
| 942 | } |
| 943 | } |
| 944 | // The 'abs_path' or 'rel_path' cases. |
| 945 | url.host = u.host |
| 946 | url.user = u.user |
| 947 | url.set_path(resolve_path(u.escaped_path(), ref.escaped_path()))! |
| 948 | return url |
| 949 | } |
| 950 | |
| 951 | // query parses raw_query and returns the corresponding values. |
| 952 | // It silently discards malformed value pairs. |
| 953 | // To check errors use parseQuery. |
| 954 | pub fn (u &URL) query() Values { |
| 955 | v := parse_query_silent(u.raw_query) |
| 956 | return v |
| 957 | } |
| 958 | |
| 959 | // request_uri returns the encoded path?query or opaque?query |
| 960 | // string that would be used in an HTTP request for u. |
| 961 | pub fn (u &URL) request_uri() string { |
| 962 | mut result := u.opaque |
| 963 | if result == '' { |
| 964 | result = u.escaped_path() |
| 965 | if result == '' { |
| 966 | result = '/' |
| 967 | } |
| 968 | } else { |
| 969 | if result.starts_with('//') { |
| 970 | result = u.scheme + ':' + result |
| 971 | } |
| 972 | } |
| 973 | if u.force_query || u.raw_query != '' { |
| 974 | result += '?' + u.raw_query |
| 975 | } |
| 976 | return result |
| 977 | } |
| 978 | |
| 979 | // hostname returns u.host, stripping any valid port number if present. |
| 980 | // |
| 981 | // If the result is enclosed in square brackets, as literal IPv6 addresses are, |
| 982 | // the square brackets are removed from the result. |
| 983 | pub fn (u &URL) hostname() string { |
| 984 | host, _ := split_host_port(u.host) |
| 985 | return host |
| 986 | } |
| 987 | |
| 988 | // port returns the port part of u.host, without the leading colon. |
| 989 | // If u.host doesn't contain a port, port returns an empty string. |
| 990 | pub fn (u &URL) port() string { |
| 991 | _, port := split_host_port(u.host) |
| 992 | return port |
| 993 | } |
| 994 | |
| 995 | // split_host_port separates host and port. If the port is not valid, it returns |
| 996 | // the entire input as host, and it doesn't check the validity of the host. |
| 997 | // Per RFC 3986, it requires ports to be numeric. |
| 998 | pub fn split_host_port(hostport string) (string, string) { |
| 999 | mut host := hostport |
| 1000 | mut port := '' |
| 1001 | colon := host.last_index_u8(`:`) |
| 1002 | if colon != -1 { |
| 1003 | if valid_optional_port(host[colon..]) { |
| 1004 | port = host[colon + 1..] |
| 1005 | host = host[..colon] |
| 1006 | } |
| 1007 | } |
| 1008 | if host.len > 1 && host[0] == `[` && host.ends_with(']') { |
| 1009 | host = host[1..host.len - 1] |
| 1010 | } |
| 1011 | return host, port |
| 1012 | } |
| 1013 | |
| 1014 | // valid_userinfo reports whether s is a valid userinfo string per RFC 3986 |
| 1015 | // Section 3.2.1: |
| 1016 | // userinfo = *( unreserved / pct-encoded / sub-delims / ':' ) |
| 1017 | // unreserved = ALPHA / DIGIT / '-' / '.' / '_' / '~' |
| 1018 | // sub-delims = '!' / '$' / '&' / ''' / '(' / ')' |
| 1019 | // / '*' / '+' / ',' / ';' / '=' |
| 1020 | // |
| 1021 | // It doesn't validate pct-encoded. The caller does that via fn unescape. |
| 1022 | pub fn valid_userinfo(s string) bool { |
| 1023 | for r in s { |
| 1024 | if r.is_alnum() { |
| 1025 | continue |
| 1026 | } |
| 1027 | match r { |
| 1028 | `-`, `.`, `_`, `:`, `~`, `!`, `$`, `&`, `\\`, `(`, `)`, `*`, `+`, `,`, `;`, `=`, `%`, |
| 1029 | `@` { |
| 1030 | continue |
| 1031 | } |
| 1032 | else { |
| 1033 | return false |
| 1034 | } |
| 1035 | } |
| 1036 | } |
| 1037 | return true |
| 1038 | } |
| 1039 | |
| 1040 | // string_contains_ctl_byte reports whether s contains any ASCII control character. |
| 1041 | fn string_contains_ctl_u8(s string) bool { |
| 1042 | for i in 0 .. s.len { |
| 1043 | b := s[i] |
| 1044 | if b < ` ` || b == 0x7f { |
| 1045 | return true |
| 1046 | } |
| 1047 | } |
| 1048 | return false |
| 1049 | } |
| 1050 | |
| 1051 | pub fn ishex(c u8) bool { |
| 1052 | if `0` <= c && c <= `9` { |
| 1053 | return true |
| 1054 | } else if `a` <= c && c <= `f` { |
| 1055 | return true |
| 1056 | } else if `A` <= c && c <= `F` { |
| 1057 | return true |
| 1058 | } |
| 1059 | return false |
| 1060 | } |
| 1061 | |
| 1062 | fn unhex(c u8) u8 { |
| 1063 | if `0` <= c && c <= `9` { |
| 1064 | return c - `0` |
| 1065 | } else if `a` <= c && c <= `f` { |
| 1066 | return c - `a` + 10 |
| 1067 | } else if `A` <= c && c <= `F` { |
| 1068 | return c - `A` + 10 |
| 1069 | } |
| 1070 | return 0 |
| 1071 | } |
| 1072 | |