1
0
mirror of https://github.com/vlang/v.git synced 2023-08-10 21:13:21 +03:00
v/vlib/net/urllib/urllib.v

1086 lines
32 KiB
V
Raw Normal View History

2019-09-09 14:21:18 +03:00
// urllib parses URLs and implements query escaping.
// See RFC 3986. This module generally follows RFC 3986, except where
2019-08-01 16:01:03 +03:00
// it deviates for compatibility reasons.
// Based off: https://github.com/golang/go/blob/master/src/net/url/url.go
// Last commit: https://github.com/golang/go/commit/fe2ed5054176935d4adcf13e891715ccf2ee3cce
2019-08-01 16:01:03 +03:00
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
module urllib
import strings
enum EncodingMode {
2019-08-06 14:01:05 +03:00
encode_path
encode_path_segment
encode_host
encode_zone
encode_user_password
encode_query_component
encode_fragment
2019-08-01 16:01:03 +03:00
}
const (
err_msg_escape = 'unescape: invalid URL escape'
err_msg_parse = 'parse: failed parsing url'
2019-08-01 16:01:03 +03:00
)
fn error_msg(message string, val string) string {
mut msg := 'net.urllib.${message}'
2019-12-22 01:41:42 +03:00
if val != '' {
msg = '${msg} (${val})'
2019-12-22 01:41:42 +03:00
}
2019-08-01 16:01:03 +03:00
return msg
}
// Return true if the specified character should be escaped when
// appearing in a URL string, according to RFC 3986.
//
// Please be informed that for now should_escape does not check all
// reserved characters correctly. See golang.org/issue/5684.
2022-04-15 18:25:45 +03:00
fn should_escape(c u8, mode EncodingMode) bool {
2019-08-01 16:01:03 +03:00
// §2.3 Unreserved characters (alphanum)
2019-08-05 04:31:22 +03:00
if (`a` <= c && c <= `z`) || (`A` <= c && c <= `Z`) || (`0` <= c && c <= `9`) {
2019-08-01 16:01:03 +03:00
return false
}
2019-08-06 14:01:05 +03:00
if mode == .encode_host || mode == .encode_zone {
2019-08-01 16:01:03 +03:00
// §3.2.2 host allows
2019-12-22 01:41:42 +03:00
// sub-delims = `!` / `$` / `&` / ``` / `(` / `)` / `*` / `+` / `,` / `;` / `=`
2019-08-01 16:01:03 +03:00
// as part of reg-name.
// We add : because we include :port as part of host.
// We add [ ] because we include [ipv6]:port as part of host.
// We add < > because they`re the only characters left that
// we could possibly allow, and parse will reject them if we
// escape them (because hosts can`t use %-encoding for
// ASCII bytes).
if c in [`!`, `$`, `&`, `\\`, `(`, `)`, `*`, `+`, `,`, `;`, `=`, `:`, `[`, `]`, `<`, `>`,
`"`] {
2019-08-01 16:01:03 +03:00
return false
}
}
2019-10-25 17:54:34 +03:00
match c {
2019-12-22 01:41:42 +03:00
`-`, `_`, `.`, `~` {
// §2.3 Unreserved characters (mark)
2019-08-01 16:01:03 +03:00
return false
}
2019-12-22 01:41:42 +03:00
`$`, `&`, `+`, `,`, `/`, `:`, `;`, `=`, `?`, `@` {
// §2.2 Reserved characters (reserved)
// Different sections of the URL allow a few of
// the reserved characters to appear unescaped.
match mode {
.encode_path {
// §3.3
// The RFC allows : @ & = + $ but saves / ; , for assigning
// meaning to individual path segments. This package
// only manipulates the path as a whole, so we allow those
// last three as well. That leaves only ? to escape.
return c == `?`
}
.encode_path_segment {
// §3.3
// The RFC allows : @ & = + $ but saves / ; , for assigning
// meaning to individual path segments.
return c == `/` || c == `;` || c == `,` || c == `?`
}
.encode_user_password {
// §3.2.1
// The RFC allows `;`, `:`, `&`, `=`, `+`, `$`, and `,` in
// userinfo, so we must escape only `@`, `/`, and `?`.
// The parsing of userinfo treats `:` as special so we must escape
// that too.
return c == `@` || c == `/` || c == `?` || c == `:`
}
.encode_query_component {
// §3.4
// The RFC reserves (so we must escape) everything.
return true
}
.encode_fragment {
// §4.1
// The RFC text is silent but the grammar allows
// everything, so escape nothing.
return false
}
else {}
}
2019-10-25 17:54:34 +03:00
}
else {}
}
2019-08-06 14:01:05 +03:00
if mode == .encode_fragment {
2019-08-01 16:01:03 +03:00
// RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are
// included in reserved from RFC 2396 §2.2. The remaining sub-delims do not
// need to be escaped. To minimize potential breakage, we apply two restrictions:
// (1) we always escape sub-delims outside of the fragment, and (2) we always
// escape single quote to avoid breaking callers that had previously assumed that
// single quotes would be escaped. See issue #19917.
2019-10-27 09:45:03 +03:00
match c {
`!`, `(`, `)`, `*` { return false }
else {}
}
2019-08-01 16:01:03 +03:00
}
// Everything else must be escaped.
return true
}
// query_unescape does the inverse transformation of query_escape,
// converting each 3-byte encoded substring of the form '%AB' into the
// hex-decoded byte 0xAB.
// It returns an error if any % is not followed by two hexadecimal
// digits.
pub fn query_unescape(s string) !string {
2019-08-06 14:01:05 +03:00
return unescape(s, .encode_query_component)
2019-08-01 16:01:03 +03:00
}
// path_unescape does the inverse transformation of path_escape,
// converting each 3-byte encoded substring of the form '%AB' into the
// hex-decoded byte 0xAB. It returns an error if any % is not followed
// by two hexadecimal digits.
//
// path_unescape is identical to query_unescape except that it does not
// unescape '+' to ' ' (space).
pub fn path_unescape(s string) !string {
2019-08-06 14:01:05 +03:00
return unescape(s, .encode_path_segment)
2019-08-01 16:01:03 +03:00
}
// unescape unescapes a string; the mode specifies
// which section of the URL string is being unescaped.
fn unescape(s_ string, mode EncodingMode) !string {
2019-09-09 14:21:18 +03:00
mut s := s_
2019-08-01 16:01:03 +03:00
// Count %, check that they're well-formed.
mut n := 0
mut has_plus := false
for i := 0; i < s.len; {
2019-08-01 16:01:03 +03:00
x := s[i]
2019-10-27 09:45:03 +03:00
match x {
`%` {
if s == '' {
break
}
n++
if i + 2 >= s.len || !ishex(s[i + 1]) || !ishex(s[i + 2]) {
if mode == .encode_query_component && i + 1 < s.len {
s = s[..i] + '%25' + s[(i + 1)..]
i += 4 // skip the %25 and the next character
continue
}
s = s[i..]
if s.len > 3 {
s = s[..3]
}
return error(error_msg(urllib.err_msg_escape, s))
}
// Per https://tools.ietf.org/html/rfc3986#page-21
// in the host component %-encoding can only be used
// for non-ASCII bytes.
// But https://tools.ietf.org/html/rfc6874#section-2
// introduces %25 being allowed to escape a percent sign
// in IPv6 scoped-address literals. Yay.
if i + 3 >= s.len && mode == .encode_host && unhex(s[i + 1]) < 8
&& s[i..i + 3] != '%25' {
return error(error_msg(urllib.err_msg_escape, s[i..i + 3]))
2019-10-27 09:45:03 +03:00
}
if mode == .encode_zone {
// RFC 6874 says basically 'anything goes' for zone identifiers
// and that even non-ASCII can be redundantly escaped,
// but it seems prudent to restrict %-escaped bytes here to those
// that are valid host name bytes in their unescaped form.
// That is, you can use escaping in the zone identifier but not
// to introduce bytes you couldn't just write directly.
// But Windows puts spaces here! Yay.
if i + 3 >= s.len {
return error(error_msg('unescape: invalid escape sequence', ''))
}
2022-04-15 14:58:56 +03:00
v := ((unhex(s[i + 1]) << u8(4)) | unhex(s[i + 2]))
if s[i..i + 3] != '%25' && v != ` ` && should_escape(v, .encode_host) {
error(error_msg(urllib.err_msg_escape, s[i..i + 3]))
}
}
i += 3
2019-08-01 16:01:03 +03:00
}
`+` {
has_plus = mode == .encode_query_component
i++
2019-12-22 01:41:42 +03:00
}
else {
if (mode == .encode_host || mode == .encode_zone) && s[i] < 0x80
&& should_escape(s[i], mode) {
error(error_msg('unescape: invalid character in host name', s[i..i + 1]))
2019-10-27 09:45:03 +03:00
}
i++
2020-09-05 13:00:35 +03:00
}
}
2019-08-01 16:01:03 +03:00
}
if n == 0 && !has_plus {
return '${s}' // TODO `return s` once an autofree bug is fixed
2019-08-01 16:01:03 +03:00
}
if s.len < 2 * n {
return error(error_msg('unescape: invalid escape sequence', ''))
}
2019-12-22 01:41:42 +03:00
mut t := strings.new_builder(s.len - 2 * n)
2019-08-01 16:01:03 +03:00
for i := 0; i < s.len; i++ {
x := s[i]
2019-10-27 09:45:03 +03:00
match x {
`%` {
if i + 2 >= s.len {
return error(error_msg('unescape: invalid escape sequence', ''))
}
2022-04-15 14:58:56 +03:00
t.write_string(((unhex(s[i + 1]) << u8(4)) | unhex(s[i + 2])).ascii_str())
2019-10-27 09:45:03 +03:00
i += 2
}
`+` {
if mode == .encode_query_component {
t.write_string(' ')
} else {
t.write_string('+')
2019-10-27 09:45:03 +03:00
}
2019-08-01 16:01:03 +03:00
}
2019-12-22 01:41:42 +03:00
else {
t.write_string(s[i].ascii_str())
}
}
2019-08-01 16:01:03 +03:00
}
return t.str()
}
// query_escape escapes the string so it can be safely placed
// inside a URL query.
pub fn query_escape(s string) string {
2019-08-06 14:01:05 +03:00
return escape(s, .encode_query_component)
2019-08-01 16:01:03 +03:00
}
// path_escape escapes the string so it can be safely placed inside a URL path segment,
// replacing special characters (including /) with %XX sequences as needed.
pub fn path_escape(s string) string {
2019-08-06 14:01:05 +03:00
return escape(s, .encode_path_segment)
2019-08-01 16:01:03 +03:00
}
fn escape(s string, mode EncodingMode) string {
mut space_count := 0
mut hex_count := 0
2022-04-15 14:58:56 +03:00
mut c := u8(0)
for i in 0 .. s.len {
2019-08-01 16:01:03 +03:00
c = s[i]
if should_escape(c, mode) {
2019-08-06 14:01:05 +03:00
if c == ` ` && mode == .encode_query_component {
2019-08-01 16:01:03 +03:00
space_count++
} else {
2019-08-01 16:01:03 +03:00
hex_count++
}
}
}
if space_count == 0 && hex_count == 0 {
return s
}
2019-12-22 01:41:42 +03:00
required := s.len + 2 * hex_count
2022-04-15 15:35:35 +03:00
mut t := []u8{len: required}
2019-08-01 16:01:03 +03:00
if hex_count == 0 {
copy(mut t, s.bytes())
for i in 0 .. s.len {
2019-08-01 16:01:03 +03:00
if s[i] == ` ` {
t[i] = `+`
}
}
return t.bytestr()
2019-08-01 16:01:03 +03:00
}
upperhex := '0123456789ABCDEF'
2019-08-01 16:01:03 +03:00
mut j := 0
for i in 0 .. s.len {
2019-08-01 16:01:03 +03:00
c1 := s[i]
2019-08-06 14:01:05 +03:00
if c1 == ` ` && mode == .encode_query_component {
2019-08-01 16:01:03 +03:00
t[j] = `+`
j++
} else if should_escape(c1, mode) {
2019-08-01 16:01:03 +03:00
t[j] = `%`
t[j + 1] = upperhex[c1 >> 4]
2019-12-22 01:41:42 +03:00
t[j + 2] = upperhex[c1 & 15]
2019-08-01 16:01:03 +03:00
j += 3
} else {
2019-08-01 16:01:03 +03:00
t[j] = s[i]
j++
}
}
return t.bytestr()
2019-08-01 16:01:03 +03:00
}
// A URL represents a parsed URL (technically, a URI reference).
// The general form represented is:
2019-12-22 01:41:42 +03:00
// [scheme:][//[userinfo@]host][/]path[?query][#fragment]
2019-08-01 16:01:03 +03:00
// URLs that do not start with a slash after the scheme are interpreted as:
2019-12-22 01:41:42 +03:00
// scheme:opaque[?query][#fragment]
2019-08-01 16:01:03 +03:00
//
// Note that the path field is stored in decoded form: /%47%6f%2f becomes /Go/.
// A consequence is that it is impossible to tell which slashes in the path were
// slashes in the raw URL and which were %2f. This distinction is rarely important,
// but when it is, the code should use raw_path, an optional field which only gets
// set if the default encoding is different from path.
//
// URL's String method uses the escaped_path method to obtain the path. See the
// escaped_path method for more details.
pub struct URL {
pub mut:
2019-08-01 16:01:03 +03:00
scheme string
2021-01-05 21:14:35 +03:00
opaque string // encoded opaque data
user &Userinfo = unsafe { nil } // username and password information
2021-01-05 21:14:35 +03:00
host string // host or host:port
path string // path (relative paths may omit leading slash)
raw_path string // encoded path hint (see escaped_path method)
force_query bool // append a query ('?') even if raw_query is empty
raw_query string // encoded query values, without '?'
fragment string // fragment for references, without '#'
2019-08-01 16:01:03 +03:00
}
// debug returns a string representation of *ALL* the fields of the given URL
pub fn (url &URL) debug() string {
return 'URL{\n scheme: ${url.scheme}\n opaque: ${url.opaque}\n user: ${url.user}\n host: ${url.host}\n path: ${url.path}\n raw_path: ${url.raw_path}\n force_query: ${url.force_query}\n raw_query: ${url.raw_query}\n fragment: ${url.fragment}\n}'
}
2019-08-01 16:01:03 +03:00
// user returns a Userinfo containing the provided username
// and no password set.
2019-09-03 19:10:56 +03:00
pub fn user(username string) &Userinfo {
2019-08-01 16:01:03 +03:00
return &Userinfo{
2019-12-22 01:41:42 +03:00
username: username
password: ''
2019-08-01 16:01:03 +03:00
password_set: false
}
}
// user_password returns a Userinfo containing the provided username
// and password.
//
// This functionality should only be used with legacy web sites.
// RFC 2396 warns that interpreting Userinfo this way
// ``is NOT RECOMMENDED, because the passing of authentication
// information in clear text (such as URI) has proven to be a
// security risk in almost every case where it has been used.''
fn user_password(username string, password string) &Userinfo {
return &Userinfo{username, password, true}
2019-08-01 16:01:03 +03:00
}
// The Userinfo type is an immutable encapsulation of username and
// password details for a URL. An existing Userinfo value is guaranteed
// to have a username set (potentially empty, as allowed by RFC 2396),
// and optionally a password.
struct Userinfo {
pub:
username string
password string
password_set bool
}
fn (u &Userinfo) empty() bool {
return isnil(u) || (u.username == '' && u.password == '')
2019-08-01 16:01:03 +03:00
}
// string returns the encoded userinfo information in the standard form
// of 'username[:password]'.
fn (u &Userinfo) str() string {
2019-08-01 16:01:03 +03:00
if u.empty() {
return ''
}
2019-08-06 14:01:05 +03:00
mut s := escape(u.username, .encode_user_password)
2019-08-01 16:01:03 +03:00
if u.password_set {
2019-08-06 14:01:05 +03:00
s += ':' + escape(u.password, .encode_user_password)
2019-08-01 16:01:03 +03:00
}
return s
}
// Maybe rawurl is of the form scheme:path.
// (scheme must be [a-zA-Z][a-zA-Z0-9+-.]*)
// If so, return [scheme, path]; else return ['', rawurl]
fn split_by_scheme(rawurl string) ![]string {
for i in 0 .. rawurl.len {
c := rawurl[i]
2019-08-05 04:31:22 +03:00
if (`a` <= c && c <= `z`) || (`A` <= c && c <= `Z`) {
2019-08-01 16:01:03 +03:00
// do nothing
} else if (`0` <= c && c <= `9`) || (c == `+` || c == `-` || c == `.`) {
2019-08-01 16:01:03 +03:00
if i == 0 {
return ['', rawurl]
}
} else if c == `:` {
2019-08-01 16:01:03 +03:00
if i == 0 {
return error(error_msg('split_by_scheme: missing protocol scheme', ''))
2019-08-01 16:01:03 +03:00
}
2019-12-22 01:41:42 +03:00
return [rawurl[..i], rawurl[i + 1..]]
} else {
2019-08-01 16:01:03 +03:00
// we have encountered an invalid character,
// so there is no valid scheme
return ['', rawurl]
}
}
return ['', rawurl]
}
fn get_scheme(rawurl string) !string {
split := split_by_scheme(rawurl) or { return err.msg() }
2019-08-01 16:01:03 +03:00
return split[0]
}
2023-05-12 09:31:27 +03:00
// split slices s into two substrings separated by the first occurrence of
// sep. If cutc is true then sep is included with the second substring.
// If sep does not occur in s then s and the empty string is returned.
2022-04-15 18:25:45 +03:00
fn split(s string, sep u8, cutc bool) (string, string) {
2022-04-15 14:58:56 +03:00
i := s.index_u8(sep)
2019-08-01 16:01:03 +03:00
if i < 0 {
return s, ''
2019-08-01 16:01:03 +03:00
}
if cutc {
return s[..i], s[i + 1..]
2019-08-01 16:01:03 +03:00
}
return s[..i], s[i..]
2019-08-01 16:01:03 +03:00
}
// parse parses rawurl into a URL structure.
//
// The rawurl may be relative (a path, without a host) or absolute
// (starting with a scheme). Trying to parse a hostname and path
// without a scheme is invalid but may not necessarily return an
// error, due to parsing ambiguities.
pub fn parse(rawurl string) !URL {
2019-08-01 16:01:03 +03:00
// Cut off #frag
u, frag := split(rawurl, `#`, true)
mut url := parse_url(u, false) or { return error(error_msg(urllib.err_msg_parse, u)) }
2019-08-01 16:01:03 +03:00
if frag == '' {
return url
}
f := unescape(frag, .encode_fragment) or { return error(error_msg(urllib.err_msg_parse,
u)) }
2019-08-01 16:01:03 +03:00
url.fragment = f
return url
}
// parse_request_uri parses rawurl into a URL structure. It assumes that
// rawurl was received in an HTTP request, so the rawurl is interpreted
// only as an absolute URI or an absolute path.
// The string rawurl is assumed not to have a #fragment suffix.
// (Web browsers strip #fragment before sending the URL to a web server.)
fn parse_request_uri(rawurl string) !URL {
return parse_url(rawurl, true)
2019-08-01 16:01:03 +03:00
}
// parse_url parses a URL from a string in one of two contexts. If
2019-08-01 16:01:03 +03:00
// via_request is true, the URL is assumed to have arrived via an HTTP request,
// in which case only absolute URLs or path-absolute relative URLs are allowed.
// If via_request is false, all forms of relative URLs are allowed.
[manualfree]
fn parse_url(rawurl string, via_request bool) !URL {
2022-04-15 14:58:56 +03:00
if string_contains_ctl_u8(rawurl) {
return error(error_msg('parse_url: invalid control character in URL', rawurl))
2019-08-01 16:01:03 +03:00
}
if rawurl == '' && via_request {
return error(error_msg('parse_url: empty URL', rawurl))
2019-08-01 16:01:03 +03:00
}
2019-12-22 01:41:42 +03:00
mut url := URL{
user: 0
}
2019-08-01 16:01:03 +03:00
if rawurl == '*' {
url.path = '*'
return url
}
// Split off possible leading 'http:', 'mailto:', etc.
// Cannot contain escaped characters.
p := split_by_scheme(rawurl)!
2019-08-01 16:01:03 +03:00
url.scheme = p[0]
mut rest := p[1]
url.scheme = url.scheme.to_lower()
2019-08-02 01:31:47 +03:00
// if rest.ends_with('?') && strings.count(rest, '?') == 1 {
if rest.ends_with('?') && !rest[..1].contains('?') {
2019-08-01 16:01:03 +03:00
url.force_query = true
2019-12-22 01:41:42 +03:00
rest = rest[..rest.len - 1]
} else {
r, raw_query := split(rest, `?`, true)
rest = r
url.raw_query = raw_query
2019-08-01 16:01:03 +03:00
}
if !rest.starts_with('/') {
if url.scheme != '' {
// We consider rootless paths per RFC 3986 as opaque.
url.opaque = rest
return url
}
if via_request {
return error(error_msg('parse_url: invalid URI for request', ''))
2019-08-01 16:01:03 +03:00
}
// Avoid confusion with malformed schemes, like cache_object:foo/bar.
// See golang.org/issue/16822.
//
// RFC 3986, §3.3:
// In addition, a URI reference (Section 4.1) may be a relative-path reference,
// in which case the first path segment cannot contain a colon (':') character.
2021-01-05 21:14:35 +03:00
colon := rest.index(':') or { return error('there should be a : in the URL') }
slash := rest.index('/') or { return error('there should be a / in the URL') }
2019-08-01 16:01:03 +03:00
if colon >= 0 && (slash < 0 || colon < slash) {
// First path segment has colon. Not allowed in relative URL.
return error(error_msg('parse_url: first path segment in URL cannot contain colon',
''))
2019-08-01 16:01:03 +03:00
}
}
2019-09-21 18:21:45 +03:00
if ((url.scheme != '' || !via_request) && !rest.starts_with('///')) && rest.starts_with('//') {
authority, r := split(rest[2..], `/`, false)
rest = r
a := parse_authority(authority)!
2019-08-01 16:01:03 +03:00
url.user = a.user
url.host = a.host
}
// Set path and, optionally, raw_path.
// raw_path is a hint of the encoding of path. We don't want to set it if
// the default escaping of path is equivalent, to help make sure that people
// don't rely on it in general.
url.set_path(rest)!
2019-08-01 16:01:03 +03:00
return url
}
struct ParseAuthorityRes {
user &Userinfo = unsafe { nil }
2019-08-01 16:01:03 +03:00
host string
}
fn parse_authority(authority string) !ParseAuthorityRes {
2021-01-05 21:14:35 +03:00
i := authority.last_index('@') or { -1 }
2019-08-01 16:01:03 +03:00
mut host := ''
mut zuser := user('')
2019-08-01 16:01:03 +03:00
if i < 0 {
h := parse_host(authority)!
2019-08-01 16:01:03 +03:00
host = h
} else {
h := parse_host(authority[i + 1..])!
2019-08-01 16:01:03 +03:00
host = h
}
if i < 0 {
2019-12-22 01:41:42 +03:00
return ParseAuthorityRes{
host: host
user: zuser
}
2019-08-01 16:01:03 +03:00
}
mut userinfo := authority[..i]
2019-08-01 16:01:03 +03:00
if !valid_userinfo(userinfo) {
return error(error_msg('parse_authority: invalid userinfo', ''))
2019-08-01 16:01:03 +03:00
}
if !userinfo.contains(':') {
u := unescape(userinfo, .encode_user_password)!
2019-08-01 16:01:03 +03:00
userinfo = u
zuser = user(userinfo)
} else {
mut username, mut password := split(userinfo, `:`, true)
u := unescape(username, .encode_user_password)!
2019-08-01 16:01:03 +03:00
username = u
p := unescape(password, .encode_user_password)!
2019-08-01 16:01:03 +03:00
password = p
zuser = user_password(username, password)
2019-08-01 16:01:03 +03:00
}
return ParseAuthorityRes{
user: zuser
2019-08-01 16:01:03 +03:00
host: host
}
}
// parse_host parses host as an authority without user
// information. That is, as host[:port].
fn parse_host(host string) !string {
2019-08-01 16:01:03 +03:00
if host.starts_with('[') {
// parse an IP-Literal in RFC 3986 and RFC 6874.
// E.g., '[fe80::1]', '[fe80::1%25en0]', '[fe80::1]:80'.
2019-12-12 21:44:52 +03:00
mut i := host.last_index(']') or {
return error(error_msg("parse_host: missing ']' in host", ''))
2019-08-01 16:01:03 +03:00
}
2019-12-22 01:41:42 +03:00
mut colon_port := host[i + 1..]
if !valid_optional_port(colon_port) {
return error(error_msg('parse_host: invalid port ${colon_port} after host ',
''))
2019-08-01 16:01:03 +03:00
}
// RFC 6874 defines that %25 (%-encoded percent) introduces
// the zone identifier, and the zone identifier can use basically
// any %-encoding it likes. That's different from the host, which
// can only %-encode non-ASCII bytes.
// We do impose some restrictions on the zone, to avoid stupidity
// like newlines.
if zone := host[..i].index('%25') {
host1 := unescape(host[..zone], .encode_host) or { return err.msg() }
host2 := unescape(host[zone..i], .encode_zone) or { return err.msg() }
host3 := unescape(host[i..], .encode_host) or { return err.msg() }
2019-08-01 16:01:03 +03:00
return host1 + host2 + host3
}
} else if i := host.last_index(':') {
colon_port := host[i..]
if !valid_optional_port(colon_port) {
return error(error_msg('parse_host: invalid port ${colon_port} after host ',
''))
2019-08-01 16:01:03 +03:00
}
}
h := unescape(host, .encode_host) or { return err.msg() }
2019-09-09 14:21:18 +03:00
return h
2019-12-22 01:41:42 +03:00
// host = h
// return host
2019-08-01 16:01:03 +03:00
}
2019-08-01 16:01:03 +03:00
// set_path sets the path and raw_path fields of the URL based on the provided
// escaped path p. It maintains the invariant that raw_path is only specified
// when it differs from the default encoding of the path.
// For example:
// - set_path('/foo/bar') will set path='/foo/bar' and raw_path=''
// - set_path('/foo%2fbar') will set path='/foo/bar' and raw_path='/foo%2fbar'
// set_path will return an error only if the provided path contains an invalid
// escaping.
pub fn (mut u URL) set_path(p string) !bool {
u.path = unescape(p, .encode_path)!
u.raw_path = if p == escape(u.path, .encode_path) { '' } else { p }
2019-08-01 16:01:03 +03:00
return true
}
// escaped_path returns the escaped form of u.path.
// In general there are multiple possible escaped forms of any path.
// escaped_path returns u.raw_path when it is a valid escaping of u.path.
// Otherwise escaped_path ignores u.raw_path and computes an escaped
// form on its own.
// The String and request_uri methods use escaped_path to construct
// their results.
// In general, code should call escaped_path instead of
// reading u.raw_path directly.
2021-06-13 01:40:32 +03:00
pub fn (u &URL) escaped_path() string {
2019-08-01 16:01:03 +03:00
if u.raw_path != '' && valid_encoded_path(u.raw_path) {
2021-01-05 21:14:35 +03:00
unescape(u.raw_path, .encode_path) or { return '' }
2019-08-01 16:01:03 +03:00
return u.raw_path
}
if u.path == '*' {
return '*' // don't escape (Issue 11202)
}
2019-08-06 14:01:05 +03:00
return escape(u.path, .encode_path)
2019-08-01 16:01:03 +03:00
}
// valid_encoded_path reports whether s is a valid encoded path.
// It must not contain any bytes that require escaping during path encoding.
fn valid_encoded_path(s string) bool {
for i in 0 .. s.len {
2019-08-01 16:01:03 +03:00
// RFC 3986, Appendix A.
// pchar = unreserved / pct-encoded / sub-delims / ':' / '@'.
// should_escape is not quite compliant with the RFC,
// so we check the sub-delims ourselves and let
// should_escape handle the others.
2019-09-09 14:21:18 +03:00
x := s[i]
2019-10-27 09:45:03 +03:00
match x {
`!`, `$`, `&`, `\\`, `(`, `)`, `*`, `+`, `,`, `;`, `=`, `:`, `@` {
// ok
}
`[`, `]` {
// ok - not specified in RFC 3986 but left alone by modern browsers
}
`%` {
// ok - percent encoded, will decode
2019-12-22 01:41:42 +03:00
}
else {
2019-10-27 09:45:03 +03:00
if should_escape(s[i], .encode_path) {
return false
}
}
}
2019-08-01 16:01:03 +03:00
}
return true
}
// valid_optional_port reports whether port is either an empty string
// or matches /^:\d*$/
fn valid_optional_port(port string) bool {
if port == '' {
return true
}
if port[0] != `:` {
return false
}
for b in port[1..] {
2019-08-01 16:01:03 +03:00
if b < `0` || b > `9` {
return false
}
}
return true
}
// str reassembles the URL into a valid URL string.
// The general form of the result is one of:
//
2019-12-22 01:41:42 +03:00
// scheme:opaque?query#fragment
// scheme://userinfo@host/path?query#fragment
2019-08-01 16:01:03 +03:00
//
// If u.opaque is non-empty, String uses the first form;
// otherwise it uses the second form.
// Any non-ASCII characters in host are escaped.
// To obtain the path, String uses u.escaped_path().
//
// In the second form, the following rules apply:
2019-12-22 01:41:42 +03:00
// - if u.scheme is empty, scheme: is omitted.
// - if u.user is nil, userinfo@ is omitted.
// - if u.host is empty, host/ is omitted.
// - if u.scheme and u.host are empty and u.user is nil,
// the entire scheme://userinfo@host/ is omitted.
// - if u.host is non-empty and u.path begins with a /,
// the form host/path does not add its own /.
// - if u.raw_query is empty, ?query is omitted.
// - if u.fragment is empty, #fragment is omitted.
pub fn (u URL) str() string {
2019-08-01 16:01:03 +03:00
mut buf := strings.new_builder(200)
if u.scheme != '' {
buf.write_string(u.scheme)
buf.write_string(':')
2019-08-01 16:01:03 +03:00
}
if u.opaque != '' {
buf.write_string(u.opaque)
} else {
if u.scheme != '' || u.host != '' || !u.user.empty() {
2019-08-01 16:01:03 +03:00
if u.host != '' || u.path != '' || !u.user.empty() {
buf.write_string('//')
2019-08-01 16:01:03 +03:00
}
if !u.user.empty() {
buf.write_string(u.user.str())
buf.write_string('@')
2019-08-01 16:01:03 +03:00
}
if u.host != '' {
buf.write_string(escape(u.host, .encode_host))
2019-08-01 16:01:03 +03:00
}
}
path := u.escaped_path()
if path != '' && path[0] != `/` && u.host != '' {
buf.write_string('/')
2019-08-01 16:01:03 +03:00
}
if buf.len == 0 {
// RFC 3986 §4.2
// A path segment that contains a colon character (e.g., 'this:that')
// cannot be used as the first segment of a relative-path reference, as
// it would be mistaken for a scheme name. Such a segment must be
// preceded by a dot-segment (e.g., './this:that') to make a relative-
// path reference.
2022-04-15 14:58:56 +03:00
i := path.index_u8(`:`)
2021-03-15 23:12:11 +03:00
if i > -1 {
// TODO remove this when autofree handles tmp
// expressions like this
2022-04-15 14:58:56 +03:00
if i > -1 && path[..i].index_u8(`/`) == -1 {
2021-03-15 23:12:11 +03:00
buf.write_string('./')
}
2019-08-01 16:01:03 +03:00
}
}
buf.write_string(path)
2019-08-01 16:01:03 +03:00
}
if u.force_query || u.raw_query != '' {
buf.write_string('?')
buf.write_string(u.raw_query)
2019-08-01 16:01:03 +03:00
}
if u.fragment != '' {
buf.write_string('#')
buf.write_string(escape(u.fragment, .encode_fragment))
2019-08-01 16:01:03 +03:00
}
return buf.str()
}
// Values maps a string key to a list of values.
// It is typically used for query parameters and form values.
// Unlike in the http.Header map, the keys in a Values map
// are case-sensitive.
// parseQuery parses the URL-encoded query string and returns
// a map listing the values specified for each key.
// parseQuery always returns a non-nil map containing all the
// valid query parameters found; err describes the first decoding error
// encountered, if any.
//
// Query is expected to be a list of key=value settings separated by
// ampersands or semicolons. A setting without an equals sign is
// interpreted as a key set to an empty value.
pub fn parse_query(query string) !Values {
2019-08-01 16:01:03 +03:00
mut m := new_values()
parse_query_values(mut m, query)!
2019-08-01 16:01:03 +03:00
return m
}
// parse_query_silent is the same as parse_query
2019-09-09 14:21:18 +03:00
// but any errors will be silent
2019-08-01 16:01:03 +03:00
fn parse_query_silent(query string) Values {
mut m := new_values()
parse_query_values(mut m, query) or {}
2019-08-01 16:01:03 +03:00
return m
}
fn parse_query_values(mut m Values, query string) !bool {
2019-08-01 16:01:03 +03:00
mut had_error := false
2019-08-07 10:53:33 +03:00
mut q := query
for q != '' {
mut key := q
2019-08-01 16:01:03 +03:00
mut i := key.index_any('&;')
if i >= 0 {
2019-12-22 01:41:42 +03:00
q = key[i + 1..]
key = key[..i]
} else {
2019-08-07 10:53:33 +03:00
q = ''
2019-08-01 16:01:03 +03:00
}
if key == '' {
continue
}
mut value := ''
if idx := key.index('=') {
i = idx
2019-12-22 01:41:42 +03:00
value = key[i + 1..]
key = key[..i]
2019-08-01 16:01:03 +03:00
}
k := query_unescape(key) or {
had_error = true
continue
}
key = k
v := query_unescape(value) or {
had_error = true
continue
}
value = v
m.add(key, value)
}
if had_error {
return error(error_msg('parse_query_values: failed parsing query string', ''))
2019-08-01 16:01:03 +03:00
}
return true
}
// encode encodes the values into ``URL encoded'' form
// ('bar=baz&foo=quux').
// The syntx of the query string is specified in the
// RFC173 https://datatracker.ietf.org/doc/html/rfc1738
//
// HTTP grammar
//
// httpurl = "http://" hostport [ "/" hpath [ "?" search ]]
// hpath = hsegment *[ "/" hsegment ]
// hsegment = *[ uchar | ";" | ":" | "@" | "&" | "=" ]
// search = *[ uchar | ";" | ":" | "@" | "&" | "=" ]
pub fn (v Values) encode() string {
2020-06-21 17:51:02 +03:00
if v.len == 0 {
2019-08-01 16:01:03 +03:00
return ''
}
mut buf := strings.new_builder(200)
for qvalue in v.data {
key_kscaped := query_escape(qvalue.key)
if buf.len > 0 {
buf.write_string('&')
}
buf.write_string(key_kscaped)
if qvalue.value == '' {
continue
2019-08-01 16:01:03 +03:00
}
buf.write_string('=')
buf.write_string(query_escape(qvalue.value))
2019-08-01 16:01:03 +03:00
}
return buf.str()
}
// resolve_path applies special path segments from refs and applies
// them to base, per RFC 3986.
fn resolve_path(base string, ref string) string {
2019-08-01 16:01:03 +03:00
mut full := ''
if ref == '' {
full = base
} else if ref[0] != `/` {
2021-01-05 21:14:35 +03:00
i := base.last_index('/') or { -1 }
2019-12-22 01:41:42 +03:00
full = base[..i + 1] + ref
} else {
2019-08-01 16:01:03 +03:00
full = ref
}
if full == '' {
return ''
}
2020-04-26 14:49:31 +03:00
mut dst := []string{}
2019-08-01 16:01:03 +03:00
src := full.split('/')
for _, elem in src {
2019-10-27 09:45:03 +03:00
match elem {
'.' {
// drop
}
'..' {
if dst.len > 0 {
2019-12-22 01:41:42 +03:00
dst = dst[..dst.len - 1]
2019-10-27 09:45:03 +03:00
}
2019-08-01 16:01:03 +03:00
}
2019-12-22 01:41:42 +03:00
else {
dst << elem
}
}
2019-08-01 16:01:03 +03:00
}
2019-12-22 01:41:42 +03:00
last := src[src.len - 1]
2019-08-01 16:01:03 +03:00
if last == '.' || last == '..' {
// Add final slash to the joined path.
2019-12-22 01:41:42 +03:00
dst << ''
2019-08-01 16:01:03 +03:00
}
return '/' + dst.join('/').trim_left('/')
}
// is_abs reports whether the URL is absolute.
// Absolute means that it has a non-empty scheme.
pub fn (u &URL) is_abs() bool {
return u.scheme != ''
}
// parse parses a URL in the context of the receiver. The provided URL
// may be relative or absolute. parse returns nil, err on parse
// failure, otherwise its return value is the same as resolve_reference.
pub fn (u &URL) parse(ref string) !URL {
refurl := parse(ref)!
2019-08-01 16:01:03 +03:00
return u.resolve_reference(refurl)
}
// resolve_reference resolves a URI reference to an absolute URI from
// an absolute base URI u, per RFC 3986 Section 5.2. The URI reference
// may be relative or absolute. resolve_reference always returns a new
// URL instance, even if the returned URL is identical to either the
// base or reference. If ref is an absolute URL, then resolve_reference
// ignores base and returns a copy of ref.
pub fn (u &URL) resolve_reference(ref &URL) !URL {
2019-08-01 16:01:03 +03:00
mut url := *ref
if ref.scheme == '' {
url.scheme = u.scheme
}
if ref.scheme != '' || ref.host != '' || !ref.user.empty() {
// The 'absoluteURI' or 'net_path' cases.
// We can ignore the error from set_path since we know we provided a
// validly-escaped path.
url.set_path(resolve_path(ref.escaped_path(), ''))!
2019-08-01 16:01:03 +03:00
return url
}
if ref.opaque != '' {
url.user = user('')
url.host = ''
url.path = ''
return url
}
if ref.path == '' && ref.raw_query == '' {
url.raw_query = u.raw_query
if ref.fragment == '' {
url.fragment = u.fragment
}
}
// The 'abs_path' or 'rel_path' cases.
url.host = u.host
url.user = u.user
url.set_path(resolve_path(u.escaped_path(), ref.escaped_path()))!
2019-08-01 16:01:03 +03:00
return url
}
// query parses raw_query and returns the corresponding values.
// It silently discards malformed value pairs.
// To check errors use parseQuery.
pub fn (u &URL) query() Values {
v := parse_query_silent(u.raw_query)
return v
}
// request_uri returns the encoded path?query or opaque?query
// string that would be used in an HTTP request for u.
pub fn (u &URL) request_uri() string {
mut result := u.opaque
if result == '' {
result = u.escaped_path()
if result == '' {
result = '/'
}
} else {
2019-08-01 16:01:03 +03:00
if result.starts_with('//') {
result = u.scheme + ':' + result
}
}
if u.force_query || u.raw_query != '' {
result += '?' + u.raw_query
}
return result
}
// hostname returns u.host, stripping any valid port number if present.
2019-08-01 16:01:03 +03:00
//
// If the result is enclosed in square brackets, as literal IPv6 addresses are,
// the square brackets are removed from the result.
2019-08-01 16:01:03 +03:00
pub fn (u &URL) hostname() string {
host, _ := split_host_port(u.host)
return host
2019-08-01 16:01:03 +03:00
}
// port returns the port part of u.host, without the leading colon.
// If u.host doesn't contain a port, port returns an empty string.
pub fn (u &URL) port() string {
_, port := split_host_port(u.host)
return port
2019-08-01 16:01:03 +03:00
}
// split_host_port separates host and port. If the port is not valid, it returns
// the entire input as host, and it doesn't check the validity of the host.
// Per RFC 3986, it requires ports to be numeric.
pub fn split_host_port(hostport string) (string, string) {
mut host := hostport
mut port := ''
2022-04-15 14:58:56 +03:00
colon := host.last_index_u8(`:`)
if colon != -1 {
if valid_optional_port(host[colon..]) {
port = host[colon + 1..]
host = host[..colon]
}
2019-08-01 16:01:03 +03:00
}
if host.starts_with('[') && host.ends_with(']') {
2019-12-22 01:41:42 +03:00
host = host[1..host.len - 1]
2019-08-01 16:01:03 +03:00
}
return host, port
2019-08-01 16:01:03 +03:00
}
// valid_userinfo reports whether s is a valid userinfo string per RFC 3986
// Section 3.2.1:
2019-12-22 01:41:42 +03:00
// userinfo = *( unreserved / pct-encoded / sub-delims / ':' )
// unreserved = ALPHA / DIGIT / '-' / '.' / '_' / '~'
// sub-delims = '!' / '$' / '&' / ''' / '(' / ')'
// / '*' / '+' / ',' / ';' / '='
2019-08-01 16:01:03 +03:00
//
// It doesn't validate pct-encoded. The caller does that via fn unescape.
pub fn valid_userinfo(s string) bool {
for r in s {
if `A` <= r && r <= `Z` {
continue
}
if `a` <= r && r <= `z` {
continue
}
if `0` <= r && r <= `9` {
continue
}
2019-10-27 09:45:03 +03:00
match r {
`-`, `.`, `_`, `:`, `~`, `!`, `$`, `&`, `\\`, `(`, `)`, `*`, `+`, `,`, `;`, `=`, `%`,
`@` {
2021-01-05 21:14:35 +03:00
continue
}
else {
return false
}
}
2019-08-01 16:01:03 +03:00
}
return true
}
// string_contains_ctl_byte reports whether s contains any ASCII control character.
2022-04-15 14:58:56 +03:00
fn string_contains_ctl_u8(s string) bool {
for i in 0 .. s.len {
2019-08-01 16:01:03 +03:00
b := s[i]
if b < ` ` || b == 0x7f {
return true
}
}
return false
}
2022-04-15 18:25:45 +03:00
pub fn ishex(c u8) bool {
2019-08-01 16:01:03 +03:00
if `0` <= c && c <= `9` {
return true
} else if `a` <= c && c <= `f` {
2019-08-01 16:01:03 +03:00
return true
} else if `A` <= c && c <= `F` {
2019-08-01 16:01:03 +03:00
return true
}
return false
}
2022-04-15 18:25:45 +03:00
fn unhex(c u8) u8 {
2019-08-01 16:01:03 +03:00
if `0` <= c && c <= `9` {
return c - `0`
} else if `a` <= c && c <= `f` {
2019-08-01 16:01:03 +03:00
return c - `a` + 10
} else if `A` <= c && c <= `F` {
2019-08-01 16:01:03 +03:00
return c - `A` + 10
}
return 0
}