diff --git a/vlib/builtin/string.v b/vlib/builtin/string.v index efaa62ffb4..45443bce76 100644 --- a/vlib/builtin/string.v +++ b/vlib/builtin/string.v @@ -399,6 +399,16 @@ pub fn (s string) index(p string) int { return -1 } +pub fn (s string) index_any(chars string) int { + for c in chars { + index := s.index(c.str()) + if index != -1 { + return index + } + } + return -1 +} + pub fn (s string) last_index(p string) int { if p.len > s.len { return -1 diff --git a/vlib/net/urllib/urllib.v b/vlib/net/urllib/urllib.v new file mode 100644 index 0000000000..8ccb5a4f75 --- /dev/null +++ b/vlib/net/urllib/urllib.v @@ -0,0 +1,1110 @@ +// Copyright (c) 2019 Alexander Medvednikov. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. + +// Package url parses URLs and implements query escaping. + +// See RFC 3986. This package generally follows RFC 3986, except where +// it deviates for compatibility reasons. + +// Based off: https://github.com/golang/go/blob/master/src/net/url/url.go +// Last commit: https://github.com/golang/go/commit/a326bc6df27309815e4a2ae005adef233cfb9ea9 +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +module urllib + +import strings + +enum EncodingMode { + EncodePath + EncodePathSegment + EncodeHost + EncodeZone + EncodeUserPassword + EncodeQueryComponent + EncodeFragment +} + +const ( + EscapeError = 'invalid URL escape' + ParseError = 'error parsing url' +) + +fn error_msg(message, val string) string { + mut msg := 'net.url: $message' + if val == '' { msg = '$msg ($val)' } + return msg +} + +// Return true if the specified character should be escaped when +// appearing in a URL string, according to RFC 3986. +// +// Please be informed that for now should_escape does not check all +// reserved characters correctly. See golang.org/issue/5684. +fn should_escape(c byte, mode EncodingMode) bool { + // §2.3 Unreserved characters (alphanum) + if `a` <= c && c <= `z` || `A` <= c && c <= `Z` || `0` <= c && c <= `9` { + return false + } + + if mode == .EncodeHost || mode == .EncodeZone { + // §3.2.2 host allows + // sub-delims = `!` / `$` / `&` / ``` / `(` / `)` / `*` / `+` / `,` / `;` / `=` + // as part of reg-name. + // We add : because we include :port as part of host. + // We add [ ] because we include [ipv6]:port as part of host. + // We add < > because they`re the only characters left that + // we could possibly allow, and parse will reject them if we + // escape them (because hosts can`t use %-encoding for + // ASCII bytes). + switch c { + case `!`, `$`, `&`, `\\`, `(`, `)`, `*`, `+`, `,`, `;`, `=`, `:`, `[`, `]`, `<`, `>`, `"`: + return false + } + } + + switch c { + case `-`, `_`, `.`, `~`: // §2.3 Unreserved characters (mark) + return false + + case `$`, `&`, `+`, `,`, `/`, `:`, `;`, `=`, `?`, `@`: // §2.2 Reserved characters (reserved) + // Different sections of the URL allow a few of + // the reserved characters to appear unescaped. + switch mode { + case EncodingMode.EncodePath: // §3.3 + // The RFC allows : @ & = + $ but saves / ; , for assigning + // meaning to individual path segments. This package + // only manipulates the path as a whole, so we allow those + // last three as well. That leaves only ? to escape. + return c == `?` + + case EncodingMode.EncodePathSegment: // §3.3 + // The RFC allows : @ & = + $ but saves / ; , for assigning + // meaning to individual path segments. + return c == `/` || c == `;` || c == `,` || c == `?` + + case EncodingMode.EncodeUserPassword: // §3.2.1 + // The RFC allows `;`, `:`, `&`, `=`, `+`, `$`, and `,` in + // userinfo, so we must escape only `@`, `/`, and `?`. + // The parsing of userinfo treats `:` as special so we must escape + // that too. + return c == `@` || c == `/` || c == `?` || c == `:` + + case EncodingMode.EncodeQueryComponent: // §3.4 + // The RFC reserves (so we must escape) everything. + return true + + case EncodingMode.EncodeFragment: // §4.1 + // The RFC text is silent but the grammar allows + // everything, so escape nothing. + return false + } + } + + if mode == .EncodeFragment { + // RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are + // included in reserved from RFC 2396 §2.2. The remaining sub-delims do not + // need to be escaped. To minimize potential breakage, we apply two restrictions: + // (1) we always escape sub-delims outside of the fragment, and (2) we always + // escape single quote to avoid breaking callers that had previously assumed that + // single quotes would be escaped. See issue #19917. + switch c { + case `!`, `(`, `)`, `*`: + return false + } + } + + // Everything else must be escaped. + return true +} + +// query_unescape does the inverse transformation of query_escape, +// converting each 3-byte encoded substring of the form '%AB' into the +// hex-decoded byte 0xAB. +// It returns an error if any % is not followed by two hexadecimal +// digits. +pub fn query_unescape(s string) ?string { + return unescape(s, .EncodeQueryComponent) +} + +// path_unescape does the inverse transformation of path_escape, +// converting each 3-byte encoded substring of the form '%AB' into the +// hex-decoded byte 0xAB. It returns an error if any % is not followed +// by two hexadecimal digits. +// +// path_unescape is identical to query_unescape except that it does not +// unescape '+' to ' ' (space). +pub fn path_unescape(s string) ?string { + return unescape(s, .EncodePathSegment) +} + +// unescape unescapes a string; the mode specifies +// which section of the URL string is being unescaped. +fn unescape(s string, mode EncodingMode) ?string { + // Count %, check that they're well-formed. + mut n := 0 + mut has_plus := false + for i := 0; i < s.len; { + x := s[i] + switch x { + case `%`: + if s == '' { + break + } + n++ + if i+2 >= s.len || !ishex(s[i+1]) || !ishex(s[i+2]) { + s = s.right(i) + if s.len > 3 { + s = s.left(3) + } + return error(error_msg(EscapeError, s)) + } + // Per https://tools.ietf.org/html/rfc3986#page-21 + // in the host component %-encoding can only be used + // for non-ASCII bytes. + // But https://tools.ietf.org/html/rfc6874#section-2 + // introduces %25 being allowed to escape a percent sign + // in IPv6 scoped-address literals. Yay. + if mode == .EncodeHost && unhex(s[i+1]) < 8 && s.substr(i, i+3) != '%25' { + return error(error_msg(EscapeError, s.substr(i, i+3))) + } + if mode == .EncodeZone { + // RFC 6874 says basically 'anything goes' for zone identifiers + // and that even non-ASCII can be redundantly escaped, + // but it seems prudent to restrict %-escaped bytes here to those + // that are valid host name bytes in their unescaped form. + // That is, you can use escaping in the zone identifier but not + // to introduce bytes you couldn't just write directly. + // But Windows puts spaces here! Yay. + v := byte(unhex(s[i+1])<>4] + t[j+2] = x[c1&15] + j += 3 + } else { + t[j] = s[i] + j++ + } + } + return string(t) +} + +// A URL represents a parsed URL (technically, a URI reference). +// +// The general form represented is: +// +// [scheme:][//[userinfo@]host][/]path[?query][#fragment] +// +// URLs that do not start with a slash after the scheme are interpreted as: +// +// scheme:opaque[?query][#fragment] +// +// Note that the path field is stored in decoded form: /%47%6f%2f becomes /Go/. +// A consequence is that it is impossible to tell which slashes in the path were +// slashes in the raw URL and which were %2f. This distinction is rarely important, +// but when it is, the code should use raw_path, an optional field which only gets +// set if the default encoding is different from path. +// +// URL's String method uses the escaped_path method to obtain the path. See the +// escaped_path method for more details. +struct URL { +pub: mut: + scheme string + opaque string // encoded opaque data + user *Userinfo // username and password information + host string // host or host:port + path string // path (relative paths may omit leading slash) + raw_path string // encoded path hint (see escaped_path method) + force_query bool // append a query ('?') even if raw_query is empty + raw_query string // encoded query values, without '?' + fragment string // fragment for references, without '#' +} + +// user returns a Userinfo containing the provided username +// and no password set. +pub fn user(username string) *Userinfo { + return &Userinfo{ + username: username, + password: '', + password_set: false + } +} + +// user_password returns a Userinfo containing the provided username +// and password. +// +// This functionality should only be used with legacy web sites. +// RFC 2396 warns that interpreting Userinfo this way +// ``is NOT RECOMMENDED, because the passing of authentication +// information in clear text (such as URI) has proven to be a +// security risk in almost every case where it has been used.'' +fn user_password(username, password string) *Userinfo { + return &Userinfo{username, password, true} +} + +// The Userinfo type is an immutable encapsulation of username and +// password details for a URL. An existing Userinfo value is guaranteed +// to have a username set (potentially empty, as allowed by RFC 2396), +// and optionally a password. +struct Userinfo { +pub: + username string + password string + password_set bool +} + +fn (u &Userinfo) empty() bool { + return u.username == '' && u.password == '' +} + +// string returns the encoded userinfo information in the standard form +// of 'username[:password]'. +fn (u &Userinfo) string() string { + if u.empty() { + return '' + } + mut s := escape(u.username, .EncodeUserPassword) + if u.password_set { + s += ':' + escape(u.password, .EncodeUserPassword) + } + return s +} + +// Maybe rawurl is of the form scheme:path. +// (scheme must be [a-zA-Z][a-zA-Z0-9+-.]*) +// If so, return [scheme, path]; else return ['', rawurl] +fn split_by_scheme(rawurl string) ?[]string { + for i := 0; i < rawurl.len; i++ { + c := rawurl[i] + if `a` <= c && c <= `z` || `A` <= c && c <= `Z` { + // do nothing + } + else if `0` <= c && c <= `9` || c == `+` || c == `-` || c == `.` { + if i == 0 { + return ['', rawurl] + } + } + else if c == `:` { + if i == 0 { + return error(error_msg('missing protocol scheme', '')) + } + return [rawurl.left(i), rawurl.right(i+1)] + } + else { + // we have encountered an invalid character, + // so there is no valid scheme + return ['', rawurl] + } + } + return ['', rawurl] +} + +fn get_scheme(rawurl string) ?string { + split := split_by_scheme(rawurl) or { + return err + } + return split[0] +} + +// Maybe s is of the form t c u. +// If so, return t, c u (or t, u if cutc == true). +// If not, return s, ''. +fn split(s string, c string, cutc bool) []string { + i := s.index(c) + if i < 0 { + return [s, ''] + } + if cutc { + return [s.left(i), s.right(i+c.len)] + } + return [s.left(i), s.right(i)] +} + +// parse parses rawurl into a URL structure. +// +// The rawurl may be relative (a path, without a host) or absolute +// (starting with a scheme). Trying to parse a hostname and path +// without a scheme is invalid but may not necessarily return an +// error, due to parsing ambiguities. +pub fn parse(rawurl string) ?URL { + // Cut off #frag + p := split(rawurl, '#', true) + u := p[0] + frag := p[1] + mut url := parse_request_url(u, false) or { + return error(error_msg(ParseError, u)) + } + if frag == '' { + return url + } + f := unescape(frag, .EncodeFragment) or { + return error(error_msg(ParseError, u)) + } + url.fragment = f + return url +} + +// parse_request_uri parses rawurl into a URL structure. It assumes that +// rawurl was received in an HTTP request, so the rawurl is interpreted +// only as an absolute URI or an absolute path. +// The string rawurl is assumed not to have a #fragment suffix. +// (Web browsers strip #fragment before sending the URL to a web server.) +fn parse_request_uri(rawurl string) ?URL { + return parse_request_url(rawurl, true) +} + +// parse parses a URL from a string in one of two contexts. If +// via_request is true, the URL is assumed to have arrived via an HTTP request, +// in which case only absolute URLs or path-absolute relative URLs are allowed. +// If via_request is false, all forms of relative URLs are allowed. +fn parse_request_url(rawurl string, via_request bool) ?URL { + if string_contains_ctl_byte(rawurl) { + return error(error_msg('invalid control character in URL', rawurl)) + } + + if rawurl == '' && via_request { + return error(error_msg('empty URL', '')) + } + mut url := URL{} + + if rawurl == '*' { + url.path = '*' + return url + } + + // Split off possible leading 'http:', 'mailto:', etc. + // Cannot contain escaped characters. + p := split_by_scheme(rawurl) or { + return error(err) + } + url.scheme = p[0] + mut rest := p[1] + url.scheme = url.scheme.to_lower() + + // if rest.ends_with('?') && strings.Count(rest, '?') == 1 { + if rest.ends_with('?') && !rest.trim_right('?').contains('?') { + url.force_query = true + rest = rest.left(rest.len-1) + } else { + parts := split(rest, '?', true) + rest = parts[0] + url.raw_query = parts[1] + } + + if !rest.starts_with('/') { + if url.scheme != '' { + // We consider rootless paths per RFC 3986 as opaque. + url.opaque = rest + return url + } + if via_request { + return error(error_msg('invalid URI for request', '')) + } + + // Avoid confusion with malformed schemes, like cache_object:foo/bar. + // See golang.org/issue/16822. + // + // RFC 3986, §3.3: + // In addition, a URI reference (Section 4.1) may be a relative-path reference, + // in which case the first path segment cannot contain a colon (':') character. + colon := rest.index(':') + slash := rest.index('/') + if colon >= 0 && (slash < 0 || colon < slash) { + // First path segment has colon. Not allowed in relative URL. + return error(error_msg('first path segment in URL cannot contain colon', '')) + } + } + + if (url.scheme != '' || !via_request && !rest.starts_with('///')) && rest.starts_with('//') { + parts := split(rest.right(2), '/', false) + authority := parts[0] + rest = parts[1] + a := parse_authority(authority) or { + return error(err) + } + url.user = a.user + url.host = a.host + } + // Set path and, optionally, raw_path. + // raw_path is a hint of the encoding of path. We don't want to set it if + // the default escaping of path is equivalent, to help make sure that people + // don't rely on it in general. + _ := url.set_path(rest) or { + return error(err) + } + return url +} + +struct ParseAuthorityRes { + user &Userinfo + host string +} + +fn parse_authority(authority string) ?ParseAuthorityRes { + i := authority.last_index('@') + mut host := '' + mut user := user('') + if i < 0 { + h := parse_host(authority) or { + return error(err) + } + host = h + } else { + h := parse_host(authority.right(i+1)) or { + return error(err) + } + host = h + } + if i < 0 { + return ParseAuthorityRes{host: host} + } + mut userinfo := authority.left(i) + if !valid_userinfo(userinfo) { + return error(error_msg('invalid userinfo', '')) + } + if !userinfo.contains(':') { + u := unescape(userinfo, .EncodeUserPassword) or { + return error(err) + } + userinfo = u + user = user(userinfo) + } else { + parts := split(userinfo, ':', true) + mut username := parts[0] + mut password := parts[1] + u := unescape(username, .EncodeUserPassword) or { + return error(err) + } + username = u + p := unescape(password, .EncodeUserPassword) or { + return error(err) + } + password = p + user = user_password(username, password) + } + return ParseAuthorityRes{ + user: user + host: host + } +} + +// parse_host parses host as an authority without user +// information. That is, as host[:port]. +fn parse_host(host string) ?string { + if host.starts_with('[') { + // parse an IP-Literal in RFC 3986 and RFC 6874. + // E.g., '[fe80::1]', '[fe80::1%25en0]', '[fe80::1]:80'. + i := host.last_index(']') + if i < 0 { + return error(error_msg('missing \']\' in host', '')) + } + colonport := host.right(i+1) + if !valid_optional_port(colonport) { + return error(error_msg('invalid port $colonport after host ', '')) + } + + // RFC 6874 defines that %25 (%-encoded percent) introduces + // the zone identifier, and the zone identifier can use basically + // any %-encoding it likes. That's different from the host, which + // can only %-encode non-ASCII bytes. + // We do impose some restrictions on the zone, to avoid stupidity + // like newlines. + zone := host.left(i).index('%25') + if zone >= 0 { + host1 := unescape(host.left(zone), .EncodeHost) or { + return err + } + host2 := unescape(host.substr(zone, i), .EncodeZone) or { + return err + } + host3 := unescape(host.right(i), .EncodeHost) or { + return err + } + return host1 + host2 + host3 + } + } + + h := unescape(host, .EncodeHost) or { + return err + } + host = h + return host +} + +// set_path sets the path and raw_path fields of the URL based on the provided +// escaped path p. It maintains the invariant that raw_path is only specified +// when it differs from the default encoding of the path. +// For example: +// - set_path('/foo/bar') will set path='/foo/bar' and raw_path='' +// - set_path('/foo%2fbar') will set path='/foo/bar' and raw_path='/foo%2fbar' +// set_path will return an error only if the provided path contains an invalid +// escaping. +fn (u &URL) set_path(p string) ?bool { + path := unescape(p, .EncodePath) or { + return error(err) + } + u.path = path + escp := escape(path, .EncodePath) + if p == escp { + // Default encoding is fine. + u.raw_path = '' + } else { + u.raw_path = p + } + return true +} + +// escaped_path returns the escaped form of u.path. +// In general there are multiple possible escaped forms of any path. +// escaped_path returns u.raw_path when it is a valid escaping of u.path. +// Otherwise escaped_path ignores u.raw_path and computes an escaped +// form on its own. +// The String and request_uri methods use escaped_path to construct +// their results. +// In general, code should call escaped_path instead of +// reading u.raw_path directly. +fn (u &URL) escaped_path() string { + if u.raw_path != '' && valid_encoded_path(u.raw_path) { + p := unescape(u.raw_path, .EncodePath) + return u.raw_path + } + if u.path == '*' { + return '*' // don't escape (Issue 11202) + } + return escape(u.path, .EncodePath) +} + +// valid_encoded_path reports whether s is a valid encoded path. +// It must not contain any bytes that require escaping during path encoding. +fn valid_encoded_path(s string) bool { + for i := 0; i < s.len; i++ { + // RFC 3986, Appendix A. + // pchar = unreserved / pct-encoded / sub-delims / ':' / '@'. + // should_escape is not quite compliant with the RFC, + // so we check the sub-delims ourselves and let + // should_escape handle the others. + x := s[i] + switch x { + case `!`, `$`, `&`, `\\`, `(`, `)`, `*`, `+`, `,`, `;`, `=`, `:`, `@`: + // ok + case `[`, `]`: + // ok - not specified in RFC 3986 but left alone by modern browsers + case `%`: + // ok - percent encoded, will decode + default: + if should_escape(s[i], .EncodePath) { + return false + } + } + } + return true +} + +// valid_optional_port reports whether port is either an empty string +// or matches /^:\d*$/ +fn valid_optional_port(port string) bool { + if port == '' { + return true + } + if port[0] != `:` { + return false + } + for b in port.right(1) { + if b < `0` || b > `9` { + return false + } + } + return true +} + +// str reassembles the URL into a valid URL string. +// The general form of the result is one of: +// +// scheme:opaque?query#fragment +// scheme://userinfo@host/path?query#fragment +// +// If u.opaque is non-empty, String uses the first form; +// otherwise it uses the second form. +// Any non-ASCII characters in host are escaped. +// To obtain the path, String uses u.escaped_path(). +// +// In the second form, the following rules apply: +// - if u.scheme is empty, scheme: is omitted. +// - if u.user is nil, userinfo@ is omitted. +// - if u.host is empty, host/ is omitted. +// - if u.scheme and u.host are empty and u.user is nil, +// the entire scheme://userinfo@host/ is omitted. +// - if u.host is non-empty and u.path begins with a /, +// the form host/path does not add its own /. +// - if u.raw_query is empty, ?query is omitted. +// - if u.fragment is empty, #fragment is omitted. +pub fn (u &URL) str() string { + mut buf := strings.new_builder(200) + if u.scheme != '' { + buf.write(u.scheme) + buf.write(':') + } + if u.opaque != '' { + buf.write(u.opaque) + } else { + if u.scheme != '' || u.host != '' || !u.user.empty() { + if u.host != '' || u.path != '' || !u.user.empty() { + buf.write('//') + } + if !u.user.empty() { + buf.write(u.user.string()) + buf.write('@') + } + if u.host != '' { + buf.write(escape(u.host, .EncodeHost)) + } + } + path := u.escaped_path() + if path != '' && path[0] != `/` && u.host != '' { + buf.write('/') + } + if buf.len == 0 { + // RFC 3986 §4.2 + // A path segment that contains a colon character (e.g., 'this:that') + // cannot be used as the first segment of a relative-path reference, as + // it would be mistaken for a scheme name. Such a segment must be + // preceded by a dot-segment (e.g., './this:that') to make a relative- + // path reference. + i := path.index(':') + if i > -1 && path.left(i).index('/') == -1 { + buf.write('./') + } + } + buf.write(path) + } + if u.force_query || u.raw_query != '' { + buf.write('?') + buf.write(u.raw_query) + } + if u.fragment != '' { + buf.write('#') + buf.write(escape(u.fragment, .EncodeFragment)) + } + return buf.str() +} + +// Values maps a string key to a list of values. +// It is typically used for query parameters and form values. +// Unlike in the http.Header map, the keys in a Values map +// are case-sensitive. + + +// parseQuery parses the URL-encoded query string and returns +// a map listing the values specified for each key. +// parseQuery always returns a non-nil map containing all the +// valid query parameters found; err describes the first decoding error +// encountered, if any. +// +// Query is expected to be a list of key=value settings separated by +// ampersands or semicolons. A setting without an equals sign is +// interpreted as a key set to an empty value. +pub fn parse_query(query string) ?Values { + mut m := new_values() + _ := _parse_query(mut m, query) or { + return error(err) + } + return m +} + +// parse_query_silent is the same as parse_query +// but any errors will be silent +fn parse_query_silent(query string) Values { + mut m := new_values() + _ := _parse_query(mut m, query) + return m +} + +fn _parse_query(m mut Values, query string) ?bool { + mut had_error := false + for query != '' { + mut key := query + mut i := key.index_any('&;') + if i >= 0 { + query = key.right(i+1) + key = key.left(i) + } else { + query = '' + } + if key == '' { + continue + } + mut value := '' + i = key.index('=') + if i >= 0 { + value = key.right(i+1) + key = key.left(i) + } + k := query_unescape(key) or { + had_error = true + continue + } + key = k + + v := query_unescape(value) or { + had_error = true + continue + } + value = v + m.add(key, value) + } + if had_error { + return error(error_msg('error parsing query string', '')) + } + return true +} + +// encode encodes the values into ``URL encoded'' form +// ('bar=baz&foo=quux') sorted by key. +fn (v Values) encode() string { + if v.size == 0 { + return '' + } + mut buf := strings.new_builder(200) + mut keys := []string + for k, _ in v.data { + keys << k + } + keys.sort() + for k in keys { + vs := v.data[k] + key_kscaped := query_escape(k) + for _, val in vs.data { + if buf.len > 0 { + buf.write('&') + } + buf.write(key_kscaped) + buf.write('=') + buf.write(query_escape(val)) + } + } + return buf.str() +} + +// resolve_path applies special path segments from refs and applies +// them to base, per RFC 3986. +fn resolve_path(base, ref string) string { + mut full := '' + if ref == '' { + full = base + } else if ref[0] != `/` { + i := base.last_index('/') + full = base.left(i+1) + ref + } else { + full = ref + } + if full == '' { + return '' + } + mut dst := []string + src := full.split('/') + for _, elem in src { + switch elem { + case '.': + // drop + case '..': + if dst.len > 0 { + dst = dst.left(dst.len-1) + } + default: + dst << elem + } + } + last := src[src.len-1] + if last == '.' || last == '..' { + // Add final slash to the joined path. + dst << '' + } + return '/' + dst.join('/').trim_left('/') +} + +// is_abs reports whether the URL is absolute. +// Absolute means that it has a non-empty scheme. +pub fn (u &URL) is_abs() bool { + return u.scheme != '' +} + +// parse parses a URL in the context of the receiver. The provided URL +// may be relative or absolute. parse returns nil, err on parse +// failure, otherwise its return value is the same as resolve_reference. +pub fn (u &URL) parse(ref string) ?URL { + refurl := parse(ref) or { + return error(err) + } + return u.resolve_reference(refurl) +} + +// resolve_reference resolves a URI reference to an absolute URI from +// an absolute base URI u, per RFC 3986 Section 5.2. The URI reference +// may be relative or absolute. resolve_reference always returns a new +// URL instance, even if the returned URL is identical to either the +// base or reference. If ref is an absolute URL, then resolve_reference +// ignores base and returns a copy of ref. +pub fn (u &URL) resolve_reference(ref &URL) ?URL { + mut url := *ref + if ref.scheme == '' { + url.scheme = u.scheme + } + if ref.scheme != '' || ref.host != '' || !ref.user.empty() { + // The 'absoluteURI' or 'net_path' cases. + // We can ignore the error from set_path since we know we provided a + // validly-escaped path. + url.set_path(resolve_path(ref.escaped_path(), '')) + return url + } + if ref.opaque != '' { + url.user = user('') + url.host = '' + url.path = '' + return url + } + if ref.path == '' && ref.raw_query == '' { + url.raw_query = u.raw_query + if ref.fragment == '' { + url.fragment = u.fragment + } + } + // The 'abs_path' or 'rel_path' cases. + url.host = u.host + url.user = u.user + url.set_path(resolve_path(u.escaped_path(), ref.escaped_path())) + return url +} + +// query parses raw_query and returns the corresponding values. +// It silently discards malformed value pairs. +// To check errors use parseQuery. +pub fn (u &URL) query() Values { + v := parse_query_silent(u.raw_query) + return v +} + +// request_uri returns the encoded path?query or opaque?query +// string that would be used in an HTTP request for u. +pub fn (u &URL) request_uri() string { + mut result := u.opaque + if result == '' { + result = u.escaped_path() + if result == '' { + result = '/' + } + } else { + if result.starts_with('//') { + result = u.scheme + ':' + result + } + } + if u.force_query || u.raw_query != '' { + result += '?' + u.raw_query + } + return result +} + +// hostname returns u.host, without any port number. +// +// If host is an IPv6 literal with a port number, hostname returns the +// IPv6 literal without the square brackets. IPv6 literals may include +// a zone identifier. +pub fn (u &URL) hostname() string { + return strip_port(u.host) +} + +// port returns the port part of u.host, without the leading colon. +// If u.host doesn't contain a port, port returns an empty string. +pub fn (u &URL) port() string { + return port_only(u.host) +} + +pub fn strip_port(hostport string) string { + colon := hostport.index(':') + if colon == -1 { + return hostport + } + i := hostport.index(']') + if i != -1 { + return hostport.left(i).trim_left('[') + } + return hostport.left(colon) +} + +pub fn port_only(hostport string) string { + colon := hostport.index(':') + if colon == -1 { + return '' + } + i := hostport.index(']:') + if i != -1 { + return hostport.right(i+(']:'.len)) + } + if hostport.contains(']') { + return '' + } + return hostport.right(colon+(':'.len)) +} + +// valid_userinfo reports whether s is a valid userinfo string per RFC 3986 +// Section 3.2.1: +// userinfo = *( unreserved / pct-encoded / sub-delims / ':' ) +// unreserved = ALPHA / DIGIT / '-' / '.' / '_' / '~' +// sub-delims = '!' / '$' / '&' / ''' / '(' / ')' +// / '*' / '+' / ',' / ';' / '=' +// +// It doesn't validate pct-encoded. The caller does that via fn unescape. +pub fn valid_userinfo(s string) bool { + for r in s { + if `A` <= r && r <= `Z` { + continue + } + if `a` <= r && r <= `z` { + continue + } + if `0` <= r && r <= `9` { + continue + } + switch r { + case `-`, `.`, `_`, `:`, `~`, `!`, `$`, `&`, `\\`, + `(`, `)`, `*`, `+`, `,`, `;`, `=`, `%`, `@`: + continue + default: + return false + } + } + return true +} + +// string_contains_ctl_byte reports whether s contains any ASCII control character. +fn string_contains_ctl_byte(s string) bool { + for i := 0; i < s.len; i++ { + b := s[i] + if b < ` ` || b == 0x7f { + return true + } + } + return false +} + +pub fn ishex(c byte) bool { + if `0` <= c && c <= `9` { + return true + } else if `a` <= c && c <= `f` { + return true + } else if `A` <= c && c <= `F` { + return true + } + return false +} + +fn unhex(c byte) byte { + if `0` <= c && c <= `9` { + return c - `0` + } else if `a` <= c && c <= `f` { + return c - `a` + 10 + } else if `A` <= c && c <= `F` { + return c - `A` + 10 + } + return 0 +} diff --git a/vlib/net/urllib/urllib_test.v b/vlib/net/urllib/urllib_test.v new file mode 100644 index 0000000000..0b254d7004 --- /dev/null +++ b/vlib/net/urllib/urllib_test.v @@ -0,0 +1,23 @@ +// Copyright (c) 2019 Alexander Medvednikov. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. + +import net.urllib + +fn test_new_urllib() { + test_query := 'Hellö Wörld@vlang' + assert urllib.query_escape(test_query) == 'Hell%C3%B6+W%C3%B6rld%40vlang' + + test_url := 'https://joe:pass@www.mydomain.com:8080/som/url?param1=test1¶m2=test2&foo=bar#testfragment' + u := urllib.parse(test_url) or { + assert false + return + } + assert u.scheme == 'https' && + u.hostname() == 'www.mydomain.com' && + u.port() == '8080' && + u.path == '/som/url' && + u.fragment == 'testfragment' && + u.user.username == 'joe' && + u.user.password == 'pass' +} diff --git a/vlib/net/urllib/values.v b/vlib/net/urllib/values.v new file mode 100644 index 0000000000..13b9ddbbed --- /dev/null +++ b/vlib/net/urllib/values.v @@ -0,0 +1,69 @@ +// Copyright (c) 2019 Alexander Medvednikov. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. + +module urllib + +struct ValueStruct { +pub: +mut: + data []string +} + +type Value ValueStruct + +struct Values { +pub: +mut: + data map[string]Value + size int +} + +fn new_values() Values { + return Values{ + data: map[string]Value{} + } +} + +pub fn (v Value) all() []string { + return v.data +} + +// Get gets the first value associated with the given key. +// If there are no values associated with the key, Get returns +// the empty string. To access multiple values, use the map +// directly. +fn (v Values) get(key string) string { + if v.data.size == 0 { + return '' + } + vs := v.data[key] + if vs.data.len == 0 { + return '' + } + return vs.data[0] +} + +// Set sets the key to value. It replaces any existing +// values. +fn (v Values) set(key, value string) { + v.data[key].data = [value] + v.size = v.data.size +} + +// Add adds the value to key. It appends to any existing +// values associated with key. +fn (v mut Values) add(key, value string) { + mut a := v.data[key] + a.data << value +} + +// Del deletes the values associated with key. +fn (v mut Values) del(key string) { + v.data.delete(key) + v.size = v.data.size +} + +pub fn (v Values) iter() map[string]Value { + return v.data +} diff --git a/vlib/strings/builder.v b/vlib/strings/builder.v index 59a5f32ab8..82399f0f15 100644 --- a/vlib/strings/builder.v +++ b/vlib/strings/builder.v @@ -7,6 +7,7 @@ module strings struct Builder { mut: buf []byte +pub: len int }