diff options
| author | Joel Klinghed <the_jk@yahoo.com> | 2017-02-28 21:50:44 +0100 |
|---|---|---|
| committer | Joel Klinghed <the_jk@yahoo.com> | 2017-02-28 21:50:44 +0100 |
| commit | c029d90d1975e124d237605f1edb2be16bd05b5d (patch) | |
| tree | 9df87ffb365354bdb74a969440b32c8304bdbcb7 /src/url.cc | |
Initial commit
Diffstat (limited to 'src/url.cc')
| -rw-r--r-- | src/url.cc | 901 |
1 files changed, 901 insertions, 0 deletions
diff --git a/src/url.cc b/src/url.cc new file mode 100644 index 0000000..b4e6c0f --- /dev/null +++ b/src/url.cc @@ -0,0 +1,901 @@ +// -*- mode: c++; c-basic-offset: 2; -*- + +#include "common.hh" + +#include <algorithm> +#include <cstring> +#include <iostream> +#include <memory> + +#include "paths.hh" +#include "url.hh" + +namespace { + +class UrlImpl : public Url { +public: + UrlImpl() + : userinfo_(nullptr), userinfo_unescaped_(nullptr), port_(0), + path_(nullptr), path_unescaped_(nullptr), query_(nullptr), + query_unescaped_(nullptr), fragment_(nullptr) { + } + ~UrlImpl() override { + delete[] userinfo_; + if (userinfo_unescaped_ != userinfo_) delete[] userinfo_unescaped_; + delete[] path_; + if (path_unescaped_ != path_) delete[] path_unescaped_; + delete[] query_; + if (query_unescaped_ != query_) delete[] query_unescaped_; + delete[] fragment_; + } + UrlImpl(UrlImpl const& url); + + bool parse(std::string const& url, Url const* base); + + Url* copy() const override { + return new UrlImpl(*this); + } + + std::string const& scheme() const override { + return scheme_; + } + + char const* userinfo() const override; + + char const* userinfo_escaped() const override { + return userinfo_; + } + + std::string const& host() const override { + return host_; + } + + uint16_t port() const override { + return port_; + } + + std::string path() const override; + + std::string path_escaped() const override { + return path_; + } + + bool query(std::string const& name, std::string* value) const override; + + char const* full_query() const override; + + char const* full_query_escaped() const override { + return query_; + } + + char const* fragment() const override { + return fragment_; + } + + void print(std::ostream& out, bool path = true, bool query = true, + bool fragment = true) const override; + +private: + char const* parse_authority(char const* pos); + char const* parse_query(char const* pos); + char const* parse_fragment(char const* pos); + bool relative(std::string const& url, Url const* base); + + std::string scheme_; + char* userinfo_; + mutable char* userinfo_unescaped_; + std::string host_; + uint16_t port_; + char* path_; + mutable char* path_unescaped_; + char* query_; + mutable char* query_unescaped_; + char* fragment_; +}; + +char* unescape(char* start, char* end, bool query); +char* escape(char const* str, char const* safe); +char* dup(char const* start, char const* end); +void lower(std::string& str); + +bool is_ipv4(char const* start, char const* end); +bool is_ipv6(char const* start, char const* end); + +char const subdelims[] = "!$&'()*+,;="; + +UrlImpl::UrlImpl(UrlImpl const& url) + : Url(), scheme_(url.scheme_), host_(url.host_), port_(url.port_) { + userinfo_ = dup(url.userinfo_, nullptr); + userinfo_unescaped_ = dup(url.userinfo_unescaped_, nullptr); + path_ = dup(url.path_, nullptr); + path_unescaped_ = dup(url.path_unescaped_, nullptr); + query_ = dup(url.query_, nullptr); + query_unescaped_ = dup(url.query_unescaped_, nullptr); + fragment_ = dup(url.fragment_, nullptr); +} + +inline bool is_alpha(char c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); +} + +inline bool is_digit(char c) { + return c >= '0' && c <= '9'; +} + +inline bool is_unreserved(char c) { + return is_alpha(c) || is_digit(c) || c == '-' || c == '.' || c == '_' || + c == '~'; +} + +inline bool is_hex(char c) { + return is_digit(c) || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); +} + +char* emptystr() { + char* ret = new char[1]; + ret[0] = '\0'; + return ret; +} + +bool UrlImpl::parse(std::string const& url, Url const* base) { + char const* pos = url.c_str(), *start; + + // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] + + // scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) + start = pos; + if (!is_alpha(*pos)) { + return relative(url, base); + } + pos++; + while (is_alpha(*pos) || is_digit(*pos) || *pos == '+' || + *pos == '-' || *pos == '.') { + pos++; + } + if (*pos != ':') { + return relative(url, base); + } + scheme_.assign(start, pos); + lower(scheme_); + pos++; + + /* + hier-part = "//" authority path-abempty + / path-absolute + / path-rootless + / path-empty + */ + if (memcmp(pos, "//", 2)) { + if (pos[0] == '/' && pos[1] != '/') { + // path-absolute = "/" [ segment-nz *( "/" segment ) ] + start = pos++; + } else if (pos[0] != '/') { + /* + path-rootless = segment-nz *( "/" segment ) + path-empty = 0<pchar> + */ + start = pos; + } + while (true) { + if (is_unreserved(*pos)) { + pos++; + } else if (*pos == '%' && is_hex(pos[1]) && is_hex(pos[2])) { + pos += 3; + } else if (*pos && strchr(subdelims, *pos)) { + pos++; + } else if (*pos == ':' || *pos == '@' || *pos == '/') { + pos++; + } else { + break; + } + } + + path_ = dup(start, pos); + } else { + pos += 2; + pos = parse_authority(pos); + if (!pos) { + return relative(url, base); + } + + if (*pos == '/') { + /* + path-abempty = *( "/" segment ) + segment = *pchar + pchar = unreserved / pct-encoded / sub-delims / ":" / "@" + */ + start = pos++; + while (true) { + if (is_unreserved(*pos)) { + pos++; + } else if (*pos == '%' && is_hex(pos[1]) && is_hex(pos[2])) { + pos += 3; + } else if (*pos && strchr(subdelims, *pos)) { + pos++; + } else if (*pos == ':' || *pos == '@' || *pos == '/') { + pos++; + } else { + break; + } + } + + path_ = dup(start, pos); + } else { + path_ = emptystr(); + } + } + + if (*pos == '?') { + pos = parse_query(pos); + if (!pos) { + return relative(url, base); + } + } + + if (*pos == '#') { + pos = parse_fragment(pos); + if (!pos) { + return relative(url, base); + } + } + + if (*pos != '\0') { + return relative(url, base); + } + + return true; +} + +bool UrlImpl::relative(std::string const& url, Url const* base) { + char const* pos = url.c_str(); + char const* start; + if (!base) return false; + + scheme_.clear(); + if (userinfo_unescaped_ != userinfo_) { + delete[] userinfo_unescaped_; userinfo_unescaped_ = nullptr; + } + delete[] userinfo_; userinfo_ = nullptr; + host_.clear(); + port_ = 0; + if (path_unescaped_ != path_) { + delete[] path_unescaped_; path_unescaped_ = nullptr; + } + delete[] path_; path_ = nullptr; + if (query_unescaped_ != query_) { + delete[] query_unescaped_; query_unescaped_ = nullptr; + } + delete[] query_; query_ = nullptr; + delete[] fragment_; fragment_ = nullptr; + + /* + relative-part = "//" authority path-abempty + / path-absolute + / path-noscheme + / path-empty + */ + if (memcmp(pos, "//", 2)) { + if (pos[0] == '/' && pos[1] != '/') { + // path-absolute = "/" [ segment-nz *( "/" segment ) ] + start = pos++; + } else if (pos[0] != '/') { + /* + path-noscheme = segment-nz-nc *( "/" segment ) + path-empty = 0<pchar> + */ + start = pos; + } + while (true) { + if (is_unreserved(*pos)) { + pos++; + } else if (*pos == '%' && is_hex(pos[1]) && is_hex(pos[2])) { + pos += 3; + } else if (*pos && strchr(subdelims, *pos)) { + pos++; + } else if (*pos == ':' || *pos == '@' || *pos == '/') { + pos++; + } else { + break; + } + } + + path_ = dup(start, pos); + } else { + pos += 2; + pos = parse_authority(pos); + if (!pos) return false; + + if (*pos == '/') { + /* + path-abempty = *( "/" segment ) + segment = *pchar + pchar = unreserved / pct-encoded / sub-delims / ":" / "@" + */ + start = pos++; + while (true) { + if (is_unreserved(*pos)) { + pos++; + } else if (*pos == '%' && is_hex(pos[1]) && is_hex(pos[2])) { + pos += 3; + } else if (*pos && strchr(subdelims, *pos)) { + pos++; + } else if (*pos == ':' || *pos == '@' || *pos == '/') { + pos++; + } else { + break; + } + } + + path_ = dup(start, pos); + } else { + path_ = emptystr(); + } + } + + if (*pos == '?') { + pos = parse_query(pos); + if (!pos) return false; + } + + if (*pos == '#') { + pos = parse_fragment(pos); + if (!pos) return false; + } + + if (*pos != '\0') return false; + + scheme_ = base->scheme(); + if (host_.empty()) { + auto userinfo = base->userinfo(); + if (userinfo) userinfo_ = dup(userinfo, userinfo + strlen(userinfo)); + host_ = base->host(); + port_ = base->port(); + } + if (path_[0] != '/') { + std::string tmp( + Paths::join(!base->path().empty() ? base->path() : "/", path_)); + delete[] path_; + path_ = dup(tmp.data(), tmp.data() + tmp.size()); + } + return true; +} + +char const* UrlImpl::parse_authority(char const* pos) { + /* authority = [ userinfo "@" ] host [ ":" port ] + userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) + host = IP-literal / IPv4address / reg-name + IP-literal = "[" ( IPv6address / IPvFuture ) "]" + IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) + IPv6address = 6( h16 ":" ) ls32 + / "::" 5( h16 ":" ) ls32 + / [ h16 ] "::" 4( h16 ":" ) ls32 + / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 + / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 + / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 + / [ *4( h16 ":" ) h16 ] "::" ls32 + / [ *5( h16 ":" ) h16 ] "::" h16 + / [ *6( h16 ":" ) h16 ] "::" + ls32 = ( h16 ":" h16 ) / IPv4address + h16 = 1*4HEXDIG + IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet + dec-octet = DIGIT ; 0-9 + / %x31-39 DIGIT ; 10-99 + / "1" 2DIGIT ; 100-199 + / "2" %x30-34 DIGIT ; 200-249 + / "25" %x30-35 ; 250-255 + reg-name = *( unreserved / pct-encoded / sub-delims ) + port = *DIGIT + */ + char const* start = pos; + char const* at = nullptr, *colon = nullptr; + char const* host_start, *host_end, *tmp; + while (true) { + if (is_unreserved(*pos)) { + pos++; + } else if (*pos == '%' && is_hex(pos[1]) && is_hex(pos[2])) { + pos += 3; + } else if (*pos && strchr(subdelims, *pos)) { + pos++; + } else if (*pos == '[' || *pos == ']') { + pos++; + } else if (*pos == ':') { + colon = pos++; + } else if (*pos == '@') { + if (at) { + return nullptr; + } + colon = nullptr; + at = pos++; + } else { + break; + } + } + + // userinfo? + if (at) { + host_start = at + 1; + userinfo_ = dup(start, at); + if (strchr(userinfo_, '[') || strchr(userinfo_, ']')) { + return nullptr; + } + } else { + host_start = start; + } + + // port? + if (colon) { + tmp = colon + 1; + if (tmp < pos && is_digit(*tmp)) { + uint16_t v = *tmp - '0'; + tmp++; + host_end = colon; + while (tmp < pos) { + uint16_t x; + if (!is_digit(*tmp)) { + host_end = pos; + break; + } + x = v; + v *= 10; + if (v < x) { + return nullptr; + } + v += *tmp - '0'; + tmp++; + } + if (host_end == colon) { + port_ = v; + } + } else { + host_end = pos; + } + } else { + host_end = pos; + } + + if (*host_start == '[') { + if (host_end[-1] != ']' || host_start + 1 >= host_end - 1) { + return nullptr; + } + host_start++; + host_end--; + if (*host_start == 'v') { + if (!is_hex(host_start[1]) || host_start[2] != '.') { + return nullptr; + } + host_.assign(host_start, host_end - host_start); + lower(host_); + } else { + if (!is_ipv6(host_start, host_end)) { + return nullptr; + } + host_.assign(host_start, host_end - host_start); + lower(host_); + } + if (host_.find('[') != std::string::npos || + host_.find(']') != std::string::npos) { + return nullptr; + } + } else { + tmp = host_start; + while (tmp < host_end) { + if (*tmp == '[' || *tmp == ']') { + return nullptr; + } + tmp++; + } + tmp = unescape(const_cast<char*>(host_start), const_cast<char*>(host_end), + false); + host_ = tmp; + lower(host_); + delete[] const_cast<char*>(tmp); + } + if (host_.empty()) return nullptr; + return pos; +} + +char const* UrlImpl::parse_query(char const* pos) { + // query = *( pchar / "/" / "?" ) + char const* start = ++pos; + while (true) { + if (is_unreserved(*pos)) { + pos++; + } else if (*pos == '%' && is_hex(pos[1]) && is_hex(pos[2])) { + pos += 3; + } else if (*pos && strchr(subdelims, *pos)) { + pos++; + } else if (*pos == ':' || *pos == '@' || *pos == '/' || + *pos == '?') { + pos++; + } else { + break; + } + } + + query_ = dup(start, pos); + return pos; +} + +char const* UrlImpl::parse_fragment(char const* pos) { + // fragment = *( pchar / "/" / "?" ) + char const* start = ++pos; + while (true) { + if (is_unreserved(*pos)) { + pos++; + } else if (*pos == '%' && is_hex(pos[1]) && is_hex(pos[2])) { + pos += 3; + } else if (*pos && strchr(subdelims, *pos)) { + pos++; + } else if (*pos == ':' || *pos == '@' || *pos == '/' || + *pos == '?') { + pos++; + } else { + break; + } + } + + fragment_ = unescape((char*) start, (char*) pos, false); + return pos; +} + +char const* UrlImpl::userinfo() const { + if (userinfo_ && !userinfo_unescaped_) { + userinfo_unescaped_ = unescape(userinfo_, nullptr, false); + } + return userinfo_unescaped_; +} + +std::string UrlImpl::path() const { + if (!path_unescaped_) { + path_unescaped_ = unescape(path_, nullptr, false); + } + return path_unescaped_; +} + +bool UrlImpl::query(std::string const& name, std::string* value) const { + char* pos; + if (!query_ || !*query_) { + return false; + } + pos = query_; + while (true) { + char* next = pos, *eq; + char* tmp; + while (*next && *next != '=' && *next != '&') { + next++; + } + if (*next == '=') { + eq = next++; + while (*next && *next != '&') { + next++; + } + } else { + eq = next; + } + tmp = unescape(pos, eq, true); + if (name.compare(tmp) == 0) { + delete[] tmp; + if (eq != next) { + std::unique_ptr<char[]> tmp(unescape(eq + 1, next, true)); + if (value) value->assign(tmp.get()); + return true; + } else { + if (value) value->clear(); + return true; + } + } + delete[] tmp; + if (!*next) { + return false; + } + pos = next + 1; + } +} + +char const* UrlImpl::full_query() const { + if (query_ && !query_unescaped_) { + query_unescaped_ = unescape(query_, nullptr, true); + } + return query_unescaped_; +} + +bool unhex(char c, uint8_t* ret) { + if (is_digit(c)) { + *ret = c - '0'; + return true; + } else if (c >= 'A' && c <= 'F') { + *ret = c - 'A' + 10; + return true; + } else if (c >= 'a' && c <= 'f') { + *ret = c - 'a' + 10; + return true; + } + return false; +} + +char* unescape(char* start, char* end, bool query) { + char* pos = start; + char* ret; + size_t o; + if (!start) { + return nullptr; + } + if (end) { + for (; pos < end; pos++) { + if (*pos == '%' || (query && *pos == '+')) { + break; + } + } + if (pos == end) { + return dup(start, end); + } + } else { + for (; *pos; pos++) { + if (*pos == '%' || (query && *pos == '+')) { + break; + } + } + if (!*pos) { + return start; + } + end = pos + strlen(pos); + } + ret = new char[end - start + 1]; + o = 0; + while (true) { + uint8_t h, l; + memcpy(ret + o, start, pos - start); + o += pos - start; + if (query && *pos == '+') { + ret[o++] = ' '; + pos++; + } else if (pos + 3 <= end && unhex(pos[1], &h) && unhex(pos[2], &l)) { + ret[o++] = h << 4 | l; + pos += 3; + } else { + ret[o++] = *(pos++); + } + start = pos; + while (pos < end && *pos != '%' && !(query && *pos == '+')) { + pos++; + } + if (pos == end) { + memcpy(ret + o, start, pos - start); + o += pos - start; + break; + } + } + ret[o] = '\0'; + return ret; +} + +bool unsafe(char c, char const* safe) { + if (is_unreserved(c) || (c && strchr(safe, c))) { + return false; + } + return true; +} + +char hex(uint8_t c) { + return c < 10 ? '0' + c : 'A' + c - 10; +} + +char* escape(char const* str, char const* safe) { + char const* pos = str; + size_t len; + char* ret; + size_t o; + while (*pos && !unsafe(*pos, safe)) { + pos++; + } + if (!*pos) { + return (char*) str; + } + len = strlen(pos); + ret = new char[(pos - str) + len * 3 + 1]; + o = 0; + while (true) { + memcpy(ret + o, str, pos - str); + o += pos - str; + ret[o++] = '%'; + ret[o++] = hex(*((const uint8_t*)pos) >> 4); + ret[o++] = hex(*((const uint8_t*)pos) & 0xf); + str = ++pos; + while (*pos && !unsafe(*pos, safe)) { + pos++; + } + if (!*pos) { + memcpy(ret + o, str, pos - str); + o += pos - str; + break; + } + } + ret[o] = '\0'; + return ret; +} + +char* dup(char const* start, char const* end) { + if (!start) return nullptr; + if (!end) end = start + strlen(start); + size_t len = end - start; + char* ret; + assert(start <= end); + ret = new char[len + 1]; + memcpy(ret, start, len); + ret[len] = '\0'; + return ret; +} + +inline char ascii_tolower(char c) { + return (c >= 'A' && c <= 'Z') ? ('a' + c - 'A') : c; +} + +void lower(std::string& str) { + std::transform(str.begin(), str.end(), str.begin(), ascii_tolower); +} + +bool is_ipv4(char const* start, char const* end) { + char const* pos = start; + size_t i = 0; + while (true) { + if (pos[0] == '2' && pos + 2 <= end) { + if (pos[1] == '5') { + if (pos[2] < '0' || pos[2] > '5') { + break; + } + } else { + if (pos[1] < '0' || pos[1] > '4' || !is_digit(pos[2])) { + break; + } + } + pos += 3; + } else if (pos[0] == '1' && pos + 2 <= end && + is_digit(pos[1]) && is_digit(pos[2])) { + pos += 3; + } else if ((pos[0] >= '1' && pos[0] <= '9') && pos + 1 <= end && + is_digit(pos[1])) { + pos += 2; + } else if (is_digit(pos[0])) { + pos++; + } else { + break; + } + + i++; + if (pos == end || *pos != '.') { + break; + } + pos++; + if (pos == end) { + return false; + } + } + return i == 4 && pos == end; +} + +size_t walk_hex(char const* start, char const* end, size_t max) { + size_t i = 0; + while (max-- && start < end) { + if (!is_hex(*start)) break; + start++; + i++; + } + return i; +} + +bool is_ipv6(char const* start, char const* end) { + size_t i = 0, j = 0, x; + char const* pos = start; + bool empty = false; + while (pos < end) { + if (*pos == ':') { + pos++; + if (*pos == ':') { + empty = true; + pos++; + break; + } + if (i == 0) { + return false; + } + } + x = walk_hex(pos, end, 4); + if (x == 0) { + return false; + } + pos += x; + i++; + } + + if (pos < end) { + while (true) { + x = walk_hex(pos, end, 4); + if (x == 0) { + return false; + } + pos += x; + if (pos == end) { + j++; + break; + } + if (*pos != ':') { + pos -= x; + break; + } + pos++; + j++; + } + } + + if (pos != end) { + if (!is_ipv4(pos, end)) { + return false; + } + j += 2; + } + + if (!empty) { + return i == 8; + } + + if (i + j > 7) { + return false; + } + return true; +} + +void UrlImpl::print(std::ostream& out, bool path, bool query, bool fragment) + const { + out << scheme_; + if (!host_.empty()) { + out << "://"; + if (userinfo_) { + out << userinfo_ << '@'; + } + out << host_; + if (port_) { + out << ':' << port_; + } + } else { + out << ':'; + } + if (path) { + out << path_; + } + if (query && query_ && *query_) { + out << '?' << query_; + } + if (fragment && fragment_ && *fragment_) { + out << '#'; + char* tmp = escape(fragment_, "/?"); + out << tmp; + if (tmp != fragment_) delete[] tmp; + } +} + +bool null_eq(char const* a1, char const* a2) { + if (a1 == a2) return true; + return a1 && a2 && strcmp(a1, a2) == 0; +} + +} // namespace + +// static +Url* Url::parse(std::string const& url, Url const* base) { + UrlImpl* ret = new UrlImpl(); + if (ret->parse(url, base)) return ret; + delete ret; + return nullptr; +} + +bool Url::operator==(Url const& url) const { + if (scheme() != url.scheme()) return false; + if (host() != url.host()) return false; + if (port() != url.port()) return false; + if (path_escaped() != url.path_escaped()) return false; + if (!null_eq(userinfo_escaped(), url.userinfo_escaped())) return false; + if (!null_eq(full_query_escaped(), url.full_query_escaped())) return false; + if (!null_eq(fragment(), url.fragment())) return false; + return true; +} + |
