// -*- mode: c++; c-basic-offset: 2; -*- #include "common.hh" #include #include #include #include #include "paths.hh" #include "url.hh" namespace { class UrlImpl : public Url { public: UrlImpl() : userinfo_(nullptr), userinfo_unescaped_(nullptr), port_(0), path_(nullptr), path_unescaped_(nullptr), query_(nullptr), query_unescaped_(nullptr), fragment_(nullptr) { } ~UrlImpl() override { delete[] userinfo_; if (userinfo_unescaped_ != userinfo_) delete[] userinfo_unescaped_; delete[] path_; if (path_unescaped_ != path_) delete[] path_unescaped_; delete[] query_; if (query_unescaped_ != query_) delete[] query_unescaped_; delete[] fragment_; } UrlImpl(UrlImpl const& url); bool parse(std::string const& url, Url const* base); Url* copy() const override { return new UrlImpl(*this); } std::string const& scheme() const override { return scheme_; } char const* userinfo() const override; char const* userinfo_escaped() const override { return userinfo_; } std::string const& host() const override { return host_; } uint16_t port() const override { return port_; } std::string path() const override; std::string path_escaped() const override { return path_; } bool query(std::string const& name, std::string* value) const override; char const* full_query() const override; char const* full_query_escaped() const override { return query_; } char const* fragment() const override { return fragment_; } void print(std::ostream& out, bool path = true, bool query = true, bool fragment = true) const override; private: char const* parse_authority(char const* pos); char const* parse_query(char const* pos); char const* parse_fragment(char const* pos); bool relative(std::string const& url, Url const* base); std::string scheme_; char* userinfo_; mutable char* userinfo_unescaped_; std::string host_; uint16_t port_; char* path_; mutable char* path_unescaped_; char* query_; mutable char* query_unescaped_; char* fragment_; }; char* unescape(char* start, char* end, bool query); char* escape(char const* str, char const* safe); char* dup(char const* start, char const* end); void lower(std::string& str); bool is_ipv4(char const* start, char const* end); bool is_ipv6(char const* start, char const* end); char const subdelims[] = "!$&'()*+,;="; UrlImpl::UrlImpl(UrlImpl const& url) : Url(), scheme_(url.scheme_), host_(url.host_), port_(url.port_) { userinfo_ = dup(url.userinfo_, nullptr); userinfo_unescaped_ = dup(url.userinfo_unescaped_, nullptr); path_ = dup(url.path_, nullptr); path_unescaped_ = dup(url.path_unescaped_, nullptr); query_ = dup(url.query_, nullptr); query_unescaped_ = dup(url.query_unescaped_, nullptr); fragment_ = dup(url.fragment_, nullptr); } inline bool is_alpha(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } inline bool is_digit(char c) { return c >= '0' && c <= '9'; } inline bool is_unreserved(char c) { return is_alpha(c) || is_digit(c) || c == '-' || c == '.' || c == '_' || c == '~'; } inline bool is_hex(char c) { return is_digit(c) || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); } char* emptystr() { char* ret = new char[1]; ret[0] = '\0'; return ret; } bool UrlImpl::parse(std::string const& url, Url const* base) { char const* pos = url.c_str(), *start; // URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] // scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) start = pos; if (!is_alpha(*pos)) { return relative(url, base); } pos++; while (is_alpha(*pos) || is_digit(*pos) || *pos == '+' || *pos == '-' || *pos == '.') { pos++; } if (*pos != ':') { return relative(url, base); } scheme_.assign(start, pos); lower(scheme_); pos++; /* hier-part = "//" authority path-abempty / path-absolute / path-rootless / path-empty */ if (memcmp(pos, "//", 2)) { if (pos[0] == '/' && pos[1] != '/') { // path-absolute = "/" [ segment-nz *( "/" segment ) ] start = pos++; } else if (pos[0] != '/') { /* path-rootless = segment-nz *( "/" segment ) path-empty = 0 */ start = pos; } while (true) { if (is_unreserved(*pos)) { pos++; } else if (*pos == '%' && is_hex(pos[1]) && is_hex(pos[2])) { pos += 3; } else if (*pos && strchr(subdelims, *pos)) { pos++; } else if (*pos == ':' || *pos == '@' || *pos == '/') { pos++; } else { break; } } path_ = dup(start, pos); } else { pos += 2; pos = parse_authority(pos); if (!pos) { return relative(url, base); } if (*pos == '/') { /* path-abempty = *( "/" segment ) segment = *pchar pchar = unreserved / pct-encoded / sub-delims / ":" / "@" */ start = pos++; while (true) { if (is_unreserved(*pos)) { pos++; } else if (*pos == '%' && is_hex(pos[1]) && is_hex(pos[2])) { pos += 3; } else if (*pos && strchr(subdelims, *pos)) { pos++; } else if (*pos == ':' || *pos == '@' || *pos == '/') { pos++; } else { break; } } path_ = dup(start, pos); } else { path_ = emptystr(); } } if (*pos == '?') { pos = parse_query(pos); if (!pos) { return relative(url, base); } } if (*pos == '#') { pos = parse_fragment(pos); if (!pos) { return relative(url, base); } } if (*pos != '\0') { return relative(url, base); } return true; } bool UrlImpl::relative(std::string const& url, Url const* base) { char const* pos = url.c_str(); char const* start; if (!base) return false; scheme_.clear(); if (userinfo_unescaped_ != userinfo_) { delete[] userinfo_unescaped_; userinfo_unescaped_ = nullptr; } delete[] userinfo_; userinfo_ = nullptr; host_.clear(); port_ = 0; if (path_unescaped_ != path_) { delete[] path_unescaped_; path_unescaped_ = nullptr; } delete[] path_; path_ = nullptr; if (query_unescaped_ != query_) { delete[] query_unescaped_; query_unescaped_ = nullptr; } delete[] query_; query_ = nullptr; delete[] fragment_; fragment_ = nullptr; /* relative-part = "//" authority path-abempty / path-absolute / path-noscheme / path-empty */ if (memcmp(pos, "//", 2)) { if (pos[0] == '/' && pos[1] != '/') { // path-absolute = "/" [ segment-nz *( "/" segment ) ] start = pos++; } else if (pos[0] != '/') { /* path-noscheme = segment-nz-nc *( "/" segment ) path-empty = 0 */ start = pos; } while (true) { if (is_unreserved(*pos)) { pos++; } else if (*pos == '%' && is_hex(pos[1]) && is_hex(pos[2])) { pos += 3; } else if (*pos && strchr(subdelims, *pos)) { pos++; } else if (*pos == ':' || *pos == '@' || *pos == '/') { pos++; } else { break; } } path_ = dup(start, pos); } else { pos += 2; pos = parse_authority(pos); if (!pos) return false; if (*pos == '/') { /* path-abempty = *( "/" segment ) segment = *pchar pchar = unreserved / pct-encoded / sub-delims / ":" / "@" */ start = pos++; while (true) { if (is_unreserved(*pos)) { pos++; } else if (*pos == '%' && is_hex(pos[1]) && is_hex(pos[2])) { pos += 3; } else if (*pos && strchr(subdelims, *pos)) { pos++; } else if (*pos == ':' || *pos == '@' || *pos == '/') { pos++; } else { break; } } path_ = dup(start, pos); } else { path_ = emptystr(); } } if (*pos == '?') { pos = parse_query(pos); if (!pos) return false; } if (*pos == '#') { pos = parse_fragment(pos); if (!pos) return false; } if (*pos != '\0') return false; scheme_ = base->scheme(); if (host_.empty()) { auto userinfo = base->userinfo(); if (userinfo) userinfo_ = dup(userinfo, userinfo + strlen(userinfo)); host_ = base->host(); port_ = base->port(); } if (path_[0] != '/') { std::string tmp( Paths::join(!base->path().empty() ? base->path() : "/", path_)); delete[] path_; path_ = dup(tmp.data(), tmp.data() + tmp.size()); } return true; } char const* UrlImpl::parse_authority(char const* pos) { /* authority = [ userinfo "@" ] host [ ":" port ] userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) host = IP-literal / IPv4address / reg-name IP-literal = "[" ( IPv6address / IPvFuture ) "]" IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) IPv6address = 6( h16 ":" ) ls32 / "::" 5( h16 ":" ) ls32 / [ h16 ] "::" 4( h16 ":" ) ls32 / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 / [ *4( h16 ":" ) h16 ] "::" ls32 / [ *5( h16 ":" ) h16 ] "::" h16 / [ *6( h16 ":" ) h16 ] "::" ls32 = ( h16 ":" h16 ) / IPv4address h16 = 1*4HEXDIG IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet dec-octet = DIGIT ; 0-9 / %x31-39 DIGIT ; 10-99 / "1" 2DIGIT ; 100-199 / "2" %x30-34 DIGIT ; 200-249 / "25" %x30-35 ; 250-255 reg-name = *( unreserved / pct-encoded / sub-delims ) port = *DIGIT */ char const* start = pos; char const* at = nullptr, *colon = nullptr; char const* host_start, *host_end, *tmp; while (true) { if (is_unreserved(*pos)) { pos++; } else if (*pos == '%' && is_hex(pos[1]) && is_hex(pos[2])) { pos += 3; } else if (*pos && strchr(subdelims, *pos)) { pos++; } else if (*pos == '[' || *pos == ']') { pos++; } else if (*pos == ':') { colon = pos++; } else if (*pos == '@') { if (at) { return nullptr; } colon = nullptr; at = pos++; } else { break; } } // userinfo? if (at) { host_start = at + 1; userinfo_ = dup(start, at); if (strchr(userinfo_, '[') || strchr(userinfo_, ']')) { return nullptr; } } else { host_start = start; } // port? if (colon) { tmp = colon + 1; if (tmp < pos && is_digit(*tmp)) { uint16_t v = *tmp - '0'; tmp++; host_end = colon; while (tmp < pos) { uint16_t x; if (!is_digit(*tmp)) { host_end = pos; break; } x = v; v *= 10; if (v < x) { return nullptr; } v += *tmp - '0'; tmp++; } if (host_end == colon) { port_ = v; } } else { host_end = pos; } } else { host_end = pos; } if (*host_start == '[') { if (host_end[-1] != ']' || host_start + 1 >= host_end - 1) { return nullptr; } host_start++; host_end--; if (*host_start == 'v') { if (!is_hex(host_start[1]) || host_start[2] != '.') { return nullptr; } host_.assign(host_start, host_end - host_start); lower(host_); } else { if (!is_ipv6(host_start, host_end)) { return nullptr; } host_.assign(host_start, host_end - host_start); lower(host_); } if (host_.find('[') != std::string::npos || host_.find(']') != std::string::npos) { return nullptr; } } else { tmp = host_start; while (tmp < host_end) { if (*tmp == '[' || *tmp == ']') { return nullptr; } tmp++; } tmp = unescape(const_cast(host_start), const_cast(host_end), false); host_ = tmp; lower(host_); delete[] const_cast(tmp); } if (host_.empty()) return nullptr; return pos; } char const* UrlImpl::parse_query(char const* pos) { // query = *( pchar / "/" / "?" ) char const* start = ++pos; while (true) { if (is_unreserved(*pos)) { pos++; } else if (*pos == '%' && is_hex(pos[1]) && is_hex(pos[2])) { pos += 3; } else if (*pos && strchr(subdelims, *pos)) { pos++; } else if (*pos == ':' || *pos == '@' || *pos == '/' || *pos == '?') { pos++; } else { break; } } query_ = dup(start, pos); return pos; } char const* UrlImpl::parse_fragment(char const* pos) { // fragment = *( pchar / "/" / "?" ) char const* start = ++pos; while (true) { if (is_unreserved(*pos)) { pos++; } else if (*pos == '%' && is_hex(pos[1]) && is_hex(pos[2])) { pos += 3; } else if (*pos && strchr(subdelims, *pos)) { pos++; } else if (*pos == ':' || *pos == '@' || *pos == '/' || *pos == '?') { pos++; } else { break; } } fragment_ = unescape((char*) start, (char*) pos, false); return pos; } char const* UrlImpl::userinfo() const { if (userinfo_ && !userinfo_unescaped_) { userinfo_unescaped_ = unescape(userinfo_, nullptr, false); } return userinfo_unescaped_; } std::string UrlImpl::path() const { if (!path_unescaped_) { path_unescaped_ = unescape(path_, nullptr, false); } return path_unescaped_; } bool UrlImpl::query(std::string const& name, std::string* value) const { char* pos; if (!query_ || !*query_) { return false; } pos = query_; while (true) { char* next = pos, *eq; char* tmp; while (*next && *next != '=' && *next != '&') { next++; } if (*next == '=') { eq = next++; while (*next && *next != '&') { next++; } } else { eq = next; } tmp = unescape(pos, eq, true); if (name.compare(tmp) == 0) { delete[] tmp; if (eq != next) { std::unique_ptr tmp(unescape(eq + 1, next, true)); if (value) value->assign(tmp.get()); return true; } else { if (value) value->clear(); return true; } } delete[] tmp; if (!*next) { return false; } pos = next + 1; } } char const* UrlImpl::full_query() const { if (query_ && !query_unescaped_) { query_unescaped_ = unescape(query_, nullptr, true); } return query_unescaped_; } bool unhex(char c, uint8_t* ret) { if (is_digit(c)) { *ret = c - '0'; return true; } else if (c >= 'A' && c <= 'F') { *ret = c - 'A' + 10; return true; } else if (c >= 'a' && c <= 'f') { *ret = c - 'a' + 10; return true; } return false; } char* unescape(char* start, char* end, bool query) { char* pos = start; char* ret; size_t o; if (!start) { return nullptr; } if (end) { for (; pos < end; pos++) { if (*pos == '%' || (query && *pos == '+')) { break; } } if (pos == end) { return dup(start, end); } } else { for (; *pos; pos++) { if (*pos == '%' || (query && *pos == '+')) { break; } } if (!*pos) { return start; } end = pos + strlen(pos); } ret = new char[end - start + 1]; o = 0; while (true) { uint8_t h, l; memcpy(ret + o, start, pos - start); o += pos - start; if (query && *pos == '+') { ret[o++] = ' '; pos++; } else if (pos + 3 <= end && unhex(pos[1], &h) && unhex(pos[2], &l)) { ret[o++] = h << 4 | l; pos += 3; } else { ret[o++] = *(pos++); } start = pos; while (pos < end && *pos != '%' && !(query && *pos == '+')) { pos++; } if (pos == end) { memcpy(ret + o, start, pos - start); o += pos - start; break; } } ret[o] = '\0'; return ret; } bool unsafe(char c, char const* safe) { if (is_unreserved(c) || (c && strchr(safe, c))) { return false; } return true; } char hex(uint8_t c) { return c < 10 ? '0' + c : 'A' + c - 10; } char* escape(char const* str, char const* safe) { char const* pos = str; size_t len; char* ret; size_t o; while (*pos && !unsafe(*pos, safe)) { pos++; } if (!*pos) { return (char*) str; } len = strlen(pos); ret = new char[(pos - str) + len * 3 + 1]; o = 0; while (true) { memcpy(ret + o, str, pos - str); o += pos - str; ret[o++] = '%'; ret[o++] = hex(*((const uint8_t*)pos) >> 4); ret[o++] = hex(*((const uint8_t*)pos) & 0xf); str = ++pos; while (*pos && !unsafe(*pos, safe)) { pos++; } if (!*pos) { memcpy(ret + o, str, pos - str); o += pos - str; break; } } ret[o] = '\0'; return ret; } char* dup(char const* start, char const* end) { if (!start) return nullptr; if (!end) end = start + strlen(start); size_t len = end - start; char* ret; assert(start <= end); ret = new char[len + 1]; memcpy(ret, start, len); ret[len] = '\0'; return ret; } inline char ascii_tolower(char c) { return (c >= 'A' && c <= 'Z') ? ('a' + c - 'A') : c; } void lower(std::string& str) { std::transform(str.begin(), str.end(), str.begin(), ascii_tolower); } bool is_ipv4(char const* start, char const* end) { char const* pos = start; size_t i = 0; while (true) { if (pos[0] == '2' && pos + 2 <= end) { if (pos[1] == '5') { if (pos[2] < '0' || pos[2] > '5') { break; } } else { if (pos[1] < '0' || pos[1] > '4' || !is_digit(pos[2])) { break; } } pos += 3; } else if (pos[0] == '1' && pos + 2 <= end && is_digit(pos[1]) && is_digit(pos[2])) { pos += 3; } else if ((pos[0] >= '1' && pos[0] <= '9') && pos + 1 <= end && is_digit(pos[1])) { pos += 2; } else if (is_digit(pos[0])) { pos++; } else { break; } i++; if (pos == end || *pos != '.') { break; } pos++; if (pos == end) { return false; } } return i == 4 && pos == end; } size_t walk_hex(char const* start, char const* end, size_t max) { size_t i = 0; while (max-- && start < end) { if (!is_hex(*start)) break; start++; i++; } return i; } bool is_ipv6(char const* start, char const* end) { size_t i = 0, j = 0, x; char const* pos = start; bool empty = false; while (pos < end) { if (*pos == ':') { pos++; if (*pos == ':') { empty = true; pos++; break; } if (i == 0) { return false; } } x = walk_hex(pos, end, 4); if (x == 0) { return false; } pos += x; i++; } if (pos < end) { while (true) { x = walk_hex(pos, end, 4); if (x == 0) { return false; } pos += x; if (pos == end) { j++; break; } if (*pos != ':') { pos -= x; break; } pos++; j++; } } if (pos != end) { if (!is_ipv4(pos, end)) { return false; } j += 2; } if (!empty) { return i == 8; } if (i + j > 7) { return false; } return true; } void UrlImpl::print(std::ostream& out, bool path, bool query, bool fragment) const { out << scheme_; if (!host_.empty()) { out << "://"; if (userinfo_) { out << userinfo_ << '@'; } out << host_; if (port_) { out << ':' << port_; } } else { out << ':'; } if (path) { out << path_; } if (query && query_ && *query_) { out << '?' << query_; } if (fragment && fragment_ && *fragment_) { out << '#'; char* tmp = escape(fragment_, "/?"); out << tmp; if (tmp != fragment_) delete[] tmp; } } bool null_eq(char const* a1, char const* a2) { if (a1 == a2) return true; return a1 && a2 && strcmp(a1, a2) == 0; } } // namespace // static Url* Url::parse(std::string const& url, Url const* base) { UrlImpl* ret = new UrlImpl(); if (ret->parse(url, base)) return ret; delete ret; return nullptr; } bool Url::operator==(Url const& url) const { if (scheme() != url.scheme()) return false; if (host() != url.host()) return false; if (port() != url.port()) return false; if (path_escaped() != url.path_escaped()) return false; if (!null_eq(userinfo_escaped(), url.userinfo_escaped())) return false; if (!null_eq(full_query_escaped(), url.full_query_escaped())) return false; if (!null_eq(fragment(), url.fragment())) return false; return true; }