From 7b75100e61cbed7a59360332197a9a811ddfe1b7 Mon Sep 17 00:00:00 2001 From: Amos Jeffries Date: Thu, 21 May 2020 14:42:02 +0000 Subject: [PATCH] Add flexible RFC 3986 URI encoder (#617) Use AnyP::Uri namespace to self-document encoder scope and coding type. Use SBuf and CharacterSet for more flexible input and actions than previous RFC 1738 encoder. Allowing callers to trivially determine which characters are encoded. --- src/anyp/Uri.cc | 193 +++++++++++++++++++++---------------- src/anyp/Uri.h | 18 +++- src/base/CharacterSet.cc | 7 ++ src/base/CharacterSet.h | 3 + src/clients/Client.cc | 37 ++++--- src/tests/stub_StatHist.cc | 2 +- src/tests/stub_libanyp.cc | 2 +- 7 files changed, 161 insertions(+), 101 deletions(-) diff --git a/src/anyp/Uri.cc b/src/anyp/Uri.cc index 7ac3ac866b..a98ae8c4ee 100644 --- a/src/anyp/Uri.cc +++ b/src/anyp/Uri.cc @@ -30,6 +30,56 @@ static const char valid_hostname_chars[] = "[:]" ; +/// Characters which are valid within a URI userinfo section +static const CharacterSet & +UserInfoChars() +{ + /* + * RFC 3986 section 3.2.1 + * + * userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) + * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" + * pct-encoded = "%" HEXDIG HEXDIG + * sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" + */ + static const auto userInfoValid = CharacterSet("userinfo", ":-._~%!$&'()*+,;=") + + CharacterSet::ALPHA + + CharacterSet::DIGIT; + return userInfoValid; +} + +/** + * Governed by RFC 3986 section 2.1 + */ +SBuf +AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore) +{ + if (buf.isEmpty()) + return buf; + + Parser::Tokenizer tk(buf); + SBuf goodSection; + // optimization for the arguably common "no encoding necessary" case + if (tk.prefix(goodSection, ignore) && tk.atEnd()) + return buf; + + SBuf output; + output.reserveSpace(buf.length() * 3); // worst case: encode all chars + output.append(goodSection); // may be empty + + while (!tk.atEnd()) { + // TODO: Add Tokenizer::parseOne(void). + const auto ch = tk.remaining()[0]; + output.appendf("%%%02X", static_cast(ch)); // TODO: Optimize using a table + (void)tk.skip(ch); + + if (tk.prefix(goodSection, ignore)) + output.append(goodSection); + } + + return output; +} + const SBuf & AnyP::Uri::Asterisk() { @@ -557,7 +607,10 @@ AnyP::Uri::absolute() const getScheme() == AnyP::PROTO_UNKNOWN; if (allowUserInfo && !userInfo().isEmpty()) { - absolute_.append(userInfo()); + static const CharacterSet uiChars = CharacterSet(UserInfoChars()) + .remove('%') + .rename("userinfo-reserved"); + absolute_.append(Encode(userInfo(), uiChars)); absolute_.append("@", 1); } absolute_.append(authority()); @@ -565,7 +618,7 @@ AnyP::Uri::absolute() const absolute_.append(host()); absolute_.append(":", 1); } - absolute_.append(path()); + absolute_.append(path()); // TODO: Encode each URI subcomponent in path_ as needed. } return absolute_; @@ -619,102 +672,76 @@ urlCanonicalFakeHttps(const HttpRequest * request) return request->canonicalCleanUrl(); } -/* - * Test if a URL is relative. +/** + * Test if a URL is a relative reference. + * + * Governed by RFC 3986 section 4.2 + * + * relative-ref = relative-part [ "?" query ] [ "#" fragment ] * - * RFC 2396, Section 5 (Page 17) implies that in a relative URL, a '/' will - * appear before a ':'. + * relative-part = "//" authority path-abempty + * / path-absolute + * / path-noscheme + * / path-empty */ bool urlIsRelative(const char *url) { - const char *p; + if (!url) + return false; // no URL - if (url == NULL) { - return (false); - } - if (*url == '\0') { - return (false); - } + /* + * RFC 3986 section 5.2.3 + * + * path = path-abempty ; begins with "/" or is empty + * / path-absolute ; begins with "/" but not "//" + * / path-noscheme ; begins with a non-colon segment + * / path-rootless ; begins with a segment + * / path-empty ; zero characters + */ - for (p = url; *p != '\0' && *p != ':' && *p != '/'; ++p); + if (*url == '\0') + return true; // path-empty - if (*p == ':') { - return (false); + if (*url == '/') { + // RFC 3986 section 5.2.3 + // path-absolute ; begins with "/" but not "//" + if (url[1] == '/') + return true; // network-path reference, aka. 'scheme-relative URI' + else + return true; // path-absolute, aka 'absolute-path reference' } - return (true); -} - -/* - * Convert a relative URL to an absolute URL using the context of a given - * request. - * - * It is assumed that you have already ensured that the URL is relative. - * - * If NULL is returned it is an indication that the method in use in the - * request does not distinguish between relative and absolute and you should - * use the url unchanged. - * - * If non-NULL is returned, it is up to the caller to free the resulting - * memory using safe_free(). - */ -char * -urlMakeAbsolute(const HttpRequest * req, const char *relUrl) -{ - if (req->method.id() == Http::METHOD_CONNECT) { - return (NULL); + for (const auto *p = url; *p != '\0' && *p != '/' && *p != '?' && *p != '#'; ++p) { + if (*p == ':') + return false; // colon is forbidden in first segment } - char *urlbuf = (char *)xmalloc(MAX_URL * sizeof(char)); - - if (req->url.getScheme() == AnyP::PROTO_URN) { - // XXX: this is what the original code did, but it seems to break the - // intended behaviour of this function. It returns the stored URN path, - // not converting the given one into a URN... - snprintf(urlbuf, MAX_URL, SQUIDSBUFPH, SQUIDSBUFPRINT(req->url.absolute())); - return (urlbuf); - } + return true; // path-noscheme, path-abempty, path-rootless +} - SBuf authorityForm = req->url.authority(); // host[:port] - const SBuf &scheme = req->url.getScheme().image(); - size_t urllen = snprintf(urlbuf, MAX_URL, SQUIDSBUFPH "://" SQUIDSBUFPH "%s" SQUIDSBUFPH, - SQUIDSBUFPRINT(scheme), - SQUIDSBUFPRINT(req->url.userInfo()), - !req->url.userInfo().isEmpty() ? "@" : "", - SQUIDSBUFPRINT(authorityForm)); - - // if the first char is '/' assume its a relative path - // XXX: this breaks on scheme-relative URLs, - // but we should not see those outside ESI, and rarely there. - // XXX: also breaks on any URL containing a '/' in the query-string portion - if (relUrl[0] == '/') { - xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1); +void +AnyP::Uri::addRelativePath(const char *relUrl) +{ + // URN cannot be merged + if (getScheme() == AnyP::PROTO_URN) + return; + + // TODO: Handle . and .. segment normalization + + const auto lastSlashPos = path_.rfind('/'); + // TODO: To optimize and simplify, add and use SBuf::replace(). + const auto relUrlLength = strlen(relUrl); + if (lastSlashPos == SBuf::npos) { + // start replacing the whole path + path_.reserveCapacity(1 + relUrlLength); + path_.assign("/", 1); } else { - SBuf path = req->url.path(); - SBuf::size_type lastSlashPos = path.rfind('/'); - - if (lastSlashPos == SBuf::npos) { - // replace the whole path with the given bit(s) - urlbuf[urllen] = '/'; - ++urllen; - xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1); - } else { - // replace only the last (file?) segment with the given bit(s) - ++lastSlashPos; - if (lastSlashPos > MAX_URL - urllen - 1) { - // XXX: crops bits in the middle of the combined URL. - lastSlashPos = MAX_URL - urllen - 1; - } - SBufToCstring(&urlbuf[urllen], path.substr(0,lastSlashPos)); - urllen += lastSlashPos; - if (urllen + 1 < MAX_URL) { - xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1); - } - } + // start replacing just the last segment + path_.reserveCapacity(lastSlashPos + 1 + relUrlLength); + path_.chop(0, lastSlashPos+1); } - - return (urlbuf); + path_.append(relUrl, relUrlLength); } int diff --git a/src/anyp/Uri.h b/src/anyp/Uri.h index 39cd199c1a..265741b0f1 100644 --- a/src/anyp/Uri.h +++ b/src/anyp/Uri.h @@ -77,6 +77,8 @@ public: } void userInfo(const SBuf &s) {userInfo_=s; touch();} + /// \returns raw userinfo subcomponent (or an empty string) + /// the caller is responsible for caller-specific encoding const SBuf &userInfo() const {return userInfo_;} void host(const char *src); @@ -98,12 +100,27 @@ public: void path(const SBuf &p) {path_=p; touch();} const SBuf &path() const; + /** + * Merge a relative-path URL into the existing URI details. + * Implements RFC 3986 section 5.2.3 + * + * The caller must ensure relUrl is a valid relative-path. + * + * NP: absolute-path are also accepted, but path() method + * should be used instead when possible. + */ + void addRelativePath(const char *relUrl); + /// the static '/' default URL-path static const SBuf &SlashPath(); /// the static '*' pseudo-URI static const SBuf &Asterisk(); + /// %-encode characters in a buffer which do not conform to + /// the provided set of expected characters. + static SBuf Encode(const SBuf &, const CharacterSet &expected); + /** * The authority-form URI for currently stored values. * @@ -199,7 +216,6 @@ void urlInitialize(void); char *urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &, const AnyP::UriScheme &); const char *urlCanonicalFakeHttps(const HttpRequest * request); bool urlIsRelative(const char *); -char *urlMakeAbsolute(const HttpRequest *, const char *); char *urlRInternal(const char *host, unsigned short port, const char *dir, const char *name); char *urlInternal(const char *dir, const char *name); bool urlAppendDomain(char *host); ///< apply append_domain config to the given hostname diff --git a/src/base/CharacterSet.cc b/src/base/CharacterSet.cc index 3d83b41265..6c02b52885 100644 --- a/src/base/CharacterSet.cc +++ b/src/base/CharacterSet.cc @@ -50,6 +50,13 @@ CharacterSet::add(const unsigned char c) return *this; } +CharacterSet & +CharacterSet::remove(const unsigned char c) +{ + chars_[static_cast(c)] = 0; + return *this; +} + CharacterSet & CharacterSet::addRange(unsigned char low, unsigned char high) { diff --git a/src/base/CharacterSet.h b/src/base/CharacterSet.h index 77785555dc..376eae938d 100644 --- a/src/base/CharacterSet.h +++ b/src/base/CharacterSet.h @@ -41,6 +41,9 @@ public: /// add a given character to the character set CharacterSet & add(const unsigned char c); + /// remove a given character from the character set + CharacterSet & remove(const unsigned char c); + /// add a list of character ranges, expressed as pairs [low,high], including both ends CharacterSet & addRange(unsigned char low, unsigned char high); diff --git a/src/clients/Client.cc b/src/clients/Client.cc index cdc01877c4..08d3c448cb 100644 --- a/src/clients/Client.cc +++ b/src/clients/Client.cc @@ -455,33 +455,40 @@ sameUrlHosts(const char *url1, const char *url2) static void purgeEntriesByHeader(HttpRequest *req, const char *reqUrl, Http::Message *rep, Http::HdrType hdr) { - const char *hdrUrl, *absUrl; - - absUrl = NULL; - hdrUrl = rep->header.getStr(hdr); - if (hdrUrl == NULL) { + const auto hdrUrl = rep->header.getStr(hdr); + if (!hdrUrl) return; - } /* * If the URL is relative, make it absolute so we can find it. * If it's absolute, make sure the host parts match to avoid DOS attacks * as per RFC 2616 13.10. */ + SBuf absUrlMaker; + const char *absUrl = nullptr; if (urlIsRelative(hdrUrl)) { - absUrl = urlMakeAbsolute(req, hdrUrl); - if (absUrl != NULL) { - hdrUrl = absUrl; + if (req->method.id() == Http::METHOD_CONNECT) + absUrl = hdrUrl; // TODO: merge authority-uri and hdrUrl + else if (req->url.getScheme() == AnyP::PROTO_URN) + absUrl = req->url.absolute().c_str(); + else { + AnyP::Uri tmpUrl = req->url; + if (*hdrUrl == '/') { + // RFC 3986 section 4.2: absolute-path reference + // for this logic replace the entire request-target URI path + tmpUrl.path(hdrUrl); + } else { + tmpUrl.addRelativePath(reqUrl); + } + absUrlMaker = tmpUrl.absolute(); + absUrl = absUrlMaker.c_str(); } } else if (!sameUrlHosts(reqUrl, hdrUrl)) { return; - } + } else + absUrl = hdrUrl; - purgeEntriesByUrl(req, hdrUrl); - - if (absUrl != NULL) { - safe_free(absUrl); - } + purgeEntriesByUrl(req, absUrl); } // some HTTP methods should purge matching cache entries diff --git a/src/tests/stub_StatHist.cc b/src/tests/stub_StatHist.cc index 3ae289dc3b..f2b3ca37b6 100644 --- a/src/tests/stub_StatHist.cc +++ b/src/tests/stub_StatHist.cc @@ -16,7 +16,7 @@ class StoreEntry; void StatHist::dump(StoreEntry * sentry, StatHistBinDumper * bd) const STUB void StatHist::enumInit(unsigned int i) STUB_NOP -void StatHist::count(double d) STUB_NOP +void StatHist::count(double) {/* STUB_NOP */} double statHistDeltaMedian(const StatHist & A, const StatHist & B) STUB_RETVAL(0.0) double statHistDeltaPctile(const StatHist & A, const StatHist & B, double pctile) STUB_RETVAL(0.0) void StatHist::logInit(unsigned int i, double d1, double d2) STUB_NOP diff --git a/src/tests/stub_libanyp.cc b/src/tests/stub_libanyp.cc index 7d49a6faf7..0b97499f4b 100644 --- a/src/tests/stub_libanyp.cc +++ b/src/tests/stub_libanyp.cc @@ -18,6 +18,7 @@ bool AnyP::Uri::parse(const HttpRequestMethod&, const SBuf &) STUB_RETVAL(true) void AnyP::Uri::host(const char *) STUB static SBuf nil; const SBuf &AnyP::Uri::path() const STUB_RETVAL(nil) +void AnyP::Uri::addRelativePath(const char *) STUB const SBuf &AnyP::Uri::SlashPath() { static SBuf slash("/"); @@ -33,7 +34,6 @@ SBuf &AnyP::Uri::absolute() const STUB_RETVAL(nil) void urlInitialize() STUB const char *urlCanonicalFakeHttps(const HttpRequest *) STUB_RETVAL(nullptr) bool urlIsRelative(const char *) STUB_RETVAL(false) -char *urlMakeAbsolute(const HttpRequest *, const char *)STUB_RETVAL(nullptr) char *urlRInternal(const char *, unsigned short, const char *, const char *) STUB_RETVAL(nullptr) char *urlInternal(const char *, const char *) STUB_RETVAL(nullptr) int matchDomainName(const char *, const char *, enum MatchDomainNameFlags) STUB_RETVAL(0) -- 2.39.5