From 7b75100e61cbed7a59360332197a9a811ddfe1b7 Mon Sep 17 00:00:00 2001
From: Amos Jeffries <yadij@users.noreply.github.com>
Date: Thu, 21 May 2020 14:42:02 +0000
Subject: [PATCH] Add flexible RFC 3986 URI encoder (#617)

Use AnyP::Uri namespace to self-document encoder scope and
coding type.

Use SBuf and CharacterSet for more flexible input and actions
than previous RFC 1738 encoder. Allowing callers to trivially
determine which characters are encoded.
---
 src/anyp/Uri.cc            | 193 +++++++++++++++++++++----------------
 src/anyp/Uri.h             |  18 +++-
 src/base/CharacterSet.cc   |   7 ++
 src/base/CharacterSet.h    |   3 +
 src/clients/Client.cc      |  37 ++++---
 src/tests/stub_StatHist.cc |   2 +-
 src/tests/stub_libanyp.cc  |   2 +-
 7 files changed, 161 insertions(+), 101 deletions(-)
diff --git a/src/anyp/Uri.cc b/src/anyp/Uri.cc
index 7ac3ac866b..a98ae8c4ee 100644
--- a/src/anyp/Uri.cc
+++ b/src/anyp/Uri.cc
@@ -30,6 +30,56 @@ static const char valid_hostname_chars[] =
     "[:]"
     ;
 
+/// Characters which are valid within a URI userinfo section
+static const CharacterSet &
+UserInfoChars()
+{
+    /*
+     * RFC 3986 section 3.2.1
+     *
+     *  userinfo      = *( unreserved / pct-encoded / sub-delims / ":" )
+     *  unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
+     *  pct-encoded   = "%" HEXDIG HEXDIG
+     *  sub-delims    = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
+     */
+    static const auto userInfoValid = CharacterSet("userinfo", ":-._~%!$&'()*+,;=") +
+                                      CharacterSet::ALPHA +
+                                      CharacterSet::DIGIT;
+    return userInfoValid;
+}
+
+/**
+ * Governed by RFC 3986 section 2.1
+ */
+SBuf
+AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore)
+{
+    if (buf.isEmpty())
+        return buf;
+
+    Parser::Tokenizer tk(buf);
+    SBuf goodSection;
+    // optimization for the arguably common "no encoding necessary" case
+    if (tk.prefix(goodSection, ignore) && tk.atEnd())
+        return buf;
+
+    SBuf output;
+    output.reserveSpace(buf.length() * 3); // worst case: encode all chars
+    output.append(goodSection); // may be empty
+
+    while (!tk.atEnd()) {
+        // TODO: Add Tokenizer::parseOne(void).
+        const auto ch = tk.remaining()[0];
+        output.appendf("%%%02X", static_cast<unsigned int>(ch)); // TODO: Optimize using a table
+        (void)tk.skip(ch);
+
+        if (tk.prefix(goodSection, ignore))
+            output.append(goodSection);
+    }
+
+    return output;
+}
+
 const SBuf &
 AnyP::Uri::Asterisk()
 {
@@ -557,7 +607,10 @@ AnyP::Uri::absolute() const
                                        getScheme() == AnyP::PROTO_UNKNOWN;
 
             if (allowUserInfo && !userInfo().isEmpty()) {
-                absolute_.append(userInfo());
+                static const CharacterSet uiChars = CharacterSet(UserInfoChars())
+                                                    .remove('%')
+                                                    .rename("userinfo-reserved");
+                absolute_.append(Encode(userInfo(), uiChars));
                 absolute_.append("@", 1);
             }
             absolute_.append(authority());
@@ -565,7 +618,7 @@ AnyP::Uri::absolute() const
             absolute_.append(host());
             absolute_.append(":", 1);
         }
-        absolute_.append(path());
+        absolute_.append(path()); // TODO: Encode each URI subcomponent in path_ as needed.
     }
 
     return absolute_;
@@ -619,102 +672,76 @@ urlCanonicalFakeHttps(const HttpRequest * request)
     return request->canonicalCleanUrl();
 }
 
-/*
- * Test if a URL is relative.
+/**
+ * Test if a URL is a relative reference.
+ *
+ * Governed by RFC 3986 section 4.2
+ *
+ *  relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
  *
- * RFC 2396, Section 5 (Page 17) implies that in a relative URL, a '/' will
- * appear before a ':'.
+ *  relative-part = "//" authority path-abempty
+ *                / path-absolute
+ *                / path-noscheme
+ *                / path-empty
  */
 bool
 urlIsRelative(const char *url)
 {
-    const char *p;
+    if (!url)
+        return false; // no URL
 
-    if (url == NULL) {
-        return (false);
-    }
-    if (*url == '\0') {
-        return (false);
-    }
+    /*
+     * RFC 3986 section 5.2.3
+     *
+     * path          = path-abempty    ; begins with "/" or is empty
+     *               / path-absolute   ; begins with "/" but not "//"
+     *               / path-noscheme   ; begins with a non-colon segment
+     *               / path-rootless   ; begins with a segment
+     *               / path-empty      ; zero characters
+     */
 
-    for (p = url; *p != '\0' && *p != ':' && *p != '/'; ++p);
+    if (*url == '\0')
+        return true; // path-empty
 
-    if (*p == ':') {
-        return (false);
+    if (*url == '/') {
+        // RFC 3986 section 5.2.3
+        // path-absolute   ; begins with "/" but not "//"
+        if (url[1] == '/')
+            return true; // network-path reference, aka. 'scheme-relative URI'
+        else
+            return true; // path-absolute, aka 'absolute-path reference'
     }
-    return (true);
-}
-
-/*
- * Convert a relative URL to an absolute URL using the context of a given
- * request.
- *
- * It is assumed that you have already ensured that the URL is relative.
- *
- * If NULL is returned it is an indication that the method in use in the
- * request does not distinguish between relative and absolute and you should
- * use the url unchanged.
- *
- * If non-NULL is returned, it is up to the caller to free the resulting
- * memory using safe_free().
- */
-char *
-urlMakeAbsolute(const HttpRequest * req, const char *relUrl)
-{
 
-    if (req->method.id() == Http::METHOD_CONNECT) {
-        return (NULL);
+    for (const auto *p = url; *p != '\0' && *p != '/' && *p != '?' && *p != '#'; ++p) {
+        if (*p == ':')
+            return false; // colon is forbidden in first segment
     }
 
-    char *urlbuf = (char *)xmalloc(MAX_URL * sizeof(char));
-
-    if (req->url.getScheme() == AnyP::PROTO_URN) {
-        // XXX: this is what the original code did, but it seems to break the
-        // intended behaviour of this function. It returns the stored URN path,
-        // not converting the given one into a URN...
-        snprintf(urlbuf, MAX_URL, SQUIDSBUFPH, SQUIDSBUFPRINT(req->url.absolute()));
-        return (urlbuf);
-    }
+    return true; // path-noscheme, path-abempty, path-rootless
+}
 
-    SBuf authorityForm = req->url.authority(); // host[:port]
-    const SBuf &scheme = req->url.getScheme().image();
-    size_t urllen = snprintf(urlbuf, MAX_URL, SQUIDSBUFPH "://" SQUIDSBUFPH "%s" SQUIDSBUFPH,
-                             SQUIDSBUFPRINT(scheme),
-                             SQUIDSBUFPRINT(req->url.userInfo()),
-                             !req->url.userInfo().isEmpty() ? "@" : "",
-                             SQUIDSBUFPRINT(authorityForm));
-
-    // if the first char is '/' assume its a relative path
-    // XXX: this breaks on scheme-relative URLs,
-    // but we should not see those outside ESI, and rarely there.
-    // XXX: also breaks on any URL containing a '/' in the query-string portion
-    if (relUrl[0] == '/') {
-        xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
+void
+AnyP::Uri::addRelativePath(const char *relUrl)
+{
+    // URN cannot be merged
+    if (getScheme() == AnyP::PROTO_URN)
+        return;
+
+    // TODO: Handle . and .. segment normalization
+
+    const auto lastSlashPos = path_.rfind('/');
+    // TODO: To optimize and simplify, add and use SBuf::replace().
+    const auto relUrlLength = strlen(relUrl);
+    if (lastSlashPos == SBuf::npos) {
+        // start replacing the whole path
+        path_.reserveCapacity(1 + relUrlLength);
+        path_.assign("/", 1);
     } else {
-        SBuf path = req->url.path();
-        SBuf::size_type lastSlashPos = path.rfind('/');
-
-        if (lastSlashPos == SBuf::npos) {
-            // replace the whole path with the given bit(s)
-            urlbuf[urllen] = '/';
-            ++urllen;
-            xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
-        } else {
-            // replace only the last (file?) segment with the given bit(s)
-            ++lastSlashPos;
-            if (lastSlashPos > MAX_URL - urllen - 1) {
-                // XXX: crops bits in the middle of the combined URL.
-                lastSlashPos = MAX_URL - urllen - 1;
-            }
-            SBufToCstring(&urlbuf[urllen], path.substr(0,lastSlashPos));
-            urllen += lastSlashPos;
-            if (urllen + 1 < MAX_URL) {
-                xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
-            }
-        }
+        // start replacing just the last segment
+        path_.reserveCapacity(lastSlashPos + 1 + relUrlLength);
+        path_.chop(0, lastSlashPos+1);
     }
-
-    return (urlbuf);
+    path_.append(relUrl, relUrlLength);
 }
 
 int
diff --git a/src/anyp/Uri.h b/src/anyp/Uri.h
index 39cd199c1a..265741b0f1 100644
--- a/src/anyp/Uri.h
+++ b/src/anyp/Uri.h
@@ -77,6 +77,8 @@ public:
     }
 
     void userInfo(const SBuf &s) {userInfo_=s; touch();}
+    /// \returns raw userinfo subcomponent (or an empty string)
+    /// the caller is responsible for caller-specific encoding
     const SBuf &userInfo() const {return userInfo_;}
 
     void host(const char *src);
@@ -98,12 +100,27 @@ public:
     void path(const SBuf &p) {path_=p; touch();}
     const SBuf &path() const;
 
+    /**
+     * Merge a relative-path URL into the existing URI details.
+     * Implements RFC 3986 section 5.2.3
+     *
+     * The caller must ensure relUrl is a valid relative-path.
+     *
+     * NP: absolute-path are also accepted, but path() method
+     * should be used instead when possible.
+     */
+    void addRelativePath(const char *relUrl);
+
     /// the static '/' default URL-path
     static const SBuf &SlashPath();
 
     /// the static '*' pseudo-URI
     static const SBuf &Asterisk();
 
+    /// %-encode characters in a buffer which do not conform to
+    /// the provided set of expected characters.
+    static SBuf Encode(const SBuf &, const CharacterSet &expected);
+
     /**
      * The authority-form URI for currently stored values.
      *
@@ -199,7 +216,6 @@ void urlInitialize(void);
 char *urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &, const AnyP::UriScheme &);
 const char *urlCanonicalFakeHttps(const HttpRequest * request);
 bool urlIsRelative(const char *);
-char *urlMakeAbsolute(const HttpRequest *, const char *);
 char *urlRInternal(const char *host, unsigned short port, const char *dir, const char *name);
 char *urlInternal(const char *dir, const char *name);
 bool urlAppendDomain(char *host); ///< apply append_domain config to the given hostname
diff --git a/src/base/CharacterSet.cc b/src/base/CharacterSet.cc
index 3d83b41265..6c02b52885 100644
--- a/src/base/CharacterSet.cc
+++ b/src/base/CharacterSet.cc
@@ -50,6 +50,13 @@ CharacterSet::add(const unsigned char c)
     return *this;
 }
 
+CharacterSet &
+CharacterSet::remove(const unsigned char c)
+{
+    chars_[static_cast<uint8_t>(c)] = 0;
+    return *this;
+}
+
 CharacterSet &
 CharacterSet::addRange(unsigned char low, unsigned char high)
 {
diff --git a/src/base/CharacterSet.h b/src/base/CharacterSet.h
index 77785555dc..376eae938d 100644
--- a/src/base/CharacterSet.h
+++ b/src/base/CharacterSet.h
@@ -41,6 +41,9 @@ public:
     /// add a given character to the character set
     CharacterSet & add(const unsigned char c);
 
+    /// remove a given character from the character set
+    CharacterSet & remove(const unsigned char c);
+
     /// add a list of character ranges, expressed as pairs [low,high], including both ends
     CharacterSet & addRange(unsigned char low, unsigned char high);
 
diff --git a/src/clients/Client.cc b/src/clients/Client.cc
index cdc01877c4..08d3c448cb 100644
--- a/src/clients/Client.cc
+++ b/src/clients/Client.cc
@@ -455,33 +455,40 @@ sameUrlHosts(const char *url1, const char *url2)
 static void
 purgeEntriesByHeader(HttpRequest *req, const char *reqUrl, Http::Message *rep, Http::HdrType hdr)
 {
-    const char *hdrUrl, *absUrl;
-
-    absUrl = NULL;
-    hdrUrl = rep->header.getStr(hdr);
-    if (hdrUrl == NULL) {
+    const auto hdrUrl = rep->header.getStr(hdr);
+    if (!hdrUrl)
         return;
-    }
 
     /*
      * If the URL is relative, make it absolute so we can find it.
      * If it's absolute, make sure the host parts match to avoid DOS attacks
      * as per RFC 2616 13.10.
      */
+    SBuf absUrlMaker;
+    const char *absUrl = nullptr;
     if (urlIsRelative(hdrUrl)) {
-        absUrl = urlMakeAbsolute(req, hdrUrl);
-        if (absUrl != NULL) {
-            hdrUrl = absUrl;
+        if (req->method.id() == Http::METHOD_CONNECT)
+            absUrl = hdrUrl; // TODO: merge authority-uri and hdrUrl
+        else if (req->url.getScheme() == AnyP::PROTO_URN)
+            absUrl = req->url.absolute().c_str();
+        else {
+            AnyP::Uri tmpUrl = req->url;
+            if (*hdrUrl == '/') {
+                // RFC 3986 section 4.2: absolute-path reference
+                // for this logic replace the entire request-target URI path
+                tmpUrl.path(hdrUrl);
+            } else {
+                tmpUrl.addRelativePath(reqUrl);
+            }
+            absUrlMaker = tmpUrl.absolute();
+            absUrl = absUrlMaker.c_str();
         }
     } else if (!sameUrlHosts(reqUrl, hdrUrl)) {
         return;
-    }
+    } else
+        absUrl = hdrUrl;
 
-    purgeEntriesByUrl(req, hdrUrl);
-
-    if (absUrl != NULL) {
-        safe_free(absUrl);
-    }
+    purgeEntriesByUrl(req, absUrl);
 }
 
 // some HTTP methods should purge matching cache entries
diff --git a/src/tests/stub_StatHist.cc b/src/tests/stub_StatHist.cc
index 3ae289dc3b..f2b3ca37b6 100644
--- a/src/tests/stub_StatHist.cc
+++ b/src/tests/stub_StatHist.cc
@@ -16,7 +16,7 @@ class StoreEntry;
 
 void StatHist::dump(StoreEntry * sentry, StatHistBinDumper * bd) const STUB
 void StatHist::enumInit(unsigned int i) STUB_NOP
-void StatHist::count(double d) STUB_NOP
+void StatHist::count(double) {/* STUB_NOP */}
 double statHistDeltaMedian(const StatHist & A, const StatHist & B) STUB_RETVAL(0.0)
 double statHistDeltaPctile(const StatHist & A, const StatHist & B, double pctile) STUB_RETVAL(0.0)
 void StatHist::logInit(unsigned int i, double d1, double d2) STUB_NOP
diff --git a/src/tests/stub_libanyp.cc b/src/tests/stub_libanyp.cc
index 7d49a6faf7..0b97499f4b 100644
--- a/src/tests/stub_libanyp.cc
+++ b/src/tests/stub_libanyp.cc
@@ -18,6 +18,7 @@ bool AnyP::Uri::parse(const HttpRequestMethod&, const SBuf &) STUB_RETVAL(true)
 void AnyP::Uri::host(const char *) STUB
 static SBuf nil;
 const SBuf &AnyP::Uri::path() const STUB_RETVAL(nil)
+void AnyP::Uri::addRelativePath(const char *) STUB
 const SBuf &AnyP::Uri::SlashPath()
 {
     static SBuf slash("/");
@@ -33,7 +34,6 @@ SBuf &AnyP::Uri::absolute() const STUB_RETVAL(nil)
 void urlInitialize() STUB
 const char *urlCanonicalFakeHttps(const HttpRequest *) STUB_RETVAL(nullptr)
 bool urlIsRelative(const char *) STUB_RETVAL(false)
-char *urlMakeAbsolute(const HttpRequest *, const char *)STUB_RETVAL(nullptr)
 char *urlRInternal(const char *, unsigned short, const char *, const char *) STUB_RETVAL(nullptr)
 char *urlInternal(const char *, const char *) STUB_RETVAL(nullptr)
 int matchDomainName(const char *, const char *, enum MatchDomainNameFlags) STUB_RETVAL(0)
-- 
2.39.5