Add flexible RFC 3986 URI encoder (#617)

author Amos Jeffries <yadij@users.noreply.github.com>

Thu, 21 May 2020 14:42:02 +0000 (14:42 +0000)

committer Amos Jeffries <yadij@users.noreply.github.com>

Fri, 29 May 2020 18:38:59 +0000 (06:38 +1200)
author Amos Jeffries <yadij@users.noreply.github.com>
Thu, 21 May 2020 14:42:02 +0000 (14:42 +0000)
committer Amos Jeffries <yadij@users.noreply.github.com>
Fri, 29 May 2020 18:38:59 +0000 (06:38 +1200)
diff --git a/src/anyp/Uri.cc b/src/anyp/Uri.cc

index cbe55b9cb3d3e04d4747b7fd90359ca62a2b99b4..e4909ff1d1f321ffb8c62e86467912ea6ec62b5a 100644 (file)
--- a/src/anyp/Uri.cc
+++ b/src/anyp/Uri.cc
@@ -30,6 +30,56 @@ static const char valid_hostname_chars[] =
      "[:]"
      ;
  
+/// Characters which are valid within a URI userinfo section
+static const CharacterSet &
+UserInfoChars()
+{
+    /*
+     * RFC 3986 section 3.2.1
+     *
+     *  userinfo      = *( unreserved / pct-encoded / sub-delims / ":" )
+     *  unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
+     *  pct-encoded   = "%" HEXDIG HEXDIG
+     *  sub-delims    = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
+     */
+    static const auto userInfoValid = CharacterSet("userinfo", ":-._~%!$&'()*+,;=") +
+                                      CharacterSet::ALPHA +
+                                      CharacterSet::DIGIT;
+    return userInfoValid;
+}
+
+/**
+ * Governed by RFC 3986 section 2.1
+ */
+SBuf
+AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore)
+{
+    if (buf.isEmpty())
+        return buf;
+
+    Parser::Tokenizer tk(buf);
+    SBuf goodSection;
+    // optimization for the arguably common "no encoding necessary" case
+    if (tk.prefix(goodSection, ignore) && tk.atEnd())
+        return buf;
+
+    SBuf output;
+    output.reserveSpace(buf.length() * 3); // worst case: encode all chars
+    output.append(goodSection); // may be empty
+
+    while (!tk.atEnd()) {
+        // TODO: Add Tokenizer::parseOne(void).
+        const auto ch = tk.remaining()[0];
+        output.appendf("%%%02X", static_cast<unsigned int>(ch)); // TODO: Optimize using a table
+        (void)tk.skip(ch);
+
+        if (tk.prefix(goodSection, ignore))
+            output.append(goodSection);
+    }
+
+    return output;
+}
+
  const SBuf &
  AnyP::Uri::Asterisk()
  {
@@ -557,7 +607,10 @@ AnyP::Uri::absolute() const
                                         getScheme() == AnyP::PROTO_UNKNOWN;
  
              if (allowUserInfo && !userInfo().isEmpty()) {
-                absolute_.append(userInfo());
+                static const CharacterSet uiChars = CharacterSet(UserInfoChars())
+                                                    .remove('%')
+                                                    .rename("userinfo-reserved");
+                absolute_.append(Encode(userInfo(), uiChars));
                  absolute_.append("@", 1);
              }
              absolute_.append(authority());
@@ -565,7 +618,7 @@ AnyP::Uri::absolute() const
              absolute_.append(host());
              absolute_.append(":", 1);
          }
-        absolute_.append(path());
+        absolute_.append(path()); // TODO: Encode each URI subcomponent in path_ as needed.
      }
  
      return absolute_;
@@ -619,102 +672,76 @@ urlCanonicalFakeHttps(const HttpRequest * request)
      return request->canonicalCleanUrl();
  }
  
-/*
- * Test if a URL is relative.
+/**
+ * Test if a URL is a relative reference.
+ *
+ * Governed by RFC 3986 section 4.2
+ *
+ *  relative-ref  = relative-part [ "?" query ] [ "#" fragment ]
   *
- * RFC 2396, Section 5 (Page 17) implies that in a relative URL, a '/' will
- * appear before a ':'.
+ *  relative-part = "//" authority path-abempty
+ *                / path-absolute
+ *                / path-noscheme
+ *                / path-empty
   */
  bool
  urlIsRelative(const char *url)
  {
-    const char *p;
+    if (!url)
+        return false; // no URL
  
-    if (url == NULL) {
-        return (false);
-    }
-    if (*url == '\0') {
-        return (false);
-    }
+    /*
+     * RFC 3986 section 5.2.3
+     *
+     * path          = path-abempty    ; begins with "/" or is empty
+     *               / path-absolute   ; begins with "/" but not "//"
+     *               / path-noscheme   ; begins with a non-colon segment
+     *               / path-rootless   ; begins with a segment
+     *               / path-empty      ; zero characters
+     */
  
-    for (p = url; *p != '\0' && *p != ':' && *p != '/'; ++p);
+    if (*url == '\0')
+        return true; // path-empty
  
-    if (*p == ':') {
-        return (false);
+    if (*url == '/') {
+        // RFC 3986 section 5.2.3
+        // path-absolute   ; begins with "/" but not "//"
+        if (url[1] == '/')
+            return true; // network-path reference, aka. 'scheme-relative URI'
+        else
+            return true; // path-absolute, aka 'absolute-path reference'
      }
-    return (true);
-}
-
-/*
- * Convert a relative URL to an absolute URL using the context of a given
- * request.
- *
- * It is assumed that you have already ensured that the URL is relative.
- *
- * If NULL is returned it is an indication that the method in use in the
- * request does not distinguish between relative and absolute and you should
- * use the url unchanged.
- *
- * If non-NULL is returned, it is up to the caller to free the resulting
- * memory using safe_free().
- */
-char *
-urlMakeAbsolute(const HttpRequest * req, const char *relUrl)
-{
  
-    if (req->method.id() == Http::METHOD_CONNECT) {
-        return (NULL);
+    for (const auto *p = url; *p != '\0' && *p != '/' && *p != '?' && *p != '#'; ++p) {
+        if (*p == ':')
+            return false; // colon is forbidden in first segment
      }
  
-    char *urlbuf = (char *)xmalloc(MAX_URL * sizeof(char));
-
-    if (req->url.getScheme() == AnyP::PROTO_URN) {
-        // XXX: this is what the original code did, but it seems to break the
-        // intended behaviour of this function. It returns the stored URN path,
-        // not converting the given one into a URN...
-        snprintf(urlbuf, MAX_URL, SQUIDSBUFPH, SQUIDSBUFPRINT(req->url.absolute()));
-        return (urlbuf);
-    }
+    return true; // path-noscheme, path-abempty, path-rootless
+}
  
-    SBuf authorityForm = req->url.authority(); // host[:port]
-    const SBuf &scheme = req->url.getScheme().image();
-    size_t urllen = snprintf(urlbuf, MAX_URL, SQUIDSBUFPH "://" SQUIDSBUFPH "%s" SQUIDSBUFPH,
-                             SQUIDSBUFPRINT(scheme),
-                             SQUIDSBUFPRINT(req->url.userInfo()),
-                             !req->url.userInfo().isEmpty() ? "@" : "",
-                             SQUIDSBUFPRINT(authorityForm));
-
-    // if the first char is '/' assume its a relative path
-    // XXX: this breaks on scheme-relative URLs,
-    // but we should not see those outside ESI, and rarely there.
-    // XXX: also breaks on any URL containing a '/' in the query-string portion
-    if (relUrl[0] == '/') {
-        xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
+void
+AnyP::Uri::addRelativePath(const char *relUrl)
+{
+    // URN cannot be merged
+    if (getScheme() == AnyP::PROTO_URN)
+        return;
+
+    // TODO: Handle . and .. segment normalization
+
+    const auto lastSlashPos = path_.rfind('/');
+    // TODO: To optimize and simplify, add and use SBuf::replace().
+    const auto relUrlLength = strlen(relUrl);
+    if (lastSlashPos == SBuf::npos) {
+        // start replacing the whole path
+        path_.reserveCapacity(1 + relUrlLength);
+        path_.assign("/", 1);
      } else {
-        SBuf path = req->url.path();
-        SBuf::size_type lastSlashPos = path.rfind('/');
-
-        if (lastSlashPos == SBuf::npos) {
-            // replace the whole path with the given bit(s)
-            urlbuf[urllen] = '/';
-            ++urllen;
-            xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
-        } else {
-            // replace only the last (file?) segment with the given bit(s)
-            ++lastSlashPos;
-            if (lastSlashPos > MAX_URL - urllen - 1) {
-                // XXX: crops bits in the middle of the combined URL.
-                lastSlashPos = MAX_URL - urllen - 1;
-            }
-            SBufToCstring(&urlbuf[urllen], path.substr(0,lastSlashPos));
-            urllen += lastSlashPos;
-            if (urllen + 1 < MAX_URL) {
-                xstrncpy(&urlbuf[urllen], relUrl, MAX_URL - urllen - 1);
-            }
-        }
+        // start replacing just the last segment
+        path_.reserveCapacity(lastSlashPos + 1 + relUrlLength);
+        path_.chop(0, lastSlashPos+1);
      }
-
-    return (urlbuf);
+    path_.append(relUrl, relUrlLength);
  }
  
  int
diff --git a/src/anyp/Uri.h b/src/anyp/Uri.h

index 05115901974a9eff4ae040561d7e9553273f55d4..d056888550493f39eb616c66320770ad4f6e2840 100644 (file)
--- a/src/anyp/Uri.h
+++ b/src/anyp/Uri.h
@@ -78,6 +78,8 @@ public:
      }
  
      void userInfo(const SBuf &s) {userInfo_=s; touch();}
+    /// \returns raw userinfo subcomponent (or an empty string)
+    /// the caller is responsible for caller-specific encoding
      const SBuf &userInfo() const {return userInfo_;}
  
      void host(const char *src);
@@ -97,12 +99,27 @@ public:
      void path(const SBuf &p) {path_=p; touch();}
      const SBuf &path() const;
  
+    /**
+     * Merge a relative-path URL into the existing URI details.
+     * Implements RFC 3986 section 5.2.3
+     *
+     * The caller must ensure relUrl is a valid relative-path.
+     *
+     * NP: absolute-path are also accepted, but path() method
+     * should be used instead when possible.
+     */
+    void addRelativePath(const char *relUrl);
+
      /// the static '/' default URL-path
      static const SBuf &SlashPath();
  
      /// the static '*' pseudo-URI
      static const SBuf &Asterisk();
  
+    /// %-encode characters in a buffer which do not conform to
+    /// the provided set of expected characters.
+    static SBuf Encode(const SBuf &, const CharacterSet &expected);
+
      /**
       * The authority-form URI for currently stored values.
       *
@@ -198,7 +215,6 @@ void urlInitialize(void);
  char *urlCanonicalCleanWithoutRequest(const SBuf &url, const HttpRequestMethod &, const AnyP::UriScheme &);
  const char *urlCanonicalFakeHttps(const HttpRequest * request);
  bool urlIsRelative(const char *);
-char *urlMakeAbsolute(const HttpRequest *, const char *);
  char *urlRInternal(const char *host, unsigned short port, const char *dir, const char *name);
  char *urlInternal(const char *dir, const char *name);
  bool urlAppendDomain(char *host); ///< apply append_domain config to the given hostname
diff --git a/src/base/CharacterSet.cc b/src/base/CharacterSet.cc

index 3d83b412656a62c70f5d89cc48e22ce3f17e27f2..6c02b52885c4275a588749b856dd7a7f69ff9d41 100644 (file)
--- a/src/base/CharacterSet.cc
+++ b/src/base/CharacterSet.cc
@@ -50,6 +50,13 @@ CharacterSet::add(const unsigned char c)
      return *this;
  }
  
+CharacterSet &
+CharacterSet::remove(const unsigned char c)
+{
+    chars_[static_cast<uint8_t>(c)] = 0;
+    return *this;
+}
+
  CharacterSet &
  CharacterSet::addRange(unsigned char low, unsigned char high)
  {
diff --git a/src/base/CharacterSet.h b/src/base/CharacterSet.h

index 77785555dcacc4ff9f3b92cc5e9b387e5dfd38a1..376eae938d592cddc5f4a96824554777ec50d536 100644 (file)
--- a/src/base/CharacterSet.h
+++ b/src/base/CharacterSet.h
@@ -41,6 +41,9 @@ public:
      /// add a given character to the character set
      CharacterSet & add(const unsigned char c);
  
+    /// remove a given character from the character set
+    CharacterSet & remove(const unsigned char c);
+
      /// add a list of character ranges, expressed as pairs [low,high], including both ends
      CharacterSet & addRange(unsigned char low, unsigned char high);
  
diff --git a/src/clients/Client.cc b/src/clients/Client.cc

index 50ac9e4450567c450b703302ec950830b87a0aa9..985e4b4716aa0adcc4dba9735c83284951631b82 100644 (file)
--- a/src/clients/Client.cc
+++ b/src/clients/Client.cc
@@ -468,33 +468,40 @@ sameUrlHosts(const char *url1, const char *url2)
  static void
  purgeEntriesByHeader(HttpRequest *req, const char *reqUrl, HttpMsg *rep, Http::HdrType hdr)
  {
-    const char *hdrUrl, *absUrl;
-
-    absUrl = NULL;
-    hdrUrl = rep->header.getStr(hdr);
-    if (hdrUrl == NULL) {
+    const auto hdrUrl = rep->header.getStr(hdr);
+    if (!hdrUrl)
          return;
-    }
  
      /*
       * If the URL is relative, make it absolute so we can find it.
       * If it's absolute, make sure the host parts match to avoid DOS attacks
       * as per RFC 2616 13.10.
       */
+    SBuf absUrlMaker;
+    const char *absUrl = nullptr;
      if (urlIsRelative(hdrUrl)) {
-        absUrl = urlMakeAbsolute(req, hdrUrl);
-        if (absUrl != NULL) {
-            hdrUrl = absUrl;
+        if (req->method.id() == Http::METHOD_CONNECT)
+            absUrl = hdrUrl; // TODO: merge authority-uri and hdrUrl
+        else if (req->url.getScheme() == AnyP::PROTO_URN)
+            absUrl = req->url.absolute().c_str();
+        else {
+            AnyP::Uri tmpUrl = req->url;
+            if (*hdrUrl == '/') {
+                // RFC 3986 section 4.2: absolute-path reference
+                // for this logic replace the entire request-target URI path
+                tmpUrl.path(hdrUrl);
+            } else {
+                tmpUrl.addRelativePath(reqUrl);
+            }
+            absUrlMaker = tmpUrl.absolute();
+            absUrl = absUrlMaker.c_str();
          }
      } else if (!sameUrlHosts(reqUrl, hdrUrl)) {
          return;
-    }
+    } else
+        absUrl = hdrUrl;
  
-    purgeEntriesByUrl(req, hdrUrl);
-
-    if (absUrl != NULL) {
-        safe_free(absUrl);
-    }
+    purgeEntriesByUrl(req, absUrl);
  }
  
  // some HTTP methods should purge matching cache entries
diff --git a/src/tests/stub_StatHist.cc b/src/tests/stub_StatHist.cc

index 659f771b8c162ebef8e3bca2bfc58a6b1f592aff..941a8de2ff1afd291143414c4a739f8809024bc4 100644 (file)
--- a/src/tests/stub_StatHist.cc
+++ b/src/tests/stub_StatHist.cc
@@ -16,7 +16,7 @@ class StoreEntry;
  
  void StatHist::dump(StoreEntry * sentry, StatHistBinDumper * bd) const STUB
  void StatHist::enumInit(unsigned int i) STUB_NOP
-void StatHist::count(double d) STUB_NOP
+void StatHist::count(double) {/* STUB_NOP */}
  double statHistDeltaMedian(const StatHist & A, const StatHist & B) STUB_RETVAL(0.0)
  double statHistDeltaPctile(const StatHist & A, const StatHist & B, double pctile) STUB_RETVAL(0.0)
  void StatHist::logInit(unsigned int i, double d1, double d2) STUB
diff --git a/src/tests/stub_libanyp.cc b/src/tests/stub_libanyp.cc

index 96c10af62d450bdfbb21bea13a7eaafe9e271d73..2d617d926f62ab520471207fc5d18c32ede5f60f 100644 (file)
--- a/src/tests/stub_libanyp.cc
+++ b/src/tests/stub_libanyp.cc
@@ -18,6 +18,7 @@ bool AnyP::Uri::parse(const HttpRequestMethod&, const SBuf &) STUB_RETVAL(true)
  void AnyP::Uri::host(const char *) STUB
  static SBuf nil;
  const SBuf &AnyP::Uri::path() const STUB_RETVAL(nil)
+void AnyP::Uri::addRelativePath(const char *) STUB
  const SBuf &AnyP::Uri::SlashPath()
  {
      static SBuf slash("/");
@@ -33,7 +34,6 @@ SBuf &AnyP::Uri::absolute() const STUB_RETVAL(nil)
  void urlInitialize() STUB
  const char *urlCanonicalFakeHttps(const HttpRequest *) STUB_RETVAL(nullptr)
  bool urlIsRelative(const char *) STUB_RETVAL(false)
-char *urlMakeAbsolute(const HttpRequest *, const char *)STUB_RETVAL(nullptr)
  char *urlRInternal(const char *, unsigned short, const char *, const char *) STUB_RETVAL(nullptr)
  char *urlInternal(const char *, const char *) STUB_RETVAL(nullptr)
  int matchDomainName(const char *, const char *, uint) STUB_RETVAL(0)
author	Amos Jeffries <yadij@users.noreply.github.com>
	Thu, 21 May 2020 14:42:02 +0000 (14:42 +0000)
committer	Amos Jeffries <yadij@users.noreply.github.com>
	Fri, 29 May 2020 18:38:59 +0000 (06:38 +1200)
src/anyp/Uri.cc		patch \| blob \| blame \| history
src/anyp/Uri.h		patch \| blob \| blame \| history
src/base/CharacterSet.cc		patch \| blob \| blame \| history
src/base/CharacterSet.h		patch \| blob \| blame \| history
src/clients/Client.cc		patch \| blob \| blame \| history
src/tests/stub_StatHist.cc		patch \| blob \| blame \| history
src/tests/stub_libanyp.cc		patch \| blob \| blame \| history