Add AnyP::Uri::Decode() (#1626)

author Francesco Chemolli <5175948+kinkie@users.noreply.github.com>

Wed, 31 Jan 2024 18:21:13 +0000 (18:21 +0000)

committer Squid Anubis <squid-anubis@squid-cache.org>

Wed, 31 Jan 2024 18:21:22 +0000 (18:21 +0000)
author Francesco Chemolli <5175948+kinkie@users.noreply.github.com>
Wed, 31 Jan 2024 18:21:13 +0000 (18:21 +0000)
committer Squid Anubis <squid-anubis@squid-cache.org>
Wed, 31 Jan 2024 18:21:22 +0000 (18:21 +0000)
diff --git a/src/anyp/Uri.cc b/src/anyp/Uri.cc

index 2f9a7191f41ad801fe7a733769fd3f1ebbf6ff03..aba1bab559369447ac1bf2de8e7e95c3287b76b7 100644 (file)
--- a/src/anyp/Uri.cc
+++ b/src/anyp/Uri.cc
@@ -81,6 +81,29 @@ AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore)
      return output;
  }
  
+SBuf
+AnyP::Uri::Decode(const SBuf &buf)
+{
+    SBuf output;
+    Parser::Tokenizer tok(buf);
+    while (!tok.atEnd()) {
+        SBuf token;
+        static const auto unencodedChars = CharacterSet("percent", "%").complement("unencoded");
+        if (tok.prefix(token, unencodedChars))
+            output.append(token);
+
+        // we are either at '%' or at end of input
+        if (tok.skip('%')) {
+            int64_t hex1 = 0, hex2 = 0;
+            if (tok.int64(hex1, 16, false, 1) && tok.int64(hex2, 16, false, 1))
+                output.append(static_cast<char>((hex1 << 4) | hex2));
+            else
+                throw TextException("invalid pct-encoded triplet", Here());
+        }
+    }
+    return output;
+}
+
  const SBuf &
  AnyP::Uri::Asterisk()
  {
diff --git a/src/anyp/Uri.h b/src/anyp/Uri.h

index a90a3b7d4710b2eeb776ac32de3758b3dfcc5885..81090e63cd81496074c6f56882ed42f57ac389a6 100644 (file)
--- a/src/anyp/Uri.h
+++ b/src/anyp/Uri.h
@@ -113,6 +113,9 @@ public:
      /// the provided set of expected characters.
      static SBuf Encode(const SBuf &, const CharacterSet &expected);
  
+    /// %-decode the given buffer
+    static SBuf Decode(const SBuf &);
+
      /**
       * The authority-form URI for currently stored values.
       *
diff --git a/src/base/CharacterSet.cc b/src/base/CharacterSet.cc

index efcb057690d52d0fe4f60868dfbf517882135da1..3793b7c745c6bc111f53f21a8bc06179f209be4d 100644 (file)
--- a/src/base/CharacterSet.cc
+++ b/src/base/CharacterSet.cc
@@ -160,3 +160,11 @@ CharacterSet::ETAGC("ETAGC", {{0x21,0x21},{0x23,0x7e},{0x80,0xff}}),
  CharacterSet::TOKEN68C("TOKEN68C","-._~+/0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
  ;
  
+const CharacterSet &
+CharacterSet::RFC3986_UNRESERVED()
+{
+    // RFC 3986: unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
+    static const auto chars = new CharacterSet("RFC3986_UNRESERVED", "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-._~");
+    return *chars;
+}
+
diff --git a/src/base/CharacterSet.h b/src/base/CharacterSet.h

index a2439d4803811f4b5477acf882c173fb77601b6c..880072c0306f76a996961bff4d9fd47dc219c6d7 100644 (file)
--- a/src/base/CharacterSet.h
+++ b/src/base/CharacterSet.h
@@ -118,6 +118,9 @@ public:
      // token68 (internal characters only, excludes '=' terminator)
      static const CharacterSet TOKEN68C;
  
+    /// allowed URI characters that do not have a reserved purpose, RFC 3986
+    static const CharacterSet &RFC3986_UNRESERVED();
+
  private:
      /** index of characters in this set
       *
diff --git a/src/http/one/RequestParser.cc b/src/http/one/RequestParser.cc

index 8dcd1e7eacb0b983d95c8c9c7365cced826ca837..6d73d4c028160cd90df705808428bdb7158b077c 100644 (file)
--- a/src/http/one/RequestParser.cc
+++ b/src/http/one/RequestParser.cc
@@ -98,9 +98,7 @@ UriValidCharacters()
          CharacterSet("gen-delims", ":/?#[]@") +
          CharacterSet("sub-delims", "!$&'()*+,;=") +
          // RFC 3986 section 2.3 - unreserved characters
-        CharacterSet::ALPHA +
-        CharacterSet::DIGIT +
-        CharacterSet("unreserved", "-._~") +
+        CharacterSet::RFC3986_UNRESERVED() +
          // RFC 3986 section 2.1 - percent encoding "%" HEXDIG
          CharacterSet("pct-encoded", "%") +
          CharacterSet::HEXDIG;
diff --git a/src/tests/testURL.cc b/src/tests/testURL.cc

index 2600a72a2009a522e476b55819abcc4059f764c6..d50e22db77fab2ca4e95170c32cdc632ff01574c 100644 (file)
--- a/src/tests/testURL.cc
+++ b/src/tests/testURL.cc
@@ -9,8 +9,11 @@
  #include "squid.h"
  
  #include "anyp/Uri.h"
+#include "base/CharacterSet.h"
+#include "base/TextException.h"
  #include "compat/cppunit.h"
  #include "debug/Stream.h"
+#include "sbuf/Stream.h"
  #include "unitTestMain.h"
  
  #include <cppunit/TestAssert.h>
@@ -25,11 +28,13 @@ class TestUri : public CPPUNIT_NS::TestFixture
      CPPUNIT_TEST_SUITE(TestUri);
      CPPUNIT_TEST(testConstructScheme);
      CPPUNIT_TEST(testDefaultConstructor);
+    CPPUNIT_TEST(testEncoding);
      CPPUNIT_TEST_SUITE_END();
  
  protected:
      void testConstructScheme();
      void testDefaultConstructor();
+    void testEncoding();
  };
  CPPUNIT_TEST_SUITE_REGISTRATION(TestUri);
  
@@ -81,6 +86,49 @@ TestUri::testDefaultConstructor()
      delete urlPointer;
  }
  
+void
+TestUri::testEncoding()
+{
+    const std::vector< std::pair<SBuf, SBuf> > basicTestCases = {
+        {SBuf(""), SBuf("")},
+        {SBuf("foo"), SBuf("foo")},
+        {SBuf("%"), SBuf("%25")},
+        {SBuf("%foo"), SBuf("%25foo")},
+        {SBuf("foo%"), SBuf("foo%25")},
+        {SBuf("fo%o"), SBuf("fo%25o")},
+        {SBuf("fo%%o"), SBuf("fo%25%25o")},
+        {SBuf("fo o"), SBuf("fo%20o")},
+        {SBuf("?1"), SBuf("%3F1")},
+        {SBuf("\377"), SBuf("%FF")},
+        {SBuf("fo\0o", 4), SBuf("fo%00o")},
+    };
+
+    for (const auto &testCase: basicTestCases) {
+        CPPUNIT_ASSERT_EQUAL(testCase.first, AnyP::Uri::Decode(testCase.second));
+        CPPUNIT_ASSERT_EQUAL(testCase.second, AnyP::Uri::Encode(testCase.first, CharacterSet::RFC3986_UNRESERVED()));
+    };
+
+    const auto invalidEncodings = {
+        SBuf("%"),
+        SBuf("%%"),
+        SBuf("%%%"),
+        SBuf("%1"),
+        SBuf("%1Z"),
+        SBuf("%1\000", 2),
+        SBuf("%1\377"),
+        SBuf("%\0002", 3),
+        SBuf("%\3772"),
+    };
+
+    for (const auto &invalidEncoding: invalidEncodings) {
+        // test various input positions of an invalid escape sequence
+        CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(invalidEncoding), TextException);
+        CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(ToSBuf("word", invalidEncoding)), TextException);
+        CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(ToSBuf(invalidEncoding, "word")), TextException);
+        CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(ToSBuf("word", invalidEncoding, "word")), TextException);
+    };
+}
+
  int
  main(int argc, char *argv[])
  {
author	Francesco Chemolli <5175948+kinkie@users.noreply.github.com>
	Wed, 31 Jan 2024 18:21:13 +0000 (18:21 +0000)
committer	Squid Anubis <squid-anubis@squid-cache.org>
	Wed, 31 Jan 2024 18:21:22 +0000 (18:21 +0000)
src/anyp/Uri.cc		patch \| blob \| blame \| history
src/anyp/Uri.h		patch \| blob \| blame \| history
src/base/CharacterSet.cc		patch \| blob \| blame \| history
src/base/CharacterSet.h		patch \| blob \| blame \| history
src/http/one/RequestParser.cc		patch \| blob \| blame \| history
src/tests/testURL.cc		patch \| blob \| blame \| history