]> git.ipfire.org Git - thirdparty/squid.git/commitdiff
Add AnyP::Uri::Decode() (#1626)
authorFrancesco Chemolli <5175948+kinkie@users.noreply.github.com>
Wed, 31 Jan 2024 18:21:13 +0000 (18:21 +0000)
committerSquid Anubis <squid-anubis@squid-cache.org>
Wed, 31 Jan 2024 18:21:22 +0000 (18:21 +0000)
Implement percent-decoding according to RFC 3986. Currently unused.
TODO: Upgrade rfc1738_unescape() callers.

src/anyp/Uri.cc
src/anyp/Uri.h
src/base/CharacterSet.cc
src/base/CharacterSet.h
src/http/one/RequestParser.cc
src/tests/testURL.cc

index 2f9a7191f41ad801fe7a733769fd3f1ebbf6ff03..aba1bab559369447ac1bf2de8e7e95c3287b76b7 100644 (file)
@@ -81,6 +81,29 @@ AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore)
     return output;
 }
 
+SBuf
+AnyP::Uri::Decode(const SBuf &buf)
+{
+    SBuf output;
+    Parser::Tokenizer tok(buf);
+    while (!tok.atEnd()) {
+        SBuf token;
+        static const auto unencodedChars = CharacterSet("percent", "%").complement("unencoded");
+        if (tok.prefix(token, unencodedChars))
+            output.append(token);
+
+        // we are either at '%' or at end of input
+        if (tok.skip('%')) {
+            int64_t hex1 = 0, hex2 = 0;
+            if (tok.int64(hex1, 16, false, 1) && tok.int64(hex2, 16, false, 1))
+                output.append(static_cast<char>((hex1 << 4) | hex2));
+            else
+                throw TextException("invalid pct-encoded triplet", Here());
+        }
+    }
+    return output;
+}
+
 const SBuf &
 AnyP::Uri::Asterisk()
 {
index a90a3b7d4710b2eeb776ac32de3758b3dfcc5885..81090e63cd81496074c6f56882ed42f57ac389a6 100644 (file)
@@ -113,6 +113,9 @@ public:
     /// the provided set of expected characters.
     static SBuf Encode(const SBuf &, const CharacterSet &expected);
 
+    /// %-decode the given buffer
+    static SBuf Decode(const SBuf &);
+
     /**
      * The authority-form URI for currently stored values.
      *
index efcb057690d52d0fe4f60868dfbf517882135da1..3793b7c745c6bc111f53f21a8bc06179f209be4d 100644 (file)
@@ -160,3 +160,11 @@ CharacterSet::ETAGC("ETAGC", {{0x21,0x21},{0x23,0x7e},{0x80,0xff}}),
 CharacterSet::TOKEN68C("TOKEN68C","-._~+/0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")
 ;
 
+const CharacterSet &
+CharacterSet::RFC3986_UNRESERVED()
+{
+    // RFC 3986: unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
+    static const auto chars = new CharacterSet("RFC3986_UNRESERVED", "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-._~");
+    return *chars;
+}
+
index a2439d4803811f4b5477acf882c173fb77601b6c..880072c0306f76a996961bff4d9fd47dc219c6d7 100644 (file)
@@ -118,6 +118,9 @@ public:
     // token68 (internal characters only, excludes '=' terminator)
     static const CharacterSet TOKEN68C;
 
+    /// allowed URI characters that do not have a reserved purpose, RFC 3986
+    static const CharacterSet &RFC3986_UNRESERVED();
+
 private:
     /** index of characters in this set
      *
index 8dcd1e7eacb0b983d95c8c9c7365cced826ca837..6d73d4c028160cd90df705808428bdb7158b077c 100644 (file)
@@ -98,9 +98,7 @@ UriValidCharacters()
         CharacterSet("gen-delims", ":/?#[]@") +
         CharacterSet("sub-delims", "!$&'()*+,;=") +
         // RFC 3986 section 2.3 - unreserved characters
-        CharacterSet::ALPHA +
-        CharacterSet::DIGIT +
-        CharacterSet("unreserved", "-._~") +
+        CharacterSet::RFC3986_UNRESERVED() +
         // RFC 3986 section 2.1 - percent encoding "%" HEXDIG
         CharacterSet("pct-encoded", "%") +
         CharacterSet::HEXDIG;
index 2600a72a2009a522e476b55819abcc4059f764c6..d50e22db77fab2ca4e95170c32cdc632ff01574c 100644 (file)
@@ -9,8 +9,11 @@
 #include "squid.h"
 
 #include "anyp/Uri.h"
+#include "base/CharacterSet.h"
+#include "base/TextException.h"
 #include "compat/cppunit.h"
 #include "debug/Stream.h"
+#include "sbuf/Stream.h"
 #include "unitTestMain.h"
 
 #include <cppunit/TestAssert.h>
@@ -25,11 +28,13 @@ class TestUri : public CPPUNIT_NS::TestFixture
     CPPUNIT_TEST_SUITE(TestUri);
     CPPUNIT_TEST(testConstructScheme);
     CPPUNIT_TEST(testDefaultConstructor);
+    CPPUNIT_TEST(testEncoding);
     CPPUNIT_TEST_SUITE_END();
 
 protected:
     void testConstructScheme();
     void testDefaultConstructor();
+    void testEncoding();
 };
 CPPUNIT_TEST_SUITE_REGISTRATION(TestUri);
 
@@ -81,6 +86,49 @@ TestUri::testDefaultConstructor()
     delete urlPointer;
 }
 
+void
+TestUri::testEncoding()
+{
+    const std::vector< std::pair<SBuf, SBuf> > basicTestCases = {
+        {SBuf(""), SBuf("")},
+        {SBuf("foo"), SBuf("foo")},
+        {SBuf("%"), SBuf("%25")},
+        {SBuf("%foo"), SBuf("%25foo")},
+        {SBuf("foo%"), SBuf("foo%25")},
+        {SBuf("fo%o"), SBuf("fo%25o")},
+        {SBuf("fo%%o"), SBuf("fo%25%25o")},
+        {SBuf("fo o"), SBuf("fo%20o")},
+        {SBuf("?1"), SBuf("%3F1")},
+        {SBuf("\377"), SBuf("%FF")},
+        {SBuf("fo\0o", 4), SBuf("fo%00o")},
+    };
+
+    for (const auto &testCase: basicTestCases) {
+        CPPUNIT_ASSERT_EQUAL(testCase.first, AnyP::Uri::Decode(testCase.second));
+        CPPUNIT_ASSERT_EQUAL(testCase.second, AnyP::Uri::Encode(testCase.first, CharacterSet::RFC3986_UNRESERVED()));
+    };
+
+    const auto invalidEncodings = {
+        SBuf("%"),
+        SBuf("%%"),
+        SBuf("%%%"),
+        SBuf("%1"),
+        SBuf("%1Z"),
+        SBuf("%1\000", 2),
+        SBuf("%1\377"),
+        SBuf("%\0002", 3),
+        SBuf("%\3772"),
+    };
+
+    for (const auto &invalidEncoding: invalidEncodings) {
+        // test various input positions of an invalid escape sequence
+        CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(invalidEncoding), TextException);
+        CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(ToSBuf("word", invalidEncoding)), TextException);
+        CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(ToSBuf(invalidEncoding, "word")), TextException);
+        CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(ToSBuf("word", invalidEncoding, "word")), TextException);
+    };
+}
+
 int
 main(int argc, char *argv[])
 {