From: Francesco Chemolli <5175948+kinkie@users.noreply.github.com> Date: Wed, 31 Jan 2024 18:21:13 +0000 (+0000) Subject: Add AnyP::Uri::Decode() (#1626) X-Git-Tag: SQUID_7_0_1~225 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=26256f285c4258d428687df94f37edac78d4de48;p=thirdparty%2Fsquid.git Add AnyP::Uri::Decode() (#1626) Implement percent-decoding according to RFC 3986. Currently unused. TODO: Upgrade rfc1738_unescape() callers. --- diff --git a/src/anyp/Uri.cc b/src/anyp/Uri.cc index 2f9a7191f4..aba1bab559 100644 --- a/src/anyp/Uri.cc +++ b/src/anyp/Uri.cc @@ -81,6 +81,29 @@ AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore) return output; } +SBuf +AnyP::Uri::Decode(const SBuf &buf) +{ + SBuf output; + Parser::Tokenizer tok(buf); + while (!tok.atEnd()) { + SBuf token; + static const auto unencodedChars = CharacterSet("percent", "%").complement("unencoded"); + if (tok.prefix(token, unencodedChars)) + output.append(token); + + // we are either at '%' or at end of input + if (tok.skip('%')) { + int64_t hex1 = 0, hex2 = 0; + if (tok.int64(hex1, 16, false, 1) && tok.int64(hex2, 16, false, 1)) + output.append(static_cast((hex1 << 4) | hex2)); + else + throw TextException("invalid pct-encoded triplet", Here()); + } + } + return output; +} + const SBuf & AnyP::Uri::Asterisk() { diff --git a/src/anyp/Uri.h b/src/anyp/Uri.h index a90a3b7d47..81090e63cd 100644 --- a/src/anyp/Uri.h +++ b/src/anyp/Uri.h @@ -113,6 +113,9 @@ public: /// the provided set of expected characters. static SBuf Encode(const SBuf &, const CharacterSet &expected); + /// %-decode the given buffer + static SBuf Decode(const SBuf &); + /** * The authority-form URI for currently stored values. * diff --git a/src/base/CharacterSet.cc b/src/base/CharacterSet.cc index efcb057690..3793b7c745 100644 --- a/src/base/CharacterSet.cc +++ b/src/base/CharacterSet.cc @@ -160,3 +160,11 @@ CharacterSet::ETAGC("ETAGC", {{0x21,0x21},{0x23,0x7e},{0x80,0xff}}), CharacterSet::TOKEN68C("TOKEN68C","-._~+/0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") ; +const CharacterSet & +CharacterSet::RFC3986_UNRESERVED() +{ + // RFC 3986: unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" + static const auto chars = new CharacterSet("RFC3986_UNRESERVED", "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-._~"); + return *chars; +} + diff --git a/src/base/CharacterSet.h b/src/base/CharacterSet.h index a2439d4803..880072c030 100644 --- a/src/base/CharacterSet.h +++ b/src/base/CharacterSet.h @@ -118,6 +118,9 @@ public: // token68 (internal characters only, excludes '=' terminator) static const CharacterSet TOKEN68C; + /// allowed URI characters that do not have a reserved purpose, RFC 3986 + static const CharacterSet &RFC3986_UNRESERVED(); + private: /** index of characters in this set * diff --git a/src/http/one/RequestParser.cc b/src/http/one/RequestParser.cc index 8dcd1e7eac..6d73d4c028 100644 --- a/src/http/one/RequestParser.cc +++ b/src/http/one/RequestParser.cc @@ -98,9 +98,7 @@ UriValidCharacters() CharacterSet("gen-delims", ":/?#[]@") + CharacterSet("sub-delims", "!$&'()*+,;=") + // RFC 3986 section 2.3 - unreserved characters - CharacterSet::ALPHA + - CharacterSet::DIGIT + - CharacterSet("unreserved", "-._~") + + CharacterSet::RFC3986_UNRESERVED() + // RFC 3986 section 2.1 - percent encoding "%" HEXDIG CharacterSet("pct-encoded", "%") + CharacterSet::HEXDIG; diff --git a/src/tests/testURL.cc b/src/tests/testURL.cc index 2600a72a20..d50e22db77 100644 --- a/src/tests/testURL.cc +++ b/src/tests/testURL.cc @@ -9,8 +9,11 @@ #include "squid.h" #include "anyp/Uri.h" +#include "base/CharacterSet.h" +#include "base/TextException.h" #include "compat/cppunit.h" #include "debug/Stream.h" +#include "sbuf/Stream.h" #include "unitTestMain.h" #include @@ -25,11 +28,13 @@ class TestUri : public CPPUNIT_NS::TestFixture CPPUNIT_TEST_SUITE(TestUri); CPPUNIT_TEST(testConstructScheme); CPPUNIT_TEST(testDefaultConstructor); + CPPUNIT_TEST(testEncoding); CPPUNIT_TEST_SUITE_END(); protected: void testConstructScheme(); void testDefaultConstructor(); + void testEncoding(); }; CPPUNIT_TEST_SUITE_REGISTRATION(TestUri); @@ -81,6 +86,49 @@ TestUri::testDefaultConstructor() delete urlPointer; } +void +TestUri::testEncoding() +{ + const std::vector< std::pair > basicTestCases = { + {SBuf(""), SBuf("")}, + {SBuf("foo"), SBuf("foo")}, + {SBuf("%"), SBuf("%25")}, + {SBuf("%foo"), SBuf("%25foo")}, + {SBuf("foo%"), SBuf("foo%25")}, + {SBuf("fo%o"), SBuf("fo%25o")}, + {SBuf("fo%%o"), SBuf("fo%25%25o")}, + {SBuf("fo o"), SBuf("fo%20o")}, + {SBuf("?1"), SBuf("%3F1")}, + {SBuf("\377"), SBuf("%FF")}, + {SBuf("fo\0o", 4), SBuf("fo%00o")}, + }; + + for (const auto &testCase: basicTestCases) { + CPPUNIT_ASSERT_EQUAL(testCase.first, AnyP::Uri::Decode(testCase.second)); + CPPUNIT_ASSERT_EQUAL(testCase.second, AnyP::Uri::Encode(testCase.first, CharacterSet::RFC3986_UNRESERVED())); + }; + + const auto invalidEncodings = { + SBuf("%"), + SBuf("%%"), + SBuf("%%%"), + SBuf("%1"), + SBuf("%1Z"), + SBuf("%1\000", 2), + SBuf("%1\377"), + SBuf("%\0002", 3), + SBuf("%\3772"), + }; + + for (const auto &invalidEncoding: invalidEncodings) { + // test various input positions of an invalid escape sequence + CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(invalidEncoding), TextException); + CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(ToSBuf("word", invalidEncoding)), TextException); + CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(ToSBuf(invalidEncoding, "word")), TextException); + CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(ToSBuf("word", invalidEncoding, "word")), TextException); + }; +} + int main(int argc, char *argv[]) {