From ad365edba79f043387446a36a930700ecc9d6d19 Mon Sep 17 00:00:00 2001
From: Alex Rousskov <rousskov@measurement-factory.com>
Date: Sun, 19 Oct 2025 19:42:38 +0000
Subject: [PATCH] Bug 5501: Squid may exit when ACLs decode an invalid URI
 (#2145)

    2025/08/14 09:28:51| FATAL: invalid pct-encoded triplet
        exception location: Uri.cc(102) Decode

The bug affects url_regex and urllogin ACLs. However, not every use of
those ACLs results in a FATAL exit. The exact preconditions are unknown.

Three pct-encoding (RFC 3986) error handling algorithms were considered:


### Algorithm A: An ACL that cannot decode, mismatches

This algorithm is similar to the algorithm used for handling "ACL is
used in context without ALE" and similar errors, but there is a
significant context difference: Those "without ALE" errors are Squid
misconfigurations or bugs! Decoding failures, on the other hand, are
caused by request properties outside of admin or Squid control.

With this algorithm, a request can easily avoid a "deny urlHasX" rule
match by injecting an invalid pct-encoding (e.g., `X%bad`). Such
injections may not be practical for URLs of most resources outside of
client control because most servers are unlikely to recognize the
malformed URL as something useful for the client. As for resources that
client does control, a urlHasX ACL cannot be effective for those anyway
because the client can change URLs.

Algorithm A does not let Squid admins match problematic URLs!


### Algorithm B: An ACL that cannot decode X, tests raw/encoded X

With this algorithm, a request can trigger some "allow urlHasY" rule
matches by injecting an invalid pct-encoding that looks like Y (e.g., if
an "allow" rule looks for the word `good`, a request may contain a
`%good` or `%XXgood` sequence). Just like with algorithm A, such
injections probably have little practical value, for similar reasons.

Algorithm B lets Squid admins match problematic URLs.


### Algorithm C: An ACL that cannot decode X, tests partially decoded X

With this algorithm, a "partially decoded X" is X where invalid
pct-encoding sequences (or their parts) are left "as is" while valid
pct-encoding triplets are decoded. This is actually a family of similar
algorithms because there are multiple ways to define invalid
pct-encoding sequence boundaries in certain URLs! For example,
`%6Fne%f%6Fo` can be replaced with `one%foo` or `one%f%6Fo`. This
additional complexity/uncertainty aggravates the two concerns below.

Algorithm B notes apply to algorithm C as well.

Algorithm C lets admins match problematic URLs but, again, it requires
that admins know exactly how Squid is going to isolate problematic
pct-encoding triplets (e.g., skip/leave just `%` byte that starts an
invalid pct-encoding sequence or the following two bytes as well).

Algorithm C family includes rfc1738_unescape() behavior. That decoding
function was used for the two ACLs before commit cbb9bf12 and commit
226394f2 started to use AnyP::Uri::Decode() added in commit 26256f28.
For example, rfc1738_unescape() decodes `%%` as `%` and leaves some
other invalid pct-encoding one-, two-, and three-byte sequences in the
decoded result. It is unlikely that many admins know exactly what that
old decoding does, but they could tune their rules to "work" as they
expect for specific cases. Those rules could stop working after the
above commits (v7.0.1+) and this change, to their surprise.


This change implements Algorithm B:

* Unlike Algorithm A, B allows admins to match bad URLs.
* Unlike Algorithm C, B does not force admins to guess how Squid
  mangles a bad URL before matching it.

Also updated ACLs documentation to reflect current implementation.
---
 src/acl/Url.cc            |  5 ++---
 src/acl/UrlLogin.cc       |  4 ++--
 src/anyp/Uri.cc           | 20 ++++++++++++++++----
 src/anyp/Uri.h            | 10 +++++++++-
 src/cf.data.pre           | 23 ++++++++++++++++++++++-
 src/clients/FtpGateway.cc | 17 +++++++++++++++--
 src/tests/testURL.cc      | 14 +++++++++-----
 7 files changed, 75 insertions(+), 18 deletions(-)

diff --git a/src/acl/Url.cc b/src/acl/Url.cc
index 12a3702c07..4c1563ec52 100644
--- a/src/acl/Url.cc
+++ b/src/acl/Url.cc
@@ -20,8 +20,7 @@ Acl::UrlCheck::match(ACLChecklist * const ch)
     const auto checklist = Filled(ch);
 
     // TODO: Consider refactoring so that effectiveRequestUri() returns decoded URI.
-    auto decodedUri = AnyP::Uri::Decode(checklist->request->effectiveRequestUri());
-    const auto result = data->match(decodedUri.c_str());
-    return result;
+    // XXX: c_str() truncates where %00 was decoded
+    return data->match(AnyP::Uri::DecodeOrDupe(checklist->request->effectiveRequestUri()).c_str());
 }
 
diff --git a/src/acl/UrlLogin.cc b/src/acl/UrlLogin.cc
index c4c5fed944..f13d70b635 100644
--- a/src/acl/UrlLogin.cc
+++ b/src/acl/UrlLogin.cc
@@ -24,7 +24,7 @@ Acl::UrlLoginCheck::match(ACLChecklist * const ch)
         return 0; // nothing can match
     }
 
-    auto decodedUserInfo = AnyP::Uri::Decode(checklist->request->url.userInfo());
-    return data->match(decodedUserInfo.c_str());
+    // XXX: c_str() truncates where %00 was decoded
+    return data->match(AnyP::Uri::DecodeOrDupe(checklist->request->url.userInfo()).c_str());
 }
 
diff --git a/src/anyp/Uri.cc b/src/anyp/Uri.cc
index 2b648866b8..0c80f10212 100644
--- a/src/anyp/Uri.cc
+++ b/src/anyp/Uri.cc
@@ -101,7 +101,7 @@ AnyP::Uri::Encode(const SBuf &buf, const CharacterSet &ignore)
     return output;
 }
 
-SBuf
+std::optional<SBuf>
 AnyP::Uri::Decode(const SBuf &buf)
 {
     SBuf output;
@@ -114,16 +114,28 @@ AnyP::Uri::Decode(const SBuf &buf)
 
         // we are either at '%' or at end of input
         if (tok.skip('%')) {
+            const auto rawBytesAfterPercent = tok.remaining();
             int64_t hex1 = 0, hex2 = 0;
-            if (tok.int64(hex1, 16, false, 1) && tok.int64(hex2, 16, false, 1))
+            if (tok.int64(hex1, 16, false, 1) && tok.int64(hex2, 16, false, 1)) {
                 output.append(static_cast<char>((hex1 << 4) | hex2));
-            else
-                throw TextException("invalid pct-encoded triplet", Here());
+            } else {
+                // see TestUri::testEncoding() for invalid pct-encoding sequence examples
+                debugs(23, 3, "invalid pct-encoding sequence starting at %" << rawBytesAfterPercent);
+                return std::nullopt;
+            }
         }
     }
     return output;
 }
 
+SBuf
+AnyP::Uri::DecodeOrDupe(const SBuf &input)
+{
+    if (const auto decoded = Decode(input))
+        return *decoded;
+    return input;
+}
+
 const SBuf &
 AnyP::Uri::Asterisk()
 {
diff --git a/src/anyp/Uri.h b/src/anyp/Uri.h
index 1c0361cc64..70ff6b7411 100644
--- a/src/anyp/Uri.h
+++ b/src/anyp/Uri.h
@@ -116,7 +116,15 @@ public:
     static SBuf Encode(const SBuf &, const CharacterSet &expected);
 
     /// %-decode the given buffer
-    static SBuf Decode(const SBuf &);
+    /// \retval std::nullopt on decoding failures
+    /// \sa DecodeOrDupe()
+    static std::optional<SBuf> Decode(const SBuf &);
+
+    /// %-decode the given buffer
+    /// \retval decoded input if input obeys RFC 3986 Percent-Encoding rules
+    /// \retval an input copy if input violates RFC 3986 Percent-Encoding rules
+    /// \sa Decode()
+    static SBuf DecodeOrDupe(const SBuf &input);
 
     /**
      * The authority-form URI for currently stored values.
diff --git a/src/cf.data.pre b/src/cf.data.pre
index 451d64b67d..898fc9af48 100644
--- a/src/cf.data.pre
+++ b/src/cf.data.pre
@@ -1245,8 +1245,29 @@ ENDIF
 
 	acl aclname url_regex [-i] ^http:// ...
 	  # POSIX extended regex matching on whole URL [fast]
+	  #
+	  # If request URL contains only valid pct-encoded triplets (RFC 3986),
+	  # all of them are decoded before matching (e.g., `%25` triplet is
+	  # replaced with a single `%` character). If request URL contains at
+	  # least one `%` character that does not start a valid pct-encoded
+	  # triplet (e.g., `%%`, `%X`, or `%2Y`), then the URL is not decoded at
+	  # all (i.e. the raw request URL is used for matching).
+	  #
+	  # If a request URL is decoded as described above, then all request URL
+	  # characters starting with the decoded `%00` pct-encoded triplet (if
+	  # any) are ignored during matching. There is currently no way to match
+	  # that triplet itself in a correctly percent-encoded URL.
+	  #
+	  # ACL parameters are not decoded.
+
 	acl aclname urllogin [-i] [^a-zA-Z0-9] ...
-	  # POSIX extended regex matching on URL login field
+	  # POSIX extended regex matching on URL login field [fast]
+	  #
+	  # This ACL does not match requests with a URL that lacks a login field.
+	  #
+	  # This ACL handles RFC 3986 pct-encoded triplets in the login field as
+	  # url_regex ACL handles those triplets in the entire request URL.
+
 	acl aclname urlpath_regex [-i] \.gif$ ...
 	  # POSIX extended regex matching on URL path [fast]
 
diff --git a/src/clients/FtpGateway.cc b/src/clients/FtpGateway.cc
index 1001ad057d..6352ce8c05 100644
--- a/src/clients/FtpGateway.cc
+++ b/src/clients/FtpGateway.cc
@@ -142,6 +142,7 @@ public:
 
     int checkAuth(const HttpHeader * req_hdr);
     void checkUrlpath();
+    std::optional<SBuf> decodedRequestUriPath() const;
     void buildTitleUrl();
     void writeReplyBody(const char *, size_t len);
     void completeForwarding() override;
@@ -2303,6 +2304,15 @@ ftpReadQuit(Ftp::Gateway * ftpState)
     ftpState->serverComplete();
 }
 
+/// absolute request URI path after successful decoding of all pct-encoding sequences
+std::optional<SBuf>
+Ftp::Gateway::decodedRequestUriPath() const
+{
+    return AnyP::Uri::Decode(request->url.absolutePath());
+}
+
+/// \prec !ftpState->flags.try_slash_hack
+/// \prec ftpState->decodedRequestUriPath()
 static void
 ftpTrySlashHack(Ftp::Gateway * ftpState)
 {
@@ -2315,8 +2325,9 @@ ftpTrySlashHack(Ftp::Gateway * ftpState)
         wordlistDestroy(&ftpState->pathcomps);
 
     /* Build the new path */
+    // XXX: Conversion to c-string effectively truncates where %00 was decoded
     safe_free(ftpState->filepath);
-    ftpState->filepath = SBufToCstring(AnyP::Uri::Decode(ftpState->request->url.absolutePath()));
+    ftpState->filepath = SBufToCstring(ftpState->decodedRequestUriPath().value());
 
     /* And off we go */
     ftpGetFile(ftpState);
@@ -2371,13 +2382,15 @@ ftpFail(Ftp::Gateway *ftpState)
            " reply code " << code << "flags(" <<
            (ftpState->flags.isdir?"IS_DIR,":"") <<
            (ftpState->flags.try_slash_hack?"TRY_SLASH_HACK":"") << "), " <<
+           "decodable_filepath=" << bool(ftpState->decodedRequestUriPath()) << ' ' <<
            "mdtm=" << ftpState->mdtm << ", size=" << ftpState->theSize <<
            "slashhack=" << (slashHack? "T":"F"));
 
     /* Try the / hack to support "Netscape" FTP URL's for retrieving files */
     if (!ftpState->flags.isdir &&   /* Not a directory */
             !ftpState->flags.try_slash_hack && !slashHack && /* Not doing slash hack */
-            ftpState->mdtm <= 0 && ftpState->theSize < 0) { /* Not known as a file */
+            ftpState->mdtm <= 0 && ftpState->theSize < 0 && /* Not known as a file */
+            ftpState->decodedRequestUriPath()) {
 
         switch (ftpState->state) {
 
diff --git a/src/tests/testURL.cc b/src/tests/testURL.cc
index b7609df5c4..b53abe12e7 100644
--- a/src/tests/testURL.cc
+++ b/src/tests/testURL.cc
@@ -104,7 +104,9 @@ TestUri::testEncoding()
     };
 
     for (const auto &testCase: basicTestCases) {
-        CPPUNIT_ASSERT_EQUAL(testCase.first, AnyP::Uri::Decode(testCase.second));
+        const auto decoded = AnyP::Uri::Decode(testCase.second);
+        CPPUNIT_ASSERT(decoded);
+        CPPUNIT_ASSERT_EQUAL(testCase.first, *decoded);
         CPPUNIT_ASSERT_EQUAL(testCase.second, AnyP::Uri::Encode(testCase.first, CharacterSet::RFC3986_UNRESERVED()));
     };
 
@@ -112,6 +114,7 @@ TestUri::testEncoding()
         SBuf("%"),
         SBuf("%%"),
         SBuf("%%%"),
+        SBuf("%0"),
         SBuf("%1"),
         SBuf("%1Z"),
         SBuf("%1\000", 2),
@@ -122,10 +125,11 @@ TestUri::testEncoding()
 
     for (const auto &invalidEncoding: invalidEncodings) {
         // test various input positions of an invalid escape sequence
-        CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(invalidEncoding), TextException);
-        CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(ToSBuf("word", invalidEncoding)), TextException);
-        CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(ToSBuf(invalidEncoding, "word")), TextException);
-        CPPUNIT_ASSERT_THROW(AnyP::Uri::Decode(ToSBuf("word", invalidEncoding, "word")), TextException);
+        CPPUNIT_ASSERT(!AnyP::Uri::Decode(invalidEncoding));
+        CPPUNIT_ASSERT(!AnyP::Uri::Decode(ToSBuf("word", invalidEncoding)));
+        CPPUNIT_ASSERT(!AnyP::Uri::Decode(ToSBuf(invalidEncoding, "word")));
+        CPPUNIT_ASSERT(!AnyP::Uri::Decode(ToSBuf("word", invalidEncoding, "word")));
+        CPPUNIT_ASSERT_EQUAL(invalidEncoding, AnyP::Uri::DecodeOrDupe(invalidEncoding));
     };
 }
 
-- 
2.47.3