From: Alex Rousskov <rousskov@measurement-factory.com>
Date: Sun, 7 Jul 2024 03:03:00 +0000 (+0000)
Subject: Fix Tokenizer::int64() parsing of "0" when guessing base (#1842)
X-Git-Tag: SQUID_7_0_1~95
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8cc5e88c7fb0d73ce413d1871641b93403d1ea1a;p=thirdparty%2Fsquid.git

Fix Tokenizer::int64() parsing of "0" when guessing base (#1842)

Known bug victims in current code were tcp_outgoing_mark,
mark_client_packet, clientside_mark, and mark_client_connection
directives as well as client_connection_mark and (deprecated)
clientside_mark ACLs if they were configured to match a zero mark using
"0" or "0/..." syntax:

    ERROR: configuration failure: NfMarkConfig: invalid value '0/10'...
    exception location: NfMarkConfig.cc(23) getNfmark

Probably broken since 2014 commit 01f2137d.
---

diff --git a/src/parser/Tokenizer.cc b/src/parser/Tokenizer.cc
index 51654bae90..6544fb4482 100644
--- a/src/parser/Tokenizer.cc
+++ b/src/parser/Tokenizer.cc
@@ -264,7 +264,6 @@ Parser::Tokenizer::int64(int64_t & result, int base, bool allowSign, const SBuf:
     if (base == 0) {
         if ( *s == '0') {
             base = 8;
-            ++s;
         } else {
             base = 10;
         }
diff --git a/src/tests/testTokenizer.cc b/src/tests/testTokenizer.cc
index e69fec4f95..28532bce0b 100644
--- a/src/tests/testTokenizer.cc
+++ b/src/tests/testTokenizer.cc
@@ -248,6 +248,285 @@ TestTokenizer::testTokenizerInt64()
         CPPUNIT_ASSERT(t.buf().isEmpty());
     }
 
+    // When interpreting octal numbers, standard strtol() and Tokenizer::int64()
+    // treat leading zero as a part of sequence of digits rather than a
+    // character used _exclusively_ as base indicator. Thus, it is not possible
+    // to create an invalid octal number with an explicit octal base -- the
+    // first invalid character after the base will be successfully ignored. This
+    // treatment also makes it difficult to define "shortest valid octal input".
+    // Here, we are just enumerating interesting "short input" octal cases in
+    // four dimensions:
+    // 1. int64(base) argument: forced or auto-detected;
+    // 2. base character ("0") in input: absent or present;
+    // 3. post-base digits in input: absent, valid, or invalid;
+    // 4. input length limits via int64(length) argument: unlimited or limited.
+
+    // forced base; input: no base, no post-base digits, unlimited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf(""));
+        CPPUNIT_ASSERT(!t.int64(rv, 8));
+        CPPUNIT_ASSERT_EQUAL(SBuf(""), t.buf());
+    }
+
+    // forced base; input: no base, no post-base digits, limited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("7"));
+        CPPUNIT_ASSERT(!t.int64(rv, 8, false, 0));
+        CPPUNIT_ASSERT_EQUAL(SBuf("7"), t.buf());
+    }
+
+    // forced base; input: no base, one valid post-base digit, unlimited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("4"));
+        const int64_t benchmark = 04;
+        CPPUNIT_ASSERT(t.int64(rv, 8));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf(""), t.buf());
+    }
+
+    // forced base; input: no base, one valid post-base digit, limited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("46"));
+        const int64_t benchmark = 04;
+        CPPUNIT_ASSERT(t.int64(rv, 8, false, 1));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("6"), t.buf());
+    }
+
+    // forced base; input: no base, one invalid post-base digit, unlimited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("8"));
+        CPPUNIT_ASSERT(!t.int64(rv, 8));
+        CPPUNIT_ASSERT_EQUAL(SBuf("8"), t.buf());
+    }
+
+    // forced base; input: no base, one invalid post-base digit, limited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("80"));
+        CPPUNIT_ASSERT(!t.int64(rv, 8, false, 1));
+        CPPUNIT_ASSERT_EQUAL(SBuf("80"), t.buf());
+    }
+
+    // repeat the above six octal cases, but now with base character in input
+
+    // forced base; input: base, no post-base digits, unlimited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("0"));
+        const int64_t benchmark = 0;
+        CPPUNIT_ASSERT(t.int64(rv, 8));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf(""), t.buf());
+    }
+
+    // forced base; input: base, no post-base digits, limited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("07"));
+        const int64_t benchmark = 0;
+        CPPUNIT_ASSERT(t.int64(rv, 8, false, 1));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("7"), t.buf());
+    }
+
+    // forced base; input: base, one valid post-base digit, unlimited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("04"));
+        const int64_t benchmark = 04;
+        CPPUNIT_ASSERT(t.int64(rv, 8));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf(""), t.buf());
+    }
+
+    // forced base; input: base, one valid post-base digit, limited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("046"));
+        const int64_t benchmark = 04;
+        CPPUNIT_ASSERT(t.int64(rv, 8, false, 2));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("6"), t.buf());
+    }
+
+    // forced base; input: base, one invalid post-base digit, unlimited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("08"));
+        const int64_t benchmark = 00;
+        CPPUNIT_ASSERT(t.int64(rv, 8));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("8"), t.buf());
+    }
+
+    // forced base; input: base, one invalid post-base digit, limited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("08"));
+        const int64_t benchmark = 00;
+        CPPUNIT_ASSERT(t.int64(rv, 8, false, 2));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("8"), t.buf());
+    }
+
+    // And now repeat six "with base character in input" octal cases but with
+    // auto-detected base. When octal cases below say "auto-detected base", they
+    // describe int64() base=0 parameter value. Current int64() implementation
+    // does auto-detect base as octal in all of these cases, but that might
+    // change, and some of these cases (e.g., "0") can also be viewed as a
+    // non-octal input case as well. These cases do not attempt to test base
+    // detection. They focus on other potential problems.
+
+    // auto-detected base; input: base, no post-base digits, unlimited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("0"));
+        const int64_t benchmark = 00;
+        CPPUNIT_ASSERT(t.int64(rv, 0));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf(""), t.buf());
+    }
+
+    // auto-detected base; input: base, no post-base digits, limited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("07"));
+        const int64_t benchmark = 0;
+        CPPUNIT_ASSERT(t.int64(rv, 0, false, 1));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("7"), t.buf());
+    }
+
+    // auto-detected base; input: base, one valid post-base digit, unlimited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("04"));
+        const int64_t benchmark = 04;
+        CPPUNIT_ASSERT(t.int64(rv, 0));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf(""), t.buf());
+    }
+
+    // auto-detected base; input: base, one valid post-base digit, limited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("046"));
+        const int64_t benchmark = 04;
+        CPPUNIT_ASSERT(t.int64(rv, 0, false, 2));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("6"), t.buf());
+    }
+
+    // auto-detected base; input: base, one invalid post-base digit, unlimited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("08"));
+        const int64_t benchmark = 00;
+        CPPUNIT_ASSERT(t.int64(rv, 0));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("8"), t.buf());
+    }
+
+    // auto-detected base; input: base, one invalid post-base digit, limited
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("08"));
+        const int64_t benchmark = 00;
+        CPPUNIT_ASSERT(t.int64(rv, 0, false, 2));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("8"), t.buf());
+    }
+
+    // this ends four-dimensional enumeration of octal cases described earlier
+
+    // check octal base auto-detection
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("0128"));
+        const int64_t benchmark = 012;
+        CPPUNIT_ASSERT(t.int64(rv, 0));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("8"), t.buf());
+    }
+
+    // check that octal base auto-detection is not confused by repeated zeros
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("00000000071"));
+        const int64_t benchmark = 00000000071;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf(""), t.buf());
+    }
+
+    // check that forced octal base is not confused by hex prefix
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("0x5"));
+        const int64_t benchmark = 0;
+        CPPUNIT_ASSERT(t.int64(rv, 8));
+        CPPUNIT_ASSERT_EQUAL(benchmark, rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("x5"), t.buf());
+    }
+
+    // autodetect decimal base in shortest valid input
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("1"));
+        const int64_t benchmark = 1;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+        CPPUNIT_ASSERT(t.buf().isEmpty());
+    }
+
+    // autodetect hex base in shortest valid input
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("0X1"));
+        const int64_t benchmark = 0X1;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+        CPPUNIT_ASSERT(t.buf().isEmpty());
+    }
+
+    // invalid (when autodetecting base) input matching hex base
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("0x"));
+        CPPUNIT_ASSERT(!t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(SBuf("0x"), t.buf());
+    }
+
+    // invalid (when forcing hex base) input matching hex base
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("0x"));
+        CPPUNIT_ASSERT(!t.int64(rv, 16));
+        CPPUNIT_ASSERT_EQUAL(SBuf("0x"), t.buf());
+    }
+
+    // invalid (when autodetecting base and limiting) input matching hex base
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("0x2"));
+        CPPUNIT_ASSERT(!t.int64(rv, 0, true, 2));
+        CPPUNIT_ASSERT_EQUAL(SBuf("0x2"), t.buf());
+    }
+
+    // invalid (when forcing hex base and limiting) input matching hex base
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("0x3"));
+        CPPUNIT_ASSERT(!t.int64(rv, 16, false, 2));
+        CPPUNIT_ASSERT_EQUAL(SBuf("0x3"), t.buf());
+    }
+
     // API mismatch: don't eat leading space
     {
         int64_t rv;
@@ -264,6 +543,36 @@ TestTokenizer::testTokenizerInt64()
         CPPUNIT_ASSERT_EQUAL(SBuf("  1234"), t.buf());
     }
 
+    // zero corner case: repeated zeros
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("00"));
+        const int64_t benchmark = 00;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf(""), t.buf());
+    }
+
+    // zero corner case: "positive" zero
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("+0"));
+        const int64_t benchmark = +0;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf(""), t.buf());
+    }
+
+    // zero corner case: "negative" zero
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("-0"));
+        const int64_t benchmark = -0;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf(""), t.buf());
+    }
+
     // trailing spaces
     {
         int64_t rv;