From: Alex Rousskov <rousskov@measurement-factory.com>
Date: Wed, 12 Aug 2015 22:18:22 +0000 (-0600)
Subject: New SBuf and Tokenizer methods to simplify suffix parsing and skipping
X-Git-Tag: SQUID_4_0_1~134
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=c571034be79334a2f09b81b8e1723a1886b39700;p=thirdparty%2Fsquid.git

New SBuf and Tokenizer methods to simplify suffix parsing and skipping
(and to make suffix/reverse APIs more similar to prefix/forward ones).

Also reluctantly changed Tokenizer to update parsedSize() when parsing
suffixes, per reviewer request.
---

diff --git a/src/SBuf.cc b/src/SBuf.cc
index fbaffcbaa4..79a04adba1 100644
--- a/src/SBuf.cc
+++ b/src/SBuf.cc
@@ -802,6 +802,48 @@ SBuf::findFirstNotOf(const CharacterSet &set, size_type startPos) const
     return npos;
 }
 
+SBuf::size_type
+SBuf::findLastOf(const CharacterSet &set, size_type endPos) const
+{
+    ++stats.find;
+
+    if (isEmpty())
+        return npos;
+
+    if (endPos == npos || endPos >= length())
+        endPos = length() - 1;
+
+    debugs(24, 7, "last of characterset " << set.name << " in id " << id);
+    const char *start = buf();
+    for (const char *cur = start + endPos; cur >= start; --cur) {
+        if (set[*cur])
+            return cur - start;
+    }
+    debugs(24, 7, "not found");
+    return npos;
+}
+
+SBuf::size_type
+SBuf::findLastNotOf(const CharacterSet &set, size_type endPos) const
+{
+    ++stats.find;
+
+    if (isEmpty())
+        return npos;
+
+    if (endPos == npos || endPos >= length())
+        endPos = length() - 1;
+
+    debugs(24, 7, "last not of characterset " << set.name << " in id " << id);
+    const char *start = buf();
+    for (const char *cur = start + endPos; cur >= start; --cur) {
+        if (!set[*cur])
+            return cur - start;
+    }
+    debugs(24, 7, "not found");
+    return npos;
+}
+
 /*
  * TODO: borrow a sscanf implementation from Linux or similar?
  * we'd really need a vsnscanf(3)... ? As an alternative, a
diff --git a/src/SBuf.h b/src/SBuf.h
index 31c51cdaf0..dcbe587720 100644
--- a/src/SBuf.h
+++ b/src/SBuf.h
@@ -596,6 +596,16 @@ public:
      */
     size_type findFirstOf(const CharacterSet &set, size_type startPos = 0) const;
 
+    /** Find last occurrence of character of set in SBuf
+     *
+     * Finds the last occurrence of ANY of the characters in the supplied set in
+     * the SBuf.
+     * \return npos if no character in the set could be found
+     * \param endPos if specified, ignore any occurrences after that position
+     *   if npos, the entire SBuf is searched
+     */
+    size_type findLastOf(const CharacterSet &set, size_type endPos = npos) const;
+
     /** Find first occurrence character NOT in character set
      *
      * \return npos if all characters in the SBuf are from set
@@ -606,6 +616,14 @@ public:
      */
     size_type findFirstNotOf(const CharacterSet &set, size_type startPos = 0) const;
 
+    /** Find last occurrence character NOT in character set
+     *
+     * \return npos if all characters in the SBuf are from set
+     * \param endPos if specified, ignore any occurrences after that position
+     *   if npos, then the entire SBuf is searched
+     */
+    size_type findLastNotOf(const CharacterSet &set, size_type endPos = npos) const;
+
     /** sscanf-alike
      *
      * sscanf re-implementation. Non-const, and not \0-clean.
diff --git a/src/parser/Tokenizer.cc b/src/parser/Tokenizer.cc
index 68fcb7cafc..825aecc7a8 100644
--- a/src/parser/Tokenizer.cc
+++ b/src/parser/Tokenizer.cc
@@ -35,6 +35,28 @@ Parser::Tokenizer::success(const SBuf::size_type n)
     return consume(n).length();
 }
 
+/// convenience method: consumes up to n last bytes and returns them
+SBuf
+Parser::Tokenizer::consumeTrailing(const SBuf::size_type n)
+{
+    debugs(24, 5, "consuming " << n << " bytes");
+
+    // If n is npos, we consume everything from buf_ (and nothing from result).
+    const SBuf::size_type parsed = (n == SBuf::npos) ? buf_.length() : n;
+
+    SBuf result = buf_;
+    buf_ = result.consume(buf_.length() - parsed);
+    parsed_ += parsed;
+    return result;
+}
+
+/// convenience method: consumes up to n last bytes and returns their count
+SBuf::size_type
+Parser::Tokenizer::successTrailing(const SBuf::size_type n)
+{
+    return consumeTrailing(n).length();
+}
+
 bool
 Parser::Tokenizer::token(SBuf &returnedToken, const CharacterSet &delimiters)
 {
@@ -90,8 +112,7 @@ Parser::Tokenizer::suffix(SBuf &returnedToken, const CharacterSet &tokenChars, c
     }
     if (!found)
         return false;
-    returnedToken = buf_;
-    buf_ = returnedToken.consume(buf_.length() - found);
+    returnedToken = consumeTrailing(found);
     return true;
 }
 
@@ -129,8 +150,8 @@ Parser::Tokenizer::skipSuffix(const SBuf &tokenToSkip)
         offset = buf_.length() - tokenToSkip.length();
 
     if (buf_.substr(offset, SBuf::npos).cmp(tokenToSkip) == 0) {
-        buf_ = buf_.substr(0,offset);
-        return true;
+        debugs(24, 8, "skipping " << tokenToSkip.length());
+        return successTrailing(tokenToSkip.length());
     }
     return false;
 }
@@ -157,6 +178,32 @@ Parser::Tokenizer::skip(const char tokenChar)
     return false;
 }
 
+bool
+Parser::Tokenizer::skipOneTrailing(const CharacterSet &skippable)
+{
+    if (!buf_.isEmpty() && skippable[buf_[buf_.length()-1]]) {
+        debugs(24, 8, "skipping one-of " << skippable.name);
+        return successTrailing(1);
+    }
+    debugs(24, 8, "no match while skipping one-of " << skippable.name);
+    return false;
+}
+
+SBuf::size_type
+Parser::Tokenizer::skipAllTrailing(const CharacterSet &skippable)
+{
+    const SBuf::size_type prefixEnd = buf_.findLastNotOf(skippable);
+    const SBuf::size_type prefixLen = prefixEnd == SBuf::npos ?
+        0 : (prefixEnd + 1);
+    const SBuf::size_type suffixLen = buf_.length() - prefixLen;
+    if (suffixLen == 0) {
+        debugs(24, 8, "no match when trying to skip " << skippable.name);
+        return 0;
+    }
+    debugs(24, 8, "skipping in " << skippable.name << " len " << suffixLen);
+    return successTrailing(suffixLen);
+}
+
 /* reworked from compat/strtoll.c */
 bool
 Parser::Tokenizer::int64(int64_t & result, int base, bool allowSign, const SBuf::size_type limit)
diff --git a/src/parser/Tokenizer.h b/src/parser/Tokenizer.h
index 47d4a7515a..762d6bdff5 100644
--- a/src/parser/Tokenizer.h
+++ b/src/parser/Tokenizer.h
@@ -115,6 +115,18 @@ public:
      */
     SBuf::size_type skipAll(const CharacterSet &discardables);
 
+    /** Removes a single trailing character from the set.
+     *
+     * \return whether a character was removed
+     */
+    bool skipOneTrailing(const CharacterSet &discardables);
+
+    /** Removes all sequential trailing characters from the set, in any order.
+     *
+     * \returns the number of characters removed
+     */
+    SBuf::size_type skipAllTrailing(const CharacterSet &discardables);
+
     /** Extracts an unsigned int64_t at the beginning of the buffer.
      *
      * strtoll(3)-alike function: tries to parse unsigned 64-bit integer
@@ -134,6 +146,8 @@ public:
 protected:
     SBuf consume(const SBuf::size_type n);
     SBuf::size_type success(const SBuf::size_type n);
+    SBuf consumeTrailing(const SBuf::size_type n);
+    SBuf::size_type successTrailing(const SBuf::size_type n);
 
     /// reset the buffer and parsed stats to a saved checkpoint
     void undoParse(const SBuf &newBuf, SBuf::size_type cParsed) { buf_ = newBuf; parsed_ = cParsed; }
diff --git a/src/tests/testTokenizer.cc b/src/tests/testTokenizer.cc
index 682f0d01f8..4e526f9882 100644
--- a/src/tests/testTokenizer.cc
+++ b/src/tests/testTokenizer.cc
@@ -128,6 +128,66 @@ testTokenizer::testTokenizerToken()
 
 }
 
+void
+testTokenizer::testTokenizerSuffix()
+{
+    const SBuf canary("This text should not be changed.");
+
+    Parser::Tokenizer t(text);
+    SBuf s;
+
+    CharacterSet all(whitespace);
+    all += alpha;
+    all += crlf;
+    all += numbers;
+    all.add(':').add('.').add('/');
+
+    // an empty suffix should return false (the full output buffer case)
+    s = canary;
+    const SBuf before = t.remaining();
+    CPPUNIT_ASSERT(!t.suffix(s, all, 0));
+    // ... and a false return value means no parameter changes
+    CPPUNIT_ASSERT_EQUAL(canary, s);
+    // ... and a false return value means no input buffer changes
+    CPPUNIT_ASSERT_EQUAL(before, t.remaining());
+
+    // consume suffix until the last CRLF, including that last CRLF
+    SBuf::size_type remaining = t.remaining().length();
+    while (t.remaining().findLastOf(crlf) != SBuf::npos) {
+        CPPUNIT_ASSERT(t.remaining().length() > 0);
+        CPPUNIT_ASSERT(t.skipOneTrailing(all));
+        // ensure steady progress
+        CPPUNIT_ASSERT_EQUAL(remaining, t.remaining().length() + 1);
+        --remaining;
+    }
+
+    // no match (last char is not in the suffix set)
+    CPPUNIT_ASSERT(!t.suffix(s, crlf));
+    CPPUNIT_ASSERT(!t.suffix(s, whitespace));
+
+    // successful suffix tokenization
+    CPPUNIT_ASSERT(t.suffix(s, numbers));
+    CPPUNIT_ASSERT_EQUAL(SBuf("1"), s);
+    CPPUNIT_ASSERT(t.skipSuffix(SBuf("1.")));
+    CPPUNIT_ASSERT(t.skipSuffix(SBuf("/")));
+    CPPUNIT_ASSERT(t.suffix(s, alpha));
+    CPPUNIT_ASSERT_EQUAL(SBuf("HTTP"), s);
+    CPPUNIT_ASSERT(t.suffix(s, whitespace));
+    CPPUNIT_ASSERT_EQUAL(SBuf(" "), s);
+
+    // match until the end of the sample
+    CPPUNIT_ASSERT(t.suffix(s, all));
+    CPPUNIT_ASSERT_EQUAL(SBuf(), t.remaining());
+
+    // an empty buffer does not end with a token
+    s = canary;
+    CPPUNIT_ASSERT(!t.suffix(s, all));
+    CPPUNIT_ASSERT_EQUAL(canary, s); // no parameter changes
+
+    // we cannot skip an empty suffix, even in an empty buffer
+    CPPUNIT_ASSERT(!t.skipSuffix(SBuf()));
+}
+
 void
 testTokenizer::testCharacterSet()
 {
diff --git a/src/tests/testTokenizer.h b/src/tests/testTokenizer.h
index b168f3456c..447ee414aa 100644
--- a/src/tests/testTokenizer.h
+++ b/src/tests/testTokenizer.h
@@ -16,6 +16,7 @@ class testTokenizer : public CPPUNIT_NS::TestFixture
     CPPUNIT_TEST_SUITE( testTokenizer );
     CPPUNIT_TEST ( testCharacterSet );
     CPPUNIT_TEST ( testTokenizerPrefix );
+    CPPUNIT_TEST ( testTokenizerSuffix );
     CPPUNIT_TEST ( testTokenizerSkip );
     CPPUNIT_TEST ( testTokenizerToken );
     CPPUNIT_TEST ( testTokenizerInt64 );
@@ -23,6 +24,7 @@ class testTokenizer : public CPPUNIT_NS::TestFixture
 
 protected:
     void testTokenizerPrefix();
+    void testTokenizerSuffix();
     void testTokenizerSkip();
     void testTokenizerToken();
     void testCharacterSet();