New SBuf and Tokenizer methods to simplify suffix parsing and skipping

author Alex Rousskov <rousskov@measurement-factory.com>

Wed, 12 Aug 2015 22:18:22 +0000 (16:18 -0600)

committer Alex Rousskov <rousskov@measurement-factory.com>

Wed, 12 Aug 2015 22:18:22 +0000 (16:18 -0600)
author Alex Rousskov <rousskov@measurement-factory.com>
Wed, 12 Aug 2015 22:18:22 +0000 (16:18 -0600)
committer Alex Rousskov <rousskov@measurement-factory.com>
Wed, 12 Aug 2015 22:18:22 +0000 (16:18 -0600)
diff --git a/src/SBuf.cc b/src/SBuf.cc

index fbaffcbaa4cf224ba35414b03198c016384a47c0..79a04adba173640c20674316117db3945498bbb8 100644 (file)
--- a/src/SBuf.cc
+++ b/src/SBuf.cc
@@ -802,6 +802,48 @@ SBuf::findFirstNotOf(const CharacterSet &set, size_type startPos) const
      return npos;
  }
  
+SBuf::size_type
+SBuf::findLastOf(const CharacterSet &set, size_type endPos) const
+{
+    ++stats.find;
+
+    if (isEmpty())
+        return npos;
+
+    if (endPos == npos || endPos >= length())
+        endPos = length() - 1;
+
+    debugs(24, 7, "last of characterset " << set.name << " in id " << id);
+    const char *start = buf();
+    for (const char *cur = start + endPos; cur >= start; --cur) {
+        if (set[*cur])
+            return cur - start;
+    }
+    debugs(24, 7, "not found");
+    return npos;
+}
+
+SBuf::size_type
+SBuf::findLastNotOf(const CharacterSet &set, size_type endPos) const
+{
+    ++stats.find;
+
+    if (isEmpty())
+        return npos;
+
+    if (endPos == npos || endPos >= length())
+        endPos = length() - 1;
+
+    debugs(24, 7, "last not of characterset " << set.name << " in id " << id);
+    const char *start = buf();
+    for (const char *cur = start + endPos; cur >= start; --cur) {
+        if (!set[*cur])
+            return cur - start;
+    }
+    debugs(24, 7, "not found");
+    return npos;
+}
+
  /*
   * TODO: borrow a sscanf implementation from Linux or similar?
   * we'd really need a vsnscanf(3)... ? As an alternative, a
diff --git a/src/SBuf.h b/src/SBuf.h

index 31c51cdaf0268dc32493a460063b3d7daec77c3f..dcbe587720adcbe526f602e8a64779239c2f7ac5 100644 (file)
--- a/src/SBuf.h
+++ b/src/SBuf.h
@@ -596,6 +596,16 @@ public:
       */
      size_type findFirstOf(const CharacterSet &set, size_type startPos = 0) const;
  
+    /** Find last occurrence of character of set in SBuf
+     *
+     * Finds the last occurrence of ANY of the characters in the supplied set in
+     * the SBuf.
+     * \return npos if no character in the set could be found
+     * \param endPos if specified, ignore any occurrences after that position
+     *   if npos, the entire SBuf is searched
+     */
+    size_type findLastOf(const CharacterSet &set, size_type endPos = npos) const;
+
      /** Find first occurrence character NOT in character set
       *
       * \return npos if all characters in the SBuf are from set
@@ -606,6 +616,14 @@ public:
       */
      size_type findFirstNotOf(const CharacterSet &set, size_type startPos = 0) const;
  
+    /** Find last occurrence character NOT in character set
+     *
+     * \return npos if all characters in the SBuf are from set
+     * \param endPos if specified, ignore any occurrences after that position
+     *   if npos, then the entire SBuf is searched
+     */
+    size_type findLastNotOf(const CharacterSet &set, size_type endPos = npos) const;
+
      /** sscanf-alike
       *
       * sscanf re-implementation. Non-const, and not \0-clean.
diff --git a/src/parser/Tokenizer.cc b/src/parser/Tokenizer.cc

index 68fcb7cafc9555ae51210aadc8ac3eaffad0528c..825aecc7a87e966609db578142fce4fbeb505818 100644 (file)
--- a/src/parser/Tokenizer.cc
+++ b/src/parser/Tokenizer.cc
@@ -35,6 +35,28 @@ Parser::Tokenizer::success(const SBuf::size_type n)
      return consume(n).length();
  }
  
+/// convenience method: consumes up to n last bytes and returns them
+SBuf
+Parser::Tokenizer::consumeTrailing(const SBuf::size_type n)
+{
+    debugs(24, 5, "consuming " << n << " bytes");
+
+    // If n is npos, we consume everything from buf_ (and nothing from result).
+    const SBuf::size_type parsed = (n == SBuf::npos) ? buf_.length() : n;
+
+    SBuf result = buf_;
+    buf_ = result.consume(buf_.length() - parsed);
+    parsed_ += parsed;
+    return result;
+}
+
+/// convenience method: consumes up to n last bytes and returns their count
+SBuf::size_type
+Parser::Tokenizer::successTrailing(const SBuf::size_type n)
+{
+    return consumeTrailing(n).length();
+}
+
  bool
  Parser::Tokenizer::token(SBuf &returnedToken, const CharacterSet &delimiters)
  {
@@ -90,8 +112,7 @@ Parser::Tokenizer::suffix(SBuf &returnedToken, const CharacterSet &tokenChars, c
      }
      if (!found)
          return false;
-    returnedToken = buf_;
-    buf_ = returnedToken.consume(buf_.length() - found);
+    returnedToken = consumeTrailing(found);
      return true;
  }
  
@@ -129,8 +150,8 @@ Parser::Tokenizer::skipSuffix(const SBuf &tokenToSkip)
          offset = buf_.length() - tokenToSkip.length();
  
      if (buf_.substr(offset, SBuf::npos).cmp(tokenToSkip) == 0) {
-        buf_ = buf_.substr(0,offset);
-        return true;
+        debugs(24, 8, "skipping " << tokenToSkip.length());
+        return successTrailing(tokenToSkip.length());
      }
      return false;
  }
@@ -157,6 +178,32 @@ Parser::Tokenizer::skip(const char tokenChar)
      return false;
  }
  
+bool
+Parser::Tokenizer::skipOneTrailing(const CharacterSet &skippable)
+{
+    if (!buf_.isEmpty() && skippable[buf_[buf_.length()-1]]) {
+        debugs(24, 8, "skipping one-of " << skippable.name);
+        return successTrailing(1);
+    }
+    debugs(24, 8, "no match while skipping one-of " << skippable.name);
+    return false;
+}
+
+SBuf::size_type
+Parser::Tokenizer::skipAllTrailing(const CharacterSet &skippable)
+{
+    const SBuf::size_type prefixEnd = buf_.findLastNotOf(skippable);
+    const SBuf::size_type prefixLen = prefixEnd == SBuf::npos ?
+        0 : (prefixEnd + 1);
+    const SBuf::size_type suffixLen = buf_.length() - prefixLen;
+    if (suffixLen == 0) {
+        debugs(24, 8, "no match when trying to skip " << skippable.name);
+        return 0;
+    }
+    debugs(24, 8, "skipping in " << skippable.name << " len " << suffixLen);
+    return successTrailing(suffixLen);
+}
+
  /* reworked from compat/strtoll.c */
  bool
  Parser::Tokenizer::int64(int64_t & result, int base, bool allowSign, const SBuf::size_type limit)
diff --git a/src/parser/Tokenizer.h b/src/parser/Tokenizer.h

index 47d4a7515aafcdd57e53c4a1bf2bab8f8158e315..762d6bdff57e24b23bcba93a6f357517d63af8d8 100644 (file)
--- a/src/parser/Tokenizer.h
+++ b/src/parser/Tokenizer.h
@@ -115,6 +115,18 @@ public:
       */
      SBuf::size_type skipAll(const CharacterSet &discardables);
  
+    /** Removes a single trailing character from the set.
+     *
+     * \return whether a character was removed
+     */
+    bool skipOneTrailing(const CharacterSet &discardables);
+
+    /** Removes all sequential trailing characters from the set, in any order.
+     *
+     * \returns the number of characters removed
+     */
+    SBuf::size_type skipAllTrailing(const CharacterSet &discardables);
+
      /** Extracts an unsigned int64_t at the beginning of the buffer.
       *
       * strtoll(3)-alike function: tries to parse unsigned 64-bit integer
@@ -134,6 +146,8 @@ public:
  protected:
      SBuf consume(const SBuf::size_type n);
      SBuf::size_type success(const SBuf::size_type n);
+    SBuf consumeTrailing(const SBuf::size_type n);
+    SBuf::size_type successTrailing(const SBuf::size_type n);
  
      /// reset the buffer and parsed stats to a saved checkpoint
      void undoParse(const SBuf &newBuf, SBuf::size_type cParsed) { buf_ = newBuf; parsed_ = cParsed; }
diff --git a/src/tests/testTokenizer.cc b/src/tests/testTokenizer.cc

index 682f0d01f89c6261659c6ed8eb84eb67019377ba..4e526f98820dd2345fd84104a8890abff1c82824 100644 (file)
--- a/src/tests/testTokenizer.cc
+++ b/src/tests/testTokenizer.cc
@@ -128,6 +128,66 @@ testTokenizer::testTokenizerToken()
  
  }
  
+void
+testTokenizer::testTokenizerSuffix()
+{
+    const SBuf canary("This text should not be changed.");
+
+    Parser::Tokenizer t(text);
+    SBuf s;
+
+    CharacterSet all(whitespace);
+    all += alpha;
+    all += crlf;
+    all += numbers;
+    all.add(':').add('.').add('/');
+
+    // an empty suffix should return false (the full output buffer case)
+    s = canary;
+    const SBuf before = t.remaining();
+    CPPUNIT_ASSERT(!t.suffix(s, all, 0));
+    // ... and a false return value means no parameter changes
+    CPPUNIT_ASSERT_EQUAL(canary, s);
+    // ... and a false return value means no input buffer changes
+    CPPUNIT_ASSERT_EQUAL(before, t.remaining());
+
+    // consume suffix until the last CRLF, including that last CRLF
+    SBuf::size_type remaining = t.remaining().length();
+    while (t.remaining().findLastOf(crlf) != SBuf::npos) {
+        CPPUNIT_ASSERT(t.remaining().length() > 0);
+        CPPUNIT_ASSERT(t.skipOneTrailing(all));
+        // ensure steady progress
+        CPPUNIT_ASSERT_EQUAL(remaining, t.remaining().length() + 1);
+        --remaining;
+    }
+
+    // no match (last char is not in the suffix set)
+    CPPUNIT_ASSERT(!t.suffix(s, crlf));
+    CPPUNIT_ASSERT(!t.suffix(s, whitespace));
+
+    // successful suffix tokenization
+    CPPUNIT_ASSERT(t.suffix(s, numbers));
+    CPPUNIT_ASSERT_EQUAL(SBuf("1"), s);
+    CPPUNIT_ASSERT(t.skipSuffix(SBuf("1.")));
+    CPPUNIT_ASSERT(t.skipSuffix(SBuf("/")));
+    CPPUNIT_ASSERT(t.suffix(s, alpha));
+    CPPUNIT_ASSERT_EQUAL(SBuf("HTTP"), s);
+    CPPUNIT_ASSERT(t.suffix(s, whitespace));
+    CPPUNIT_ASSERT_EQUAL(SBuf(" "), s);
+
+    // match until the end of the sample
+    CPPUNIT_ASSERT(t.suffix(s, all));
+    CPPUNIT_ASSERT_EQUAL(SBuf(), t.remaining());
+
+    // an empty buffer does not end with a token
+    s = canary;
+    CPPUNIT_ASSERT(!t.suffix(s, all));
+    CPPUNIT_ASSERT_EQUAL(canary, s); // no parameter changes
+
+    // we cannot skip an empty suffix, even in an empty buffer
+    CPPUNIT_ASSERT(!t.skipSuffix(SBuf()));
+}
+
  void
  testTokenizer::testCharacterSet()
  {
diff --git a/src/tests/testTokenizer.h b/src/tests/testTokenizer.h

index b168f3456c75086e10ae83f830e032b5aecf3752..447ee414aa7cc01ca64775f275f427964747f404 100644 (file)
--- a/src/tests/testTokenizer.h
+++ b/src/tests/testTokenizer.h
@@ -16,6 +16,7 @@ class testTokenizer : public CPPUNIT_NS::TestFixture
      CPPUNIT_TEST_SUITE( testTokenizer );
      CPPUNIT_TEST ( testCharacterSet );
      CPPUNIT_TEST ( testTokenizerPrefix );
+    CPPUNIT_TEST ( testTokenizerSuffix );
      CPPUNIT_TEST ( testTokenizerSkip );
      CPPUNIT_TEST ( testTokenizerToken );
      CPPUNIT_TEST ( testTokenizerInt64 );
@@ -23,6 +24,7 @@ class testTokenizer : public CPPUNIT_NS::TestFixture
  
  protected:
      void testTokenizerPrefix();
+    void testTokenizerSuffix();
      void testTokenizerSkip();
      void testTokenizerToken();
      void testCharacterSet();
author	Alex Rousskov <rousskov@measurement-factory.com>
	Wed, 12 Aug 2015 22:18:22 +0000 (16:18 -0600)
committer	Alex Rousskov <rousskov@measurement-factory.com>
	Wed, 12 Aug 2015 22:18:22 +0000 (16:18 -0600)
src/SBuf.cc		patch \| blob \| blame \| history
src/SBuf.h		patch \| blob \| blame \| history
src/parser/Tokenizer.cc		patch \| blob \| blame \| history
src/parser/Tokenizer.h		patch \| blob \| blame \| history
src/tests/testTokenizer.cc		patch \| blob \| blame \| history
src/tests/testTokenizer.h		patch \| blob \| blame \| history