Import Tokenizer updates

author Amos Jeffries <squid3@treenet.co.nz>

Sun, 1 Jun 2014 08:57:43 +0000 (01:57 -0700)

committer Amos Jeffries <squid3@treenet.co.nz>

Sun, 1 Jun 2014 08:57:43 +0000 (01:57 -0700)
author Amos Jeffries <squid3@treenet.co.nz>
Sun, 1 Jun 2014 08:57:43 +0000 (01:57 -0700)
committer Amos Jeffries <squid3@treenet.co.nz>
Sun, 1 Jun 2014 08:57:43 +0000 (01:57 -0700)
diff --git a/src/parser/Makefile.am b/src/parser/Makefile.am

index 8bb9c8678d776b158226aeb1047c80f40d6dbd8e..97cab1fdeef76b02b9f82c2b924dd0119763284b 100644 (file)
--- a/src/parser/Makefile.am
+++ b/src/parser/Makefile.am
@@ -14,6 +14,7 @@ libsquid_parser_la_SOURCES = \
         Tokenizer.cc
  
  SBUF_SOURCE= \
+       $(top_srcdir)/src/base/CharacterSet.h \
         $(top_srcdir)/src/SBuf.h \
         $(top_srcdir)/src/SBuf.cc \
         $(top_srcdir)/src/MemBlob.h \
@@ -32,7 +33,6 @@ testTokenizer_SOURCES = \
         testTokenizer.cc \
         Tokenizer.h
  nodist_testTokenizer_SOURCES = \
-       $(top_srcdir)/src/base/CharacterSet.h \
         $(top_srcdir)/src/tests/testMain.cc \
         $(top_srcdir)/src/tests/stub_mem.cc \
         $(top_srcdir)/src/tests/stub_debug.cc \
diff --git a/src/parser/Tokenizer.cc b/src/parser/Tokenizer.cc

index b76aa1ec67b036a7fbdd5814ea7170f7f5306059..b3bc3ce5a25d021cf61125612a9b190fde245e3d 100644 (file)
--- a/src/parser/Tokenizer.cc
+++ b/src/parser/Tokenizer.cc
@@ -1,25 +1,27 @@
  #include "squid.h"
-#include "Tokenizer.h"
-
-namespace Parser {
+#include "parser/Tokenizer.h"
  
  bool
-Tokenizer::token(SBuf &returnedToken, const CharacterSet &whitespace)
+Parser::Tokenizer::token(SBuf &returnedToken, const CharacterSet &delimiters)
  {
-    const SBuf::size_type endOfPreWhiteSpace = buf_.findFirstNotOf(whitespace);
-    const SBuf::size_type endOfToken = buf_.findFirstOf(whitespace, endOfPreWhiteSpace);
-    if (endOfToken == SBuf::npos)
+    SBuf savebuf(buf_);
+    skip(delimiters);
+    SBuf::size_type tokenLen = buf_.findFirstOf(delimiters); // not found = npos => consume to end
+    if (tokenLen == SBuf::npos && !delimiters['\0']) {
+        // no delimiter found, nor is NUL/EOS/npos acceptible as one
+        buf_ = savebuf;
          return false;
-    buf_.consume(endOfPreWhiteSpace);
-    returnedToken = buf_.consume(endOfToken - endOfPreWhiteSpace);
-    skip(whitespace);
+    }
+    SBuf retval = buf_.consume(tokenLen);
+    skip(delimiters);
+    returnedToken = retval;
      return true;
  }
  
  bool
-Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars)
+Parser::Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
  {
-    SBuf::size_type prefixLen = buf_.findFirstNotOf(tokenChars);
+    SBuf::size_type prefixLen = buf_.substr(0,limit).findFirstNotOf(tokenChars);
      if (prefixLen == 0)
          return false;
      returnedToken = buf_.consume(prefixLen);
@@ -27,7 +29,7 @@ Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars)
  }
  
  bool
-Tokenizer::skip(const CharacterSet &tokenChars)
+Parser::Tokenizer::skip(const CharacterSet &tokenChars)
  {
      SBuf::size_type prefixLen = buf_.findFirstNotOf(tokenChars);
      if (prefixLen == 0)
@@ -37,7 +39,7 @@ Tokenizer::skip(const CharacterSet &tokenChars)
  }
  
  bool
-Tokenizer::skip(const SBuf &tokenToSkip)
+Parser::Tokenizer::skip(const SBuf &tokenToSkip)
  {
      if (buf_.startsWith(tokenToSkip)) {
          buf_.consume(tokenToSkip.length());
@@ -47,7 +49,7 @@ Tokenizer::skip(const SBuf &tokenToSkip)
  }
  
  bool
-Tokenizer::skip(const char tokenChar)
+Parser::Tokenizer::skip(const char tokenChar)
  {
      if (buf_[0] == tokenChar) {
          buf_.consume(1);
@@ -55,4 +57,78 @@ Tokenizer::skip(const char tokenChar)
      }
      return false;
  }
-} /* namespace Parser */
+
+/* reworked from compat/strtoll.c */
+bool
+Parser::Tokenizer::int64(int64_t & result, int base)
+{
+    if (buf_.isEmpty())
+        return false;
+
+    //fixme: account for buf_.size()
+    bool neg = false;
+    const char *s = buf_.rawContent();
+    const char *end = buf_.rawContent() + buf_.length();
+
+    if (*s == '-') {
+        neg = true;
+        ++s;
+    } else if (*s == '+') {
+        ++s;
+    }
+    if (s >= end) return false;
+    if (( base == 0 || base == 16) && *s == '0' && (s+1 <= end ) &&
+                    tolower(*(s+1)) == 'x') {
+        s += 2;
+        base = 16;
+    }
+    if (base == 0) {
+        if ( *s == '0') {
+            base = 8;
+            ++s;
+        } else {
+            base = 10;
+        }
+    }
+    if (s >= end) return false;
+
+    uint64_t cutoff;
+
+    cutoff = neg ? -static_cast<uint64_t>(INT64_MIN) : INT64_MAX;
+    int cutlim = cutoff % static_cast<int64_t>(base);
+    cutoff /= static_cast<uint64_t>(base);
+
+    int any = 0, c;
+    int64_t acc = 0;
+    for (c = *s++; s <= end; c = *s++) {
+        if (xisdigit(c)) {
+            c -= '0';
+        } else if (xisalpha(c)) {
+            c -= xisupper(c) ? 'A' - 10 : 'a' - 10;
+        } else {
+            break;
+        }
+        if (c >= base)
+            break;
+        if (any < 0 || static_cast<uint64_t>(acc) > cutoff || (static_cast<uint64_t>(acc) == cutoff && c > cutlim))
+            any = -1;
+        else {
+            any = 1;
+            acc *= base;
+            acc += c;
+        }
+    }
+
+    if (any == 0) // nothing was parsed
+        return false;
+    if (any < 0) {
+        acc = neg ? INT64_MIN : INT64_MAX;
+        errno = ERANGE;
+        return false;
+    } else if (neg)
+        acc = -acc;
+
+    result = acc;
+    buf_.consume(s - buf_.rawContent() -1);
+    return true;
+}
diff --git a/src/parser/Tokenizer.h b/src/parser/Tokenizer.h

index d40869875d2b708dd4904cee8867025eee20e301..9436e9f19c108ff9fb8a86f6e0559f0f86134d05 100644 (file)
--- a/src/parser/Tokenizer.h
+++ b/src/parser/Tokenizer.h
@@ -4,44 +4,87 @@
  #include "base/CharacterSet.h"
  #include "SBuf.h"
  
+/// Generic protocol-agnostic parsing tools
  namespace Parser {
  
+/**
+ * Lexical processor to tokenize a buffer.
+ *
+ * Allows arbitrary delimiters and token character sets to
+ * be provided by callers.
+ *
+ * All methods start from the beginning of the input buffer.
+ * Methods returning true consume bytes from the buffer.
+ * Methods returning false have no side-effects.
+ */
  class Tokenizer {
  public:
     explicit Tokenizer(const SBuf &inBuf) : buf_(inBuf) {}
  
-   bool atEnd() const { return !buf_.length(); }
+   // return a copy the current contents of the parse buffer
+   const SBuf buf() const { return buf_; }
+
+   /// whether the end of the buffer has been reached
+   bool atEnd() const { return buf_.isEmpty(); }
+
+   /// the remaining unprocessed section of buffer
     const SBuf& remaining() const { return buf_; }
-   void reset(const SBuf &newBuf) { buf_ = newBuf; }
  
-   /* The following methods start from the beginning of the input buffer.
-    * They return true and consume parsed chars if a non-empty token is found.
-    * Otherwise, they return false without any side-effects. */
+   /// reinitialize processing for a new buffer
+   void reset(const SBuf &newBuf) { buf_ = newBuf; }
  
     /** Basic strtok(3):
      *  Skips all leading delimiters (if any),
-    *  accumulates all characters up to the first delimiter (a token), and
+    *  accumulates all characters up to the next delimiter (a token), and
      *  skips all trailing delimiters (if any).
-    *  Want to extract delimiters? Use three prefix() calls instead.
+    *
+    *  Want to extract delimiters? Use prefix() instead.
      */
-   bool token(SBuf &returnedToken, const CharacterSet &whitespace);
+   bool token(SBuf &returnedToken, const CharacterSet &delimiters);
  
-   /// Accumulates all sequential permitted characters (a token).
-   bool prefix(SBuf &returnedToken, const CharacterSet &tokenChars);
+   /** Accumulates all sequential permitted characters up to an optional length limit.
+    *
+    * \retval true one or more characters were found, the sequence (string) is placed in returnedToken
+    * \retval false no characters from the permitted set were found
+    */
+   bool prefix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit = SBuf::npos);
  
-   /// Skips all sequential permitted characters (a token).
+   /** skips all sequential characters from the set, in any order
+    *
+    * \return whether one or more characters in the set were found
+    */
     bool skip(const CharacterSet &tokenChars);
  
-   /// Skips a given token.
+   /** skips a given character sequence (string)
+    *
+    * \return whether the exact character sequence was found and skipped
+    */
     bool skip(const SBuf &tokenToSkip);
  
-   /// Skips a given character (a token).
+   /** skips a given single character
+    *
+    * \return whether the character was found and skipped
+    */
     bool skip(const char tokenChar);
  
+   /** parse an unsigned int64_t at the beginning of the buffer
+    *
+    * strtoll(3)-alike function: tries to parse unsigned 64-bit integer
+    * at the beginning of the parse buffer, in the base specified by the user
+    * or guesstimated; consumes the parsed characters.
+    *
+    * \param result Output value. Not touched if parsing is unsuccessful.
+    * \param base   Specify base to do the parsing in, with the same restrictions
+    *               as strtoll. Defaults to 0 (meaning guess)
+    *
+    * \return whether the parsing was successful
+    */
+   bool int64(int64_t &result, int base = 0);
+
  private:
     SBuf buf_; ///< yet unparsed input
  };
  
-
  } /* namespace Parser */
+
  #endif /* SQUID_PARSER_TOKENIZER_H_ */
diff --git a/src/parser/testTokenizer.cc b/src/parser/testTokenizer.cc

index 7334d743e526fc83c239cd26006a8dd8db5ccc8a..4e4dfa5ae63d541a59e60443f0ea5af7c508b40c 100644 (file)
--- a/src/parser/testTokenizer.cc
+++ b/src/parser/testTokenizer.cc
@@ -1,8 +1,7 @@
  #include "squid.h"
-
-#include "testTokenizer.h"
  #include "base/CharacterSet.h"
-#include "Tokenizer.h"
+#include "parser/Tokenizer.h"
+#include "testTokenizer.h"
  
  CPPUNIT_TEST_SUITE_REGISTRATION( testTokenizer );
  
@@ -96,8 +95,6 @@ testTokenizer::testTokenizerToken()
      CPPUNIT_ASSERT(t.token(s,whitespace));
      CPPUNIT_ASSERT_EQUAL(SBuf("Host:"),s);
  
-    //no separator found
-    CPPUNIT_ASSERT(!t.token(s,tab));
  }
  
  void
@@ -105,3 +102,117 @@ testTokenizer::testCharacterSet()
  {
  
  }
+
+void
+testTokenizer::testTokenizerInt64()
+{
+    // successful parse in base 10
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("1234"));
+        const int64_t benchmark = 1234;
+        CPPUNIT_ASSERT(t.int64(rv, 10));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+    }
+
+    // successful parse, autodetect base
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("1234"));
+        const int64_t benchmark = 1234;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+    }
+
+    // successful parse, autodetect base
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("01234"));
+        const int64_t benchmark = 01234;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+    }
+
+    // successful parse, autodetect base
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("0x12f4"));
+        const int64_t benchmark = 0x12f4;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+    }
+
+    // API mismatch: don't eat leading space
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf(" 1234"));
+        CPPUNIT_ASSERT(!t.int64(rv));
+    }
+
+    // API mismatch: don't eat multiple leading spaces
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("  1234"));
+        CPPUNIT_ASSERT(!t.int64(rv));
+    }
+
+    // trailing spaces
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("1234  foo"));
+        const int64_t benchmark = 1234;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("  foo"), t.buf());
+    }
+
+    // trailing nonspaces
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("1234foo"));
+        const int64_t benchmark = 1234;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("foo"), t.buf());
+    }
+
+    // trailing nonspaces
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("0x1234foo"));
+        const int64_t benchmark = 0x1234f;
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("oo"), t.buf());
+    }
+
+    // overflow
+    {
+        int64_t rv;
+        Parser::Tokenizer t(SBuf("1029397752385698678762234"));
+        CPPUNIT_ASSERT(!t.int64(rv));
+    }
+
+    // buffered sub-string parsing
+    {
+        int64_t rv;
+        SBuf base("1029397752385698678762234");
+        const int64_t benchmark = 22;
+        Parser::Tokenizer t(base.substr(base.length()-4,2));
+        CPPUNIT_ASSERT_EQUAL(SBuf("22"),t.buf());
+        CPPUNIT_ASSERT(t.int64(rv));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+    }
+
+    // base-16, prefix
+    {
+        int64_t rv;
+        SBuf base("deadbeefrow");
+        const int64_t benchmark=0xdeadbeef;
+        Parser::Tokenizer t(base);
+        CPPUNIT_ASSERT(t.int64(rv,16));
+        CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+        CPPUNIT_ASSERT_EQUAL(SBuf("row"),t.buf());
+
+    }
+}
diff --git a/src/parser/testTokenizer.h b/src/parser/testTokenizer.h

index 22ff87d9dac4a545a13593e87d5d21dd029d77bb..9089aa75ccfe477d71009a2d96272e9ef44f9c34 100644 (file)
--- a/src/parser/testTokenizer.h
+++ b/src/parser/testTokenizer.h
@@ -10,6 +10,7 @@ class testTokenizer : public CPPUNIT_NS::TestFixture
      CPPUNIT_TEST ( testTokenizerPrefix );
      CPPUNIT_TEST ( testTokenizerSkip );
      CPPUNIT_TEST ( testTokenizerToken );
+    CPPUNIT_TEST ( testTokenizerInt64 );
      CPPUNIT_TEST_SUITE_END();
  
  protected:
@@ -17,6 +18,7 @@ protected:
      void testTokenizerSkip();
      void testTokenizerToken();
      void testCharacterSet();
+    void testTokenizerInt64();
  };
  
  #endif /* SQUID_TESTTOKENIZER_H_ */
author	Amos Jeffries <squid3@treenet.co.nz>
	Sun, 1 Jun 2014 08:57:43 +0000 (01:57 -0700)
committer	Amos Jeffries <squid3@treenet.co.nz>
	Sun, 1 Jun 2014 08:57:43 +0000 (01:57 -0700)
src/parser/Makefile.am		patch \| blob \| blame \| history
src/parser/Tokenizer.cc		patch \| blob \| blame \| history
src/parser/Tokenizer.h		patch \| blob \| blame \| history
src/parser/testTokenizer.cc		patch \| blob \| blame \| history
src/parser/testTokenizer.h		patch \| blob \| blame \| history