From: Amos Jeffries Date: Sun, 1 Jun 2014 08:57:43 +0000 (-0700) Subject: Import Tokenizer updates X-Git-Tag: merge-candidate-3-v1~506^2~34 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=06af457a81d8dc077b24a79cfd1460312d720a71;p=thirdparty%2Fsquid.git Import Tokenizer updates --- diff --git a/src/parser/Makefile.am b/src/parser/Makefile.am index 8bb9c8678d..97cab1fdee 100644 --- a/src/parser/Makefile.am +++ b/src/parser/Makefile.am @@ -14,6 +14,7 @@ libsquid_parser_la_SOURCES = \ Tokenizer.cc SBUF_SOURCE= \ + $(top_srcdir)/src/base/CharacterSet.h \ $(top_srcdir)/src/SBuf.h \ $(top_srcdir)/src/SBuf.cc \ $(top_srcdir)/src/MemBlob.h \ @@ -32,7 +33,6 @@ testTokenizer_SOURCES = \ testTokenizer.cc \ Tokenizer.h nodist_testTokenizer_SOURCES = \ - $(top_srcdir)/src/base/CharacterSet.h \ $(top_srcdir)/src/tests/testMain.cc \ $(top_srcdir)/src/tests/stub_mem.cc \ $(top_srcdir)/src/tests/stub_debug.cc \ diff --git a/src/parser/Tokenizer.cc b/src/parser/Tokenizer.cc index b76aa1ec67..b3bc3ce5a2 100644 --- a/src/parser/Tokenizer.cc +++ b/src/parser/Tokenizer.cc @@ -1,25 +1,27 @@ #include "squid.h" -#include "Tokenizer.h" - -namespace Parser { +#include "parser/Tokenizer.h" bool -Tokenizer::token(SBuf &returnedToken, const CharacterSet &whitespace) +Parser::Tokenizer::token(SBuf &returnedToken, const CharacterSet &delimiters) { - const SBuf::size_type endOfPreWhiteSpace = buf_.findFirstNotOf(whitespace); - const SBuf::size_type endOfToken = buf_.findFirstOf(whitespace, endOfPreWhiteSpace); - if (endOfToken == SBuf::npos) + SBuf savebuf(buf_); + skip(delimiters); + SBuf::size_type tokenLen = buf_.findFirstOf(delimiters); // not found = npos => consume to end + if (tokenLen == SBuf::npos && !delimiters['\0']) { + // no delimiter found, nor is NUL/EOS/npos acceptible as one + buf_ = savebuf; return false; - buf_.consume(endOfPreWhiteSpace); - returnedToken = buf_.consume(endOfToken - endOfPreWhiteSpace); - skip(whitespace); + } + SBuf retval = buf_.consume(tokenLen); + skip(delimiters); + returnedToken = retval; return true; } bool -Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars) +Parser::Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit) { - SBuf::size_type prefixLen = buf_.findFirstNotOf(tokenChars); + SBuf::size_type prefixLen = buf_.substr(0,limit).findFirstNotOf(tokenChars); if (prefixLen == 0) return false; returnedToken = buf_.consume(prefixLen); @@ -27,7 +29,7 @@ Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars) } bool -Tokenizer::skip(const CharacterSet &tokenChars) +Parser::Tokenizer::skip(const CharacterSet &tokenChars) { SBuf::size_type prefixLen = buf_.findFirstNotOf(tokenChars); if (prefixLen == 0) @@ -37,7 +39,7 @@ Tokenizer::skip(const CharacterSet &tokenChars) } bool -Tokenizer::skip(const SBuf &tokenToSkip) +Parser::Tokenizer::skip(const SBuf &tokenToSkip) { if (buf_.startsWith(tokenToSkip)) { buf_.consume(tokenToSkip.length()); @@ -47,7 +49,7 @@ Tokenizer::skip(const SBuf &tokenToSkip) } bool -Tokenizer::skip(const char tokenChar) +Parser::Tokenizer::skip(const char tokenChar) { if (buf_[0] == tokenChar) { buf_.consume(1); @@ -55,4 +57,78 @@ Tokenizer::skip(const char tokenChar) } return false; } -} /* namespace Parser */ + +/* reworked from compat/strtoll.c */ +bool +Parser::Tokenizer::int64(int64_t & result, int base) +{ + if (buf_.isEmpty()) + return false; + + //fixme: account for buf_.size() + bool neg = false; + const char *s = buf_.rawContent(); + const char *end = buf_.rawContent() + buf_.length(); + + if (*s == '-') { + neg = true; + ++s; + } else if (*s == '+') { + ++s; + } + if (s >= end) return false; + if (( base == 0 || base == 16) && *s == '0' && (s+1 <= end ) && + tolower(*(s+1)) == 'x') { + s += 2; + base = 16; + } + if (base == 0) { + if ( *s == '0') { + base = 8; + ++s; + } else { + base = 10; + } + } + if (s >= end) return false; + + uint64_t cutoff; + + cutoff = neg ? -static_cast(INT64_MIN) : INT64_MAX; + int cutlim = cutoff % static_cast(base); + cutoff /= static_cast(base); + + int any = 0, c; + int64_t acc = 0; + for (c = *s++; s <= end; c = *s++) { + if (xisdigit(c)) { + c -= '0'; + } else if (xisalpha(c)) { + c -= xisupper(c) ? 'A' - 10 : 'a' - 10; + } else { + break; + } + if (c >= base) + break; + if (any < 0 || static_cast(acc) > cutoff || (static_cast(acc) == cutoff && c > cutlim)) + any = -1; + else { + any = 1; + acc *= base; + acc += c; + } + } + + if (any == 0) // nothing was parsed + return false; + if (any < 0) { + acc = neg ? INT64_MIN : INT64_MAX; + errno = ERANGE; + return false; + } else if (neg) + acc = -acc; + + result = acc; + buf_.consume(s - buf_.rawContent() -1); + return true; +} diff --git a/src/parser/Tokenizer.h b/src/parser/Tokenizer.h index d40869875d..9436e9f19c 100644 --- a/src/parser/Tokenizer.h +++ b/src/parser/Tokenizer.h @@ -4,44 +4,87 @@ #include "base/CharacterSet.h" #include "SBuf.h" +/// Generic protocol-agnostic parsing tools namespace Parser { +/** + * Lexical processor to tokenize a buffer. + * + * Allows arbitrary delimiters and token character sets to + * be provided by callers. + * + * All methods start from the beginning of the input buffer. + * Methods returning true consume bytes from the buffer. + * Methods returning false have no side-effects. + */ class Tokenizer { public: explicit Tokenizer(const SBuf &inBuf) : buf_(inBuf) {} - bool atEnd() const { return !buf_.length(); } + // return a copy the current contents of the parse buffer + const SBuf buf() const { return buf_; } + + /// whether the end of the buffer has been reached + bool atEnd() const { return buf_.isEmpty(); } + + /// the remaining unprocessed section of buffer const SBuf& remaining() const { return buf_; } - void reset(const SBuf &newBuf) { buf_ = newBuf; } - /* The following methods start from the beginning of the input buffer. - * They return true and consume parsed chars if a non-empty token is found. - * Otherwise, they return false without any side-effects. */ + /// reinitialize processing for a new buffer + void reset(const SBuf &newBuf) { buf_ = newBuf; } /** Basic strtok(3): * Skips all leading delimiters (if any), - * accumulates all characters up to the first delimiter (a token), and + * accumulates all characters up to the next delimiter (a token), and * skips all trailing delimiters (if any). - * Want to extract delimiters? Use three prefix() calls instead. + * + * Want to extract delimiters? Use prefix() instead. */ - bool token(SBuf &returnedToken, const CharacterSet &whitespace); + bool token(SBuf &returnedToken, const CharacterSet &delimiters); - /// Accumulates all sequential permitted characters (a token). - bool prefix(SBuf &returnedToken, const CharacterSet &tokenChars); + /** Accumulates all sequential permitted characters up to an optional length limit. + * + * \retval true one or more characters were found, the sequence (string) is placed in returnedToken + * \retval false no characters from the permitted set were found + */ + bool prefix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit = SBuf::npos); - /// Skips all sequential permitted characters (a token). + /** skips all sequential characters from the set, in any order + * + * \return whether one or more characters in the set were found + */ bool skip(const CharacterSet &tokenChars); - /// Skips a given token. + /** skips a given character sequence (string) + * + * \return whether the exact character sequence was found and skipped + */ bool skip(const SBuf &tokenToSkip); - /// Skips a given character (a token). + /** skips a given single character + * + * \return whether the character was found and skipped + */ bool skip(const char tokenChar); + /** parse an unsigned int64_t at the beginning of the buffer + * + * strtoll(3)-alike function: tries to parse unsigned 64-bit integer + * at the beginning of the parse buffer, in the base specified by the user + * or guesstimated; consumes the parsed characters. + * + * \param result Output value. Not touched if parsing is unsuccessful. + * \param base Specify base to do the parsing in, with the same restrictions + * as strtoll. Defaults to 0 (meaning guess) + * + * \return whether the parsing was successful + */ + bool int64(int64_t &result, int base = 0); + private: SBuf buf_; ///< yet unparsed input }; - } /* namespace Parser */ + #endif /* SQUID_PARSER_TOKENIZER_H_ */ diff --git a/src/parser/testTokenizer.cc b/src/parser/testTokenizer.cc index 7334d743e5..4e4dfa5ae6 100644 --- a/src/parser/testTokenizer.cc +++ b/src/parser/testTokenizer.cc @@ -1,8 +1,7 @@ #include "squid.h" - -#include "testTokenizer.h" #include "base/CharacterSet.h" -#include "Tokenizer.h" +#include "parser/Tokenizer.h" +#include "testTokenizer.h" CPPUNIT_TEST_SUITE_REGISTRATION( testTokenizer ); @@ -96,8 +95,6 @@ testTokenizer::testTokenizerToken() CPPUNIT_ASSERT(t.token(s,whitespace)); CPPUNIT_ASSERT_EQUAL(SBuf("Host:"),s); - //no separator found - CPPUNIT_ASSERT(!t.token(s,tab)); } void @@ -105,3 +102,117 @@ testTokenizer::testCharacterSet() { } + +void +testTokenizer::testTokenizerInt64() +{ + // successful parse in base 10 + { + int64_t rv; + Parser::Tokenizer t(SBuf("1234")); + const int64_t benchmark = 1234; + CPPUNIT_ASSERT(t.int64(rv, 10)); + CPPUNIT_ASSERT_EQUAL(benchmark,rv); + } + + // successful parse, autodetect base + { + int64_t rv; + Parser::Tokenizer t(SBuf("1234")); + const int64_t benchmark = 1234; + CPPUNIT_ASSERT(t.int64(rv)); + CPPUNIT_ASSERT_EQUAL(benchmark,rv); + } + + // successful parse, autodetect base + { + int64_t rv; + Parser::Tokenizer t(SBuf("01234")); + const int64_t benchmark = 01234; + CPPUNIT_ASSERT(t.int64(rv)); + CPPUNIT_ASSERT_EQUAL(benchmark,rv); + } + + // successful parse, autodetect base + { + int64_t rv; + Parser::Tokenizer t(SBuf("0x12f4")); + const int64_t benchmark = 0x12f4; + CPPUNIT_ASSERT(t.int64(rv)); + CPPUNIT_ASSERT_EQUAL(benchmark,rv); + } + + // API mismatch: don't eat leading space + { + int64_t rv; + Parser::Tokenizer t(SBuf(" 1234")); + CPPUNIT_ASSERT(!t.int64(rv)); + } + + // API mismatch: don't eat multiple leading spaces + { + int64_t rv; + Parser::Tokenizer t(SBuf(" 1234")); + CPPUNIT_ASSERT(!t.int64(rv)); + } + + // trailing spaces + { + int64_t rv; + Parser::Tokenizer t(SBuf("1234 foo")); + const int64_t benchmark = 1234; + CPPUNIT_ASSERT(t.int64(rv)); + CPPUNIT_ASSERT_EQUAL(benchmark,rv); + CPPUNIT_ASSERT_EQUAL(SBuf(" foo"), t.buf()); + } + + // trailing nonspaces + { + int64_t rv; + Parser::Tokenizer t(SBuf("1234foo")); + const int64_t benchmark = 1234; + CPPUNIT_ASSERT(t.int64(rv)); + CPPUNIT_ASSERT_EQUAL(benchmark,rv); + CPPUNIT_ASSERT_EQUAL(SBuf("foo"), t.buf()); + } + + // trailing nonspaces + { + int64_t rv; + Parser::Tokenizer t(SBuf("0x1234foo")); + const int64_t benchmark = 0x1234f; + CPPUNIT_ASSERT(t.int64(rv)); + CPPUNIT_ASSERT_EQUAL(benchmark,rv); + CPPUNIT_ASSERT_EQUAL(SBuf("oo"), t.buf()); + } + + // overflow + { + int64_t rv; + Parser::Tokenizer t(SBuf("1029397752385698678762234")); + CPPUNIT_ASSERT(!t.int64(rv)); + } + + // buffered sub-string parsing + { + int64_t rv; + SBuf base("1029397752385698678762234"); + const int64_t benchmark = 22; + Parser::Tokenizer t(base.substr(base.length()-4,2)); + CPPUNIT_ASSERT_EQUAL(SBuf("22"),t.buf()); + CPPUNIT_ASSERT(t.int64(rv)); + CPPUNIT_ASSERT_EQUAL(benchmark,rv); + } + + // base-16, prefix + { + int64_t rv; + SBuf base("deadbeefrow"); + const int64_t benchmark=0xdeadbeef; + Parser::Tokenizer t(base); + CPPUNIT_ASSERT(t.int64(rv,16)); + CPPUNIT_ASSERT_EQUAL(benchmark,rv); + CPPUNIT_ASSERT_EQUAL(SBuf("row"),t.buf()); + + } +} diff --git a/src/parser/testTokenizer.h b/src/parser/testTokenizer.h index 22ff87d9da..9089aa75cc 100644 --- a/src/parser/testTokenizer.h +++ b/src/parser/testTokenizer.h @@ -10,6 +10,7 @@ class testTokenizer : public CPPUNIT_NS::TestFixture CPPUNIT_TEST ( testTokenizerPrefix ); CPPUNIT_TEST ( testTokenizerSkip ); CPPUNIT_TEST ( testTokenizerToken ); + CPPUNIT_TEST ( testTokenizerInt64 ); CPPUNIT_TEST_SUITE_END(); protected: @@ -17,6 +18,7 @@ protected: void testTokenizerSkip(); void testTokenizerToken(); void testCharacterSet(); + void testTokenizerInt64(); }; #endif /* SQUID_TESTTOKENIZER_H_ */