Tokenizer.cc
SBUF_SOURCE= \
+ $(top_srcdir)/src/base/CharacterSet.h \
$(top_srcdir)/src/SBuf.h \
$(top_srcdir)/src/SBuf.cc \
$(top_srcdir)/src/MemBlob.h \
testTokenizer.cc \
Tokenizer.h
nodist_testTokenizer_SOURCES = \
- $(top_srcdir)/src/base/CharacterSet.h \
$(top_srcdir)/src/tests/testMain.cc \
$(top_srcdir)/src/tests/stub_mem.cc \
$(top_srcdir)/src/tests/stub_debug.cc \
#include "squid.h"
-#include "Tokenizer.h"
-
-namespace Parser {
+#include "parser/Tokenizer.h"
bool
-Tokenizer::token(SBuf &returnedToken, const CharacterSet &whitespace)
+Parser::Tokenizer::token(SBuf &returnedToken, const CharacterSet &delimiters)
{
- const SBuf::size_type endOfPreWhiteSpace = buf_.findFirstNotOf(whitespace);
- const SBuf::size_type endOfToken = buf_.findFirstOf(whitespace, endOfPreWhiteSpace);
- if (endOfToken == SBuf::npos)
+ SBuf savebuf(buf_);
+ skip(delimiters);
+ SBuf::size_type tokenLen = buf_.findFirstOf(delimiters); // not found = npos => consume to end
+ if (tokenLen == SBuf::npos && !delimiters['\0']) {
+ // no delimiter found, nor is NUL/EOS/npos acceptible as one
+ buf_ = savebuf;
return false;
- buf_.consume(endOfPreWhiteSpace);
- returnedToken = buf_.consume(endOfToken - endOfPreWhiteSpace);
- skip(whitespace);
+ }
+ SBuf retval = buf_.consume(tokenLen);
+ skip(delimiters);
+ returnedToken = retval;
return true;
}
bool
-Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars)
+Parser::Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
{
- SBuf::size_type prefixLen = buf_.findFirstNotOf(tokenChars);
+ SBuf::size_type prefixLen = buf_.substr(0,limit).findFirstNotOf(tokenChars);
if (prefixLen == 0)
return false;
returnedToken = buf_.consume(prefixLen);
}
bool
-Tokenizer::skip(const CharacterSet &tokenChars)
+Parser::Tokenizer::skip(const CharacterSet &tokenChars)
{
SBuf::size_type prefixLen = buf_.findFirstNotOf(tokenChars);
if (prefixLen == 0)
}
bool
-Tokenizer::skip(const SBuf &tokenToSkip)
+Parser::Tokenizer::skip(const SBuf &tokenToSkip)
{
if (buf_.startsWith(tokenToSkip)) {
buf_.consume(tokenToSkip.length());
}
bool
-Tokenizer::skip(const char tokenChar)
+Parser::Tokenizer::skip(const char tokenChar)
{
if (buf_[0] == tokenChar) {
buf_.consume(1);
}
return false;
}
-} /* namespace Parser */
+
+/* reworked from compat/strtoll.c */
+bool
+Parser::Tokenizer::int64(int64_t & result, int base)
+{
+ if (buf_.isEmpty())
+ return false;
+
+ //fixme: account for buf_.size()
+ bool neg = false;
+ const char *s = buf_.rawContent();
+ const char *end = buf_.rawContent() + buf_.length();
+
+ if (*s == '-') {
+ neg = true;
+ ++s;
+ } else if (*s == '+') {
+ ++s;
+ }
+ if (s >= end) return false;
+ if (( base == 0 || base == 16) && *s == '0' && (s+1 <= end ) &&
+ tolower(*(s+1)) == 'x') {
+ s += 2;
+ base = 16;
+ }
+ if (base == 0) {
+ if ( *s == '0') {
+ base = 8;
+ ++s;
+ } else {
+ base = 10;
+ }
+ }
+ if (s >= end) return false;
+
+ uint64_t cutoff;
+
+ cutoff = neg ? -static_cast<uint64_t>(INT64_MIN) : INT64_MAX;
+ int cutlim = cutoff % static_cast<int64_t>(base);
+ cutoff /= static_cast<uint64_t>(base);
+
+ int any = 0, c;
+ int64_t acc = 0;
+ for (c = *s++; s <= end; c = *s++) {
+ if (xisdigit(c)) {
+ c -= '0';
+ } else if (xisalpha(c)) {
+ c -= xisupper(c) ? 'A' - 10 : 'a' - 10;
+ } else {
+ break;
+ }
+ if (c >= base)
+ break;
+ if (any < 0 || static_cast<uint64_t>(acc) > cutoff || (static_cast<uint64_t>(acc) == cutoff && c > cutlim))
+ any = -1;
+ else {
+ any = 1;
+ acc *= base;
+ acc += c;
+ }
+ }
+
+ if (any == 0) // nothing was parsed
+ return false;
+ if (any < 0) {
+ acc = neg ? INT64_MIN : INT64_MAX;
+ errno = ERANGE;
+ return false;
+ } else if (neg)
+ acc = -acc;
+
+ result = acc;
+ buf_.consume(s - buf_.rawContent() -1);
+ return true;
+}
#include "base/CharacterSet.h"
#include "SBuf.h"
+/// Generic protocol-agnostic parsing tools
namespace Parser {
+/**
+ * Lexical processor to tokenize a buffer.
+ *
+ * Allows arbitrary delimiters and token character sets to
+ * be provided by callers.
+ *
+ * All methods start from the beginning of the input buffer.
+ * Methods returning true consume bytes from the buffer.
+ * Methods returning false have no side-effects.
+ */
class Tokenizer {
public:
explicit Tokenizer(const SBuf &inBuf) : buf_(inBuf) {}
- bool atEnd() const { return !buf_.length(); }
+ // return a copy the current contents of the parse buffer
+ const SBuf buf() const { return buf_; }
+
+ /// whether the end of the buffer has been reached
+ bool atEnd() const { return buf_.isEmpty(); }
+
+ /// the remaining unprocessed section of buffer
const SBuf& remaining() const { return buf_; }
- void reset(const SBuf &newBuf) { buf_ = newBuf; }
- /* The following methods start from the beginning of the input buffer.
- * They return true and consume parsed chars if a non-empty token is found.
- * Otherwise, they return false without any side-effects. */
+ /// reinitialize processing for a new buffer
+ void reset(const SBuf &newBuf) { buf_ = newBuf; }
/** Basic strtok(3):
* Skips all leading delimiters (if any),
- * accumulates all characters up to the first delimiter (a token), and
+ * accumulates all characters up to the next delimiter (a token), and
* skips all trailing delimiters (if any).
- * Want to extract delimiters? Use three prefix() calls instead.
+ *
+ * Want to extract delimiters? Use prefix() instead.
*/
- bool token(SBuf &returnedToken, const CharacterSet &whitespace);
+ bool token(SBuf &returnedToken, const CharacterSet &delimiters);
- /// Accumulates all sequential permitted characters (a token).
- bool prefix(SBuf &returnedToken, const CharacterSet &tokenChars);
+ /** Accumulates all sequential permitted characters up to an optional length limit.
+ *
+ * \retval true one or more characters were found, the sequence (string) is placed in returnedToken
+ * \retval false no characters from the permitted set were found
+ */
+ bool prefix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit = SBuf::npos);
- /// Skips all sequential permitted characters (a token).
+ /** skips all sequential characters from the set, in any order
+ *
+ * \return whether one or more characters in the set were found
+ */
bool skip(const CharacterSet &tokenChars);
- /// Skips a given token.
+ /** skips a given character sequence (string)
+ *
+ * \return whether the exact character sequence was found and skipped
+ */
bool skip(const SBuf &tokenToSkip);
- /// Skips a given character (a token).
+ /** skips a given single character
+ *
+ * \return whether the character was found and skipped
+ */
bool skip(const char tokenChar);
+ /** parse an unsigned int64_t at the beginning of the buffer
+ *
+ * strtoll(3)-alike function: tries to parse unsigned 64-bit integer
+ * at the beginning of the parse buffer, in the base specified by the user
+ * or guesstimated; consumes the parsed characters.
+ *
+ * \param result Output value. Not touched if parsing is unsuccessful.
+ * \param base Specify base to do the parsing in, with the same restrictions
+ * as strtoll. Defaults to 0 (meaning guess)
+ *
+ * \return whether the parsing was successful
+ */
+ bool int64(int64_t &result, int base = 0);
+
private:
SBuf buf_; ///< yet unparsed input
};
-
} /* namespace Parser */
+
#endif /* SQUID_PARSER_TOKENIZER_H_ */
#include "squid.h"
-
-#include "testTokenizer.h"
#include "base/CharacterSet.h"
-#include "Tokenizer.h"
+#include "parser/Tokenizer.h"
+#include "testTokenizer.h"
CPPUNIT_TEST_SUITE_REGISTRATION( testTokenizer );
CPPUNIT_ASSERT(t.token(s,whitespace));
CPPUNIT_ASSERT_EQUAL(SBuf("Host:"),s);
- //no separator found
- CPPUNIT_ASSERT(!t.token(s,tab));
}
void
{
}
+
+void
+testTokenizer::testTokenizerInt64()
+{
+ // successful parse in base 10
+ {
+ int64_t rv;
+ Parser::Tokenizer t(SBuf("1234"));
+ const int64_t benchmark = 1234;
+ CPPUNIT_ASSERT(t.int64(rv, 10));
+ CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+ }
+
+ // successful parse, autodetect base
+ {
+ int64_t rv;
+ Parser::Tokenizer t(SBuf("1234"));
+ const int64_t benchmark = 1234;
+ CPPUNIT_ASSERT(t.int64(rv));
+ CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+ }
+
+ // successful parse, autodetect base
+ {
+ int64_t rv;
+ Parser::Tokenizer t(SBuf("01234"));
+ const int64_t benchmark = 01234;
+ CPPUNIT_ASSERT(t.int64(rv));
+ CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+ }
+
+ // successful parse, autodetect base
+ {
+ int64_t rv;
+ Parser::Tokenizer t(SBuf("0x12f4"));
+ const int64_t benchmark = 0x12f4;
+ CPPUNIT_ASSERT(t.int64(rv));
+ CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+ }
+
+ // API mismatch: don't eat leading space
+ {
+ int64_t rv;
+ Parser::Tokenizer t(SBuf(" 1234"));
+ CPPUNIT_ASSERT(!t.int64(rv));
+ }
+
+ // API mismatch: don't eat multiple leading spaces
+ {
+ int64_t rv;
+ Parser::Tokenizer t(SBuf(" 1234"));
+ CPPUNIT_ASSERT(!t.int64(rv));
+ }
+
+ // trailing spaces
+ {
+ int64_t rv;
+ Parser::Tokenizer t(SBuf("1234 foo"));
+ const int64_t benchmark = 1234;
+ CPPUNIT_ASSERT(t.int64(rv));
+ CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+ CPPUNIT_ASSERT_EQUAL(SBuf(" foo"), t.buf());
+ }
+
+ // trailing nonspaces
+ {
+ int64_t rv;
+ Parser::Tokenizer t(SBuf("1234foo"));
+ const int64_t benchmark = 1234;
+ CPPUNIT_ASSERT(t.int64(rv));
+ CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+ CPPUNIT_ASSERT_EQUAL(SBuf("foo"), t.buf());
+ }
+
+ // trailing nonspaces
+ {
+ int64_t rv;
+ Parser::Tokenizer t(SBuf("0x1234foo"));
+ const int64_t benchmark = 0x1234f;
+ CPPUNIT_ASSERT(t.int64(rv));
+ CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+ CPPUNIT_ASSERT_EQUAL(SBuf("oo"), t.buf());
+ }
+
+ // overflow
+ {
+ int64_t rv;
+ Parser::Tokenizer t(SBuf("1029397752385698678762234"));
+ CPPUNIT_ASSERT(!t.int64(rv));
+ }
+
+ // buffered sub-string parsing
+ {
+ int64_t rv;
+ SBuf base("1029397752385698678762234");
+ const int64_t benchmark = 22;
+ Parser::Tokenizer t(base.substr(base.length()-4,2));
+ CPPUNIT_ASSERT_EQUAL(SBuf("22"),t.buf());
+ CPPUNIT_ASSERT(t.int64(rv));
+ CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+ }
+
+ // base-16, prefix
+ {
+ int64_t rv;
+ SBuf base("deadbeefrow");
+ const int64_t benchmark=0xdeadbeef;
+ Parser::Tokenizer t(base);
+ CPPUNIT_ASSERT(t.int64(rv,16));
+ CPPUNIT_ASSERT_EQUAL(benchmark,rv);
+ CPPUNIT_ASSERT_EQUAL(SBuf("row"),t.buf());
+
+ }
+}
CPPUNIT_TEST ( testTokenizerPrefix );
CPPUNIT_TEST ( testTokenizerSkip );
CPPUNIT_TEST ( testTokenizerToken );
+ CPPUNIT_TEST ( testTokenizerInt64 );
CPPUNIT_TEST_SUITE_END();
protected:
void testTokenizerSkip();
void testTokenizerToken();
void testCharacterSet();
+ void testTokenizerInt64();
};
#endif /* SQUID_TESTTOKENIZER_H_ */