From: Alex Rousskov Date: Wed, 12 Aug 2015 22:18:22 +0000 (-0600) Subject: New SBuf and Tokenizer methods to simplify suffix parsing and skipping X-Git-Tag: SQUID_4_0_1~134 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=c571034be79334a2f09b81b8e1723a1886b39700;p=thirdparty%2Fsquid.git New SBuf and Tokenizer methods to simplify suffix parsing and skipping (and to make suffix/reverse APIs more similar to prefix/forward ones). Also reluctantly changed Tokenizer to update parsedSize() when parsing suffixes, per reviewer request. --- diff --git a/src/SBuf.cc b/src/SBuf.cc index fbaffcbaa4..79a04adba1 100644 --- a/src/SBuf.cc +++ b/src/SBuf.cc @@ -802,6 +802,48 @@ SBuf::findFirstNotOf(const CharacterSet &set, size_type startPos) const return npos; } +SBuf::size_type +SBuf::findLastOf(const CharacterSet &set, size_type endPos) const +{ + ++stats.find; + + if (isEmpty()) + return npos; + + if (endPos == npos || endPos >= length()) + endPos = length() - 1; + + debugs(24, 7, "last of characterset " << set.name << " in id " << id); + const char *start = buf(); + for (const char *cur = start + endPos; cur >= start; --cur) { + if (set[*cur]) + return cur - start; + } + debugs(24, 7, "not found"); + return npos; +} + +SBuf::size_type +SBuf::findLastNotOf(const CharacterSet &set, size_type endPos) const +{ + ++stats.find; + + if (isEmpty()) + return npos; + + if (endPos == npos || endPos >= length()) + endPos = length() - 1; + + debugs(24, 7, "last not of characterset " << set.name << " in id " << id); + const char *start = buf(); + for (const char *cur = start + endPos; cur >= start; --cur) { + if (!set[*cur]) + return cur - start; + } + debugs(24, 7, "not found"); + return npos; +} + /* * TODO: borrow a sscanf implementation from Linux or similar? * we'd really need a vsnscanf(3)... ? As an alternative, a diff --git a/src/SBuf.h b/src/SBuf.h index 31c51cdaf0..dcbe587720 100644 --- a/src/SBuf.h +++ b/src/SBuf.h @@ -596,6 +596,16 @@ public: */ size_type findFirstOf(const CharacterSet &set, size_type startPos = 0) const; + /** Find last occurrence of character of set in SBuf + * + * Finds the last occurrence of ANY of the characters in the supplied set in + * the SBuf. + * \return npos if no character in the set could be found + * \param endPos if specified, ignore any occurrences after that position + * if npos, the entire SBuf is searched + */ + size_type findLastOf(const CharacterSet &set, size_type endPos = npos) const; + /** Find first occurrence character NOT in character set * * \return npos if all characters in the SBuf are from set @@ -606,6 +616,14 @@ public: */ size_type findFirstNotOf(const CharacterSet &set, size_type startPos = 0) const; + /** Find last occurrence character NOT in character set + * + * \return npos if all characters in the SBuf are from set + * \param endPos if specified, ignore any occurrences after that position + * if npos, then the entire SBuf is searched + */ + size_type findLastNotOf(const CharacterSet &set, size_type endPos = npos) const; + /** sscanf-alike * * sscanf re-implementation. Non-const, and not \0-clean. diff --git a/src/parser/Tokenizer.cc b/src/parser/Tokenizer.cc index 68fcb7cafc..825aecc7a8 100644 --- a/src/parser/Tokenizer.cc +++ b/src/parser/Tokenizer.cc @@ -35,6 +35,28 @@ Parser::Tokenizer::success(const SBuf::size_type n) return consume(n).length(); } +/// convenience method: consumes up to n last bytes and returns them +SBuf +Parser::Tokenizer::consumeTrailing(const SBuf::size_type n) +{ + debugs(24, 5, "consuming " << n << " bytes"); + + // If n is npos, we consume everything from buf_ (and nothing from result). + const SBuf::size_type parsed = (n == SBuf::npos) ? buf_.length() : n; + + SBuf result = buf_; + buf_ = result.consume(buf_.length() - parsed); + parsed_ += parsed; + return result; +} + +/// convenience method: consumes up to n last bytes and returns their count +SBuf::size_type +Parser::Tokenizer::successTrailing(const SBuf::size_type n) +{ + return consumeTrailing(n).length(); +} + bool Parser::Tokenizer::token(SBuf &returnedToken, const CharacterSet &delimiters) { @@ -90,8 +112,7 @@ Parser::Tokenizer::suffix(SBuf &returnedToken, const CharacterSet &tokenChars, c } if (!found) return false; - returnedToken = buf_; - buf_ = returnedToken.consume(buf_.length() - found); + returnedToken = consumeTrailing(found); return true; } @@ -129,8 +150,8 @@ Parser::Tokenizer::skipSuffix(const SBuf &tokenToSkip) offset = buf_.length() - tokenToSkip.length(); if (buf_.substr(offset, SBuf::npos).cmp(tokenToSkip) == 0) { - buf_ = buf_.substr(0,offset); - return true; + debugs(24, 8, "skipping " << tokenToSkip.length()); + return successTrailing(tokenToSkip.length()); } return false; } @@ -157,6 +178,32 @@ Parser::Tokenizer::skip(const char tokenChar) return false; } +bool +Parser::Tokenizer::skipOneTrailing(const CharacterSet &skippable) +{ + if (!buf_.isEmpty() && skippable[buf_[buf_.length()-1]]) { + debugs(24, 8, "skipping one-of " << skippable.name); + return successTrailing(1); + } + debugs(24, 8, "no match while skipping one-of " << skippable.name); + return false; +} + +SBuf::size_type +Parser::Tokenizer::skipAllTrailing(const CharacterSet &skippable) +{ + const SBuf::size_type prefixEnd = buf_.findLastNotOf(skippable); + const SBuf::size_type prefixLen = prefixEnd == SBuf::npos ? + 0 : (prefixEnd + 1); + const SBuf::size_type suffixLen = buf_.length() - prefixLen; + if (suffixLen == 0) { + debugs(24, 8, "no match when trying to skip " << skippable.name); + return 0; + } + debugs(24, 8, "skipping in " << skippable.name << " len " << suffixLen); + return successTrailing(suffixLen); +} + /* reworked from compat/strtoll.c */ bool Parser::Tokenizer::int64(int64_t & result, int base, bool allowSign, const SBuf::size_type limit) diff --git a/src/parser/Tokenizer.h b/src/parser/Tokenizer.h index 47d4a7515a..762d6bdff5 100644 --- a/src/parser/Tokenizer.h +++ b/src/parser/Tokenizer.h @@ -115,6 +115,18 @@ public: */ SBuf::size_type skipAll(const CharacterSet &discardables); + /** Removes a single trailing character from the set. + * + * \return whether a character was removed + */ + bool skipOneTrailing(const CharacterSet &discardables); + + /** Removes all sequential trailing characters from the set, in any order. + * + * \returns the number of characters removed + */ + SBuf::size_type skipAllTrailing(const CharacterSet &discardables); + /** Extracts an unsigned int64_t at the beginning of the buffer. * * strtoll(3)-alike function: tries to parse unsigned 64-bit integer @@ -134,6 +146,8 @@ public: protected: SBuf consume(const SBuf::size_type n); SBuf::size_type success(const SBuf::size_type n); + SBuf consumeTrailing(const SBuf::size_type n); + SBuf::size_type successTrailing(const SBuf::size_type n); /// reset the buffer and parsed stats to a saved checkpoint void undoParse(const SBuf &newBuf, SBuf::size_type cParsed) { buf_ = newBuf; parsed_ = cParsed; } diff --git a/src/tests/testTokenizer.cc b/src/tests/testTokenizer.cc index 682f0d01f8..4e526f9882 100644 --- a/src/tests/testTokenizer.cc +++ b/src/tests/testTokenizer.cc @@ -128,6 +128,66 @@ testTokenizer::testTokenizerToken() } +void +testTokenizer::testTokenizerSuffix() +{ + const SBuf canary("This text should not be changed."); + + Parser::Tokenizer t(text); + SBuf s; + + CharacterSet all(whitespace); + all += alpha; + all += crlf; + all += numbers; + all.add(':').add('.').add('/'); + + // an empty suffix should return false (the full output buffer case) + s = canary; + const SBuf before = t.remaining(); + CPPUNIT_ASSERT(!t.suffix(s, all, 0)); + // ... and a false return value means no parameter changes + CPPUNIT_ASSERT_EQUAL(canary, s); + // ... and a false return value means no input buffer changes + CPPUNIT_ASSERT_EQUAL(before, t.remaining()); + + // consume suffix until the last CRLF, including that last CRLF + SBuf::size_type remaining = t.remaining().length(); + while (t.remaining().findLastOf(crlf) != SBuf::npos) { + CPPUNIT_ASSERT(t.remaining().length() > 0); + CPPUNIT_ASSERT(t.skipOneTrailing(all)); + // ensure steady progress + CPPUNIT_ASSERT_EQUAL(remaining, t.remaining().length() + 1); + --remaining; + } + + // no match (last char is not in the suffix set) + CPPUNIT_ASSERT(!t.suffix(s, crlf)); + CPPUNIT_ASSERT(!t.suffix(s, whitespace)); + + // successful suffix tokenization + CPPUNIT_ASSERT(t.suffix(s, numbers)); + CPPUNIT_ASSERT_EQUAL(SBuf("1"), s); + CPPUNIT_ASSERT(t.skipSuffix(SBuf("1."))); + CPPUNIT_ASSERT(t.skipSuffix(SBuf("/"))); + CPPUNIT_ASSERT(t.suffix(s, alpha)); + CPPUNIT_ASSERT_EQUAL(SBuf("HTTP"), s); + CPPUNIT_ASSERT(t.suffix(s, whitespace)); + CPPUNIT_ASSERT_EQUAL(SBuf(" "), s); + + // match until the end of the sample + CPPUNIT_ASSERT(t.suffix(s, all)); + CPPUNIT_ASSERT_EQUAL(SBuf(), t.remaining()); + + // an empty buffer does not end with a token + s = canary; + CPPUNIT_ASSERT(!t.suffix(s, all)); + CPPUNIT_ASSERT_EQUAL(canary, s); // no parameter changes + + // we cannot skip an empty suffix, even in an empty buffer + CPPUNIT_ASSERT(!t.skipSuffix(SBuf())); +} + void testTokenizer::testCharacterSet() { diff --git a/src/tests/testTokenizer.h b/src/tests/testTokenizer.h index b168f3456c..447ee414aa 100644 --- a/src/tests/testTokenizer.h +++ b/src/tests/testTokenizer.h @@ -16,6 +16,7 @@ class testTokenizer : public CPPUNIT_NS::TestFixture CPPUNIT_TEST_SUITE( testTokenizer ); CPPUNIT_TEST ( testCharacterSet ); CPPUNIT_TEST ( testTokenizerPrefix ); + CPPUNIT_TEST ( testTokenizerSuffix ); CPPUNIT_TEST ( testTokenizerSkip ); CPPUNIT_TEST ( testTokenizerToken ); CPPUNIT_TEST ( testTokenizerInt64 ); @@ -23,6 +24,7 @@ class testTokenizer : public CPPUNIT_NS::TestFixture protected: void testTokenizerPrefix(); + void testTokenizerSuffix(); void testTokenizerSkip(); void testTokenizerToken(); void testCharacterSet();