(and to make suffix/reverse APIs more similar to prefix/forward ones).
Also reluctantly changed Tokenizer to update parsedSize() when parsing
suffixes, per reviewer request.
return npos;
}
+SBuf::size_type
+SBuf::findLastOf(const CharacterSet &set, size_type endPos) const
+{
+ ++stats.find;
+
+ if (isEmpty())
+ return npos;
+
+ if (endPos == npos || endPos >= length())
+ endPos = length() - 1;
+
+ debugs(24, 7, "last of characterset " << set.name << " in id " << id);
+ const char *start = buf();
+ for (const char *cur = start + endPos; cur >= start; --cur) {
+ if (set[*cur])
+ return cur - start;
+ }
+ debugs(24, 7, "not found");
+ return npos;
+}
+
+SBuf::size_type
+SBuf::findLastNotOf(const CharacterSet &set, size_type endPos) const
+{
+ ++stats.find;
+
+ if (isEmpty())
+ return npos;
+
+ if (endPos == npos || endPos >= length())
+ endPos = length() - 1;
+
+ debugs(24, 7, "last not of characterset " << set.name << " in id " << id);
+ const char *start = buf();
+ for (const char *cur = start + endPos; cur >= start; --cur) {
+ if (!set[*cur])
+ return cur - start;
+ }
+ debugs(24, 7, "not found");
+ return npos;
+}
+
/*
* TODO: borrow a sscanf implementation from Linux or similar?
* we'd really need a vsnscanf(3)... ? As an alternative, a
*/
size_type findFirstOf(const CharacterSet &set, size_type startPos = 0) const;
+ /** Find last occurrence of character of set in SBuf
+ *
+ * Finds the last occurrence of ANY of the characters in the supplied set in
+ * the SBuf.
+ * \return npos if no character in the set could be found
+ * \param endPos if specified, ignore any occurrences after that position
+ * if npos, the entire SBuf is searched
+ */
+ size_type findLastOf(const CharacterSet &set, size_type endPos = npos) const;
+
/** Find first occurrence character NOT in character set
*
* \return npos if all characters in the SBuf are from set
*/
size_type findFirstNotOf(const CharacterSet &set, size_type startPos = 0) const;
+ /** Find last occurrence character NOT in character set
+ *
+ * \return npos if all characters in the SBuf are from set
+ * \param endPos if specified, ignore any occurrences after that position
+ * if npos, then the entire SBuf is searched
+ */
+ size_type findLastNotOf(const CharacterSet &set, size_type endPos = npos) const;
+
/** sscanf-alike
*
* sscanf re-implementation. Non-const, and not \0-clean.
return consume(n).length();
}
+/// convenience method: consumes up to n last bytes and returns them
+SBuf
+Parser::Tokenizer::consumeTrailing(const SBuf::size_type n)
+{
+ debugs(24, 5, "consuming " << n << " bytes");
+
+ // If n is npos, we consume everything from buf_ (and nothing from result).
+ const SBuf::size_type parsed = (n == SBuf::npos) ? buf_.length() : n;
+
+ SBuf result = buf_;
+ buf_ = result.consume(buf_.length() - parsed);
+ parsed_ += parsed;
+ return result;
+}
+
+/// convenience method: consumes up to n last bytes and returns their count
+SBuf::size_type
+Parser::Tokenizer::successTrailing(const SBuf::size_type n)
+{
+ return consumeTrailing(n).length();
+}
+
bool
Parser::Tokenizer::token(SBuf &returnedToken, const CharacterSet &delimiters)
{
}
if (!found)
return false;
- returnedToken = buf_;
- buf_ = returnedToken.consume(buf_.length() - found);
+ returnedToken = consumeTrailing(found);
return true;
}
offset = buf_.length() - tokenToSkip.length();
if (buf_.substr(offset, SBuf::npos).cmp(tokenToSkip) == 0) {
- buf_ = buf_.substr(0,offset);
- return true;
+ debugs(24, 8, "skipping " << tokenToSkip.length());
+ return successTrailing(tokenToSkip.length());
}
return false;
}
return false;
}
+bool
+Parser::Tokenizer::skipOneTrailing(const CharacterSet &skippable)
+{
+ if (!buf_.isEmpty() && skippable[buf_[buf_.length()-1]]) {
+ debugs(24, 8, "skipping one-of " << skippable.name);
+ return successTrailing(1);
+ }
+ debugs(24, 8, "no match while skipping one-of " << skippable.name);
+ return false;
+}
+
+SBuf::size_type
+Parser::Tokenizer::skipAllTrailing(const CharacterSet &skippable)
+{
+ const SBuf::size_type prefixEnd = buf_.findLastNotOf(skippable);
+ const SBuf::size_type prefixLen = prefixEnd == SBuf::npos ?
+ 0 : (prefixEnd + 1);
+ const SBuf::size_type suffixLen = buf_.length() - prefixLen;
+ if (suffixLen == 0) {
+ debugs(24, 8, "no match when trying to skip " << skippable.name);
+ return 0;
+ }
+ debugs(24, 8, "skipping in " << skippable.name << " len " << suffixLen);
+ return successTrailing(suffixLen);
+}
+
/* reworked from compat/strtoll.c */
bool
Parser::Tokenizer::int64(int64_t & result, int base, bool allowSign, const SBuf::size_type limit)
*/
SBuf::size_type skipAll(const CharacterSet &discardables);
+ /** Removes a single trailing character from the set.
+ *
+ * \return whether a character was removed
+ */
+ bool skipOneTrailing(const CharacterSet &discardables);
+
+ /** Removes all sequential trailing characters from the set, in any order.
+ *
+ * \returns the number of characters removed
+ */
+ SBuf::size_type skipAllTrailing(const CharacterSet &discardables);
+
/** Extracts an unsigned int64_t at the beginning of the buffer.
*
* strtoll(3)-alike function: tries to parse unsigned 64-bit integer
protected:
SBuf consume(const SBuf::size_type n);
SBuf::size_type success(const SBuf::size_type n);
+ SBuf consumeTrailing(const SBuf::size_type n);
+ SBuf::size_type successTrailing(const SBuf::size_type n);
/// reset the buffer and parsed stats to a saved checkpoint
void undoParse(const SBuf &newBuf, SBuf::size_type cParsed) { buf_ = newBuf; parsed_ = cParsed; }
}
+void
+testTokenizer::testTokenizerSuffix()
+{
+ const SBuf canary("This text should not be changed.");
+
+ Parser::Tokenizer t(text);
+ SBuf s;
+
+ CharacterSet all(whitespace);
+ all += alpha;
+ all += crlf;
+ all += numbers;
+ all.add(':').add('.').add('/');
+
+ // an empty suffix should return false (the full output buffer case)
+ s = canary;
+ const SBuf before = t.remaining();
+ CPPUNIT_ASSERT(!t.suffix(s, all, 0));
+ // ... and a false return value means no parameter changes
+ CPPUNIT_ASSERT_EQUAL(canary, s);
+ // ... and a false return value means no input buffer changes
+ CPPUNIT_ASSERT_EQUAL(before, t.remaining());
+
+ // consume suffix until the last CRLF, including that last CRLF
+ SBuf::size_type remaining = t.remaining().length();
+ while (t.remaining().findLastOf(crlf) != SBuf::npos) {
+ CPPUNIT_ASSERT(t.remaining().length() > 0);
+ CPPUNIT_ASSERT(t.skipOneTrailing(all));
+ // ensure steady progress
+ CPPUNIT_ASSERT_EQUAL(remaining, t.remaining().length() + 1);
+ --remaining;
+ }
+
+ // no match (last char is not in the suffix set)
+ CPPUNIT_ASSERT(!t.suffix(s, crlf));
+ CPPUNIT_ASSERT(!t.suffix(s, whitespace));
+
+ // successful suffix tokenization
+ CPPUNIT_ASSERT(t.suffix(s, numbers));
+ CPPUNIT_ASSERT_EQUAL(SBuf("1"), s);
+ CPPUNIT_ASSERT(t.skipSuffix(SBuf("1.")));
+ CPPUNIT_ASSERT(t.skipSuffix(SBuf("/")));
+ CPPUNIT_ASSERT(t.suffix(s, alpha));
+ CPPUNIT_ASSERT_EQUAL(SBuf("HTTP"), s);
+ CPPUNIT_ASSERT(t.suffix(s, whitespace));
+ CPPUNIT_ASSERT_EQUAL(SBuf(" "), s);
+
+ // match until the end of the sample
+ CPPUNIT_ASSERT(t.suffix(s, all));
+ CPPUNIT_ASSERT_EQUAL(SBuf(), t.remaining());
+
+ // an empty buffer does not end with a token
+ s = canary;
+ CPPUNIT_ASSERT(!t.suffix(s, all));
+ CPPUNIT_ASSERT_EQUAL(canary, s); // no parameter changes
+
+ // we cannot skip an empty suffix, even in an empty buffer
+ CPPUNIT_ASSERT(!t.skipSuffix(SBuf()));
+}
+
void
testTokenizer::testCharacterSet()
{
CPPUNIT_TEST_SUITE( testTokenizer );
CPPUNIT_TEST ( testCharacterSet );
CPPUNIT_TEST ( testTokenizerPrefix );
+ CPPUNIT_TEST ( testTokenizerSuffix );
CPPUNIT_TEST ( testTokenizerSkip );
CPPUNIT_TEST ( testTokenizerToken );
CPPUNIT_TEST ( testTokenizerInt64 );
protected:
void testTokenizerPrefix();
+ void testTokenizerSuffix();
void testTokenizerSkip();
void testTokenizerToken();
void testCharacterSet();