src/parser/Tokenizer.h

   1 /*
   2  * Copyright (C) 1996-2018 The Squid Software Foundation and contributors
   3  *
   4  * Squid software is distributed under GPLv2+ license and includes
   5  * contributions from numerous individuals and organizations.
   6  * Please see the COPYING and CONTRIBUTORS files for details.
   7  */
   8
   9 #ifndef SQUID_PARSER_TOKENIZER_H_
  10 #define SQUID_PARSER_TOKENIZER_H_
  11
  12 #include "base/CharacterSet.h"
  13 #include "sbuf/SBuf.h"
  14
  15 /// Generic protocol-agnostic parsing tools
  16 namespace Parser
  17 {
  18
  19 /**
  20  * Lexical processor to tokenize a buffer.
  21  *
  22  * Allows arbitrary delimiters and token character sets to
  23  * be provided by callers.
  24  *
  25  * All methods start from the beginning of the input buffer.
  26  * Methods returning true consume bytes from the buffer.
  27  * Methods returning false have no side-effects.
  28  */
  29 class Tokenizer
  30 {
  31 public:
  32     explicit Tokenizer(const SBuf &inBuf) : buf_(inBuf), parsed_(0) {}
  33
  34     /// yet unparsed data
  35     SBuf buf() const { return buf_; }
  36
  37     /// number of parsed bytes, including skipped ones
  38     SBuf::size_type parsedSize() const { return parsed_; }
  39
  40     /// whether the end of the buffer has been reached
  41     bool atEnd() const { return buf_.isEmpty(); }
  42
  43     /// the remaining unprocessed section of buffer
  44     const SBuf& remaining() const { return buf_; }
  45
  46     /// reinitialize processing for a new buffer
  47     void reset(const SBuf &newBuf) { undoParse(newBuf, 0); }
  48
  49     /** Basic strtok(3):
  50      *  Skips all leading delimiters (if any),
  51      *  extracts all characters up to the next delimiter (a token), and
  52      *  skips all trailing delimiters (at least one must be present).
  53      *
  54      *  Want to extract delimiters? Use prefix() instead.
  55      *
  56      *  Note that Tokenizer cannot tell whether the trailing delimiters will
  57      *  continue when/if more input data becomes available later.
  58      *
  59      * \return true if found a non-empty token followed by a delimiter
  60      */
  61     bool token(SBuf &returnedToken, const CharacterSet &delimiters);
  62
  63     /** Extracts all sequential permitted characters up to an optional length limit.
  64      *
  65      *  Note that Tokenizer cannot tell whether the prefix will
  66      *  continue when/if more input data becomes available later.
  67      *
  68      * \retval true one or more characters were found, the sequence (string) is placed in returnedToken
  69      * \retval false no characters from the permitted set were found
  70      */
  71     bool prefix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit = SBuf::npos);
  72
  73     /** Extracts all sequential permitted characters up to an optional length limit.
  74      * Operates on the trailing end of the buffer.
  75      *
  76      *  Note that Tokenizer cannot tell whether the buffer will
  77      *  gain more data when/if more input becomes available later.
  78      *
  79      * \retval true one or more characters were found, the sequence (string) is placed in returnedToken
  80      * \retval false no characters from the permitted set were found
  81      */
  82     bool suffix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit = SBuf::npos);
  83
  84     /** skips a given suffix character sequence (string)
  85      * Operates on the trailing end of the buffer.
  86      *
  87      *  Note that Tokenizer cannot tell whether the buffer will
  88      *  gain more data when/if more input becomes available later.
  89      *
  90      * \return whether the exact character sequence was found and skipped
  91      */
  92     bool skipSuffix(const SBuf &tokenToSkip);
  93
  94     /** skips a given character sequence (string)
  95      *
  96      * \return whether the exact character sequence was found and skipped
  97      */
  98     bool skip(const SBuf &tokenToSkip);
  99
 100     /** skips a given single character
 101      *
 102      * \return whether the character was skipped
 103      */
 104     bool skip(const char tokenChar);
 105
 106     /** Skips a single character from the set.
 107      *
 108      * \return whether a character was skipped
 109      */
 110     bool skipOne(const CharacterSet &discardables);
 111
 112     /** Skips all sequential characters from the set, in any order.
 113      *
 114      * \returns the number of skipped characters
 115      */
 116     SBuf::size_type skipAll(const CharacterSet &discardables);
 117
 118     /** Removes a single trailing character from the set.
 119      *
 120      * \return whether a character was removed
 121      */
 122     bool skipOneTrailing(const CharacterSet &discardables);
 123
 124     /** Removes all sequential trailing characters from the set, in any order.
 125      *
 126      * \returns the number of characters removed
 127      */
 128     SBuf::size_type skipAllTrailing(const CharacterSet &discardables);
 129
 130     /** Extracts an unsigned int64_t at the beginning of the buffer.
 131      *
 132      * strtoll(3)-alike function: tries to parse unsigned 64-bit integer
 133      * at the beginning of the parse buffer, in the base specified by the user
 134      * or guesstimated; consumes the parsed characters.
 135      *
 136      * \param result Output value. Not touched if parsing is unsuccessful.
 137      * \param base   Specify base to do the parsing in, with the same restrictions
 138      *               as strtoll. Defaults to 0 (meaning guess)
 139      * \param allowSign Whether to accept a '+' or '-' sign prefix.
 140      * \param limit  Maximum count of characters to convert.
 141      *
 142      * \return whether the parsing was successful
 143      */
 144     bool int64(int64_t &result, int base = 0, bool allowSign = true, SBuf::size_type limit = SBuf::npos);
 145
 146 protected:
 147     SBuf consume(const SBuf::size_type n);
 148     SBuf::size_type success(const SBuf::size_type n);
 149     SBuf consumeTrailing(const SBuf::size_type n);
 150     SBuf::size_type successTrailing(const SBuf::size_type n);
 151
 152     /// reset the buffer and parsed stats to a saved checkpoint
 153     void undoParse(const SBuf &newBuf, SBuf::size_type cParsed) { buf_ = newBuf; parsed_ = cParsed; }
 154
 155 private:
 156     SBuf buf_; ///< yet unparsed input
 157     SBuf::size_type parsed_; ///< bytes successfully parsed, including skipped
 158 };
 159
 160 } /* namespace Parser */
 161
 162 #endif /* SQUID_PARSER_TOKENIZER_H_ */
 163