]> git.ipfire.org Git - thirdparty/squid.git/blame_incremental - src/parser/Tokenizer.h
Simplify appending SBuf to String (#2108)
[thirdparty/squid.git] / src / parser / Tokenizer.h
... / ...
CommitLineData
1/*
2 * Copyright (C) 1996-2025 The Squid Software Foundation and contributors
3 *
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
7 */
8
9#ifndef SQUID_SRC_PARSER_TOKENIZER_H
10#define SQUID_SRC_PARSER_TOKENIZER_H
11
12#include "base/CharacterSet.h"
13#include "sbuf/SBuf.h"
14
15/// Generic protocol-agnostic parsing tools
16namespace Parser
17{
18
19/**
20 * Lexical processor to tokenize a buffer.
21 *
22 * Allows arbitrary delimiters and token character sets to
23 * be provided by callers.
24 *
25 * All methods start from the beginning of the input buffer.
26 * Methods returning true consume bytes from the buffer.
27 * Methods returning false have no side-effects.
28 */
29class Tokenizer
30{
31public:
32 explicit Tokenizer(const SBuf &inBuf) : buf_(inBuf), parsed_(0) {}
33
34 /// yet unparsed data
35 SBuf buf() const { return buf_; }
36
37 /// number of parsed bytes, including skipped ones
38 SBuf::size_type parsedSize() const { return parsed_; }
39
40 /// whether the end of the buffer has been reached
41 bool atEnd() const { return buf_.isEmpty(); }
42
43 /// the remaining unprocessed section of buffer
44 const SBuf& remaining() const { return buf_; }
45
46 /// reinitialize processing for a new buffer
47 void reset(const SBuf &newBuf) { undoParse(newBuf, 0); }
48
49 /** Basic strtok(3):
50 * Skips all leading delimiters (if any),
51 * extracts all characters up to the next delimiter (a token), and
52 * skips all trailing delimiters (at least one must be present).
53 *
54 * Want to extract delimiters? Use prefix() instead.
55 *
56 * Note that Tokenizer cannot tell whether the trailing delimiters will
57 * continue when/if more input data becomes available later.
58 *
59 * \return true if found a non-empty token followed by a delimiter
60 */
61 bool token(SBuf &returnedToken, const CharacterSet &delimiters);
62
63 /** Extracts all sequential permitted characters up to an optional length limit.
64 *
65 * Note that Tokenizer cannot tell whether the prefix will
66 * continue when/if more input data becomes available later.
67 *
68 * \retval true one or more characters were found, the sequence (string) is placed in returnedToken
69 * \retval false no characters from the permitted set were found
70 */
71 bool prefix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit = SBuf::npos);
72
73 /** Extracts all sequential permitted characters up to an optional length limit.
74 * Operates on the trailing end of the buffer.
75 *
76 * Note that Tokenizer cannot tell whether the buffer will
77 * gain more data when/if more input becomes available later.
78 *
79 * \retval true one or more characters were found, the sequence (string) is placed in returnedToken
80 * \retval false no characters from the permitted set were found
81 */
82 bool suffix(SBuf &returnedToken, const CharacterSet &tokenChars, SBuf::size_type limit = SBuf::npos);
83
84 /** skips a given suffix character sequence (string)
85 * Operates on the trailing end of the buffer.
86 *
87 * Note that Tokenizer cannot tell whether the buffer will
88 * gain more data when/if more input becomes available later.
89 *
90 * \return whether the exact character sequence was found and skipped
91 */
92 bool skipSuffix(const SBuf &tokenToSkip);
93
94 /** skips a given character sequence (string)
95 *
96 * \return whether the exact character sequence was found and skipped
97 */
98 bool skip(const SBuf &tokenToSkip);
99
100 /** skips a given single character
101 *
102 * \return whether the character was skipped
103 */
104 bool skip(const char tokenChar);
105
106 /** Skips a single character from the set.
107 *
108 * \return whether a character was skipped
109 */
110 bool skipOne(const CharacterSet &discardables);
111
112 /** Skips all sequential characters from the set, in any order.
113 *
114 * \returns the number of skipped characters
115 */
116 SBuf::size_type skipAll(const CharacterSet &discardables);
117
118 /** skips a given character sequence (string);
119 * does nothing if the sequence is empty
120 *
121 * \throws exception on mismatching prefix or InsufficientInput
122 */
123 void skipRequired(const char *description, const SBuf &tokenToSkip);
124
125 /** Removes a single trailing character from the set.
126 *
127 * \return whether a character was removed
128 */
129 bool skipOneTrailing(const CharacterSet &discardables);
130
131 /** Removes all sequential trailing characters from the set, in any order.
132 *
133 * \returns the number of characters removed
134 */
135 SBuf::size_type skipAllTrailing(const CharacterSet &discardables);
136
137 /** Extracts an unsigned int64_t at the beginning of the buffer.
138 *
139 * strtoll(3)-alike function: tries to parse unsigned 64-bit integer
140 * at the beginning of the parse buffer, in the base specified by the user
141 * or guesstimated; consumes the parsed characters.
142 *
143 * \param result Output value. Not touched if parsing is unsuccessful.
144 * \param base Specify base to do the parsing in, with the same restrictions
145 * as strtoll. Defaults to 0 (meaning guess)
146 * \param allowSign Whether to accept a '+' or '-' sign prefix.
147 * \param limit Maximum count of characters to convert.
148 *
149 * \return whether the parsing was successful
150 */
151 bool int64(int64_t &result, int base = 0, bool allowSign = true, SBuf::size_type limit = SBuf::npos);
152
153 /*
154 * The methods below mimic their counterparts documented above, but they
155 * throw on errors, including InsufficientInput. The field description
156 * parameter is used for error reporting and debugging.
157 */
158
159 /// prefix() wrapper but throws InsufficientInput if input contains
160 /// nothing but the prefix (i.e. if the prefix is not "terminated")
161 SBuf prefix(const char *description, const CharacterSet &tokenChars, SBuf::size_type limit = SBuf::npos);
162
163 /// int64() wrapper but limited to unsigned decimal integers (for now)
164 int64_t udec64(const char *description, SBuf::size_type limit = SBuf::npos);
165
166protected:
167 SBuf consume(const SBuf::size_type n);
168 SBuf::size_type success(const SBuf::size_type n);
169 SBuf consumeTrailing(const SBuf::size_type n);
170 SBuf::size_type successTrailing(const SBuf::size_type n);
171
172 /// reset the buffer and parsed stats to a saved checkpoint
173 void undoParse(const SBuf &newBuf, SBuf::size_type cParsed) { buf_ = newBuf; parsed_ = cParsed; }
174
175private:
176 SBuf buf_; ///< yet unparsed input
177 SBuf::size_type parsed_; ///< bytes successfully parsed, including skipped
178};
179
180} /* namespace Parser */
181
182#endif /* SQUID_SRC_PARSER_TOKENIZER_H */
183