]> git.ipfire.org Git - thirdparty/squid.git/blob - src/parser/Tokenizer.cc
RFC 9112: Improve HTTP chunked encoding compliance (#1498)
[thirdparty/squid.git] / src / parser / Tokenizer.cc
1 /*
2 * Copyright (C) 1996-2023 The Squid Software Foundation and contributors
3 *
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
7 */
8
9 /* DEBUG: section 24 SBuf */
10
11 #include "squid.h"
12 #include "debug/Stream.h"
13 #include "parser/forward.h"
14 #include "parser/Tokenizer.h"
15 #include "sbuf/Stream.h"
16
17 #include <cctype>
18 #include <cerrno>
19
20 /// convenience method: consumes up to n bytes, counts, and returns them
21 SBuf
22 Parser::Tokenizer::consume(const SBuf::size_type n)
23 {
24 // careful: n may be npos!
25 debugs(24, 5, "consuming " << n << " bytes");
26 const SBuf result = buf_.consume(n);
27 parsed_ += result.length();
28 return result;
29 }
30
31 /// convenience method: consume()s up to n bytes and returns their count
32 SBuf::size_type
33 Parser::Tokenizer::success(const SBuf::size_type n)
34 {
35 return consume(n).length();
36 }
37
38 /// convenience method: consumes up to n last bytes and returns them
39 SBuf
40 Parser::Tokenizer::consumeTrailing(const SBuf::size_type n)
41 {
42 debugs(24, 5, "consuming " << n << " bytes");
43
44 // If n is npos, we consume everything from buf_ (and nothing from result).
45 const SBuf::size_type parsed = (n == SBuf::npos) ? buf_.length() : n;
46
47 SBuf result = buf_;
48 buf_ = result.consume(buf_.length() - parsed);
49 parsed_ += parsed;
50 return result;
51 }
52
53 /// convenience method: consumes up to n last bytes and returns their count
54 SBuf::size_type
55 Parser::Tokenizer::successTrailing(const SBuf::size_type n)
56 {
57 return consumeTrailing(n).length();
58 }
59
60 bool
61 Parser::Tokenizer::token(SBuf &returnedToken, const CharacterSet &delimiters)
62 {
63 const Tokenizer saved(*this);
64 skipAll(delimiters);
65 const SBuf::size_type tokenLen = buf_.findFirstOf(delimiters); // not found = npos => consume to end
66 if (tokenLen == SBuf::npos) {
67 debugs(24, 8, "no token found for delimiters " << delimiters.name);
68 *this = saved;
69 return false;
70 }
71 returnedToken = consume(tokenLen); // cannot be empty
72 skipAll(delimiters);
73 debugs(24, DBG_DATA, "token found for delimiters " << delimiters.name << ": '" <<
74 returnedToken << '\'');
75 return true;
76 }
77
78 bool
79 Parser::Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
80 {
81 SBuf::size_type prefixLen = buf_.substr(0,limit).findFirstNotOf(tokenChars);
82 if (prefixLen == 0) {
83 debugs(24, 8, "no prefix for set " << tokenChars.name);
84 return false;
85 }
86 if (prefixLen == SBuf::npos && (atEnd() || limit == 0)) {
87 debugs(24, 8, "no char in set " << tokenChars.name << " while looking for prefix");
88 return false;
89 }
90 if (prefixLen == SBuf::npos && limit > 0) {
91 debugs(24, 8, "whole haystack matched");
92 prefixLen = limit;
93 }
94 debugs(24, 8, "found with length " << prefixLen);
95 returnedToken = consume(prefixLen); // cannot be empty after the npos check
96 return true;
97 }
98
99 SBuf
100 Parser::Tokenizer::prefix(const char *description, const CharacterSet &tokenChars, const SBuf::size_type limit)
101 {
102 if (atEnd())
103 throw InsufficientInput();
104
105 SBuf result;
106
107 if (!prefix(result, tokenChars, limit))
108 throw TexcHere(ToSBuf("cannot parse ", description));
109
110 if (atEnd())
111 throw InsufficientInput();
112
113 return result;
114 }
115
116 bool
117 Parser::Tokenizer::suffix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
118 {
119 SBuf span = buf_;
120
121 if (limit < buf_.length())
122 span.consume(buf_.length() - limit); // ignore the N prefix characters
123
124 auto i = span.rbegin();
125 SBuf::size_type found = 0;
126 while (i != span.rend() && tokenChars[*i]) {
127 ++i;
128 ++found;
129 }
130 if (!found)
131 return false;
132 returnedToken = consumeTrailing(found);
133 return true;
134 }
135
136 SBuf::size_type
137 Parser::Tokenizer::skipAll(const CharacterSet &tokenChars)
138 {
139 const SBuf::size_type prefixLen = buf_.findFirstNotOf(tokenChars);
140 if (prefixLen == 0) {
141 debugs(24, 8, "no match when trying to skipAll " << tokenChars.name);
142 return 0;
143 }
144 debugs(24, 8, "skipping all in " << tokenChars.name << " len " << prefixLen);
145 return success(prefixLen);
146 }
147
148 void
149 Parser::Tokenizer::skipRequired(const char *description, const SBuf &tokenToSkip)
150 {
151 if (skip(tokenToSkip) || tokenToSkip.isEmpty())
152 return;
153
154 if (tokenToSkip.startsWith(buf_))
155 throw InsufficientInput();
156
157 throw TextException(ToSBuf("cannot skip ", description), Here());
158 }
159
160 bool
161 Parser::Tokenizer::skipOne(const CharacterSet &chars)
162 {
163 if (!buf_.isEmpty() && chars[buf_[0]]) {
164 debugs(24, 8, "skipping one-of " << chars.name);
165 return success(1);
166 }
167 debugs(24, 8, "no match while skipping one-of " << chars.name);
168 return false;
169 }
170
171 bool
172 Parser::Tokenizer::skipSuffix(const SBuf &tokenToSkip)
173 {
174 if (buf_.length() < tokenToSkip.length())
175 return false;
176
177 SBuf::size_type offset = 0;
178 if (tokenToSkip.length() < buf_.length())
179 offset = buf_.length() - tokenToSkip.length();
180
181 if (buf_.substr(offset, SBuf::npos).cmp(tokenToSkip) == 0) {
182 debugs(24, 8, "skipping " << tokenToSkip.length());
183 return successTrailing(tokenToSkip.length());
184 }
185 return false;
186 }
187
188 bool
189 Parser::Tokenizer::skip(const SBuf &tokenToSkip)
190 {
191 if (buf_.startsWith(tokenToSkip)) {
192 debugs(24, 8, "skipping " << tokenToSkip.length());
193 return success(tokenToSkip.length());
194 }
195 debugs(24, 8, "no match, not skipping '" << tokenToSkip << '\'');
196 return false;
197 }
198
199 bool
200 Parser::Tokenizer::skip(const char tokenChar)
201 {
202 if (!buf_.isEmpty() && buf_[0] == tokenChar) {
203 debugs(24, 8, "skipping char '" << tokenChar << '\'');
204 return success(1);
205 }
206 debugs(24, 8, "no match, not skipping char '" << tokenChar << '\'');
207 return false;
208 }
209
210 bool
211 Parser::Tokenizer::skipOneTrailing(const CharacterSet &skippable)
212 {
213 if (!buf_.isEmpty() && skippable[buf_[buf_.length()-1]]) {
214 debugs(24, 8, "skipping one-of " << skippable.name);
215 return successTrailing(1);
216 }
217 debugs(24, 8, "no match while skipping one-of " << skippable.name);
218 return false;
219 }
220
221 SBuf::size_type
222 Parser::Tokenizer::skipAllTrailing(const CharacterSet &skippable)
223 {
224 const SBuf::size_type prefixEnd = buf_.findLastNotOf(skippable);
225 const SBuf::size_type prefixLen = prefixEnd == SBuf::npos ?
226 0 : (prefixEnd + 1);
227 const SBuf::size_type suffixLen = buf_.length() - prefixLen;
228 if (suffixLen == 0) {
229 debugs(24, 8, "no match when trying to skip " << skippable.name);
230 return 0;
231 }
232 debugs(24, 8, "skipping in " << skippable.name << " len " << suffixLen);
233 return successTrailing(suffixLen);
234 }
235
236 /* reworked from compat/strtoll.c */
237 bool
238 Parser::Tokenizer::int64(int64_t & result, int base, bool allowSign, const SBuf::size_type limit)
239 {
240 if (atEnd() || limit == 0)
241 return false;
242
243 const SBuf range(buf_.substr(0,limit));
244
245 // XXX: account for buf_.size()
246 bool neg = false;
247 const char *s = range.rawContent();
248 const char *end = range.rawContent() + range.length();
249
250 if (allowSign) {
251 if (*s == '-') {
252 neg = true;
253 ++s;
254 } else if (*s == '+') {
255 ++s;
256 }
257 if (s >= end) return false;
258 }
259 if (( base == 0 || base == 16) && *s == '0' && (s+1 < end ) &&
260 tolower(*(s+1)) == 'x') {
261 s += 2;
262 base = 16;
263 }
264 if (base == 0) {
265 if ( *s == '0') {
266 base = 8;
267 ++s;
268 } else {
269 base = 10;
270 }
271 }
272 if (s >= end) return false;
273
274 uint64_t cutoff;
275
276 cutoff = neg ? -static_cast<uint64_t>(INT64_MIN) : INT64_MAX;
277 const int cutlim = cutoff % static_cast<int64_t>(base);
278 cutoff /= static_cast<uint64_t>(base);
279
280 int any = 0, c;
281 int64_t acc = 0;
282 do {
283 c = *s;
284 if (xisdigit(c)) {
285 c -= '0';
286 } else if (xisalpha(c)) {
287 c -= xisupper(c) ? 'A' - 10 : 'a' - 10;
288 } else {
289 break;
290 }
291 if (c >= base)
292 break;
293 if (any < 0 || static_cast<uint64_t>(acc) > cutoff || (static_cast<uint64_t>(acc) == cutoff && c > cutlim))
294 any = -1;
295 else {
296 any = 1;
297 acc *= base;
298 acc += c;
299 }
300 } while (++s < end);
301
302 if (any == 0) // nothing was parsed
303 return false;
304 if (any < 0) {
305 acc = neg ? INT64_MIN : INT64_MAX;
306 errno = ERANGE;
307 return false;
308 } else if (neg)
309 acc = -acc;
310
311 result = acc;
312 return success(s - range.rawContent());
313 }
314
315 int64_t
316 Parser::Tokenizer::udec64(const char *description, const SBuf::size_type limit)
317 {
318 if (atEnd())
319 throw InsufficientInput();
320
321 int64_t result = 0;
322
323 // Since we only support unsigned decimals, a parsing failure with a
324 // non-empty input always implies invalid/malformed input (or a buggy
325 // limit=0 caller). TODO: Support signed and non-decimal integers by
326 // refactoring int64() to detect insufficient input.
327 if (!int64(result, 10, false, limit))
328 throw TexcHere(ToSBuf("cannot parse ", description));
329
330 if (atEnd())
331 throw InsufficientInput(); // more digits may be coming
332
333 return result;
334 }
335