]> git.ipfire.org Git - thirdparty/squid.git/blob - src/http/one/RequestParser.cc
RFC 7230 compliant request-line parser based on Tokenizer API
[thirdparty/squid.git] / src / http / one / RequestParser.cc
1 /*
2 * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
3 *
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
7 */
8
9 #include "squid.h"
10 #include "Debug.h"
11 #include "http/one/RequestParser.h"
12 #include "http/ProtocolVersion.h"
13 #include "mime_header.h"
14 #include "parser/Tokenizer.h"
15 #include "profiler/Profiler.h"
16 #include "SquidConfig.h"
17
18 Http::One::RequestParser::RequestParser() :
19 Parser(),
20 request_parse_status(Http::scNone),
21 firstLineGarbage_(0)
22 {}
23
24 Http1::Parser::size_type
25 Http::One::RequestParser::firstLineSize() const
26 {
27 // RFC 7230 section 2.6
28 /* method SP request-target SP "HTTP/" DIGIT "." DIGIT CRLF */
29 return method_.image().length() + uri_.length() + 12;
30 }
31
32 /**
33 * Attempt to parse the first line of a new request message.
34 *
35 * Governed by RFC 7230 section 3.5
36 * "
37 * In the interest of robustness, a server that is expecting to receive
38 * and parse a request-line SHOULD ignore at least one empty line (CRLF)
39 * received prior to the request-line.
40 * "
41 *
42 * Parsing state is stored between calls to avoid repeating buffer scans.
43 * If garbage is found the parsing offset is incremented.
44 */
45 void
46 Http::One::RequestParser::skipGarbageLines()
47 {
48 if (Config.onoff.relaxed_header_parser) {
49 if (Config.onoff.relaxed_header_parser < 0 && (buf_[0] == '\r' || buf_[0] == '\n'))
50 debugs(74, DBG_IMPORTANT, "WARNING: Invalid HTTP Request: " <<
51 "CRLF bytes received ahead of request-line. " <<
52 "Ignored due to relaxed_header_parser.");
53 // Be tolerant of prefix empty lines
54 // ie any series of either \n or \r\n with no other characters and no repeated \r
55 while (!buf_.isEmpty() && (buf_[0] == '\n' || (buf_[0] == '\r' && buf_[1] == '\n'))) {
56 buf_.consume(1);
57 }
58 }
59 }
60
61 /// detect and skip the CRLF or LF line terminator
62 /// consume from the tokenizer and return true only if found
63 bool
64 Http::One::RequestParser::skipLineTerminator(::Parser::Tokenizer &tok) const
65 {
66 static const SBuf crlf("\r\n");
67 if (tok.skip(crlf))
68 return true;
69
70 if (Config.onoff.relaxed_header_parser && tok.skipOne(CharacterSet::LF))
71 return true;
72
73 return false;
74 }
75
76 /**
77 * Attempt to parse the method field out of an HTTP message request-line.
78 *
79 * Governed by:
80 * RFC 1945 section 5.1
81 * RFC 7230 section 2.6, 3.1 and 3.5
82 *
83 * Parsing state is stored between calls. The current implementation uses
84 * checkpoints after each successful request-line field.
85 * The return value tells you whether the parsing is completed or not.
86 *
87 * \retval -1 an error occurred. request_parse_status indicates HTTP status result.
88 * \retval 1 successful parse. method_ is filled and buffer consumed including first delimiter.
89 * \retval 0 more data is needed to complete the parse
90 */
91 int
92 Http::One::RequestParser::parseMethodField(::Parser::Tokenizer &tok, const CharacterSet &WspDelim)
93 {
94 // scan for up to 16 valid method characters.
95 static const size_t maxMethodLength = 16;
96
97 SBuf methodFound;
98
99 // method field is a sequence of TCHAR.
100 // NP: prefix-with-limit returns true if it finds ANY valid chars
101 if (!tok.prefix(methodFound, CharacterSet::TCHAR, maxMethodLength)) {
102 // missing/invalid 'method'.
103 request_parse_status = Http::scBadRequest;
104 debugs(33, 5, "invalid request-line. missing method");
105 return -1;
106 }
107
108 // we may be at the end if we found exactly maxMethodLength bytes
109 if (tok.atEnd()) {
110 debugs(74, 5, "Parser needs more data to find method");
111 return 0;
112 }
113
114 // ... followed by at least one whitespace character.
115 if (!tok.skipOne(WspDelim)) {
116 // non-delimiter found after accepted method bytes means ...
117 if (methodFound.length() == maxMethodLength) {
118 // method longer than acceptible.
119 // RFC 7230 section 3.1.1 mandatory (SHOULD) 501 response
120 request_parse_status = Http::scNotImplemented;
121 debugs(33, 5, "invalid request-line. method too long");
122 } else {
123 // invalid character in the URL
124 // RFC 7230 section 3.1.1 required (SHOULD) 400 response
125 request_parse_status = Http::scBadRequest;
126 debugs(33, 5, "invalid request-line. missing method delimiter");
127 }
128 return -1;
129 }
130 method_ = HttpRequestMethod(methodFound);
131 buf_ = tok.remaining(); // incremental parse checkpoint
132 return 1;
133 }
134
135 int
136 Http::One::RequestParser::parseUriField(::Parser::Tokenizer &tok, const CharacterSet &WspDelim)
137 {
138 // URI field is a sequence of ... what? segments all have different valid charset
139 // go with non-whitespace non-binary characters for now
140 static CharacterSet UriChars("URI-Chars","");
141 if (!UriChars['a']) { // if it needs initializing...
142 /* RFC 3986 section 2:
143 * "
144 * A URI is composed from a limited set of characters consisting of
145 * digits, letters, and a few graphic symbols.
146 * "
147 */
148 // RFC 3986 section 2.1 - percent encoding "%" HEXDIG
149 UriChars.add('%');
150 UriChars += CharacterSet::HEXDIG;
151 // RFC 3986 section 2.2 - reserved characters
152 UriChars += CharacterSet("gen-delims", ":/?#[]@");
153 UriChars += CharacterSet("sub-delims", "!$&'()*+,;=");
154 // RFC 3986 section 2.3 - unreserved characters
155 UriChars += CharacterSet::ALPHA;
156 UriChars += CharacterSet::DIGIT;
157 UriChars += CharacterSet("unreserved", "-._~");
158 }
159
160 /* Arbitrary 64KB URI upper length limit.
161 *
162 * Not quite as arbitrary as it seems though. Old SquidString objects
163 * cannot store strings larger than 64KB, so we must limit until they
164 * have all been replaced with SBuf.
165 *
166 * Not that it matters but RFC 7230 section 3.1.1 requires (RECOMMENDED)
167 * at least 8000 octets for the whole line, including method and version.
168 */
169 const size_t maxUriLength = min(static_cast<size_t>(Config.maxRequestHeaderSize) - firstLineSize(),
170 static_cast<size_t>((64*1024)-1));
171
172 SBuf uriFound;
173 // NP: prefix-with-limit returns true if it finds ANY valid chars
174 if (!tok.prefix(uriFound, UriChars, maxUriLength)) {
175 // else did not find any valid TCHAR
176 debugs(33, 5, "invalid request-line. missing URL");
177 request_parse_status = Http::scBadRequest;
178 return -1;
179 }
180
181 // we may be at the end if we found exactly maxUriLength bytes
182 if (tok.atEnd()) {
183 debugs(74, 5, "Parser needs more data to find URI");
184 return 0;
185 }
186
187 // RFC 1945 - for GET the line terminator may follow URL instead of a delimiter
188 if (method_ == Http::METHOD_GET && skipLineTerminator(tok)) {
189 debugs(33, 5, "HTTP/0.9 syntax request-line detected");
190 msgProtocol_ = Http::ProtocolVersion(0,9);
191 uri_ = uriFound;
192 request_parse_status = Http::scOkay;
193 buf_ = tok.remaining(); // incremental parse checkpoint
194 return 1;
195 }
196
197 // ... followed by at least one whitespace character.
198 if (!tok.skipOne(WspDelim)) {
199 // non-delimiter found after accepted URL bytes means ...
200 if (uriFound.length() == maxUriLength) {
201 // URL longer than acceptible.
202 // RFC 7230 section 3.1.1 mandatory (MUST) 414 response
203 request_parse_status = Http::scUriTooLong;
204 debugs(33, 5, "invalid request-line. URI longer than " << maxUriLength << " bytes");
205 return -1;
206 } else {
207 // invalid non-delimiter character ended the URL
208 // RFC 7230 section 3.1.1 required (SHOULD) 400 response
209 request_parse_status = Http::scBadRequest;
210 debugs(33, 5, "invalid request-line. missing URI delimiter");
211 return -1;
212 }
213 }
214 uri_ = uriFound;
215 buf_ = tok.remaining(); // incremental parse checkpoint
216 return 1;
217 }
218
219 int
220 Http::One::RequestParser::parseHttpVersionField(::Parser::Tokenizer &tok)
221 {
222 // partial match of HTTP/1 magic prefix
223 if (tok.remaining().length() < Http1magic.length() && Http1magic.startsWith(tok.remaining())) {
224 debugs(74, 5, "Parser needs more data to find version");
225 return 0;
226 }
227
228 if (!tok.skip(Http1magic)) {
229 debugs(74, 5, "invalid request-line. not HTTP/1 protocol");
230 request_parse_status = Http::scHttpVersionNotSupported;
231 return -1;
232 }
233
234 if (tok.atEnd()) {
235 debugs(74, 5, "Parser needs more data to find version");
236 return 0;
237 }
238
239 // get the version minor DIGIT
240 SBuf digit;
241 if (!tok.prefix(digit, CharacterSet::DIGIT, 1)) {
242 // non-DIGIT. invalid version number.
243 request_parse_status = Http::scHttpVersionNotSupported;
244 debugs(33, 5, "invalid request-line. non-numeric or too-large HTTP minor version");
245 return -1;
246 }
247
248 if (tok.atEnd()) {
249 debugs(74, 5, "Parser needs more data to find version");
250 return 0;
251 }
252
253 // version is always followed by the terminator
254 if (!skipLineTerminator(tok)) {
255 if (tok.skipOne(CharacterSet::CR) && tok.atEnd()) {
256 debugs(74, 5, "Parser needs more data to find version");
257 return 0;
258 }
259 request_parse_status = Http::scHttpVersionNotSupported;
260 debugs(33, 5, "invalid request-line. garabge before line terminator");
261 return -1;
262 }
263
264 // found version fully AND terminator
265 msgProtocol_ = Http::ProtocolVersion(1, (*digit.rawContent() - '0'));
266 request_parse_status = Http::scOkay;
267 buf_ = tok.remaining(); // incremental parse checkpoint
268 return 1;
269 }
270
271 /**
272 * Attempt to parse the first line of a new request message.
273 *
274 * Governed by:
275 * RFC 1945 section 5.1
276 * RFC 7230 section 2.6, 3.1 and 3.5
277 *
278 * Parsing state is stored between calls. The current implementation uses
279 * checkpoints after each successful request-line field.
280 * The return value tells you whether the parsing is completed or not.
281 *
282 * \retval -1 an error occurred. request_parse_status indicates HTTP status result.
283 * \retval 1 successful parse. member fields contain the request-line items
284 * \retval 0 more data is needed to complete the parse
285 */
286 int
287 Http::One::RequestParser::parseRequestFirstLine()
288 {
289 ::Parser::Tokenizer tok(buf_);
290
291 debugs(74, 5, "parsing possible request: buf.length=" << buf_.length());
292 debugs(74, DBG_DATA, buf_);
293
294 CharacterSet WspDelim = CharacterSet::SP; // strict parse only accepts SP
295
296 if (Config.onoff.relaxed_header_parser) {
297 // RFC 7230 section 3.5
298 // tolerant parser MAY accept any of SP, HTAB, VT (%x0B), FF (%x0C), or bare CR
299 // as whitespace between request-line fields
300 WspDelim += CharacterSet::HTAB
301 + CharacterSet("VT,FF","\x0B\x0C")
302 + CharacterSet::CR;
303 }
304
305 // only search for method if we have not yet found one
306 if (method_ == Http::METHOD_NONE) {
307 const int res = parseMethodField(tok, WspDelim);
308 if (res < 1)
309 return res;
310 // else keep going...
311 }
312
313 // tolerant parser allows multiple whitespace characters between fields
314 if (Config.onoff.relaxed_header_parser) {
315 const size_t garbage = tok.skipAll(WspDelim);
316 if (garbage > 0) {
317 firstLineGarbage_ += garbage;
318 buf_ = tok.remaining(); // re-checkpoint after garbage
319 }
320 }
321 if (tok.atEnd()) {
322 debugs(74, 5, "Parser needs more data");
323 return 0;
324 }
325
326 // only search for request-target (URL) if we have not yet found one
327 if (uri_.isEmpty()) {
328 const int res = parseUriField(tok, WspDelim);
329 if (res < 1 || msgProtocol_.protocol == AnyP::PROTO_HTTP)
330 return res;
331 // else keep going...
332 }
333
334 // tolerant parser allows multiple whitespace characters between fields
335 if (Config.onoff.relaxed_header_parser) {
336 const size_t garbage = tok.skipAll(WspDelim);
337 if (garbage > 0) {
338 firstLineGarbage_ += garbage;
339 buf_ = tok.remaining(); // re-checkpoint after garbage
340 }
341 }
342 if (tok.atEnd()) {
343 debugs(74, 5, "Parser needs more data");
344 return 0;
345 }
346
347 // HTTP/1 version suffix (protocol magic) followed by CR*LF
348 if (msgProtocol_.protocol == AnyP::PROTO_NONE) {
349 return parseHttpVersionField(tok);
350 }
351
352 // If we got here this method has been called too many times
353 request_parse_status = Http::scInternalServerError;
354 debugs(33, 5, "ERROR: Parser already processed request-line");
355 return -1;
356 }
357
358 bool
359 Http::One::RequestParser::parse(const SBuf &aBuf)
360 {
361 buf_ = aBuf;
362 debugs(74, DBG_DATA, "Parse buf={length=" << aBuf.length() << ", data='" << aBuf << "'}");
363
364 // stage 1: locate the request-line
365 if (parsingStage_ == HTTP_PARSE_NONE) {
366 skipGarbageLines();
367
368 // if we hit something before EOS treat it as a message
369 if (!buf_.isEmpty())
370 parsingStage_ = HTTP_PARSE_FIRST;
371 else
372 return false;
373 }
374
375 // stage 2: parse the request-line
376 if (parsingStage_ == HTTP_PARSE_FIRST) {
377 PROF_start(HttpParserParseReqLine);
378 const int retcode = parseRequestFirstLine();
379
380 // first-line (or a look-alike) found successfully.
381 if (retcode > 0) {
382 parsingStage_ = HTTP_PARSE_MIME;
383 }
384
385 debugs(74, 5, "request-line: retval " << retcode << ": line={" << aBuf.length() << ", data='" << aBuf << "'}");
386 debugs(74, 5, "request-line: method: " << method_);
387 debugs(74, 5, "request-line: url: " << uri_);
388 debugs(74, 5, "request-line: proto: " << msgProtocol_);
389 debugs(74, 5, "Parser: bytes processed=" << (aBuf.length()-buf_.length()));
390 PROF_stop(HttpParserParseReqLine);
391
392 // syntax errors already
393 if (retcode < 0) {
394 parsingStage_ = HTTP_PARSE_DONE;
395 return false;
396 }
397 }
398
399 // stage 3: locate the mime header block
400 if (parsingStage_ == HTTP_PARSE_MIME) {
401 // HTTP/1.x request-line is valid and parsing completed.
402 if (msgProtocol_.major == 1) {
403 /* NOTE: HTTP/0.9 requests do not have a mime header block.
404 * So the rest of the code will need to deal with '0'-byte headers
405 * (ie, none, so don't try parsing em)
406 */
407 int64_t mimeHeaderBytes = 0;
408 // XXX: c_str() reallocates. performance regression.
409 if ((mimeHeaderBytes = headersEnd(buf_.c_str(), buf_.length())) == 0) {
410 if (buf_.length()+firstLineSize() >= Config.maxRequestHeaderSize) {
411 debugs(33, 5, "Too large request");
412 request_parse_status = Http::scRequestHeaderFieldsTooLarge;
413 parsingStage_ = HTTP_PARSE_DONE;
414 } else
415 debugs(33, 5, "Incomplete request, waiting for end of headers");
416 return false;
417 }
418 mimeHeaderBlock_ = buf_.consume(mimeHeaderBytes);
419 debugs(74, 5, "mime header (0-" << mimeHeaderBytes << ") {" << mimeHeaderBlock_ << "}");
420
421 } else
422 debugs(33, 3, "Missing HTTP/1.x identifier");
423
424 // NP: we do not do any further stages here yet so go straight to DONE
425 parsingStage_ = HTTP_PARSE_DONE;
426
427 // Squid could handle these headers, but admin does not want to
428 if (messageHeaderSize() >= Config.maxRequestHeaderSize) {
429 debugs(33, 5, "Too large request");
430 request_parse_status = Http::scRequestHeaderFieldsTooLarge;
431 return false;
432 }
433 }
434
435 return !needsMoreData();
436 }
437