2 * Copyright (C) 1996-2017 The Squid Software Foundation and contributors
4 * Squid software is distributed under GPLv2+ license and includes
5 * contributions from numerous individuals and organizations.
6 * Please see the COPYING and CONTRIBUTORS files for details.
11 #include "http/one/RequestParser.h"
12 #include "http/one/Tokenizer.h"
13 #include "http/ProtocolVersion.h"
14 #include "profiler/Profiler.h"
15 #include "SquidConfig.h"
17 // the right debugs() level for parsing errors
20 return Config
.onoff
.relaxed_header_parser
< 0 ? DBG_IMPORTANT
: 5;
23 Http::One::RequestParser::RequestParser(bool preserveParsed
) :
25 preserveParsed_(preserveParsed
)
28 Http1::Parser::size_type
29 Http::One::RequestParser::firstLineSize() const
31 // RFC 7230 section 2.6
32 /* method SP request-target SP "HTTP/" DIGIT "." DIGIT CRLF */
33 return method_
.image().length() + uri_
.length() + 12;
37 * Attempt to parse the first line of a new request message.
39 * Governed by RFC 7230 section 3.5
41 * In the interest of robustness, a server that is expecting to receive
42 * and parse a request-line SHOULD ignore at least one empty line (CRLF)
43 * received prior to the request-line.
46 * Parsing state is stored between calls to avoid repeating buffer scans.
47 * If garbage is found the parsing offset is incremented.
50 Http::One::RequestParser::skipGarbageLines()
52 if (Config
.onoff
.relaxed_header_parser
) {
53 if (Config
.onoff
.relaxed_header_parser
< 0 && (buf_
[0] == '\r' || buf_
[0] == '\n'))
54 debugs(74, DBG_IMPORTANT
, "WARNING: Invalid HTTP Request: " <<
55 "CRLF bytes received ahead of request-line. " <<
56 "Ignored due to relaxed_header_parser.");
57 // Be tolerant of prefix empty lines
58 // ie any series of either \n or \r\n with no other characters and no repeated \r
59 while (!buf_
.isEmpty() && (buf_
[0] == '\n' || (buf_
[0] == '\r' && buf_
[1] == '\n'))) {
66 * Attempt to parse the method field out of an HTTP message request-line.
69 * RFC 1945 section 5.1
70 * RFC 7230 section 2.6, 3.1 and 3.5
73 Http::One::RequestParser::parseMethodField(Http1::Tokenizer
&tok
)
75 // method field is a sequence of TCHAR.
76 // Limit to 32 characters to prevent overly long sequences of non-HTTP
77 // being sucked in before mismatch is detected. 32 is itself annoyingly
78 // big but there are methods registered by IANA that reach 17 bytes:
79 // http://www.iana.org/assignments/http-methods
80 static const size_t maxMethodLength
= 32; // TODO: make this configurable?
83 if (!tok
.prefix(methodFound
, CharacterSet::TCHAR
, maxMethodLength
)) {
84 debugs(33, ErrorLevel(), "invalid request-line: missing or malformed method");
85 parseStatusCode
= Http::scBadRequest
;
88 method_
= HttpRequestMethod(methodFound
);
90 if (!skipDelimiter(tok
.skipAll(DelimiterCharacters()), "after method"))
96 /// the characters which truly are valid within URI
97 static const CharacterSet
&
100 /* RFC 3986 section 2:
102 * A URI is composed from a limited set of characters consisting of
103 * digits, letters, and a few graphic symbols.
106 static const CharacterSet UriChars
=
107 CharacterSet("URI-Chars","") +
108 // RFC 3986 section 2.2 - reserved characters
109 CharacterSet("gen-delims", ":/?#[]@") +
110 CharacterSet("sub-delims", "!$&'()*+,;=") +
111 // RFC 3986 section 2.3 - unreserved characters
112 CharacterSet::ALPHA
+
113 CharacterSet::DIGIT
+
114 CharacterSet("unreserved", "-._~") +
115 // RFC 3986 section 2.1 - percent encoding "%" HEXDIG
116 CharacterSet("pct-encoded", "%") +
117 CharacterSet::HEXDIG
;
122 /// characters which Squid will accept in the HTTP request-target (URI)
124 Http::One::RequestParser::RequestTargetCharacters()
126 if (Config
.onoff
.relaxed_header_parser
) {
127 #if USE_HTTP_VIOLATIONS
128 static const CharacterSet RelaxedExtended
=
129 UriValidCharacters() +
130 // accept whitespace (extended), it will be dealt with later
131 DelimiterCharacters() +
132 // RFC 2396 unwise character set which must never be transmitted
133 // in un-escaped form. But many web services do anyway.
134 CharacterSet("RFC2396-unwise","\"\\|^<>`{}") +
135 // UTF-8 because we want to be future-proof
136 CharacterSet("UTF-8", 128, 255);
138 return RelaxedExtended
;
140 static const CharacterSet RelaxedCompliant
=
141 UriValidCharacters() +
142 // accept whitespace (extended), it will be dealt with later.
143 DelimiterCharacters();
145 return RelaxedCompliant
;
149 // strict parse only accepts what the RFC say we can
150 return UriValidCharacters();
154 Http::One::RequestParser::parseUriField(Http1::Tokenizer
&tok
)
156 /* Arbitrary 64KB URI upper length limit.
158 * Not quite as arbitrary as it seems though. Old SquidString objects
159 * cannot store strings larger than 64KB, so we must limit until they
160 * have all been replaced with SBuf.
162 * Not that it matters but RFC 7230 section 3.1.1 requires (RECOMMENDED)
163 * at least 8000 octets for the whole line, including method and version.
165 const size_t maxUriLength
= static_cast<size_t>((64*1024)-1);
168 if (!tok
.prefix(uriFound
, RequestTargetCharacters())) {
169 parseStatusCode
= Http::scBadRequest
;
170 debugs(33, ErrorLevel(), "invalid request-line: missing or malformed URI");
174 if (uriFound
.length() > maxUriLength
) {
175 // RFC 7230 section 3.1.1 mandatory (MUST) 414 response
176 parseStatusCode
= Http::scUriTooLong
;
177 debugs(33, ErrorLevel(), "invalid request-line: " << uriFound
.length() <<
178 "-byte URI exceeds " << maxUriLength
<< "-byte limit");
187 Http::One::RequestParser::parseHttpVersionField(Http1::Tokenizer
&tok
)
189 static const SBuf
http1p0("HTTP/1.0");
190 static const SBuf
http1p1("HTTP/1.1");
191 const auto savedTok
= tok
;
193 // Optimization: Expect (and quickly parse) HTTP/1.1 or HTTP/1.0 in
194 // the vast majority of cases.
195 if (tok
.skipSuffix(http1p1
)) {
196 msgProtocol_
= Http::ProtocolVersion(1, 1);
198 } else if (tok
.skipSuffix(http1p0
)) {
199 msgProtocol_
= Http::ProtocolVersion(1, 0);
202 // RFC 7230 section 2.6:
203 // HTTP-version = HTTP-name "/" DIGIT "." DIGIT
204 static const CharacterSet
period("Decimal point", ".");
205 static const SBuf
proto("HTTP/");
208 if (tok
.suffix(minorDigit
, CharacterSet::DIGIT
) &&
209 tok
.skipOneTrailing(period
) &&
210 tok
.suffix(majorDigit
, CharacterSet::DIGIT
) &&
211 tok
.skipSuffix(proto
)) {
212 const bool multiDigits
= majorDigit
.length() > 1 || minorDigit
.length() > 1;
213 // use '0.0' for unsupported multiple digit version numbers
214 const unsigned int major
= multiDigits
? 0 : (*majorDigit
.rawContent() - '0');
215 const unsigned int minor
= multiDigits
? 0 : (*minorDigit
.rawContent() - '0');
216 msgProtocol_
= Http::ProtocolVersion(major
, minor
);
221 // A GET request might use HTTP/0.9 syntax
222 if (method_
== Http::METHOD_GET
) {
223 // RFC 1945 - no HTTP version field at all
224 tok
= savedTok
; // in case the URI ends with a digit
225 // report this assumption as an error if configured to triage parsing
226 debugs(33, ErrorLevel(), "assuming HTTP/0.9 request-line");
227 msgProtocol_
= Http::ProtocolVersion(0,9);
231 debugs(33, ErrorLevel(), "invalid request-line: not HTTP");
232 parseStatusCode
= Http::scBadRequest
;
237 * Skip characters separating request-line fields.
238 * To handle bidirectional parsing, the caller does the actual skipping and
239 * we just check how many character the caller has skipped.
242 Http::One::RequestParser::skipDelimiter(const size_t count
, const char *where
)
245 debugs(33, ErrorLevel(), "invalid request-line: missing delimiter " << where
);
246 parseStatusCode
= Http::scBadRequest
;
250 // tolerant parser allows multiple whitespace characters between request-line fields
251 if (count
> 1 && !Config
.onoff
.relaxed_header_parser
) {
252 debugs(33, ErrorLevel(), "invalid request-line: too many delimiters " << where
);
253 parseStatusCode
= Http::scBadRequest
;
260 /// Parse CRs at the end of request-line, just before the terminating LF.
262 Http::One::RequestParser::skipTrailingCrs(Http1::Tokenizer
&tok
)
264 if (Config
.onoff
.relaxed_header_parser
) {
265 (void)tok
.skipAllTrailing(CharacterSet::CR
); // optional; multiple OK
267 if (!tok
.skipOneTrailing(CharacterSet::CR
)) {
268 debugs(33, ErrorLevel(), "invalid request-line: missing CR before LF");
269 parseStatusCode
= Http::scBadRequest
;
277 * Attempt to parse the first line of a new request message.
280 * RFC 1945 section 5.1
281 * RFC 7230 section 2.6, 3.1 and 3.5
283 * \retval -1 an error occurred. parseStatusCode indicates HTTP status result.
284 * \retval 1 successful parse. member fields contain the request-line items
285 * \retval 0 more data is needed to complete the parse
288 Http::One::RequestParser::parseRequestFirstLine()
290 debugs(74, 5, "parsing possible request: buf.length=" << buf_
.length());
291 debugs(74, DBG_DATA
, buf_
);
295 // Earlier, skipGarbageLines() took care of any leading LFs (if allowed).
296 // Now, the request line has to end at the first LF.
297 static const CharacterSet lineChars
= CharacterSet::LF
.complement("notLF");
298 ::Parser::Tokenizer
lineTok(buf_
);
299 if (!lineTok
.prefix(line
, lineChars
) || !lineTok
.skip('\n')) {
300 if (buf_
.length() >= Config
.maxRequestHeaderSize
) {
301 /* who should we blame for our failure to parse this line? */
303 Http1::Tokenizer
methodTok(buf_
);
304 if (!parseMethodField(methodTok
))
305 return -1; // blame a bad method (or its delimiter)
307 // assume it is the URI
308 debugs(74, ErrorLevel(), "invalid request-line: URI exceeds " <<
309 Config
.maxRequestHeaderSize
<< "-byte limit");
310 parseStatusCode
= Http::scUriTooLong
;
313 debugs(74, 5, "Parser needs more data");
317 Http1::Tokenizer
tok(line
);
319 if (!parseMethodField(tok
))
322 /* now parse backwards, to leave just the URI */
323 if (!skipTrailingCrs(tok
))
326 if (!parseHttpVersionField(tok
))
329 if (!http0() && !skipDelimiter(tok
.skipAllTrailing(DelimiterCharacters()), "before protocol version"))
332 /* parsed everything before and after the URI */
334 if (!parseUriField(tok
))
338 debugs(33, ErrorLevel(), "invalid request-line: garbage after URI");
339 parseStatusCode
= Http::scBadRequest
;
343 parseStatusCode
= Http::scOkay
;
344 buf_
= lineTok
.remaining(); // incremental parse checkpoint
349 Http::One::RequestParser::parse(const SBuf
&aBuf
)
351 const bool result
= doParse(aBuf
);
352 if (preserveParsed_
) {
353 assert(aBuf
.length() >= remaining().length());
354 parsed_
.append(aBuf
.substr(0, aBuf
.length() - remaining().length())); // newly parsed bytes
360 // raw is not a reference because a reference might point back to our own buf_ or parsed_
362 Http::One::RequestParser::doParse(const SBuf
&aBuf
)
365 debugs(74, DBG_DATA
, "Parse buf={length=" << aBuf
.length() << ", data='" << aBuf
<< "'}");
367 // stage 1: locate the request-line
368 if (parsingStage_
== HTTP_PARSE_NONE
) {
371 // if we hit something before EOS treat it as a message
373 parsingStage_
= HTTP_PARSE_FIRST
;
378 // stage 2: parse the request-line
379 if (parsingStage_
== HTTP_PARSE_FIRST
) {
380 PROF_start(HttpParserParseReqLine
);
381 const int retcode
= parseRequestFirstLine();
383 // first-line (or a look-alike) found successfully.
385 parsingStage_
= HTTP_PARSE_MIME
;
388 debugs(74, 5, "request-line: retval " << retcode
<< ": line={" << aBuf
.length() << ", data='" << aBuf
<< "'}");
389 debugs(74, 5, "request-line: method: " << method_
);
390 debugs(74, 5, "request-line: url: " << uri_
);
391 debugs(74, 5, "request-line: proto: " << msgProtocol_
);
392 debugs(74, 5, "Parser: bytes processed=" << (aBuf
.length()-buf_
.length()));
393 PROF_stop(HttpParserParseReqLine
);
395 // syntax errors already
397 parsingStage_
= HTTP_PARSE_DONE
;
402 // stage 3: locate the mime header block
403 if (parsingStage_
== HTTP_PARSE_MIME
) {
404 // HTTP/1.x request-line is valid and parsing completed.
405 if (!grabMimeBlock("Request", Config
.maxRequestHeaderSize
)) {
406 if (parseStatusCode
== Http::scHeaderTooLarge
)
407 parseStatusCode
= Http::scRequestHeaderFieldsTooLarge
;
412 return !needsMoreData();