src/http/one/RequestParser.cc

   1 /*
   2  * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
   3  *
   4  * Squid software is distributed under GPLv2+ license and includes
   5  * contributions from numerous individuals and organizations.
   6  * Please see the COPYING and CONTRIBUTORS files for details.
   7  */
   8
   9 #include "squid.h"
  10 #include "Debug.h"
  11 #include "http/one/RequestParser.h"
  12 #include "http/ProtocolVersion.h"
  13 #include "mime_header.h"
  14 #include "parser/Tokenizer.h"
  15 #include "profiler/Profiler.h"
  16 #include "SquidConfig.h"
  17
  18 Http::One::RequestParser::RequestParser() :
  19     Parser(),
  20     request_parse_status(Http::scNone),
  21     firstLineGarbage_(0)
  22 {}
  23
  24 Http1::Parser::size_type
  25 Http::One::RequestParser::firstLineSize() const
  26 {
  27     // RFC 7230 section 2.6
  28     /* method SP request-target SP "HTTP/" DIGIT "." DIGIT CRLF */
  29     return method_.image().length() + uri_.length() + 12;
  30 }
  31
  32 /**
  33  * Attempt to parse the first line of a new request message.
  34  *
  35  * Governed by RFC 7230 section 3.5
  36  *  "
  37  *    In the interest of robustness, a server that is expecting to receive
  38  *    and parse a request-line SHOULD ignore at least one empty line (CRLF)
  39  *    received prior to the request-line.
  40  *  "
  41  *
  42  * Parsing state is stored between calls to avoid repeating buffer scans.
  43  * If garbage is found the parsing offset is incremented.
  44  */
  45 void
  46 Http::One::RequestParser::skipGarbageLines()
  47 {
  48     if (Config.onoff.relaxed_header_parser) {
  49         if (Config.onoff.relaxed_header_parser < 0 && (buf_[0] == '\r' || buf_[0] == '\n'))
  50             debugs(74, DBG_IMPORTANT, "WARNING: Invalid HTTP Request: " <<
  51                    "CRLF bytes received ahead of request-line. " <<
  52                    "Ignored due to relaxed_header_parser.");
  53         // Be tolerant of prefix empty lines
  54         // ie any series of either \n or \r\n with no other characters and no repeated \r
  55         while (!buf_.isEmpty() && (buf_[0] == '\n' || (buf_[0] == '\r' && buf_[1] == '\n'))) {
  56             buf_.consume(1);
  57         }
  58     }
  59 }
  60
  61 /**
  62  * Attempt to parse the method field out of an HTTP message request-line.
  63  *
  64  * Governed by:
  65  *  RFC 1945 section 5.1
  66  *  RFC 7230 section 2.6, 3.1 and 3.5
  67  *
  68  * Parsing state is stored between calls. The current implementation uses
  69  * checkpoints after each successful request-line field.
  70  * The return value tells you whether the parsing is completed or not.
  71  *
  72  * \retval -1  an error occurred. request_parse_status indicates HTTP status result.
  73  * \retval  1  successful parse. method_ is filled and buffer consumed including first delimiter.
  74  * \retval  0  more data is needed to complete the parse
  75  */
  76 int
  77 Http::One::RequestParser::parseMethodField(::Parser::Tokenizer &tok, const CharacterSet &WspDelim)
  78 {
  79     // scan for up to 16 valid method characters.
  80     static const size_t maxMethodLength = 16; // TODO: make this configurable?
  81
  82     // method field is a sequence of TCHAR.
  83     SBuf methodFound;
  84     if (tok.prefix(methodFound, CharacterSet::TCHAR, maxMethodLength) && tok.skipOne(WspDelim)) {
  85
  86         method_ = HttpRequestMethod(methodFound);
  87         buf_ = tok.remaining(); // incremental parse checkpoint
  88         return 1;
  89
  90     } else if (tok.atEnd()) {
  91         debugs(74, 5, "Parser needs more data to find method");
  92         return 0;
  93
  94     } // else error(s)
  95
  96     // non-delimiter found after accepted method bytes means ...
  97     if (methodFound.length() == maxMethodLength) {
  98         // method longer than acceptible.
  99         // RFC 7230 section 3.1.1 mandatory (SHOULD) 501 response
 100         request_parse_status = Http::scNotImplemented;
 101         debugs(33, 5, "invalid request-line. method too long");
 102     } else {
 103         // invalid character in the URL
 104         // RFC 7230 section 3.1.1 required (SHOULD) 400 response
 105         request_parse_status = Http::scBadRequest;
 106         debugs(33, 5, "invalid request-line. missing method delimiter");
 107     }
 108     return -1;
 109 }
 110
 111 static CharacterSet
 112 uriValidCharacters()
 113 {
 114     CharacterSet UriChars("URI-Chars","");
 115
 116     /* RFC 3986 section 2:
 117      * "
 118      *   A URI is composed from a limited set of characters consisting of
 119      *   digits, letters, and a few graphic symbols.
 120      * "
 121      */
 122     // RFC 3986 section 2.1 - percent encoding "%" HEXDIG
 123     UriChars.add('%');
 124     UriChars += CharacterSet::HEXDIG;
 125     // RFC 3986 section 2.2 - reserved characters
 126     UriChars += CharacterSet("gen-delims", ":/?#[]@");
 127     UriChars += CharacterSet("sub-delims", "!$&'()*+,;=");
 128     // RFC 3986 section 2.3 - unreserved characters
 129     UriChars += CharacterSet::ALPHA;
 130     UriChars += CharacterSet::DIGIT;
 131     UriChars += CharacterSet("unreserved", "-._~");
 132
 133     return UriChars;
 134 }
 135
 136 int
 137 Http::One::RequestParser::parseUriField(::Parser::Tokenizer &tok)
 138 {
 139     // URI field is a sequence of ... what? segments all have different valid charset
 140     // go with non-whitespace non-binary characters for now
 141     static CharacterSet UriChars = uriValidCharacters();
 142
 143     /* Arbitrary 64KB URI upper length limit.
 144      *
 145      * Not quite as arbitrary as it seems though. Old SquidString objects
 146      * cannot store strings larger than 64KB, so we must limit until they
 147      * have all been replaced with SBuf.
 148      *
 149      * Not that it matters but RFC 7230 section 3.1.1 requires (RECOMMENDED)
 150      * at least 8000 octets for the whole line, including method and version.
 151      */
 152     const size_t maxUriLength = min(static_cast<size_t>(Config.maxRequestHeaderSize) - firstLineSize(),
 153                                     static_cast<size_t>((64*1024)-1));
 154
 155     SBuf uriFound;
 156
 157     // RFC 7230 HTTP/1.x URI are followed by at least one whitespace delimiter
 158     if (tok.prefix(uriFound, UriChars, maxUriLength) && tok.skipOne(CharacterSet::SP)) {
 159         uri_ = uriFound;
 160         buf_ = tok.remaining(); // incremental parse checkpoint
 161         return 1;
 162
 163         // RFC 1945 for GET the line terminator may follow URL instead of a delimiter
 164     } else if (method_ == Http::METHOD_GET && skipLineTerminator(tok)) {
 165         debugs(33, 5, "HTTP/0.9 syntax request-line detected");
 166         msgProtocol_ = Http::ProtocolVersion(0,9);
 167         uri_ = uriFound; // found by successful prefix() call earlier.
 168         request_parse_status = Http::scOkay;
 169         buf_ = tok.remaining(); // incremental parse checkpoint
 170         return 1;
 171
 172     } else if (tok.atEnd()) {
 173         debugs(74, 5, "Parser needs more data to find URI");
 174         return 0;
 175     }
 176
 177     // else errors...
 178
 179     if (uriFound.length() == maxUriLength) {
 180         // RFC 7230 section 3.1.1 mandatory (MUST) 414 response
 181         request_parse_status = Http::scUriTooLong;
 182         debugs(33, 5, "invalid request-line. URI longer than " << maxUriLength << " bytes");
 183     } else {
 184         // RFC 7230 section 3.1.1 required (SHOULD) 400 response
 185         request_parse_status = Http::scBadRequest;
 186         debugs(33, 5, "invalid request-line. missing URI delimiter");
 187     }
 188     return -1;
 189 }
 190
 191 int
 192 Http::One::RequestParser::parseHttpVersionField(::Parser::Tokenizer &tok)
 193 {
 194     // partial match of HTTP/1 magic prefix
 195     if (tok.remaining().length() < Http1magic.length() && Http1magic.startsWith(tok.remaining())) {
 196         debugs(74, 5, "Parser needs more data to find version");
 197         return 0;
 198     }
 199
 200     if (!tok.skip(Http1magic)) {
 201         debugs(74, 5, "invalid request-line. not HTTP/1 protocol");
 202         request_parse_status = Http::scHttpVersionNotSupported;
 203         return -1;
 204     }
 205
 206     if (tok.atEnd()) {
 207         debugs(74, 5, "Parser needs more data to find version");
 208         return 0;
 209     }
 210
 211     // get the version minor DIGIT
 212     SBuf digit;
 213     if (tok.prefix(digit, CharacterSet::DIGIT, 1) && skipLineTerminator(tok)) {
 214
 215         // found version fully AND terminator
 216         msgProtocol_ = Http::ProtocolVersion(1, (*digit.rawContent() - '0'));
 217         request_parse_status = Http::scOkay;
 218         buf_ = tok.remaining(); // incremental parse checkpoint
 219         return 1;
 220
 221     } else if (tok.atEnd() || (tok.skip('\r') && tok.atEnd())) {
 222         debugs(74, 5, "Parser needs more data to find version");
 223         return 0;
 224
 225     } // else error ...
 226
 227     // non-DIGIT. invalid version number.
 228     request_parse_status = Http::scHttpVersionNotSupported;
 229     debugs(33, 5, "invalid request-line. garbage before line terminator");
 230     return -1;
 231 }
 232
 233 /**
 234  * Attempt to parse the first line of a new request message.
 235  *
 236  * Governed by:
 237  *  RFC 1945 section 5.1
 238  *  RFC 7230 section 2.6, 3.1 and 3.5
 239  *
 240  * Parsing state is stored between calls. The current implementation uses
 241  * checkpoints after each successful request-line field.
 242  * The return value tells you whether the parsing is completed or not.
 243  *
 244  * \retval -1  an error occurred. request_parse_status indicates HTTP status result.
 245  * \retval  1  successful parse. member fields contain the request-line items
 246  * \retval  0  more data is needed to complete the parse
 247  */
 248 int
 249 Http::One::RequestParser::parseRequestFirstLine()
 250 {
 251     ::Parser::Tokenizer tok(buf_);
 252
 253     debugs(74, 5, "parsing possible request: buf.length=" << buf_.length());
 254     debugs(74, DBG_DATA, buf_);
 255
 256     // NP: would be static, except it need to change with reconfigure
 257     CharacterSet WspDelim = CharacterSet::SP; // strict parse only accepts SP
 258
 259     if (Config.onoff.relaxed_header_parser) {
 260         // RFC 7230 section 3.5
 261         // tolerant parser MAY accept any of SP, HTAB, VT (%x0B), FF (%x0C), or bare CR
 262         // as whitespace between request-line fields
 263         WspDelim += CharacterSet::HTAB
 264                     + CharacterSet("VT,FF","\x0B\x0C")
 265                     + CharacterSet::CR;
 266     }
 267
 268     // only search for method if we have not yet found one
 269     if (method_ == Http::METHOD_NONE) {
 270         const int res = parseMethodField(tok, WspDelim);
 271         if (res < 1)
 272             return res;
 273         // else keep going...
 274     }
 275
 276     // tolerant parser allows multiple whitespace characters between request-line fields
 277     if (Config.onoff.relaxed_header_parser) {
 278         const size_t garbage = tok.skipAll(WspDelim);
 279         if (garbage > 0) {
 280             firstLineGarbage_ += garbage;
 281             buf_ = tok.remaining(); // re-checkpoint after garbage
 282         }
 283     }
 284     if (tok.atEnd()) {
 285         debugs(74, 5, "Parser needs more data");
 286         return 0;
 287     }
 288
 289     // from here on, we have two possible parse paths: whitespace tolerant, and strict
 290     if (Config.onoff.relaxed_header_parser) {
 291         // whitespace tolerant
 292
 293         // NOTES:
 294         // * this would be static, except WspDelim changes with reconfigure
 295         // * HTTP-version charset is included by uriValidCharacters()
 296         // * terminal CR is included by WspDelim here in relaxed parsing
 297         CharacterSet LfDelim = uriValidCharacters() + WspDelim;
 298
 299         // seek the LF character, then tokenize the line in reverse
 300         SBuf line;
 301         if (tok.prefix(line, LfDelim) && tok.skip('\n')) {
 302             ::Parser::Tokenizer rTok(line);
 303             SBuf nil;
 304             (void)rTok.suffix(nil,CharacterSet::CR); // optional CR in terminator
 305             SBuf digit;
 306             if (rTok.suffix(digit,CharacterSet::DIGIT) && rTok.skipSuffix(Http1magic) && rTok.suffix(nil,WspDelim)) {
 307                 uri_ = rTok.remaining();
 308                 msgProtocol_ = Http::ProtocolVersion(1, (*digit.rawContent() - '0'));
 309                 if (uri_.isEmpty()) {
 310                     debugs(33, 5, "invalid request-line. missing URL");
 311                     request_parse_status = Http::scBadRequest;
 312                     return -1;
 313                 }
 314
 315                 request_parse_status = Http::scOkay;
 316                 buf_ = tok.remaining(); // incremental parse checkpoint
 317                 return 1;
 318
 319             } else if (method_ == Http::METHOD_GET) {
 320                 // RFC 1945 - for GET the line terminator may follow URL instead of a delimiter
 321                 debugs(33, 5, "HTTP/0.9 syntax request-line detected");
 322                 msgProtocol_ = Http::ProtocolVersion(0,9);
 323                 static const SBuf cr("\r",1);
 324                 uri_ = line.trim(cr,false,true);
 325                 request_parse_status = Http::scOkay;
 326                 buf_ = tok.remaining(); // incremental parse checkpoint
 327                 return 1;
 328             }
 329
 330             debugs(33, 5, "invalid request-line. not HTTP");
 331             request_parse_status = Http::scBadRequest;
 332             return -1;
 333         }
 334
 335         debugs(74, 5, "Parser needs more data");
 336         return 0;
 337     }
 338     // else strict non-whitespace tolerant parse
 339
 340     // only search for request-target (URL) if we have not yet found one
 341     if (uri_.isEmpty()) {
 342         const int res = parseUriField(tok);
 343         if (res < 1 || msgProtocol_.protocol == AnyP::PROTO_HTTP)
 344             return res;
 345         // else keep going...
 346     }
 347
 348     if (tok.atEnd()) {
 349         debugs(74, 5, "Parser needs more data");
 350         return 0;
 351     }
 352
 353     // HTTP/1 version suffix (protocol magic) followed by CR*LF
 354     if (msgProtocol_.protocol == AnyP::PROTO_NONE) {
 355         return parseHttpVersionField(tok);
 356     }
 357
 358     // If we got here this method has been called too many times
 359     request_parse_status = Http::scInternalServerError;
 360     debugs(33, 5, "ERROR: Parser already processed request-line");
 361     return -1;
 362 }
 363
 364 bool
 365 Http::One::RequestParser::parse(const SBuf &aBuf)
 366 {
 367     buf_ = aBuf;
 368     debugs(74, DBG_DATA, "Parse buf={length=" << aBuf.length() << ", data='" << aBuf << "'}");
 369
 370     // stage 1: locate the request-line
 371     if (parsingStage_ == HTTP_PARSE_NONE) {
 372         skipGarbageLines();
 373
 374         // if we hit something before EOS treat it as a message
 375         if (!buf_.isEmpty())
 376             parsingStage_ = HTTP_PARSE_FIRST;
 377         else
 378             return false;
 379     }
 380
 381     // stage 2: parse the request-line
 382     if (parsingStage_ == HTTP_PARSE_FIRST) {
 383         PROF_start(HttpParserParseReqLine);
 384         const int retcode = parseRequestFirstLine();
 385
 386         // first-line (or a look-alike) found successfully.
 387         if (retcode > 0) {
 388             parsingStage_ = HTTP_PARSE_MIME;
 389         }
 390
 391         debugs(74, 5, "request-line: retval " << retcode << ": line={" << aBuf.length() << ", data='" << aBuf << "'}");
 392         debugs(74, 5, "request-line: method: " << method_);
 393         debugs(74, 5, "request-line: url: " << uri_);
 394         debugs(74, 5, "request-line: proto: " << msgProtocol_);
 395         debugs(74, 5, "Parser: bytes processed=" << (aBuf.length()-buf_.length()));
 396         PROF_stop(HttpParserParseReqLine);
 397
 398         // syntax errors already
 399         if (retcode < 0) {
 400             parsingStage_ = HTTP_PARSE_DONE;
 401             return false;
 402         }
 403     }
 404
 405     // stage 3: locate the mime header block
 406     if (parsingStage_ == HTTP_PARSE_MIME) {
 407         // HTTP/1.x request-line is valid and parsing completed.
 408         if (msgProtocol_.major == 1) {
 409             /* NOTE: HTTP/0.9 requests do not have a mime header block.
 410              *       So the rest of the code will need to deal with '0'-byte headers
 411              *       (ie, none, so don't try parsing em)
 412              */
 413             int64_t mimeHeaderBytes = 0;
 414             // XXX: c_str() reallocates. performance regression.
 415             if ((mimeHeaderBytes = headersEnd(buf_.c_str(), buf_.length())) == 0) {
 416                 if (buf_.length()+firstLineSize() >= Config.maxRequestHeaderSize) {
 417                     debugs(33, 5, "Too large request");
 418                     request_parse_status = Http::scRequestHeaderFieldsTooLarge;
 419                     parsingStage_ = HTTP_PARSE_DONE;
 420                 } else
 421                     debugs(33, 5, "Incomplete request, waiting for end of headers");
 422                 return false;
 423             }
 424             mimeHeaderBlock_ = buf_.consume(mimeHeaderBytes);
 425             debugs(74, 5, "mime header (0-" << mimeHeaderBytes << ") {" << mimeHeaderBlock_ << "}");
 426
 427         } else
 428             debugs(33, 3, "Missing HTTP/1.x identifier");
 429
 430         // NP: we do not do any further stages here yet so go straight to DONE
 431         parsingStage_ = HTTP_PARSE_DONE;
 432
 433         // Squid could handle these headers, but admin does not want to
 434         if (messageHeaderSize() >= Config.maxRequestHeaderSize) {
 435             debugs(33, 5, "Too large request");
 436             request_parse_status = Http::scRequestHeaderFieldsTooLarge;
 437             return false;
 438         }
 439     }
 440
 441     return !needsMoreData();
 442 }
 443