src/http/one/RequestParser.cc

   1 /*
   2  * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
   3  *
   4  * Squid software is distributed under GPLv2+ license and includes
   5  * contributions from numerous individuals and organizations.
   6  * Please see the COPYING and CONTRIBUTORS files for details.
   7  */
   8
   9 #include "squid.h"
  10 #include "Debug.h"
  11 #include "http/one/RequestParser.h"
  12 #include "http/ProtocolVersion.h"
  13 #include "mime_header.h"
  14 #include "parser/Tokenizer.h"
  15 #include "profiler/Profiler.h"
  16 #include "SquidConfig.h"
  17
  18 Http::One::RequestParser::RequestParser() :
  19     Parser(),
  20     request_parse_status(Http::scNone),
  21     firstLineGarbage_(0)
  22 {}
  23
  24 Http1::Parser::size_type
  25 Http::One::RequestParser::firstLineSize() const
  26 {
  27     // RFC 7230 section 2.6
  28     /* method SP request-target SP "HTTP/" DIGIT "." DIGIT CRLF */
  29     return method_.image().length() + uri_.length() + 12;
  30 }
  31
  32 /**
  33  * Attempt to parse the first line of a new request message.
  34  *
  35  * Governed by RFC 7230 section 3.5
  36  *  "
  37  *    In the interest of robustness, a server that is expecting to receive
  38  *    and parse a request-line SHOULD ignore at least one empty line (CRLF)
  39  *    received prior to the request-line.
  40  *  "
  41  *
  42  * Parsing state is stored between calls to avoid repeating buffer scans.
  43  * If garbage is found the parsing offset is incremented.
  44  */
  45 void
  46 Http::One::RequestParser::skipGarbageLines()
  47 {
  48     if (Config.onoff.relaxed_header_parser) {
  49         if (Config.onoff.relaxed_header_parser < 0 && (buf_[0] == '\r' || buf_[0] == '\n'))
  50             debugs(74, DBG_IMPORTANT, "WARNING: Invalid HTTP Request: " <<
  51                    "CRLF bytes received ahead of request-line. " <<
  52                    "Ignored due to relaxed_header_parser.");
  53         // Be tolerant of prefix empty lines
  54         // ie any series of either \n or \r\n with no other characters and no repeated \r
  55         while (!buf_.isEmpty() && (buf_[0] == '\n' || (buf_[0] == '\r' && buf_[1] == '\n'))) {
  56             buf_.consume(1);
  57         }
  58     }
  59 }
  60
  61 /// detect and skip the CRLF or LF line terminator
  62 /// consume from the tokenizer and return true only if found
  63 bool
  64 Http::One::RequestParser::skipLineTerminator(::Parser::Tokenizer &tok) const
  65 {
  66     static const SBuf crlf("\r\n");
  67     if (tok.skip(crlf))
  68         return true;
  69
  70     if (Config.onoff.relaxed_header_parser && tok.skipOne(CharacterSet::LF))
  71         return true;
  72
  73     return false;
  74 }
  75
  76 /**
  77  * Attempt to parse the method field out of an HTTP message request-line.
  78  *
  79  * Governed by:
  80  *  RFC 1945 section 5.1
  81  *  RFC 7230 section 2.6, 3.1 and 3.5
  82  *
  83  * Parsing state is stored between calls. The current implementation uses
  84  * checkpoints after each successful request-line field.
  85  * The return value tells you whether the parsing is completed or not.
  86  *
  87  * \retval -1  an error occurred. request_parse_status indicates HTTP status result.
  88  * \retval  1  successful parse. method_ is filled and buffer consumed including first delimiter.
  89  * \retval  0  more data is needed to complete the parse
  90  */
  91 int
  92 Http::One::RequestParser::parseMethodField(::Parser::Tokenizer &tok, const CharacterSet &WspDelim)
  93 {
  94     // scan for up to 16 valid method characters.
  95     static const size_t maxMethodLength = 16;
  96
  97     SBuf methodFound;
  98
  99     // method field is a sequence of TCHAR.
 100     // NP: prefix-with-limit returns true if it finds ANY valid chars
 101     if (!tok.prefix(methodFound, CharacterSet::TCHAR, maxMethodLength)) {
 102         // missing/invalid 'method'.
 103         request_parse_status = Http::scBadRequest;
 104         debugs(33, 5, "invalid request-line. missing method");
 105         return -1;
 106     }
 107
 108     // we may be at the end if we found exactly maxMethodLength bytes
 109     if (tok.atEnd()) {
 110         debugs(74, 5, "Parser needs more data to find method");
 111         return 0;
 112     }
 113
 114     // ... followed by at least one whitespace character.
 115     if (!tok.skipOne(WspDelim)) {
 116         // non-delimiter found after accepted method bytes means ...
 117         if (methodFound.length() == maxMethodLength) {
 118             // method longer than acceptible.
 119             // RFC 7230 section 3.1.1 mandatory (SHOULD) 501 response
 120             request_parse_status = Http::scNotImplemented;
 121             debugs(33, 5, "invalid request-line. method too long");
 122         } else {
 123             // invalid character in the URL
 124             // RFC 7230 section 3.1.1 required (SHOULD) 400 response
 125             request_parse_status = Http::scBadRequest;
 126             debugs(33, 5, "invalid request-line. missing method delimiter");
 127         }
 128         return -1;
 129     }
 130     method_ = HttpRequestMethod(methodFound);
 131     buf_ = tok.remaining(); // incremental parse checkpoint
 132     return 1;
 133 }
 134
 135 int
 136 Http::One::RequestParser::parseUriField(::Parser::Tokenizer &tok, const CharacterSet &WspDelim)
 137 {
 138     // URI field is a sequence of ... what? segments all have different valid charset
 139     // go with non-whitespace non-binary characters for now
 140     static CharacterSet UriChars("URI-Chars","");
 141     if (!UriChars['a']) { // if it needs initializing...
 142         /* RFC 3986 section 2:
 143          * "
 144          *   A URI is composed from a limited set of characters consisting of
 145          *   digits, letters, and a few graphic symbols.
 146          * "
 147          */
 148         // RFC 3986 section 2.1 - percent encoding "%" HEXDIG
 149         UriChars.add('%');
 150         UriChars += CharacterSet::HEXDIG;
 151         // RFC 3986 section 2.2 - reserved characters
 152         UriChars += CharacterSet("gen-delims", ":/?#[]@");
 153         UriChars += CharacterSet("sub-delims", "!$&'()*+,;=");
 154         // RFC 3986 section 2.3 - unreserved characters
 155         UriChars += CharacterSet::ALPHA;
 156         UriChars += CharacterSet::DIGIT;
 157         UriChars += CharacterSet("unreserved", "-._~");
 158     }
 159
 160     /* Arbitrary 64KB URI upper length limit.
 161      *
 162      * Not quite as arbitrary as it seems though. Old SquidString objects
 163      * cannot store strings larger than 64KB, so we must limit until they
 164      * have all been replaced with SBuf.
 165      *
 166      * Not that it matters but RFC 7230 section 3.1.1 requires (RECOMMENDED)
 167      * at least 8000 octets for the whole line, including method and version.
 168      */
 169     const size_t maxUriLength = min(static_cast<size_t>(Config.maxRequestHeaderSize) - firstLineSize(),
 170                                     static_cast<size_t>((64*1024)-1));
 171
 172     SBuf uriFound;
 173     // NP: prefix-with-limit returns true if it finds ANY valid chars
 174     if (!tok.prefix(uriFound, UriChars, maxUriLength)) {
 175         // else did not find any valid TCHAR
 176         debugs(33, 5, "invalid request-line. missing URL");
 177         request_parse_status = Http::scBadRequest;
 178         return -1;
 179     }
 180
 181     // we may be at the end if we found exactly maxUriLength bytes
 182     if (tok.atEnd()) {
 183         debugs(74, 5, "Parser needs more data to find URI");
 184         return 0;
 185     }
 186
 187     // RFC 1945 - for GET the line terminator may follow URL instead of a delimiter
 188     if (method_ == Http::METHOD_GET && skipLineTerminator(tok)) {
 189         debugs(33, 5, "HTTP/0.9 syntax request-line detected");
 190         msgProtocol_ = Http::ProtocolVersion(0,9);
 191         uri_ = uriFound;
 192         request_parse_status = Http::scOkay;
 193         buf_ = tok.remaining(); // incremental parse checkpoint
 194         return 1;
 195     }
 196
 197     // ... followed by at least one whitespace character.
 198     if (!tok.skipOne(WspDelim)) {
 199         // non-delimiter found after accepted URL bytes means ...
 200         if (uriFound.length() == maxUriLength) {
 201             // URL longer than acceptible.
 202             // RFC 7230 section 3.1.1 mandatory (MUST) 414 response
 203             request_parse_status = Http::scUriTooLong;
 204             debugs(33, 5, "invalid request-line. URI longer than " << maxUriLength << " bytes");
 205             return -1;
 206         } else {
 207             // invalid non-delimiter character ended the URL
 208             // RFC 7230 section 3.1.1 required (SHOULD) 400 response
 209             request_parse_status = Http::scBadRequest;
 210             debugs(33, 5, "invalid request-line. missing URI delimiter");
 211             return -1;
 212         }
 213     }
 214     uri_ = uriFound;
 215     buf_ = tok.remaining(); // incremental parse checkpoint
 216     return 1;
 217 }
 218
 219 int
 220 Http::One::RequestParser::parseHttpVersionField(::Parser::Tokenizer &tok)
 221 {
 222     // partial match of HTTP/1 magic prefix
 223     if (tok.remaining().length() < Http1magic.length() && Http1magic.startsWith(tok.remaining())) {
 224         debugs(74, 5, "Parser needs more data to find version");
 225         return 0;
 226     }
 227
 228     if (!tok.skip(Http1magic)) {
 229         debugs(74, 5, "invalid request-line. not HTTP/1 protocol");
 230         request_parse_status = Http::scHttpVersionNotSupported;
 231         return -1;
 232     }
 233
 234     if (tok.atEnd()) {
 235         debugs(74, 5, "Parser needs more data to find version");
 236         return 0;
 237     }
 238
 239     // get the version minor DIGIT
 240     SBuf digit;
 241     if (!tok.prefix(digit, CharacterSet::DIGIT, 1)) {
 242         // non-DIGIT. invalid version number.
 243         request_parse_status = Http::scHttpVersionNotSupported;
 244         debugs(33, 5, "invalid request-line. non-numeric or too-large HTTP minor version");
 245         return -1;
 246     }
 247
 248     if (tok.atEnd()) {
 249         debugs(74, 5, "Parser needs more data to find version");
 250         return 0;
 251     }
 252
 253     // version is always followed by the terminator
 254     if (!skipLineTerminator(tok)) {
 255         if (tok.skipOne(CharacterSet::CR) && tok.atEnd()) {
 256             debugs(74, 5, "Parser needs more data to find version");
 257             return 0;
 258         }
 259         request_parse_status = Http::scHttpVersionNotSupported;
 260         debugs(33, 5, "invalid request-line. garabge before line terminator");
 261         return -1;
 262     }
 263
 264     // found version fully AND terminator
 265     msgProtocol_ = Http::ProtocolVersion(1, (*digit.rawContent() - '0'));
 266     request_parse_status = Http::scOkay;
 267     buf_ = tok.remaining(); // incremental parse checkpoint
 268     return 1;
 269 }
 270
 271 /**
 272  * Attempt to parse the first line of a new request message.
 273  *
 274  * Governed by:
 275  *  RFC 1945 section 5.1
 276  *  RFC 7230 section 2.6, 3.1 and 3.5
 277  *
 278  * Parsing state is stored between calls. The current implementation uses
 279  * checkpoints after each successful request-line field.
 280  * The return value tells you whether the parsing is completed or not.
 281  *
 282  * \retval -1  an error occurred. request_parse_status indicates HTTP status result.
 283  * \retval  1  successful parse. member fields contain the request-line items
 284  * \retval  0  more data is needed to complete the parse
 285  */
 286 int
 287 Http::One::RequestParser::parseRequestFirstLine()
 288 {
 289     ::Parser::Tokenizer tok(buf_);
 290
 291     debugs(74, 5, "parsing possible request: buf.length=" << buf_.length());
 292     debugs(74, DBG_DATA, buf_);
 293
 294     CharacterSet WspDelim = CharacterSet::SP; // strict parse only accepts SP
 295
 296     if (Config.onoff.relaxed_header_parser) {
 297         // RFC 7230 section 3.5
 298         // tolerant parser MAY accept any of SP, HTAB, VT (%x0B), FF (%x0C), or bare CR
 299         // as whitespace between request-line fields
 300         WspDelim += CharacterSet::HTAB
 301                   + CharacterSet("VT,FF","\x0B\x0C")
 302                   + CharacterSet::CR;
 303     }
 304
 305     // only search for method if we have not yet found one
 306     if (method_ == Http::METHOD_NONE) {
 307         const int res = parseMethodField(tok, WspDelim);
 308         if (res < 1)
 309             return res;
 310         // else keep going...
 311     }
 312
 313     // tolerant parser allows multiple whitespace characters between fields
 314     if (Config.onoff.relaxed_header_parser) {
 315         const size_t garbage = tok.skipAll(WspDelim);
 316         if (garbage > 0) {
 317             firstLineGarbage_ += garbage;
 318             buf_ = tok.remaining(); // re-checkpoint after garbage
 319         }
 320     }
 321     if (tok.atEnd()) {
 322         debugs(74, 5, "Parser needs more data");
 323         return 0;
 324     }
 325
 326     // only search for request-target (URL) if we have not yet found one
 327     if (uri_.isEmpty()) {
 328         const int res = parseUriField(tok, WspDelim);
 329         if (res < 1 || msgProtocol_.protocol == AnyP::PROTO_HTTP)
 330             return res;
 331         // else keep going...
 332     }
 333
 334     // tolerant parser allows multiple whitespace characters between fields
 335     if (Config.onoff.relaxed_header_parser) {
 336         const size_t garbage = tok.skipAll(WspDelim);
 337         if (garbage > 0) {
 338             firstLineGarbage_ += garbage;
 339             buf_ = tok.remaining(); // re-checkpoint after garbage
 340         }
 341     }
 342     if (tok.atEnd()) {
 343         debugs(74, 5, "Parser needs more data");
 344         return 0;
 345     }
 346
 347     // HTTP/1 version suffix (protocol magic) followed by CR*LF
 348     if (msgProtocol_.protocol == AnyP::PROTO_NONE) {
 349         return parseHttpVersionField(tok);
 350     }
 351
 352     // If we got here this method has been called too many times
 353     request_parse_status = Http::scInternalServerError;
 354     debugs(33, 5, "ERROR: Parser already processed request-line");
 355     return -1;
 356 }
 357
 358 bool
 359 Http::One::RequestParser::parse(const SBuf &aBuf)
 360 {
 361     buf_ = aBuf;
 362     debugs(74, DBG_DATA, "Parse buf={length=" << aBuf.length() << ", data='" << aBuf << "'}");
 363
 364     // stage 1: locate the request-line
 365     if (parsingStage_ == HTTP_PARSE_NONE) {
 366         skipGarbageLines();
 367
 368         // if we hit something before EOS treat it as a message
 369         if (!buf_.isEmpty())
 370             parsingStage_ = HTTP_PARSE_FIRST;
 371         else
 372             return false;
 373     }
 374
 375     // stage 2: parse the request-line
 376     if (parsingStage_ == HTTP_PARSE_FIRST) {
 377         PROF_start(HttpParserParseReqLine);
 378         const int retcode = parseRequestFirstLine();
 379
 380         // first-line (or a look-alike) found successfully.
 381         if (retcode > 0) {
 382             parsingStage_ = HTTP_PARSE_MIME;
 383         }
 384
 385         debugs(74, 5, "request-line: retval " << retcode << ": line={" << aBuf.length() << ", data='" << aBuf << "'}");
 386         debugs(74, 5, "request-line: method: " << method_);
 387         debugs(74, 5, "request-line: url: " << uri_);
 388         debugs(74, 5, "request-line: proto: " << msgProtocol_);
 389         debugs(74, 5, "Parser: bytes processed=" << (aBuf.length()-buf_.length()));
 390         PROF_stop(HttpParserParseReqLine);
 391
 392         // syntax errors already
 393         if (retcode < 0) {
 394             parsingStage_ = HTTP_PARSE_DONE;
 395             return false;
 396         }
 397     }
 398
 399     // stage 3: locate the mime header block
 400     if (parsingStage_ == HTTP_PARSE_MIME) {
 401         // HTTP/1.x request-line is valid and parsing completed.
 402         if (msgProtocol_.major == 1) {
 403             /* NOTE: HTTP/0.9 requests do not have a mime header block.
 404              *       So the rest of the code will need to deal with '0'-byte headers
 405              *       (ie, none, so don't try parsing em)
 406              */
 407             int64_t mimeHeaderBytes = 0;
 408             // XXX: c_str() reallocates. performance regression.
 409             if ((mimeHeaderBytes = headersEnd(buf_.c_str(), buf_.length())) == 0) {
 410                 if (buf_.length()+firstLineSize() >= Config.maxRequestHeaderSize) {
 411                     debugs(33, 5, "Too large request");
 412                     request_parse_status = Http::scRequestHeaderFieldsTooLarge;
 413                     parsingStage_ = HTTP_PARSE_DONE;
 414                 } else
 415                     debugs(33, 5, "Incomplete request, waiting for end of headers");
 416                 return false;
 417             }
 418             mimeHeaderBlock_ = buf_.consume(mimeHeaderBytes);
 419             debugs(74, 5, "mime header (0-" << mimeHeaderBytes << ") {" << mimeHeaderBlock_ << "}");
 420
 421         } else
 422             debugs(33, 3, "Missing HTTP/1.x identifier");
 423
 424         // NP: we do not do any further stages here yet so go straight to DONE
 425         parsingStage_ = HTTP_PARSE_DONE;
 426
 427         // Squid could handle these headers, but admin does not want to
 428         if (messageHeaderSize() >= Config.maxRequestHeaderSize) {
 429             debugs(33, 5, "Too large request");
 430             request_parse_status = Http::scRequestHeaderFieldsTooLarge;
 431             return false;
 432         }
 433     }
 434
 435     return !needsMoreData();
 436 }
 437