SourceFormat Enforcement

[thirdparty/squid.git] / src / http / one / RequestParser.cc
diff --git a/src/http/one/RequestParser.cc b/src/http/one/RequestParser.cc

index 74c8fccf63df712dcff4eeff9aa088945969f8cd..44482a913fd25f37a5d43ce9ac50c7e2b93c7ae9 100644 (file)
--- a/src/http/one/RequestParser.cc
+++ b/src/http/one/RequestParser.cc
@@ -1,5 +1,5 @@
  /*
- * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
+ * Copyright (C) 1996-2017 The Squid Software Foundation and contributors
   *
   * Squid software is distributed under GPLv2+ license and includes
   * contributions from numerous individuals and organizations.
@@ -9,19 +9,28 @@
  #include "squid.h"
  #include "Debug.h"
  #include "http/one/RequestParser.h"
+#include "http/one/Tokenizer.h"
  #include "http/ProtocolVersion.h"
-#include "mime_header.h"
  #include "profiler/Profiler.h"
  #include "SquidConfig.h"
  
-Http::One::RequestParser::RequestParser() :
+// the right debugs() level for parsing errors
+inline static int
+ErrorLevel() {
+    return Config.onoff.relaxed_header_parser < 0 ? DBG_IMPORTANT : 5;
+}
+
+Http::One::RequestParser::RequestParser(bool preserveParsed) :
      Parser(),
-    request_parse_status(Http::scNone)
+    preserveParsed_(preserveParsed)
+{}
+
+Http1::Parser::size_type
+Http::One::RequestParser::firstLineSize() const
  {
-    req.start = req.end = -1;
-    req.m_start = req.m_end = -1;
-    req.u_start = req.u_end = -1;
-    req.v_start = req.v_end = -1;
+    // RFC 7230 section 2.6
+    /* method SP request-target SP "HTTP/" DIGIT "." DIGIT CRLF */
+    return method_.image().length() + uri_.length() + 12;
  }
  
  /**
@@ -51,25 +60,217 @@ Http::One::RequestParser::skipGarbageLines()
              buf_.consume(1);
          }
      }
+}
+
+/**
+ * Attempt to parse the method field out of an HTTP message request-line.
+ *
+ * Governed by:
+ *  RFC 1945 section 5.1
+ *  RFC 7230 section 2.6, 3.1 and 3.5
+ */
+bool
+Http::One::RequestParser::parseMethodField(Http1::Tokenizer &tok)
+{
+    // method field is a sequence of TCHAR.
+    // Limit to 32 characters to prevent overly long sequences of non-HTTP
+    // being sucked in before mismatch is detected. 32 is itself annoyingly
+    // big but there are methods registered by IANA that reach 17 bytes:
+    //  http://www.iana.org/assignments/http-methods
+    static const size_t maxMethodLength = 32; // TODO: make this configurable?
+
+    SBuf methodFound;
+    if (!tok.prefix(methodFound, CharacterSet::TCHAR, maxMethodLength)) {
+        debugs(33, ErrorLevel(), "invalid request-line: missing or malformed method");
+        parseStatusCode = Http::scBadRequest;
+        return false;
+    }
+    method_ = HttpRequestMethod(methodFound);
  
-    /* XXX: this is a Squid-specific tolerance
-     * it appears never to have been relevant outside out unit-tests
-     * because the ConnStateData parser loop starts with consumeWhitespace()
-     * which absorbs any SP HTAB VTAB CR LF characters.
-     * But unit-tests called the HttpParser method directly without that pruning.
+    if (!skipDelimiter(tok.skipAll(DelimiterCharacters()), "after method"))
+        return false;
+
+    return true;
+}
+
+/// the characters which truly are valid within URI
+static const CharacterSet &
+UriValidCharacters()
+{
+    /* RFC 3986 section 2:
+     * "
+     *   A URI is composed from a limited set of characters consisting of
+     *   digits, letters, and a few graphic symbols.
+     * "
       */
+    static const CharacterSet UriChars =
+        CharacterSet("URI-Chars","") +
+        // RFC 3986 section 2.2 - reserved characters
+        CharacterSet("gen-delims", ":/?#[]@") +
+        CharacterSet("sub-delims", "!$&'()*+,;=") +
+        // RFC 3986 section 2.3 - unreserved characters
+        CharacterSet::ALPHA +
+        CharacterSet::DIGIT +
+        CharacterSet("unreserved", "-._~") +
+        // RFC 3986 section 2.1 - percent encoding "%" HEXDIG
+        CharacterSet("pct-encoded", "%") +
+        CharacterSet::HEXDIG;
+
+    return UriChars;
+}
+
+/// characters which Squid will accept in the HTTP request-target (URI)
+const CharacterSet &
+Http::One::RequestParser::RequestTargetCharacters()
+{
+    if (Config.onoff.relaxed_header_parser) {
  #if USE_HTTP_VIOLATIONS
+        static const CharacterSet RelaxedExtended =
+            UriValidCharacters() +
+            // accept whitespace (extended), it will be dealt with later
+            DelimiterCharacters() +
+            // RFC 2396 unwise character set which must never be transmitted
+            // in un-escaped form. But many web services do anyway.
+            CharacterSet("RFC2396-unwise","\"\\|^<>`{}") +
+            // UTF-8 because we want to be future-proof
+            CharacterSet("UTF-8", 128, 255);
+
+        return RelaxedExtended;
+#else
+        static const CharacterSet RelaxedCompliant =
+            UriValidCharacters() +
+            // accept whitespace (extended), it will be dealt with later.
+            DelimiterCharacters();
+
+        return RelaxedCompliant;
+#endif
+    }
+
+    // strict parse only accepts what the RFC say we can
+    return UriValidCharacters();
+}
+
+bool
+Http::One::RequestParser::parseUriField(Http1::Tokenizer &tok)
+{
+    /* Arbitrary 64KB URI upper length limit.
+     *
+     * Not quite as arbitrary as it seems though. Old SquidString objects
+     * cannot store strings larger than 64KB, so we must limit until they
+     * have all been replaced with SBuf.
+     *
+     * Not that it matters but RFC 7230 section 3.1.1 requires (RECOMMENDED)
+     * at least 8000 octets for the whole line, including method and version.
+     */
+    const size_t maxUriLength = static_cast<size_t>((64*1024)-1);
+
+    SBuf uriFound;
+    if (!tok.prefix(uriFound, RequestTargetCharacters())) {
+        parseStatusCode = Http::scBadRequest;
+        debugs(33, ErrorLevel(), "invalid request-line: missing or malformed URI");
+        return false;
+    }
+
+    if (uriFound.length() > maxUriLength) {
+        // RFC 7230 section 3.1.1 mandatory (MUST) 414 response
+        parseStatusCode = Http::scUriTooLong;
+        debugs(33, ErrorLevel(), "invalid request-line: " << uriFound.length() <<
+               "-byte URI exceeds " << maxUriLength << "-byte limit");
+        return false;
+    }
+
+    uri_ = uriFound;
+    return true;
+}
+
+bool
+Http::One::RequestParser::parseHttpVersionField(Http1::Tokenizer &tok)
+{
+    static const SBuf http1p0("HTTP/1.0");
+    static const SBuf http1p1("HTTP/1.1");
+    const auto savedTok = tok;
+
+    // Optimization: Expect (and quickly parse) HTTP/1.1 or HTTP/1.0 in
+    // the vast majority of cases.
+    if (tok.skipSuffix(http1p1)) {
+        msgProtocol_ = Http::ProtocolVersion(1, 1);
+        return true;
+    } else if (tok.skipSuffix(http1p0)) {
+        msgProtocol_ = Http::ProtocolVersion(1, 0);
+        return true;
+    } else {
+        // RFC 7230 section 2.6:
+        // HTTP-version  = HTTP-name "/" DIGIT "." DIGIT
+        static const CharacterSet period("Decimal point", ".");
+        static const SBuf proto("HTTP/");
+        SBuf majorDigit;
+        SBuf minorDigit;
+        if (tok.suffix(minorDigit, CharacterSet::DIGIT) &&
+                tok.skipOneTrailing(period) &&
+                tok.suffix(majorDigit, CharacterSet::DIGIT) &&
+                tok.skipSuffix(proto)) {
+            const bool multiDigits = majorDigit.length() > 1 || minorDigit.length() > 1;
+            // use '0.0' for unsupported multiple digit version numbers
+            const unsigned int major = multiDigits ? 0 : (*majorDigit.rawContent() - '0');
+            const unsigned int minor = multiDigits ? 0 : (*minorDigit.rawContent() - '0');
+            msgProtocol_ = Http::ProtocolVersion(major, minor);
+            return true;
+        }
+    }
+
+    // A GET request might use HTTP/0.9 syntax
+    if (method_ == Http::METHOD_GET) {
+        // RFC 1945 - no HTTP version field at all
+        tok = savedTok; // in case the URI ends with a digit
+        // report this assumption as an error if configured to triage parsing
+        debugs(33, ErrorLevel(), "assuming HTTP/0.9 request-line");
+        msgProtocol_ = Http::ProtocolVersion(0,9);
+        return true;
+    }
+
+    debugs(33, ErrorLevel(), "invalid request-line: not HTTP");
+    parseStatusCode = Http::scBadRequest;
+    return false;
+}
+
+/**
+ * Skip characters separating request-line fields.
+ * To handle bidirectional parsing, the caller does the actual skipping and
+ * we just check how many character the caller has skipped.
+ */
+bool
+Http::One::RequestParser::skipDelimiter(const size_t count, const char *where)
+{
+    if (count <= 0) {
+        debugs(33, ErrorLevel(), "invalid request-line: missing delimiter " << where);
+        parseStatusCode = Http::scBadRequest;
+        return false;
+    }
+
+    // tolerant parser allows multiple whitespace characters between request-line fields
+    if (count > 1 && !Config.onoff.relaxed_header_parser) {
+        debugs(33, ErrorLevel(), "invalid request-line: too many delimiters " << where);
+        parseStatusCode = Http::scBadRequest;
+        return false;
+    }
+
+    return true;
+}
+
+/// Parse CRs at the end of request-line, just before the terminating LF.
+bool
+Http::One::RequestParser::skipTrailingCrs(Http1::Tokenizer &tok)
+{
      if (Config.onoff.relaxed_header_parser) {
-        if (Config.onoff.relaxed_header_parser < 0 && buf_[0] == ' ')
-            debugs(74, DBG_IMPORTANT, "WARNING: Invalid HTTP Request: " <<
-                   "Whitespace bytes received ahead of method. " <<
-                   "Ignored due to relaxed_header_parser.");
-        // Be tolerant of prefix spaces (other bytes are valid method values)
-        while (!buf_.isEmpty() && buf_[0] == ' ') {
-            buf_.consume(1);
+        (void)tok.skipAllTrailing(CharacterSet::CR); // optional; multiple OK
+    } else {
+        if (!tok.skipOneTrailing(CharacterSet::CR)) {
+            debugs(33, ErrorLevel(), "invalid request-line: missing CR before LF");
+            parseStatusCode = Http::scBadRequest;
+            return false;
          }
      }
-#endif
+    return true;
  }
  
  /**
@@ -77,212 +278,88 @@ Http::One::RequestParser::skipGarbageLines()
   *
   * Governed by:
   *  RFC 1945 section 5.1
- *  RFC 7230 section 3.1 and 3.5
+ *  RFC 7230 section 2.6, 3.1 and 3.5
   *
- * Parsing state is stored between calls. However the current implementation
- * begins parsing from scratch on every call.
- * The return value tells you whether the parsing state fields are valid or not.
- *
- * \retval -1  an error occurred. request_parse_status indicates HTTP status result.
+ * \retval -1  an error occurred. parseStatusCode indicates HTTP status result.
   * \retval  1  successful parse. member fields contain the request-line items
   * \retval  0  more data is needed to complete the parse
   */
  int
  Http::One::RequestParser::parseRequestFirstLine()
  {
-    int second_word = -1; // track the suspected URI start
-    int first_whitespace = -1, last_whitespace = -1; // track the first and last SP byte
-    int line_end = -1; // tracks the last byte BEFORE terminal \r\n or \n sequence
-
      debugs(74, 5, "parsing possible request: buf.length=" << buf_.length());
      debugs(74, DBG_DATA, buf_);
  
-    // Single-pass parse: (provided we have the whole line anyways)
+    SBuf line;
  
-    req.start = 0;
-    req.end = -1;
-    for (SBuf::size_type i = 0; i < buf_.length(); ++i) {
-        // track first and last whitespace (SP only)
-        if (buf_[i] == ' ') {
-            last_whitespace = i;
-            if (first_whitespace < req.start)
-                first_whitespace = i;
-        }
+    // Earlier, skipGarbageLines() took care of any leading LFs (if allowed).
+    // Now, the request line has to end at the first LF.
+    static const CharacterSet lineChars = CharacterSet::LF.complement("notLF");
+    ::Parser::Tokenizer lineTok(buf_);
+    if (!lineTok.prefix(line, lineChars) || !lineTok.skip('\n')) {
+        if (buf_.length() >= Config.maxRequestHeaderSize) {
+            /* who should we blame for our failure to parse this line? */
  
-        // track next non-SP/non-HT byte after first_whitespace
-        if (second_word < first_whitespace && buf_[i] != ' ' && buf_[i] != '\t') {
-            second_word = i;
-        }
+            Http1::Tokenizer methodTok(buf_);
+            if (!parseMethodField(methodTok))
+                return -1; // blame a bad method (or its delimiter)
  
-        // locate line terminator
-        if (buf_[i] == '\n') {
-            req.end = i;
-            line_end = i - 1;
-            break;
-        }
-        if (i < buf_.length() - 1 && buf_[i] == '\r') {
-            if (Config.onoff.relaxed_header_parser) {
-                if (Config.onoff.relaxed_header_parser < 0 && buf_[i + 1] == '\r')
-                    debugs(74, DBG_IMPORTANT, "WARNING: Invalid HTTP Request: " <<
-                           "Series of carriage-return bytes received prior to line terminator. " <<
-                           "Ignored due to relaxed_header_parser.");
-
-                // Be tolerant of invalid multiple \r prior to terminal \n
-                if (buf_[i + 1] == '\n' || buf_[i + 1] == '\r')
-                    line_end = i - 1;
-                while (i < buf_.length() - 1 && buf_[i + 1] == '\r')
-                    ++i;
-
-                if (buf_[i + 1] == '\n') {
-                    req.end = i + 1;
-                    break;
-                }
-            } else {
-                if (buf_[i + 1] == '\n') {
-                    req.end = i + 1;
-                    line_end = i - 1;
-                    break;
-                }
-            }
-
-            // RFC 7230 section 3.1.1 does not prohibit embeded CR like RFC 2616 used to.
-            // However it does explicitly state an exact syntax which omits un-encoded CR
-            // and defines 400 (Bad Request) as the required action when
-            // handed an invalid request-line.
-            request_parse_status = Http::scBadRequest;
+            // assume it is the URI
+            debugs(74, ErrorLevel(), "invalid request-line: URI exceeds " <<
+                   Config.maxRequestHeaderSize << "-byte limit");
+            parseStatusCode = Http::scUriTooLong;
              return -1;
          }
-    }
-
-    if (req.end == -1) {
-        // DoS protection against long first-line
-        if ((size_t)buf_.length() >= Config.maxRequestHeaderSize) {
-            debugs(33, 5, "Too large request-line");
-            // RFC 7230 section 3.1.1 mandatory 414 response if URL longer than acceptible.
-            request_parse_status = Http::scUriTooLong;
-            return -1;
-        }
-
-        debugs(74, 5, "Parser: retval 0: from " << req.start <<
-               "->" << req.end << ": needs more data to complete first line.");
+        debugs(74, 5, "Parser needs more data");
          return 0;
      }
  
-    // NP: we have now seen EOL, more-data (0) cannot occur.
-    //     From here on any failure is -1, success is 1
+    Http1::Tokenizer tok(line);
  
-    // Input Validation:
-
-    // DoS protection against long first-line
-    if ((size_t)(req.end-req.start) >= Config.maxRequestHeaderSize) {
-        debugs(33, 5, "Too large request-line");
-        request_parse_status = Http::scUriTooLong;
+    if (!parseMethodField(tok))
          return -1;
-    }
-
-    // Process what we now know about the line structure into field offsets
-    // generating HTTP status for any aborts as we go.
  
-    // First non-whitespace = beginning of method
-    if (req.start > line_end) {
-        request_parse_status = Http::scBadRequest;
+    /* now parse backwards, to leave just the URI */
+    if (!skipTrailingCrs(tok))
          return -1;
-    }
-    req.m_start = req.start;
  
-    // First whitespace = end of method
-    if (first_whitespace > line_end || first_whitespace < req.start) {
-        request_parse_status = Http::scBadRequest; // no method
-        return -1;
-    }
-    req.m_end = first_whitespace - 1;
-    if (req.m_end < req.m_start) {
-        request_parse_status = Http::scBadRequest; // missing URI?
+    if (!parseHttpVersionField(tok))
          return -1;
-    }
  
-    /* Set method_ */
-    const SBuf tmp = buf_.substr(req.m_start, req.m_end - req.m_start + 1);
-    method_ = HttpRequestMethod(tmp);
-
-    // First non-whitespace after first SP = beginning of URL+Version
-    if (second_word > line_end || second_word < req.start) {
-        request_parse_status = Http::scBadRequest; // missing URI
+    if (!http0() && !skipDelimiter(tok.skipAllTrailing(DelimiterCharacters()), "before protocol version"))
          return -1;
-    }
-    req.u_start = second_word;
  
-    // RFC 1945: SP and version following URI are optional, marking version 0.9
-    // we identify this by the last whitespace being earlier than URI start
-    if (last_whitespace < second_word && last_whitespace >= req.start) {
-        msgProtocol_ = Http::ProtocolVersion(0,9);
-        req.u_end = line_end;
-        uri_ = buf_.substr(req.u_start, req.u_end - req.u_start + 1);
-        request_parse_status = Http::scOkay; // HTTP/0.9
-        return 1;
-    } else {
-        // otherwise last whitespace is somewhere after end of URI.
-        req.u_end = last_whitespace;
-        // crop any trailing whitespace in the area we think of as URI
-        for (; req.u_end >= req.u_start && xisspace(buf_[req.u_end]); --req.u_end);
-    }
-    if (req.u_end < req.u_start) {
-        request_parse_status = Http::scBadRequest; // missing URI
-        return -1;
-    }
-    uri_ = buf_.substr(req.u_start, req.u_end - req.u_start + 1);
+    /* parsed everything before and after the URI */
  
-    // Last whitespace SP = before start of protocol/version
-    if (last_whitespace >= line_end) {
-        request_parse_status = Http::scBadRequest; // missing version
+    if (!parseUriField(tok))
          return -1;
-    }
-    req.v_start = last_whitespace + 1;
-    req.v_end = line_end;
  
-    /* RFC 7230 section 2.6 : handle unsupported HTTP major versions cleanly. */
-    if ((req.v_end - req.v_start +1) < (int)Http1magic.length() || !buf_.substr(req.v_start, SBuf::npos).startsWith(Http1magic)) {
-        // non-HTTP/1 protocols not supported / implemented.
-        request_parse_status = Http::scHttpVersionNotSupported;
+    if (!tok.atEnd()) {
+        debugs(33, ErrorLevel(), "invalid request-line: garbage after URI");
+        parseStatusCode = Http::scBadRequest;
          return -1;
      }
-    // NP: magic octets include the protocol name and major version DIGIT.
-    msgProtocol_.protocol = AnyP::PROTO_HTTP;
-    msgProtocol_.major = 1;
  
-    int i = req.v_start + Http1magic.length() -1;
+    parseStatusCode = Http::scOkay;
+    buf_ = lineTok.remaining(); // incremental parse checkpoint
+    return 1;
+}
  
-    // catch missing minor part
-    if (++i > line_end) {
-        request_parse_status = Http::scHttpVersionNotSupported;
-        return -1;
-    }
-    /* next should be one or more digits */
-    if (!isdigit(buf_[i])) {
-        request_parse_status = Http::scHttpVersionNotSupported;
-        return -1;
-    }
-    int min = 0;
-    for (; i <= line_end && (isdigit(buf_[i])) && min < 65536; ++i) {
-        min = min * 10;
-        min = min + (buf_[i]) - '0';
-    }
-    // catch too-big values or trailing garbage
-    if (min >= 65536 || i < line_end) {
-        request_parse_status = Http::scHttpVersionNotSupported;
-        return -1;
+bool
+Http::One::RequestParser::parse(const SBuf &aBuf)
+{
+    const bool result = doParse(aBuf);
+    if (preserveParsed_) {
+        assert(aBuf.length() >= remaining().length());
+        parsed_.append(aBuf.substr(0, aBuf.length() - remaining().length())); // newly parsed bytes
      }
-    msgProtocol_.minor = min;
  
-    /*
-     * Rightio - we have all the schtuff. Return true; we've got enough.
-     */
-    request_parse_status = Http::scOkay;
-    return 1;
+    return result;
  }
  
+// raw is not a reference because a reference might point back to our own buf_ or parsed_
  bool
-Http::One::RequestParser::parse(const SBuf &aBuf)
+Http::One::RequestParser::doParse(const SBuf &aBuf)
  {
      buf_ = aBuf;
      debugs(74, DBG_DATA, "Parse buf={length=" << aBuf.length() << ", data='" << aBuf << "'}");
@@ -305,15 +382,13 @@ Http::One::RequestParser::parse(const SBuf &aBuf)
  
          // first-line (or a look-alike) found successfully.
          if (retcode > 0) {
-            buf_.consume(firstLineSize()); // first line bytes including CRLF terminator are now done.
              parsingStage_ = HTTP_PARSE_MIME;
          }
  
-        debugs(74, 5, "request-line: retval " << retcode << ": from " << req.start << "->" << req.end <<
-               " line={" << aBuf.length() << ", data='" << aBuf << "'}");
-        debugs(74, 5, "request-line: method " << req.m_start << "->" << req.m_end << " (" << method_ << ")");
-        debugs(74, 5, "request-line: url " << req.u_start << "->" << req.u_end << " (" << uri_ << ")");
-        debugs(74, 5, "request-line: proto " << req.v_start << "->" << req.v_end << " (" << msgProtocol_ << ")");
+        debugs(74, 5, "request-line: retval " << retcode << ": line={" << aBuf.length() << ", data='" << aBuf << "'}");
+        debugs(74, 5, "request-line: method: " << method_);
+        debugs(74, 5, "request-line: url: " << uri_);
+        debugs(74, 5, "request-line: proto: " << msgProtocol_);
          debugs(74, 5, "Parser: bytes processed=" << (aBuf.length()-buf_.length()));
          PROF_stop(HttpParserParseReqLine);
  
@@ -327,35 +402,9 @@ Http::One::RequestParser::parse(const SBuf &aBuf)
      // stage 3: locate the mime header block
      if (parsingStage_ == HTTP_PARSE_MIME) {
          // HTTP/1.x request-line is valid and parsing completed.
-        if (msgProtocol_.major == 1) {
-            /* NOTE: HTTP/0.9 requests do not have a mime header block.
-             *       So the rest of the code will need to deal with '0'-byte headers
-             *       (ie, none, so don't try parsing em)
-             */
-            int64_t mimeHeaderBytes = 0;
-            // XXX: c_str() reallocates. performance regression.
-            if ((mimeHeaderBytes = headersEnd(buf_.c_str(), buf_.length())) == 0) {
-                if (buf_.length()+firstLineSize() >= Config.maxRequestHeaderSize) {
-                    debugs(33, 5, "Too large request");
-                    request_parse_status = Http::scRequestHeaderFieldsTooLarge;
-                    parsingStage_ = HTTP_PARSE_DONE;
-                } else
-                    debugs(33, 5, "Incomplete request, waiting for end of headers");
-                return false;
-            }
-            mimeHeaderBlock_ = buf_.consume(mimeHeaderBytes);
-            debugs(74, 5, "mime header (0-" << mimeHeaderBytes << ") {" << mimeHeaderBlock_ << "}");
-
-        } else
-            debugs(33, 3, "Missing HTTP/1.x identifier");
-
-        // NP: we do not do any further stages here yet so go straight to DONE
-        parsingStage_ = HTTP_PARSE_DONE;
-
-        // Squid could handle these headers, but admin does not want to
-        if (messageHeaderSize() >= Config.maxRequestHeaderSize) {
-            debugs(33, 5, "Too large request");
-            request_parse_status = Http::scRequestHeaderFieldsTooLarge;
+        if (!grabMimeBlock("Request", Config.maxRequestHeaderSize)) {
+            if (parseStatusCode == Http::scHeaderTooLarge)
+                parseStatusCode = Http::scRequestHeaderFieldsTooLarge;
              return false;
          }
      }