From: Alex Rousskov <rousskov@measurement-factory.com>
Date: Sun, 4 Jun 2017 23:45:03 +0000 (-0600)
Subject: Bug 4492: Chunk extension parser is too pedantic.
X-Git-Tag: M-staged-PR71~143
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=26f0a359;p=thirdparty%2Fsquid.git

Bug 4492: Chunk extension parser is too pedantic.

Support HTTP/1 BWS when parsing HTTP and ICAP chunk extensions.
Per RFC 7230 Errata #4667, HTTP parsers MUST parse BWS in chunk-ext.
Per RFC 3507 and its extensions, ICAP agents generate BWS in chunk-ext.

Also discovered that our DelimiterCharacters() in pedantic mode is too
strict for many use cases: Most HTTP syntax rules allow both SP and HTAB
but pedantic DelimiterCharacters() only allows SP. Added
WhitespaceCharacters() to provide the more general set where it is
needed in new code (including BWS), but did not remove excessive
DelimiterCharacters() use.
---

diff --git a/src/http/one/Parser.cc b/src/http/one/Parser.cc
index 01aa183498..ba233d8481 100644
--- a/src/http/one/Parser.cc
+++ b/src/http/one/Parser.cc
@@ -47,7 +47,13 @@ RelaxedDelimiterCharacters()
     return RelaxedDels;
 }
 
-/// characters used to separate HTTP fields
+const CharacterSet &
+Http::One::Parser::WhitespaceCharacters()
+{
+    return Config.onoff.relaxed_header_parser ?
+           RelaxedDelimiterCharacters() : CharacterSet::WSP;
+}
+
 const CharacterSet &
 Http::One::Parser::DelimiterCharacters()
 {
@@ -259,11 +265,23 @@ Http::One::Parser::getHeaderField(const char *name)
     return NULL;
 }
 
-#if USE_HTTP_VIOLATIONS
 int
-Http::One::Parser::violationLevel() const
+Http::One::ErrorLevel()
 {
     return Config.onoff.relaxed_header_parser < 0 ? DBG_IMPORTANT : 5;
 }
-#endif
 
+// BWS = *( SP / HTAB ) ; WhitespaceCharacters() may relax this RFC 7230 rule
+bool
+Http::One::ParseBws(Tokenizer &tok)
+{
+    if (const auto count = tok.skipAll(Parser::WhitespaceCharacters())) {
+        // Generating BWS is a MUST-level violation so warn about it as needed.
+        debugs(33, ErrorLevel(), "found " << count << " BWS octets");
+        // RFC 7230 says we MUST parse BWS, so we fall through even if
+        // Config.onoff.relaxed_header_parser is off.
+    }
+    // else we successfully "parsed" an empty BWS sequence
+
+    return true;
+}
diff --git a/src/http/one/Parser.h b/src/http/one/Parser.h
index 6112b59d1c..ae518ef4d6 100644
--- a/src/http/one/Parser.h
+++ b/src/http/one/Parser.h
@@ -95,11 +95,6 @@ public:
     /// the remaining unprocessed section of buffer
     const SBuf &remaining() const {return buf_;}
 
-#if USE_HTTP_VIOLATIONS
-    /// the right debugs() level for parsing HTTP violation messages
-    int violationLevel() const;
-#endif
-
     /**
      * HTTP status code resulting from the parse process.
      * to be used on the invalid message handling.
@@ -110,8 +105,16 @@ public:
      */
     Http::StatusCode parseStatusCode = Http::scNone;
 
-    /// the characters which are to be considered valid whitespace
-    /// (WSP / BSP / OWS)
+    /// Whitespace between regular protocol elements.
+    /// Seen in RFCs as OWS, RWS, BWS, SP/HTAB but may be "relaxed" by us.
+    /// See also: DelimiterCharacters().
+    static const CharacterSet &WhitespaceCharacters();
+
+    /// Whitespace between protocol elements in restricted contexts like
+    /// request line, status line, asctime-date, and credentials
+    /// Seen in RFCs as SP but may be "relaxed" by us.
+    /// See also: WhitespaceCharacters().
+    /// XXX: Misnamed and overused.
     static const CharacterSet &DelimiterCharacters();
 
 protected:
@@ -159,6 +162,13 @@ private:
     void unfoldMime();
 };
 
+/// skips and, if needed, warns about RFC 7230 BWS ("bad" whitespace)
+/// \returns true (always; unlike all the skip*() functions)
+bool ParseBws(Tokenizer &tok);
+
+/// the right debugs() level for logging HTTP violation messages
+int ErrorLevel();
+
 } // namespace One
 } // namespace Http
 
diff --git a/src/http/one/RequestParser.cc b/src/http/one/RequestParser.cc
index d92c464092..94d48e9d8d 100644
--- a/src/http/one/RequestParser.cc
+++ b/src/http/one/RequestParser.cc
@@ -14,12 +14,6 @@
 #include "profiler/Profiler.h"
 #include "SquidConfig.h"
 
-// the right debugs() level for parsing errors
-inline static int
-ErrorLevel() {
-    return Config.onoff.relaxed_header_parser < 0 ? DBG_IMPORTANT : 5;
-}
-
 Http1::Parser::size_type
 Http::One::RequestParser::firstLineSize() const
 {
diff --git a/src/http/one/TeChunkedParser.cc b/src/http/one/TeChunkedParser.cc
index d730cdb8d2..42a090776c 100644
--- a/src/http/one/TeChunkedParser.cc
+++ b/src/http/one/TeChunkedParser.cc
@@ -105,10 +105,10 @@ Http::One::TeChunkedParser::parseChunkSize(Http1::Tokenizer &tok)
 }
 
 /**
- * Parses a set of RFC 7230 section 4.1.1 chunk-ext
- * http://tools.ietf.org/html/rfc7230#section-4.1.1
+ * Parses chunk metadata suffix, looking for interesting extensions and/or
+ * getting to the line terminator. RFC 7230 section 4.1.1 and its Errata #4667:
  *
- *   chunk-ext      = *( ";" chunk-ext-name [ "=" chunk-ext-val ] )
+ *   chunk-ext = *( BWS  ";" BWS chunk-ext-name [ BWS "=" BWS chunk-ext-val ] )
  *   chunk-ext-name = token
  *   chunk-ext-val  = token / quoted-string
  *
@@ -117,17 +117,16 @@ Http::One::TeChunkedParser::parseChunkSize(Http1::Tokenizer &tok)
 bool
 Http::One::TeChunkedParser::parseChunkExtension(Http1::Tokenizer &tok, bool skipKnown)
 {
-    // Bug 4492: IBM_HTTP_Server sends SP padding
-    if (auto n = tok.skipAll(CharacterSet::SP)) {
-        debugs(94, 3, "skipping " << n << " spurious whitespace at start of chunk extension");
-    }
-
     SBuf ext;
     SBuf value;
-    while (tok.skip(';') && tok.prefix(ext, CharacterSet::TCHAR)) {
+    while (
+        ParseBws(tok) && // Bug 4492: IBM_HTTP_Server sends SP after chunk-size
+        tok.skip(';') &&
+        ParseBws(tok) && // Bug 4492: ICAP servers send SP before chunk-ext-name
+        tok.prefix(ext, CharacterSet::TCHAR)) { // chunk-ext-name
 
         // whole value part is optional. if no '=' expect next chunk-ext
-        if (tok.skip('=')) {
+        if (ParseBws(tok) && tok.skip('=') && ParseBws(tok)) {
 
             if (!skipKnown) {
                 if (ext.cmp("use-original-body",17) == 0 && tok.int64(useOriginBody, 10)) {