From: Amos Jeffries <squid3@treenet.co.nz>
Date: Fri, 20 May 2016 08:28:33 +0000 (+1200)
Subject: HTTP/1.1: unfold mime header blocks
X-Git-Tag: SQUID_4_0_11~22
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=00237269110b6d9661457a7f8baa4c6450ded737;p=thirdparty%2Fsquid.git

HTTP/1.1: unfold mime header blocks

Enact the RFC 7230 section 3 requirement that proxies remove obs-fold from
received mime header blocks and drop all lines that start with whitespace
immediately following the request-line.

Also;

* Shuffle the DelimiterCharacters() and RelaxedDelimiters() helper methods
  to Http::One::Parser for use in processing mime.

* Extend the headersEnd() function with a wrapper to avoid SBuf::c_str()
  and to efficiently report whether obs-fold exists without needing to
  re-scan the block.

* Document the mime_header.h API function(s).

* Add Http::One::CrLf() to present a SBuf CRLF constant. This can be used
  instead of the many local string definitions we have now.
  The implementation could perhapse be done better for performance, but not
  done here.
---

diff --git a/src/http/one/Parser.cc b/src/http/one/Parser.cc
index e762957af0..1eb1ea46b8 100644
--- a/src/http/one/Parser.cc
+++ b/src/http/one/Parser.cc
@@ -16,6 +16,12 @@
 /// RFC 7230 section 2.6 - 7 magic octets
 const SBuf Http::One::Parser::Http1magic("HTTP/1.");
 
+const SBuf &Http::One::CrLf()
+{
+    static const SBuf crlf("\r\n");
+    return crlf;
+}
+
 void
 Http::One::Parser::clear()
 {
@@ -25,11 +31,34 @@ Http::One::Parser::clear()
     mimeHeaderBlock_.clear();
 }
 
+/// characters HTTP permits tolerant parsers to accept as delimiters
+static const CharacterSet &
+RelaxedDelimiterCharacters()
+{
+    // RFC 7230 section 3.5
+    // tolerant parser MAY accept any of SP, HTAB, VT (%x0B), FF (%x0C),
+    // or bare CR as whitespace between request-line fields
+    static const CharacterSet RelaxedDels =
+        (CharacterSet::SP +
+         CharacterSet::HTAB +
+         CharacterSet("VT,FF","\x0B\x0C") +
+         CharacterSet::CR).rename("relaxed-WSP");
+
+    return RelaxedDels;
+}
+
+/// characters used to separate HTTP fields
+const CharacterSet &
+Http::One::Parser::DelimiterCharacters()
+{
+    return Config.onoff.relaxed_header_parser ?
+           RelaxedDelimiterCharacters() : CharacterSet::SP;
+}
+
 bool
 Http::One::Parser::skipLineTerminator(Http1::Tokenizer &tok) const
 {
-    static const SBuf crlf("\r\n");
-    if (tok.skip(crlf))
+    if (tok.skip(Http1::CrLf()))
         return true;
 
     if (Config.onoff.relaxed_header_parser && tok.skipOne(CharacterSet::LF))
@@ -38,6 +67,88 @@ Http::One::Parser::skipLineTerminator(Http1::Tokenizer &tok) const
     return false;
 }
 
+/// all characters except the LF line terminator
+static const CharacterSet &
+LineCharacters()
+{
+    static const CharacterSet line = CharacterSet::LF.complement("non-LF");
+    return line;
+}
+
+/**
+ * Remove invalid lines (if any) from the mime prefix
+ *
+ * RFC 7230 section 3:
+ * "A recipient that receives whitespace between the start-line and
+ * the first header field MUST ... consume each whitespace-preceded
+ * line without further processing of it."
+ *
+ * We need to always use the relaxed delimiters here to prevent
+ * line smuggling through strict parsers.
+ *
+ * Note that 'whitespace' in RFC 7230 includes CR. So that means
+ * sequences of CRLF will be pruned, but not sequences of bare-LF.
+ */
+void
+Http::One::Parser::cleanMimePrefix()
+{
+    Http1::Tokenizer tok(mimeHeaderBlock_);
+    while (tok.skipOne(RelaxedDelimiterCharacters())) {
+        (void)tok.skipAll(LineCharacters()); // optional line content
+        // LF terminator is required.
+        // trust headersEnd() to ensure that we have at least one LF
+        (void)tok.skipOne(CharacterSet::LF);
+    }
+
+    // If mimeHeaderBlock_ had just whitespace line(s) followed by CRLF,
+    // then we skipped everything, including that terminating LF.
+    // Restore the terminating CRLF if needed.
+    if (tok.atEnd())
+        mimeHeaderBlock_ = Http1::CrLf();
+    else
+        mimeHeaderBlock_ = tok.remaining();
+    // now mimeHeaderBlock_ has 0+ fields followed by the LF terminator
+}
+
+/**
+ * Replace obs-fold with a single SP,
+ *
+ * RFC 7230 section 3.2.4
+ * "A server that receives an obs-fold in a request message that is not
+ *  within a message/http container MUST ... replace
+ *  each received obs-fold with one or more SP octets prior to
+ *  interpreting the field value or forwarding the message downstream."
+ *
+ * "A proxy or gateway that receives an obs-fold in a response message
+ *  that is not within a message/http container MUST ... replace each
+ *  received obs-fold with one or more SP octets prior to interpreting
+ *  the field value or forwarding the message downstream."
+ */
+void
+Http::One::Parser::unfoldMime()
+{
+    Http1::Tokenizer tok(mimeHeaderBlock_);
+    const auto szLimit = mimeHeaderBlock_.length();
+    mimeHeaderBlock_.clear();
+    // prevent the mime sender being able to make append() realloc/grow multiple times.
+    mimeHeaderBlock_.reserveSpace(szLimit);
+
+    static const CharacterSet nonCRLF = (CharacterSet::CR + CharacterSet::LF).complement().rename("non-CRLF");
+
+    while (!tok.atEnd()) {
+        const SBuf all(tok.remaining());
+        const auto blobLen = tok.skipAll(nonCRLF); // may not be there
+        const auto crLen = tok.skipAll(CharacterSet::CR); // may not be there
+        const auto lfLen = tok.skipOne(CharacterSet::LF); // may not be there
+
+        if (lfLen && tok.skipAll(CharacterSet::WSP)) { // obs-fold!
+            mimeHeaderBlock_.append(all.substr(0, blobLen));
+            mimeHeaderBlock_.append(' '); // replace one obs-fold with one SP
+        } else
+            mimeHeaderBlock_.append(all.substr(0, blobLen + crLen + lfLen));
+    }
+}
+
 bool
 Http::One::Parser::grabMimeBlock(const char *which, const size_t limit)
 {
@@ -51,8 +162,8 @@ Http::One::Parser::grabMimeBlock(const char *which, const size_t limit)
          *       So the rest of the code will need to deal with '0'-byte headers
          *       (ie, none, so don't try parsing em)
          */
-        // XXX: c_str() reallocates. performance regression.
-        if (SBuf::size_type mimeHeaderBytes = headersEnd(buf_.c_str(), buf_.length())) {
+        bool containsObsFold;
+        if (SBuf::size_type mimeHeaderBytes = headersEnd(buf_, containsObsFold)) {
 
             // Squid could handle these headers, but admin does not want to
             if (firstLineSize() + mimeHeaderBytes >= limit) {
@@ -64,6 +175,10 @@ Http::One::Parser::grabMimeBlock(const char *which, const size_t limit)
             }
 
             mimeHeaderBlock_ = buf_.consume(mimeHeaderBytes);
+            cleanMimePrefix();
+            if (containsObsFold)
+                unfoldMime();
+
             debugs(74, 5, "mime header (0-" << mimeHeaderBytes << ") {" << mimeHeaderBlock_ << "}");
 
         } else { // headersEnd() == 0
@@ -102,12 +217,10 @@ Http::One::Parser::getHeaderField(const char *name)
     debugs(25, 5, "looking for " << name);
 
     // while we can find more LF in the SBuf
-    static CharacterSet iso8859Line = CharacterSet("non-LF",'\0','\n'-1) + CharacterSet(NULL, '\n'+1, (unsigned char)0xFF);
     Http1::Tokenizer tok(mimeHeaderBlock_);
     SBuf p;
-    static const SBuf crlf("\r\n");
 
-    while (tok.prefix(p, iso8859Line)) {
+    while (tok.prefix(p, LineCharacters())) {
         if (!tok.skipOne(CharacterSet::LF)) // move tokenizer past the LF
             break; // error. reached invalid octet or end of buffer insted of an LF ??
 
@@ -120,7 +233,7 @@ Http::One::Parser::getHeaderField(const char *name)
             continue;
 
         // drop any trailing *CR sequence
-        p.trim(crlf, false, true);
+        p.trim(Http1::CrLf(), false, true);
 
         debugs(25, 5, "checking " << p);
         p.consume(namelen + 1);
diff --git a/src/http/one/Parser.h b/src/http/one/Parser.h
index dc57583e6e..d24c8c503f 100644
--- a/src/http/one/Parser.h
+++ b/src/http/one/Parser.h
@@ -111,6 +111,10 @@ protected:
     /// consume from the tokenizer and return true only if found
     bool skipLineTerminator(Http1::Tokenizer &tok) const;
 
+    /// the characters which are to be considered valid whitespace
+    /// (WSP / BSP / OWS)
+    static const CharacterSet &DelimiterCharacters();
+
     /**
      * Scan to find the mime headers block for current message.
      *
@@ -139,6 +143,10 @@ protected:
 
     /// Whether the invalid HTTP as HTTP/0.9 hack expects a mime header block
     bool hackExpectsMime_;
+
+private:
+    void cleanMimePrefix();
+    void unfoldMime();
 };
 
 } // namespace One
diff --git a/src/http/one/RequestParser.cc b/src/http/one/RequestParser.cc
index ed04e85ed2..a1c18942e2 100644
--- a/src/http/one/RequestParser.cc
+++ b/src/http/one/RequestParser.cc
@@ -114,30 +114,6 @@ UriValidCharacters()
     return UriChars;
 }
 
-/// characters HTTP permits tolerant parsers to accept as delimiters
-static const CharacterSet &
-RelaxedDelimiterCharacters()
-{
-    // RFC 7230 section 3.5
-    // tolerant parser MAY accept any of SP, HTAB, VT (%x0B), FF (%x0C),
-    // or bare CR as whitespace between request-line fields
-    static const CharacterSet RelaxedDels =
-        CharacterSet::SP +
-        CharacterSet::HTAB +
-        CharacterSet("VT,FF","\x0B\x0C") +
-        CharacterSet::CR;
-
-    return RelaxedDels;
-}
-
-/// characters used to separate HTTP fields
-const CharacterSet &
-Http::One::RequestParser::DelimiterCharacters()
-{
-    return Config.onoff.relaxed_header_parser ?
-           RelaxedDelimiterCharacters() : CharacterSet::SP;
-}
-
 /// characters which Squid will accept in the HTTP request-target (URI)
 const CharacterSet &
 Http::One::RequestParser::RequestTargetCharacters()
diff --git a/src/http/one/RequestParser.h b/src/http/one/RequestParser.h
index d339e94b3c..107b8cf4f7 100644
--- a/src/http/one/RequestParser.h
+++ b/src/http/one/RequestParser.h
@@ -56,7 +56,6 @@ private:
     bool skipTrailingCrs(Http1::Tokenizer &tok);
 
     bool http0() const {return !msgProtocol_.major;}
-    static const CharacterSet &DelimiterCharacters();
     static const CharacterSet &RequestTargetCharacters();
 
     /// what request method has been found on the first line
diff --git a/src/http/one/forward.h b/src/http/one/forward.h
index 93f39634c6..791dc100e1 100644
--- a/src/http/one/forward.h
+++ b/src/http/one/forward.h
@@ -10,6 +10,7 @@
 #define SQUID_SRC_HTTP_ONE_FORWARD_H
 
 #include "base/RefCount.h"
+#include "sbuf/forward.h"
 
 namespace Http {
 namespace One {
@@ -27,6 +28,9 @@ typedef RefCount<Http::One::RequestParser> RequestParserPointer;
 class ResponseParser;
 typedef RefCount<Http::One::ResponseParser> ResponseParserPointer;
 
+/// CRLF textual representation
+const SBuf &CrLf();
+
 } // namespace One
 } // namespace Http
 
diff --git a/src/mime_header.cc b/src/mime_header.cc
index 746cd120eb..f8f5354e97 100644
--- a/src/mime_header.cc
+++ b/src/mime_header.cc
@@ -13,10 +13,11 @@
 #include "profiler/Profiler.h"
 
 size_t
-headersEnd(const char *mime, size_t l)
+headersEnd(const char *mime, size_t l, bool &containsObsFold)
 {
     size_t e = 0;
     int state = 1;
+    containsObsFold = false;
 
     PROF_start(headersEnd);
 
@@ -35,7 +36,10 @@ headersEnd(const char *mime, size_t l)
                 state = 2;
             else if ('\n' == mime[e])
                 state = 3;
-            else
+            else if (' ' == mime[e] || '\t' == mime[e]) {
+                containsObsFold = true;
+                state = 0;
+            } else
                 state = 0;
 
             break;
diff --git a/src/mime_header.h b/src/mime_header.h
index b57cc0d225..ca176f329c 100644
--- a/src/mime_header.h
+++ b/src/mime_header.h
@@ -11,7 +11,35 @@
 #ifndef SQUID_MIME_HEADER_H_
 #define SQUID_MIME_HEADER_H_
 
-size_t headersEnd(const char *, size_t);
+/**
+ * Scan for the end of mime header block.
+ *
+ * Which is one of the following octet patterns:
+ * - CRLF CRLF, or
+ * - CRLF LF, or
+ * - LF CRLF, or
+ * - LF LF
+ *
+ * Also detects whether a obf-fold pattern exists within the mime block
+ * - CR*LF (SP / HTAB)
+ *
+ * \param containsObsFold will be set to true if obs-fold pattern is found.
+ */
+size_t headersEnd(const char *, size_t, bool &containsObsFold);
+
+inline size_t
+headersEnd(const SBuf &buf, bool &containsObsFold)
+{
+    return headersEnd(buf.rawContent(), buf.length(), containsObsFold);
+}
+
+/// \deprecated caller needs to be fixed to handle obs-fold
+inline size_t
+headersEnd(const char *buf, size_t sz)
+{
+    bool ignored;
+    return headersEnd(buf, sz, ignored);
+}
 
 #endif /* SQUID_MIME_HEADER_H_ */
 
diff --git a/src/tests/stub_mime.cc b/src/tests/stub_mime.cc
index 42a9a5f53e..a284b5eb3a 100644
--- a/src/tests/stub_mime.cc
+++ b/src/tests/stub_mime.cc
@@ -11,5 +11,5 @@
 #define STUB_API "mime.cc"
 #include "tests/STUB.h"
 
-size_t headersEnd(const char *mime, size_t l) STUB_RETVAL(0)
+size_t headersEnd(const char *, size_t, bool &) STUB_RETVAL(0)