From a1b9ec20f3249d0478b2357edad095c1cc3fbcf4 Mon Sep 17 00:00:00 2001
From: Alex Rousskov <rousskov@measurement-factory.com>
Date: Tue, 6 Sep 2016 14:45:20 +1200
Subject: [PATCH] HTTP: validate Content-Length header values

Squid is violating HTTP MUSTs by forwarding messages with
problematic Content-Length values. Some of those bugs were fixed in
trunk r14215. This change handles multiple Content-Length values inside
one header field, negative values, and trailing garbage. Handling the
former required a change in the overall Content-Length interpretation
approach (which is why it was previously left as a TODO).

Squid now passes almost all Co-Advisor tests devoted to this area. We
are not 100% done though: We still need to handle malformed values with
leading signs (e.g., "-0" or "+1"). However, I hope that the remaining
problems are relatively minor. I do not plan on addressing them in the
foreseeable future.

Also improved httpHeaderParseOffset(): Added detection of overflowing
and underflowing integer values; polished malformed value detection code
(Linux strtoll(3) manual page has a good example). The function no
longer considers empty strings valid and reports trailing characters.
The function still accepts leading whitespace and signs. It is still the
wrong approach to HTTP numbers parsing, but further improvements are out
of scope because they are complicated and would require significant
caller rewrites.
---
 src/HttpHeader.cc                    |  80 +++++++---------
 src/HttpHeaderTools.cc               |  25 +++--
 src/HttpHeaderTools.h                |   8 +-
 src/http/ContentLengthInterpreter.cc | 134 +++++++++++++++++++++++++++
 src/http/ContentLengthInterpreter.h  |  57 ++++++++++++
 src/http/Makefile.am                 |   2 +
 src/http/one/Parser.h                |   8 +-
 7 files changed, 254 insertions(+), 60 deletions(-)
 create mode 100644 src/http/ContentLengthInterpreter.cc
 create mode 100644 src/http/ContentLengthInterpreter.h
diff --git a/src/HttpHeader.cc b/src/HttpHeader.cc
index 7275dae788..0197509c23 100644
--- a/src/HttpHeader.cc
+++ b/src/HttpHeader.cc
@@ -12,6 +12,7 @@
 #include "base/EnumIterator.h"
 #include "base64.h"
 #include "globals.h"
+#include "http/ContentLengthInterpreter.h"
 #include "HttpHdrCc.h"
 #include "HttpHdrContRange.h"
 #include "HttpHdrScTarget.h" // also includes HttpHdrSc.h
@@ -320,7 +321,6 @@ HttpHeader::parse(const char *header_start, size_t hdrLen)
 {
     const char *field_ptr = header_start;
     const char *header_end = header_start + hdrLen; // XXX: remove
-    HttpHeaderEntry *e, *e2;
     int warnOnError = (Config.onoff.relaxed_header_parser <= 0 ? DBG_IMPORTANT : 2);
 
     PROF_start(HttpHeaderParse);
@@ -338,6 +338,7 @@ HttpHeader::parse(const char *header_start, size_t hdrLen)
         return 0;
     }
 
+    Http::ContentLengthInterpreter clen(warnOnError);
     /* common format headers are "<name>:[ws]<value>" lines delimited by <CRLF>.
      * continuation lines start with a (single) space or tab */
     while (field_ptr < header_end) {
@@ -419,6 +420,7 @@ HttpHeader::parse(const char *header_start, size_t hdrLen)
             break;      /* terminating blank line */
         }
 
+        HttpHeaderEntry *e;
         if ((e = HttpHeaderEntry::parse(field_start, field_end)) == NULL) {
             debugs(55, warnOnError, "WARNING: unparseable HTTP header field {" <<
                    getStringPrefix(field_start, field_end-field_start) << "}");
@@ -432,45 +434,15 @@ HttpHeader::parse(const char *header_start, size_t hdrLen)
             return 0;
         }
 
-        // XXX: RFC 7230 Section 3.3.3 item #4 requires sending a 502 error in
-        // several cases that we do not yet cover. TODO: Rewrite to cover more.
-        if (e->id == Http::HdrType::CONTENT_LENGTH && (e2 = findEntry(e->id)) != nullptr) {
-            if (e->value != e2->value) {
-                int64_t l1, l2;
-                debugs(55, warnOnError, "WARNING: found two conflicting content-length headers in {" <<
-                       getStringPrefix(header_start, hdrLen) << "}");
-
-                if (!Config.onoff.relaxed_header_parser) {
-                    delete e;
-                    PROF_stop(HttpHeaderParse);
-                    clean();
-                    return 0;
-                }
-
-                if (!httpHeaderParseOffset(e->value.termedBuf(), &l1)) {
-                    debugs(55, DBG_IMPORTANT, "WARNING: Unparseable content-length '" << e->value << "'");
-                    delete e;
-                    continue;
-                } else if (!httpHeaderParseOffset(e2->value.termedBuf(), &l2)) {
-                    debugs(55, DBG_IMPORTANT, "WARNING: Unparseable content-length '" << e2->value << "'");
-                    delById(e2->id);
-                } else {
-                    if (l1 != l2)
-                        conflictingContentLength_ = true;
-                    delete e;
-                    continue;
-                }
-            } else {
-                debugs(55, warnOnError, "NOTICE: found double content-length header");
-                delete e;
+        if (e->id == Http::HdrType::CONTENT_LENGTH && !clen.checkField(e->value)) {
+            delete e;
 
-                if (Config.onoff.relaxed_header_parser)
-                    continue;
+            if (Config.onoff.relaxed_header_parser)
+                continue; // clen has printed any necessary warnings
 
-                PROF_stop(HttpHeaderParse);
-                clean();
-                return 0;
-            }
+            PROF_stop(HttpHeaderParse);
+            clean();
+            return 0;
         }
 
         if (e->id == Http::HdrType::OTHER && stringHasWhitespace(e->name.termedBuf())) {
@@ -488,14 +460,29 @@ HttpHeader::parse(const char *header_start, size_t hdrLen)
         addEntry(e);
     }
 
+    if (clen.headerWideProblem) {
+        debugs(55, warnOnError, "WARNING: " << clen.headerWideProblem <<
+               " Content-Length field values in" <<
+               Raw("header", header_start, hdrLen));
+    }
+
     if (chunked()) {
         // RFC 2616 section 4.4: ignore Content-Length with Transfer-Encoding
+        // RFC 7230 section 3.3.3 #3: Transfer-Encoding overwrites Content-Length
         delById(Http::HdrType::CONTENT_LENGTH);
-        // RFC 7230 section 3.3.3 #4: ignore Content-Length conflicts with Transfer-Encoding
-        conflictingContentLength_ = false;
-    } else if (conflictingContentLength_) {
-        // ensure our callers do not see the conflicting Content-Length value
+        // and clen state becomes irrelevant
+    } else if (clen.sawBad) {
+        // ensure our callers do not accidentally see bad Content-Length values
         delById(Http::HdrType::CONTENT_LENGTH);
+        conflictingContentLength_ = true; // TODO: Rename to badContentLength_.
+    } else if (clen.needsSanitizing) {
+        // RFC 7230 section 3.3.2: MUST either reject or ... [sanitize];
+        // ensure our callers see a clean Content-Length value or none at all
+        delById(Http::HdrType::CONTENT_LENGTH);
+        if (clen.sawGood) {
+            putInt64(Http::HdrType::CONTENT_LENGTH, clen.value);
+            debugs(55, 5, "sanitized Content-Length to be " << clen.value);
+        }
     }
 
     PROF_stop(HttpHeaderParse);
@@ -1479,12 +1466,9 @@ int64_t
 HttpHeaderEntry::getInt64() const
 {
     int64_t val = -1;
-    int ok = httpHeaderParseOffset(value.termedBuf(), &val);
-    httpHeaderNoteParsedEntry(id, value, ok == 0);
-    /* XXX: Should we check ok - ie
-     * return ok ? -1 : value;
-     */
-    return val;
+    const bool ok = httpHeaderParseOffset(value.termedBuf(), &val);
+    httpHeaderNoteParsedEntry(id, value, ok);
+    return val; // remains -1 if !ok (XXX: bad method API)
 }
 
 static void
diff --git a/src/HttpHeaderTools.cc b/src/HttpHeaderTools.cc
index af5c2bd880..7015f1b9b8 100644
--- a/src/HttpHeaderTools.cc
+++ b/src/HttpHeaderTools.cc
@@ -138,18 +138,29 @@ httpHeaderParseInt(const char *start, int *value)
     return 1;
 }
 
-int
-httpHeaderParseOffset(const char *start, int64_t * value)
+bool
+httpHeaderParseOffset(const char *start, int64_t *value, char **endPtr)
 {
+    char *end = nullptr;
     errno = 0;
-    int64_t res = strtoll(start, NULL, 10);
-    if (!res && EINVAL == errno) {   /* maybe not portable? */
-        debugs(66, 7, "failed to parse offset in " << start);
-        return 0;
+    const int64_t res = strtoll(start, &end, 10);
+    if (errno && !res) {
+        debugs(66, 7, "failed to parse malformed offset in " << start);
+        return false;
+    }
+    if (errno == ERANGE && (res == LLONG_MIN || res == LLONG_MAX)) { // no overflow
+        debugs(66, 7, "failed to parse huge offset in " << start);
+        return false;
+    }
+    if (start == end) {
+        debugs(66, 7, "failed to parse empty offset");
+        return false;
     }
     *value = res;
+    if (endPtr)
+        *endPtr = end;
     debugs(66, 7, "offset " << start << " parsed as " << res);
-    return 1;
+    return true;
 }
 
 /**
diff --git a/src/HttpHeaderTools.h b/src/HttpHeaderTools.h
index bec824ec02..24e5bc88c1 100644
--- a/src/HttpHeaderTools.h
+++ b/src/HttpHeaderTools.h
@@ -117,7 +117,13 @@ public:
     bool quoted;
 };
 
-int httpHeaderParseOffset(const char *start, int64_t * off);
+/// A strtoll(10) wrapper that checks for strtoll() failures and other problems.
+/// XXX: This function is not fully compatible with some HTTP syntax rules.
+/// Just like strtoll(), allows whitespace prefix, a sign, and _any_ suffix.
+/// Requires at least one digit to be present.
+/// Sets "off" and "end" arguments if and only if no problems were found.
+/// \return true if and only if no problems were found.
+bool httpHeaderParseOffset(const char *start, int64_t *offPtr, char **endPtr = nullptr);
 
 int httpHeaderHasConnDir(const HttpHeader * hdr, const char *directive);
 int httpHeaderParseInt(const char *start, int *val);
diff --git a/src/http/ContentLengthInterpreter.cc b/src/http/ContentLengthInterpreter.cc
new file mode 100644
index 0000000000..61c1518c8b
--- /dev/null
+++ b/src/http/ContentLengthInterpreter.cc
@@ -0,0 +1,134 @@
+/*
+ * Copyright (C) 1996-2016 The Squid Software Foundation and contributors
+ *
+ * Squid software is distributed under GPLv2+ license and includes
+ * contributions from numerous individuals and organizations.
+ * Please see the COPYING and CONTRIBUTORS files for details.
+ */
+
+/* DEBUG: section 55    HTTP Header */
+
+#include "squid.h"
+#include "base/CharacterSet.h"
+#include "Debug.h"
+#include "http/ContentLengthInterpreter.h"
+#include "http/one/Parser.h"
+#include "HttpHeaderTools.h"
+#include "SquidConfig.h"
+#include "SquidString.h"
+#include "StrList.h"
+
+Http::ContentLengthInterpreter::ContentLengthInterpreter(const int aDebugLevel):
+    value(-1),
+    headerWideProblem(nullptr),
+    debugLevel(aDebugLevel),
+    sawBad(false),
+    needsSanitizing(false),
+    sawGood(false)
+{
+}
+
+/// checks whether all characters after the Content-Length are allowed
+bool
+Http::ContentLengthInterpreter::goodSuffix(const char *suffix, const char * const end) const
+{
+    // optimize for the common case that does not need delimiters
+    if (suffix == end)
+        return true;
+
+    for (const CharacterSet &delimiters = Http::One::Parser::DelimiterCharacters();
+            suffix < end; ++suffix) {
+        if (!delimiters[*suffix])
+            return false;
+    }
+    // needsSanitizing = true; // TODO: Always remove trailing whitespace?
+    return true; // including empty suffix
+}
+
+/// handles a single-token Content-Length value
+/// rawValue null-termination requirements are those of httpHeaderParseOffset()
+bool
+Http::ContentLengthInterpreter::checkValue(const char *rawValue, const int valueSize)
+{
+    Must(!sawBad);
+
+    int64_t latestValue = -1;
+    char *suffix = nullptr;
+    // TODO: Handle malformed values with leading signs (e.g., "-0" or "+1").
+    if (!httpHeaderParseOffset(rawValue, &latestValue, &suffix)) {
+        debugs(55, DBG_IMPORTANT, "WARNING: Malformed" << Raw("Content-Length", rawValue, valueSize));
+        sawBad = true;
+        return false;
+    }
+
+    if (latestValue < 0) {
+        debugs(55, debugLevel, "WARNING: Negative" << Raw("Content-Length", rawValue, valueSize));
+        sawBad = true;
+        return false;
+    }
+
+    // check for garbage after the number
+    if (!goodSuffix(suffix, rawValue + valueSize)) {
+        debugs(55, debugLevel, "WARNING: Trailing garbage in" << Raw("Content-Length", rawValue, valueSize));
+        sawBad = true;
+        return false;
+    }
+
+    if (sawGood) {
+        /* we have found at least two, possibly identical values */
+
+        needsSanitizing = true; // replace identical values with a single value
+
+        const bool conflicting = value != latestValue;
+        if (conflicting)
+            headerWideProblem = "Conflicting"; // overwrite any lesser problem
+        else if (!headerWideProblem) // preserve a possibly worse problem
+            headerWideProblem = "Duplicate";
+
+        // with relaxed_header_parser, identical values are permitted
+        sawBad = !Config.onoff.relaxed_header_parser || conflicting;
+        return false; // conflicting or duplicate
+    }
+
+    sawGood = true;
+    value = latestValue;
+    return true;
+}
+
+/// handles Content-Length: a, b, c
+bool
+Http::ContentLengthInterpreter::checkList(const String &list)
+{
+    Must(!sawBad);
+
+    if (!Config.onoff.relaxed_header_parser) {
+        debugs(55, debugLevel, "WARNING: List-like" << Raw("Content-Length", list.rawBuf(), list.size()));
+        sawBad = true;
+        return false;
+    }
+
+    needsSanitizing = true; // remove extra commas (at least)
+
+    const char *pos = nullptr;
+    const char *item = nullptr;;
+    int ilen = -1;
+    while (strListGetItem(&list, ',', &item, &ilen, &pos)) {
+        if (!checkValue(item, ilen) && sawBad)
+            break;
+        // keep going after a duplicate value to find conflicting ones
+    }
+    return false; // no need to keep this list field; it will be sanitized away
+}
+
+bool
+Http::ContentLengthInterpreter::checkField(const String &rawValue)
+{
+    if (sawBad)
+        return false; // one rotten apple is enough to spoil all of them
+
+    // TODO: Optimize by always parsing the first integer first.
+    return rawValue.pos(',') ?
+           checkList(rawValue) :
+           checkValue(rawValue.rawBuf(), rawValue.size());
+}
+
diff --git a/src/http/ContentLengthInterpreter.h b/src/http/ContentLengthInterpreter.h
new file mode 100644
index 0000000000..128c34d885
--- /dev/null
+++ b/src/http/ContentLengthInterpreter.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 1996-2016 The Squid Software Foundation and contributors
+ *
+ * Squid software is distributed under GPLv2+ license and includes
+ * contributions from numerous individuals and organizations.
+ * Please see the COPYING and CONTRIBUTORS files for details.
+ */
+
+#ifndef SQUID_SRC_HTTP_CONTENTLENGTH_INTERPRETER_H
+#define SQUID_SRC_HTTP_CONTENTLENGTH_INTERPRETER_H
+
+class String;
+
+namespace Http
+{
+
+/// Finds the intended Content-Length value while parsing message-header fields.
+/// Deals with complications such as value lists and/or repeated fields.
+class ContentLengthInterpreter
+{
+public:
+    explicit ContentLengthInterpreter(const int aDebugLevel);
+
+    /// updates history based on the given message-header field
+    /// \return true iff the field should be added/remembered for future use
+    bool checkField(const String &field);
+
+    /// intended Content-Length value if sawGood is set and sawBad is not set
+    /// meaningless otherwise
+    int64_t value;
+
+    /* for debugging (declared here to minimize padding) */
+    const char *headerWideProblem; ///< worst header-wide problem found (or nil)
+    const int debugLevel; ///< debugging level for certain warnings
+
+    /// whether a malformed Content-Length value was present
+    bool sawBad;
+
+    /// whether all remembered fields should be removed
+    /// removed fields ought to be replaced with the intended value (if known)
+    /// irrelevant if sawBad is set
+    bool needsSanitizing;
+
+    /// whether a valid field value was present, possibly among problematic ones
+    /// irrelevant if sawBad is set
+    bool sawGood;
+
+protected:
+    bool goodSuffix(const char *suffix, const char * const end) const;
+    bool checkValue(const char *start, const int size);
+    bool checkList(const String &list);
+};
+
+} // namespace Http
+
+#endif /* SQUID_SRC_HTTP_CONTENTLENGTH_INTERPRETER_H */
+
diff --git a/src/http/Makefile.am b/src/http/Makefile.am
index b3071bb162..43ea4fa5b8 100644
--- a/src/http/Makefile.am
+++ b/src/http/Makefile.am
@@ -14,6 +14,8 @@ DIST_SUBDIRS = one url_rewriters
 noinst_LTLIBRARIES = libhttp.la
 
 libhttp_la_SOURCES = \
+	ContentLengthInterpreter.cc \
+	ContentLengthInterpreter.h \
 	forward.h \
 	MethodType.cc \
 	MethodType.h \
diff --git a/src/http/one/Parser.h b/src/http/one/Parser.h
index b1e819a945..cdf0c3a605 100644
--- a/src/http/one/Parser.h
+++ b/src/http/one/Parser.h
@@ -106,6 +106,10 @@ public:
      */
     Http::StatusCode parseStatusCode;
 
+    /// the characters which are to be considered valid whitespace
+    /// (WSP / BSP / OWS)
+    static const CharacterSet &DelimiterCharacters();
+
 protected:
     /**
      * detect and skip the CRLF or (if tolerant) LF line terminator
@@ -117,10 +121,6 @@ protected:
      */
     bool skipLineTerminator(Http1::Tokenizer &tok) const;
 
-    /// the characters which are to be considered valid whitespace
-    /// (WSP / BSP / OWS)
-    static const CharacterSet &DelimiterCharacters();
-
     /**
      * Scan to find the mime headers block for current message.
      *
-- 
2.39.5