Fix incremental parsing of chunked quoted extensions (#310)

author Eduard Bagdasaryan <eduard.bagdasaryan@measurement-factory.com>

Mon, 18 Mar 2019 17:48:21 +0000 (17:48 +0000)

committer Squid Anubis <squid-anubis@squid-cache.org>

Mon, 18 Mar 2019 17:48:25 +0000 (17:48 +0000)
author Eduard Bagdasaryan <eduard.bagdasaryan@measurement-factory.com>
Mon, 18 Mar 2019 17:48:21 +0000 (17:48 +0000)
committer Squid Anubis <squid-anubis@squid-cache.org>
Mon, 18 Mar 2019 17:48:25 +0000 (17:48 +0000)
diff --git a/src/adaptation/icap/ModXact.cc b/src/adaptation/icap/ModXact.cc

index 3a095a0bb295b042565f84fcad755b4a40601466..78a374013148d8d7c6db417881d5b331ca01e605 100644 (file)
--- a/src/adaptation/icap/ModXact.cc
+++ b/src/adaptation/icap/ModXact.cc
@@ -26,10 +26,11 @@
  #include "comm/Connection.h"
  #include "err_detail_type.h"
  #include "http/ContentLengthInterpreter.h"
-#include "http/one/TeChunkedParser.h"
  #include "HttpHeaderTools.h"
  #include "HttpReply.h"
  #include "MasterXaction.h"
+#include "parser/Tokenizer.h"
+#include "sbuf/Stream.h"
  #include "SquidTime.h"
  
  // flow and terminology:
@@ -43,6 +44,8 @@ CBDATA_NAMESPACED_CLASS_INIT(Adaptation::Icap, ModXactLauncher);
  
  static const size_t TheBackupLimit = BodyPipe::MaxCapacity;
  
+const SBuf Adaptation::Icap::ChunkExtensionValueParser::UseOriginalBodyName("use-original-body");
+
  Adaptation::Icap::ModXact::State::State()
  {
      memset(this, 0, sizeof(*this));
@@ -1145,6 +1148,7 @@ void Adaptation::Icap::ModXact::decideOnParsingBody()
          state.parsing = State::psBody;
          replyHttpBodySize = 0;
          bodyParser = new Http1::TeChunkedParser;
+        bodyParser->parseExtensionValuesWith(&extensionParser);
          makeAdaptedBodyPipe("adapted response from the ICAP server");
          Must(state.sending == State::sendingAdapted);
      } else {
@@ -1182,8 +1186,8 @@ void Adaptation::Icap::ModXact::parseBody()
      }
  
      if (parsed) {
-        if (state.readyForUob && bodyParser->useOriginBody >= 0)
-            prepPartialBodyEchoing(static_cast<uint64_t>(bodyParser->useOriginBody));
+        if (state.readyForUob && extensionParser.sawUseOriginalBody())
+            prepPartialBodyEchoing(extensionParser.useOriginalBody());
          else
              stopSending(true); // the parser succeeds only if all parsed data fits
          if (trailerParser)
@@ -2074,3 +2078,14 @@ bool Adaptation::Icap::TrailerParser::parse(const char *buf, int len, int atEnd,
      return parsed > 0;
  }
  
+void
+Adaptation::Icap::ChunkExtensionValueParser::parse(Tokenizer &tok, const SBuf &extName)
+{
+    if (extName == UseOriginalBodyName) {
+        useOriginalBody_ = tok.udec64("use-original-body");
+        assert(useOriginalBody_ >= 0);
+    } else {
+        Ignore(tok, extName);
+    }
+}
+
diff --git a/src/adaptation/icap/ModXact.h b/src/adaptation/icap/ModXact.h

index 6aa1e9f2a1871a77e19057fb59b5eda0d60320aa..830938bb932116c38ea9fde7156bd14f66436745 100644 (file)
--- a/src/adaptation/icap/ModXact.h
+++ b/src/adaptation/icap/ModXact.h
@@ -15,6 +15,7 @@
  #include "adaptation/icap/Xaction.h"
  #include "BodyPipe.h"
  #include "http/one/forward.h"
+#include "http/one/TeChunkedParser.h"
  
  /*
   * ICAPModXact implements ICAP REQMOD and RESPMOD transaction using
@@ -120,6 +121,23 @@ public:
      size_t hdr_sz; // pedantic XXX: wrong type dictated by HttpHeader::parse() API
  };
  
+/// handles ICAP-specific chunk extensions supported by Squid
+class ChunkExtensionValueParser: public Http1::ChunkExtensionValueParser
+{
+public:
+    /* Http1::ChunkExtensionValueParser API */
+    virtual void parse(Tokenizer &tok, const SBuf &extName) override;
+
+    bool sawUseOriginalBody() const { return useOriginalBody_ >= 0; }
+    uint64_t useOriginalBody() const { assert(sawUseOriginalBody()); return static_cast<uint64_t>(useOriginalBody_); }
+
+private:
+    static const SBuf UseOriginalBodyName;
+
+    /// the value of the parsed use-original-body chunk extension (or -1)
+    int64_t useOriginalBody_ = -1;
+};
+
  class ModXact: public Xaction, public BodyProducer, public BodyConsumer
  {
      CBDATA_CLASS(ModXact);
@@ -301,6 +319,8 @@ private:
  
      TrailerParser *trailerParser;
  
+    ChunkExtensionValueParser extensionParser;
+
      class State
      {
  
diff --git a/src/http/one/Parser.cc b/src/http/one/Parser.cc

index 069bdacee0a05596ccad33d0386795a816b7a753..8b262a99dc724d4bd6c5ad0f3c0a3bac94709d49 100644 (file)
--- a/src/http/one/Parser.cc
+++ b/src/http/one/Parser.cc
@@ -7,10 +7,11 @@
   */
  
  #include "squid.h"
+#include "base/CharacterSet.h"
  #include "Debug.h"
  #include "http/one/Parser.h"
-#include "http/one/Tokenizer.h"
  #include "mime_header.h"
+#include "parser/Tokenizer.h"
  #include "SquidConfig.h"
  
  /// RFC 7230 section 2.6 - 7 magic octets
@@ -61,20 +62,19 @@ Http::One::Parser::DelimiterCharacters()
             RelaxedDelimiterCharacters() : CharacterSet::SP;
  }
  
-bool
-Http::One::Parser::skipLineTerminator(Http1::Tokenizer &tok) const
+void
+Http::One::Parser::skipLineTerminator(Tokenizer &tok) const
  {
      if (tok.skip(Http1::CrLf()))
-        return true;
+        return;
  
      if (Config.onoff.relaxed_header_parser && tok.skipOne(CharacterSet::LF))
-        return true;
+        return;
  
      if (tok.atEnd() || (tok.remaining().length() == 1 && tok.remaining().at(0) == '\r'))
-        return false; // need more data
+        throw InsufficientInput();
  
      throw TexcHere("garbage instead of CRLF line terminator");
-    return false; // unreachable, but make naive compilers happy
  }
  
  /// all characters except the LF line terminator
@@ -102,7 +102,7 @@ LineCharacters()
  void
  Http::One::Parser::cleanMimePrefix()
  {
-    Http1::Tokenizer tok(mimeHeaderBlock_);
+    Tokenizer tok(mimeHeaderBlock_);
      while (tok.skipOne(RelaxedDelimiterCharacters())) {
          (void)tok.skipAll(LineCharacters()); // optional line content
          // LF terminator is required.
@@ -137,7 +137,7 @@ Http::One::Parser::cleanMimePrefix()
  void
  Http::One::Parser::unfoldMime()
  {
-    Http1::Tokenizer tok(mimeHeaderBlock_);
+    Tokenizer tok(mimeHeaderBlock_);
      const auto szLimit = mimeHeaderBlock_.length();
      mimeHeaderBlock_.clear();
      // prevent the mime sender being able to make append() realloc/grow multiple times.
@@ -227,7 +227,7 @@ Http::One::Parser::getHeaderField(const char *name)
      debugs(25, 5, "looking for " << name);
  
      // while we can find more LF in the SBuf
-    Http1::Tokenizer tok(mimeHeaderBlock_);
+    Tokenizer tok(mimeHeaderBlock_);
      SBuf p;
  
      while (tok.prefix(p, LineCharacters())) {
@@ -249,7 +249,7 @@ Http::One::Parser::getHeaderField(const char *name)
          p.consume(namelen + 1);
  
          // TODO: optimize SBuf::trim to take CharacterSet directly
-        Http1::Tokenizer t(p);
+        Tokenizer t(p);
          t.skipAll(CharacterSet::WSP);
          p = t.remaining();
  
@@ -272,10 +272,15 @@ Http::One::ErrorLevel()
  }
  
  // BWS = *( SP / HTAB ) ; WhitespaceCharacters() may relax this RFC 7230 rule
-bool
-Http::One::ParseBws(Tokenizer &tok)
+void
+Http::One::ParseBws(Parser::Tokenizer &tok)
  {
-    if (const auto count = tok.skipAll(Parser::WhitespaceCharacters())) {
+    const auto count = tok.skipAll(Parser::WhitespaceCharacters());
+
+    if (tok.atEnd())
+        throw InsufficientInput(); // even if count is positive
+
+    if (count) {
          // Generating BWS is a MUST-level violation so warn about it as needed.
          debugs(33, ErrorLevel(), "found " << count << " BWS octets");
          // RFC 7230 says we MUST parse BWS, so we fall through even if
@@ -283,6 +288,6 @@ Http::One::ParseBws(Tokenizer &tok)
      }
      // else we successfully "parsed" an empty BWS sequence
  
-    return true;
+    // success: no more BWS characters expected
  }
  
diff --git a/src/http/one/Parser.h b/src/http/one/Parser.h

index 202c52d54274fee720fe61f9ca1d3d946d4512ec..f449e8334c0280eb1b6dc41914ed1b8a3623fbf9 100644 (file)
--- a/src/http/one/Parser.h
+++ b/src/http/one/Parser.h
@@ -12,6 +12,7 @@
  #include "anyp/ProtocolVersion.h"
  #include "http/one/forward.h"
  #include "http/StatusCode.h"
+#include "parser/forward.h"
  #include "sbuf/SBuf.h"
  
  namespace Http {
@@ -40,6 +41,7 @@ class Parser : public RefCountable
  {
  public:
      typedef SBuf::size_type size_type;
+    typedef ::Parser::Tokenizer Tokenizer;
  
      Parser() = default;
      Parser(const Parser &) = default;
@@ -122,11 +124,11 @@ protected:
       * detect and skip the CRLF or (if tolerant) LF line terminator
       * consume from the tokenizer.
       *
-     * throws if non-terminator is detected.
+     * \throws exception on bad or InsuffientInput.
       * \retval true only if line terminator found.
       * \retval false incomplete or missing line terminator, need more data.
       */
-    bool skipLineTerminator(Http1::Tokenizer &tok) const;
+    void skipLineTerminator(Tokenizer &) const;
  
      /**
       * Scan to find the mime headers block for current message.
@@ -163,8 +165,8 @@ private:
  };
  
  /// skips and, if needed, warns about RFC 7230 BWS ("bad" whitespace)
-/// \returns true (always; unlike all the skip*() functions)
-bool ParseBws(Tokenizer &tok);
+/// \throws InsufficientInput when the end of BWS cannot be confirmed
+void ParseBws(Parser::Tokenizer &);
  
  /// the right debugs() level for logging HTTP violation messages
  int ErrorLevel();
diff --git a/src/http/one/RequestParser.cc b/src/http/one/RequestParser.cc

index 5dd48011c5140e8e19861ca9ce819451f1518162..81c6d20845e3b65b54673471833aee7fb43a490c 100644 (file)
--- a/src/http/one/RequestParser.cc
+++ b/src/http/one/RequestParser.cc
@@ -9,8 +9,8 @@
  #include "squid.h"
  #include "Debug.h"
  #include "http/one/RequestParser.h"
-#include "http/one/Tokenizer.h"
  #include "http/ProtocolVersion.h"
+#include "parser/Tokenizer.h"
  #include "profiler/Profiler.h"
  #include "SquidConfig.h"
  
@@ -59,7 +59,7 @@ Http::One::RequestParser::skipGarbageLines()
   *  RFC 7230 section 2.6, 3.1 and 3.5
   */
  bool
-Http::One::RequestParser::parseMethodField(Http1::Tokenizer &tok)
+Http::One::RequestParser::parseMethodField(Tokenizer &tok)
  {
      // method field is a sequence of TCHAR.
      // Limit to 32 characters to prevent overly long sequences of non-HTTP
@@ -140,7 +140,7 @@ Http::One::RequestParser::RequestTargetCharacters()
  }
  
  bool
-Http::One::RequestParser::parseUriField(Http1::Tokenizer &tok)
+Http::One::RequestParser::parseUriField(Tokenizer &tok)
  {
      /* Arbitrary 64KB URI upper length limit.
       *
@@ -173,7 +173,7 @@ Http::One::RequestParser::parseUriField(Http1::Tokenizer &tok)
  }
  
  bool
-Http::One::RequestParser::parseHttpVersionField(Http1::Tokenizer &tok)
+Http::One::RequestParser::parseHttpVersionField(Tokenizer &tok)
  {
      static const SBuf http1p0("HTTP/1.0");
      static const SBuf http1p1("HTTP/1.1");
@@ -248,7 +248,7 @@ Http::One::RequestParser::skipDelimiter(const size_t count, const char *where)
  
  /// Parse CRs at the end of request-line, just before the terminating LF.
  bool
-Http::One::RequestParser::skipTrailingCrs(Http1::Tokenizer &tok)
+Http::One::RequestParser::skipTrailingCrs(Tokenizer &tok)
  {
      if (Config.onoff.relaxed_header_parser) {
          (void)tok.skipAllTrailing(CharacterSet::CR); // optional; multiple OK
@@ -284,12 +284,12 @@ Http::One::RequestParser::parseRequestFirstLine()
      // Earlier, skipGarbageLines() took care of any leading LFs (if allowed).
      // Now, the request line has to end at the first LF.
      static const CharacterSet lineChars = CharacterSet::LF.complement("notLF");
-    ::Parser::Tokenizer lineTok(buf_);
+    Tokenizer lineTok(buf_);
      if (!lineTok.prefix(line, lineChars) || !lineTok.skip('\n')) {
          if (buf_.length() >= Config.maxRequestHeaderSize) {
              /* who should we blame for our failure to parse this line? */
  
-            Http1::Tokenizer methodTok(buf_);
+            Tokenizer methodTok(buf_);
              if (!parseMethodField(methodTok))
                  return -1; // blame a bad method (or its delimiter)
  
@@ -303,7 +303,7 @@ Http::One::RequestParser::parseRequestFirstLine()
          return 0;
      }
  
-    Http1::Tokenizer tok(line);
+    Tokenizer tok(line);
  
      if (!parseMethodField(tok))
          return -1;
diff --git a/src/http/one/RequestParser.h b/src/http/one/RequestParser.h

index dd6dc6f1cadff292184158e2e6b477ef45d7bf56..d48f72cba54a4e536511f59a7c5259c3a3d393b2 100644 (file)
--- a/src/http/one/RequestParser.h
+++ b/src/http/one/RequestParser.h
@@ -59,11 +59,11 @@ private:
      bool doParse(const SBuf &aBuf);
  
      /* all these return false and set parseStatusCode on parsing failures */
-    bool parseMethodField(Http1::Tokenizer &);
-    bool parseUriField(Http1::Tokenizer &);
-    bool parseHttpVersionField(Http1::Tokenizer &);
+    bool parseMethodField(Tokenizer &);
+    bool parseUriField(Tokenizer &);
+    bool parseHttpVersionField(Tokenizer &);
      bool skipDelimiter(const size_t count, const char *where);
-    bool skipTrailingCrs(Http1::Tokenizer &tok);
+    bool skipTrailingCrs(Tokenizer &tok);
  
      bool http0() const {return !msgProtocol_.major;}
      static const CharacterSet &RequestTargetCharacters();
diff --git a/src/http/one/ResponseParser.cc b/src/http/one/ResponseParser.cc

index 4dcaafb270e0acdc8b2f47be3e2ea8ad712bdcda..4d09a03cc16b7979005b8f6bc1f0b303b5c01314 100644 (file)
--- a/src/http/one/ResponseParser.cc
+++ b/src/http/one/ResponseParser.cc
@@ -9,8 +9,8 @@
  #include "squid.h"
  #include "Debug.h"
  #include "http/one/ResponseParser.h"
-#include "http/one/Tokenizer.h"
  #include "http/ProtocolVersion.h"
+#include "parser/Tokenizer.h"
  #include "profiler/Profiler.h"
  #include "SquidConfig.h"
  
@@ -47,7 +47,7 @@ Http::One::ResponseParser::firstLineSize() const
  // NP: we found the protocol version and consumed it already.
  // just need the status code and reason phrase
  int
-Http::One::ResponseParser::parseResponseStatusAndReason(Http1::Tokenizer &tok, const CharacterSet &WspDelim)
+Http::One::ResponseParser::parseResponseStatusAndReason(Tokenizer &tok, const CharacterSet &WspDelim)
  {
      if (!completedStatus_) {
          debugs(74, 9, "seek status-code in: " << tok.remaining().substr(0,10) << "...");
@@ -87,14 +87,13 @@ Http::One::ResponseParser::parseResponseStatusAndReason(Http1::Tokenizer &tok, c
      static const CharacterSet phraseChars = CharacterSet::WSP + CharacterSet::VCHAR + CharacterSet::OBSTEXT;
      (void)tok.prefix(reasonPhrase_, phraseChars); // optional, no error if missing
      try {
-        if (skipLineTerminator(tok)) {
-            debugs(74, DBG_DATA, "parse remaining buf={length=" << tok.remaining().length() << ", data='" << tok.remaining() << "'}");
-            buf_ = tok.remaining(); // resume checkpoint
-            return 1;
-        }
+        skipLineTerminator(tok);
+        buf_ = tok.remaining(); // resume checkpoint
+        debugs(74, DBG_DATA, Raw("leftovers", buf_.rawContent(), buf_.length()));
+        return 1;
+    } catch (const InsufficientInput &) {
          reasonPhrase_.clear();
          return 0; // need more to be sure we have it all
-
      } catch (const std::exception &ex) {
          debugs(74, 6, "invalid status-line: " << ex.what());
      }
@@ -119,7 +118,7 @@ Http::One::ResponseParser::parseResponseStatusAndReason(Http1::Tokenizer &tok, c
  int
  Http::One::ResponseParser::parseResponseFirstLine()
  {
-    Http1::Tokenizer tok(buf_);
+    Tokenizer tok(buf_);
  
      const CharacterSet &WspDelim = DelimiterCharacters();
  
diff --git a/src/http/one/ResponseParser.h b/src/http/one/ResponseParser.h

index d80e1721394ed0e30aae89e766be092ae8a9d5af..9ea63df805ff99ac9fda91562ab3a899efd5685e 100644 (file)
--- a/src/http/one/ResponseParser.h
+++ b/src/http/one/ResponseParser.h
@@ -47,7 +47,7 @@ public:
  
  private:
      int parseResponseFirstLine();
-    int parseResponseStatusAndReason(Http1::Tokenizer&, const CharacterSet &);
+    int parseResponseStatusAndReason(Tokenizer&, const CharacterSet &);
  
      /// magic prefix for identifying ICY response messages
      static const SBuf IcyMagic;
diff --git a/src/http/one/TeChunkedParser.cc b/src/http/one/TeChunkedParser.cc

index 3d26c6fcb3de793d8c032a04f8149a7f6de75567..d56d467e6d248a720b682a59da0d4da42cdadeef 100644 (file)
--- a/src/http/one/TeChunkedParser.cc
+++ b/src/http/one/TeChunkedParser.cc
@@ -13,10 +13,13 @@
  #include "http/one/Tokenizer.h"
  #include "http/ProtocolVersion.h"
  #include "MemBuf.h"
+#include "parser/Tokenizer.h"
  #include "Parsing.h"
+#include "sbuf/Stream.h"
  #include "SquidConfig.h"
  
-Http::One::TeChunkedParser::TeChunkedParser()
+Http::One::TeChunkedParser::TeChunkedParser():
+    customExtensionValueParser(nullptr)
  {
      // chunked encoding only exists in HTTP/1.1
      Http1::Parser::msgProtocol_ = Http::ProtocolVersion(1,1);
@@ -31,7 +34,11 @@ Http::One::TeChunkedParser::clear()
      buf_.clear();
      theChunkSize = theLeftBodySize = 0;
      theOut = NULL;
-    useOriginBody = -1;
+    // XXX: We do not reset customExtensionValueParser here. Based on the
+    // clear() API description, we must, but it makes little sense and could
+    // break method callers if they appear because some of them may forget to
+    // reset customExtensionValueParser. TODO: Remove Http1::Parser as our
+    // parent class and this unnecessary method with it.
  }
  
  bool
@@ -49,14 +56,14 @@ Http::One::TeChunkedParser::parse(const SBuf &aBuf)
      if (parsingStage_ == Http1::HTTP_PARSE_NONE)
          parsingStage_ = Http1::HTTP_PARSE_CHUNK_SZ;
  
-    Http1::Tokenizer tok(buf_);
+    Tokenizer tok(buf_);
  
      // loop for as many chunks as we can
      // use do-while instead of while so that we can incrementally
      // restart in the middle of a chunk/frame
      do {
  
-        if (parsingStage_ == Http1::HTTP_PARSE_CHUNK_EXT && !parseChunkExtension(tok, theChunkSize))
+        if (parsingStage_ == Http1::HTTP_PARSE_CHUNK_EXT && !parseChunkMetadataSuffix(tok))
              return false;
  
          if (parsingStage_ == Http1::HTTP_PARSE_CHUNK && !parseChunkBody(tok))
@@ -80,7 +87,7 @@ Http::One::TeChunkedParser::needsMoreSpace() const
  
  /// RFC 7230 section 4.1 chunk-size
  bool
-Http::One::TeChunkedParser::parseChunkSize(Http1::Tokenizer &tok)
+Http::One::TeChunkedParser::parseChunkSize(Tokenizer &tok)
  {
      Must(theChunkSize <= 0); // Should(), really
  
@@ -104,66 +111,75 @@ Http::One::TeChunkedParser::parseChunkSize(Http1::Tokenizer &tok)
      return false; // should not be reachable
  }
  
-/**
- * Parses chunk metadata suffix, looking for interesting extensions and/or
- * getting to the line terminator. RFC 7230 section 4.1.1 and its Errata #4667:
- *
- *   chunk-ext = *( BWS  ";" BWS chunk-ext-name [ BWS "=" BWS chunk-ext-val ] )
- *   chunk-ext-name = token
- *   chunk-ext-val  = token / quoted-string
- *
- * ICAP 'use-original-body=N' extension is supported.
- */
+/// Parses "[chunk-ext] CRLF" from RFC 7230 section 4.1.1:
+///   chunk = chunk-size [ chunk-ext ] CRLF chunk-data CRLF
+///   last-chunk = 1*"0" [ chunk-ext ] CRLF
  bool
-Http::One::TeChunkedParser::parseChunkExtension(Http1::Tokenizer &tok, bool skipKnown)
+Http::One::TeChunkedParser::parseChunkMetadataSuffix(Tokenizer &tok)
  {
-    SBuf ext;
-    SBuf value;
-    while (
-        ParseBws(tok) && // Bug 4492: IBM_HTTP_Server sends SP after chunk-size
-        tok.skip(';') &&
-        ParseBws(tok) && // Bug 4492: ICAP servers send SP before chunk-ext-name
-        tok.prefix(ext, CharacterSet::TCHAR)) { // chunk-ext-name
-
-        // whole value part is optional. if no '=' expect next chunk-ext
-        if (ParseBws(tok) && tok.skip('=') && ParseBws(tok)) {
-
-            if (!skipKnown) {
-                if (ext.cmp("use-original-body",17) == 0 && tok.int64(useOriginBody, 10)) {
-                    debugs(94, 3, "Found chunk extension " << ext << "=" << useOriginBody);
-                    buf_ = tok.remaining(); // parse checkpoint
-                    continue;
-                }
-            }
-
-            debugs(94, 5, "skipping unknown chunk extension " << ext);
-
-            // unknown might have a value token or quoted-string
-            if (tok.quotedStringOrToken(value) && !tok.atEnd()) {
-                buf_ = tok.remaining(); // parse checkpoint
-                continue;
-            }
-
-            // otherwise need more data OR corrupt syntax
-            break;
-        }
-
-        if (!tok.atEnd())
-            buf_ = tok.remaining(); // parse checkpoint (unless there might be more token name)
-    }
-
-    if (skipLineTerminator(tok)) {
-        buf_ = tok.remaining(); // checkpoint
-        // non-0 chunk means data, 0-size means optional Trailer follows
+    // Code becomes much simpler when incremental parsing functions throw on
+    // bad or insufficient input, like in the code below. TODO: Expand up.
+    try {
+        parseChunkExtensions(tok); // a possibly empty chunk-ext list
+        skipLineTerminator(tok);
+        buf_ = tok.remaining();
          parsingStage_ = theChunkSize ? Http1::HTTP_PARSE_CHUNK : Http1::HTTP_PARSE_MIME;
          return true;
+    } catch (const InsufficientInput &) {
+        tok.reset(buf_); // backtrack to the last commit point
+        return false;
      }
+    // other exceptions bubble up to kill message parsing
+}
  
-    return false;
+/// Parses the chunk-ext list (RFC 7230 section 4.1.1 and its Errata #4667):
+/// chunk-ext = *( BWS ";" BWS chunk-ext-name [ BWS "=" BWS chunk-ext-val ] )
+void
+Http::One::TeChunkedParser::parseChunkExtensions(Tokenizer &tok)
+{
+    do {
+        ParseBws(tok); // Bug 4492: IBM_HTTP_Server sends SP after chunk-size
+
+        if (!tok.skip(';'))
+            return; // reached the end of extensions (if any)
+
+        parseOneChunkExtension(tok);
+        buf_ = tok.remaining(); // got one extension
+    } while (true);
+}
+
+void
+Http::One::ChunkExtensionValueParser::Ignore(Tokenizer &tok, const SBuf &extName)
+{
+    const auto ignoredValue = tokenOrQuotedString(tok);
+    debugs(94, 5, extName << " with value " << ignoredValue);
+}
+
+/// Parses a single chunk-ext list element:
+/// chunk-ext = *( BWS ";" BWS chunk-ext-name [ BWS "=" BWS chunk-ext-val ] )
+void
+Http::One::TeChunkedParser::parseOneChunkExtension(Tokenizer &tok)
+{
+    ParseBws(tok); // Bug 4492: ICAP servers send SP before chunk-ext-name
+
+    const auto extName = tok.prefix("chunk-ext-name", CharacterSet::TCHAR);
+
+    ParseBws(tok);
+
+    if (!tok.skip('='))
+        return; // parsed a valueless chunk-ext
+
+    ParseBws(tok);
+
+    // optimization: the only currently supported extension needs last-chunk
+    if (!theChunkSize && customExtensionValueParser)
+        customExtensionValueParser->parse(tok, extName);
+    else
+        ChunkExtensionValueParser::Ignore(tok, extName);
  }
  
  bool
-Http::One::TeChunkedParser::parseChunkBody(Http1::Tokenizer &tok)
+Http::One::TeChunkedParser::parseChunkBody(Tokenizer &tok)
  {
      if (theLeftBodySize > 0) {
          buf_ = tok.remaining(); // sync buffers before buf_ use
@@ -188,17 +204,20 @@ Http::One::TeChunkedParser::parseChunkBody(Http1::Tokenizer &tok)
  }
  
  bool
-Http::One::TeChunkedParser::parseChunkEnd(Http1::Tokenizer &tok)
+Http::One::TeChunkedParser::parseChunkEnd(Tokenizer &tok)
  {
      Must(theLeftBodySize == 0); // Should(), really
  
-    if (skipLineTerminator(tok)) {
+    try {
+        skipLineTerminator(tok);
          buf_ = tok.remaining(); // parse checkpoint
          theChunkSize = 0; // done with the current chunk
          parsingStage_ = Http1::HTTP_PARSE_CHUNK_SZ;
          return true;
      }
-
-    return false;
+    catch (const InsufficientInput &) {
+        return false;
+    }
+    // other exceptions bubble up to kill message parsing
  }
  
diff --git a/src/http/one/TeChunkedParser.h b/src/http/one/TeChunkedParser.h

index 517dcd0ed40b1f5e7776cdc5b5aadf75a366197d..09f908c0f8161d58b73dfd7b714f8e92d6b21b2a 100644 (file)
--- a/src/http/one/TeChunkedParser.h
+++ b/src/http/one/TeChunkedParser.h
@@ -18,6 +18,26 @@ namespace Http
  namespace One
  {
  
+using ::Parser::InsufficientInput;
+
+// TODO: Move this class into http/one/ChunkExtensionValueParser.*
+/// A customizable parser of a single chunk extension value (chunk-ext-val).
+/// From RFC 7230 section 4.1.1 and its Errata #4667:
+/// chunk-ext = *( BWS  ";" BWS chunk-ext-name [ BWS "=" BWS chunk-ext-val ] )
+/// chunk-ext-name = token
+/// chunk-ext-val  = token / quoted-string
+class ChunkExtensionValueParser
+{
+public:
+    typedef ::Parser::Tokenizer Tokenizer;
+
+    /// extracts and ignores the value of a named extension
+    static void Ignore(Tokenizer &tok, const SBuf &extName);
+
+    /// extracts and then interprets (or ignores) the extension value
+    virtual void parse(Tokenizer &tok, const SBuf &extName) = 0;
+};
+
  /**
   * An incremental parser for chunked transfer coding
   * defined in RFC 7230 section 4.1.
@@ -25,7 +45,7 @@ namespace One
   *
   * The parser shovels content bytes from the raw
   * input buffer into the content output buffer, both caller-supplied.
- * Ignores chunk extensions except for ICAP's ieof.
+ * Chunk extensions like use-original-body are handled via parseExtensionValuesWith().
   * Trailers are available via mimeHeader() if wanted.
   */
  class TeChunkedParser : public Http1::Parser
@@ -37,6 +57,10 @@ public:
      /// set the buffer to be used to store decoded chunk data
      void setPayloadBuffer(MemBuf *parsedContent) {theOut = parsedContent;}
  
+    /// Instead of ignoring all chunk extension values, give the supplied
+    /// parser a chance to handle them. Only applied to last-chunk (for now).
+    void parseExtensionValuesWith(ChunkExtensionValueParser *parser) { customExtensionValueParser = parser; }
+
      bool needsMoreSpace() const;
  
      /* Http1::Parser API */
@@ -45,17 +69,20 @@ public:
      virtual Parser::size_type firstLineSize() const {return 0;} // has no meaning with multiple chunks
  
  private:
-    bool parseChunkSize(Http1::Tokenizer &tok);
-    bool parseChunkExtension(Http1::Tokenizer &tok, bool skipKnown);
-    bool parseChunkBody(Http1::Tokenizer &tok);
-    bool parseChunkEnd(Http1::Tokenizer &tok);
+    bool parseChunkSize(Tokenizer &tok);
+    bool parseChunkMetadataSuffix(Tokenizer &);
+    void parseChunkExtensions(Tokenizer &);
+    void parseOneChunkExtension(Tokenizer &);
+    bool parseChunkBody(Tokenizer &tok);
+    bool parseChunkEnd(Tokenizer &tok);
  
      MemBuf *theOut;
      uint64_t theChunkSize;
      uint64_t theLeftBodySize;
  
-public:
-    int64_t useOriginBody;
+    /// An optional plugin for parsing and interpreting custom chunk-ext-val.
+    /// This "visitor" object is owned by our creator.
+    ChunkExtensionValueParser *customExtensionValueParser;
  };
  
  } // namespace One
diff --git a/src/http/one/Tokenizer.cc b/src/http/one/Tokenizer.cc

index 371ed46e0675e0508df4b9926543a453e3a07a69..913397cda9a64fab40181a1ab9ef96d4e23af79b 100644 (file)
--- a/src/http/one/Tokenizer.cc
+++ b/src/http/one/Tokenizer.cc
@@ -8,35 +8,18 @@
  
  #include "squid.h"
  #include "Debug.h"
+#include "http/one/Parser.h"
  #include "http/one/Tokenizer.h"
-
-bool
-Http::One::Tokenizer::quotedString(SBuf &returnedToken, const bool http1p0)
-{
-    checkpoint();
-
-    if (!skip('"'))
-        return false;
-
-    return qdText(returnedToken, http1p0);
-}
-
-bool
-Http::One::Tokenizer::quotedStringOrToken(SBuf &returnedToken, const bool http1p0)
+#include "parser/Tokenizer.h"
+#include "sbuf/Stream.h"
+
+/// Extracts quoted-string after the caller removes the initial '"'.
+/// \param http1p0 whether to prohibit \-escaped characters in quoted strings
+/// \throws InsufficientInput when input can be a token _prefix_
+/// \returns extracted quoted string (without quotes and with chars unescaped)
+static SBuf
+parseQuotedStringSuffix(Parser::Tokenizer &tok, const bool http1p0)
  {
-    checkpoint();
-
-    if (!skip('"'))
-        return prefix(returnedToken, CharacterSet::TCHAR);
-
-    return qdText(returnedToken, http1p0);
-}
-
-bool
-Http::One::Tokenizer::qdText(SBuf &returnedToken, const bool http1p0)
-{
-    // the initial DQUOTE has been skipped by the caller
-
      /*
       * RFC 1945 - defines qdtext:
       *   inclusive of LWS (which includes CR and LF)
@@ -61,12 +44,17 @@ Http::One::Tokenizer::qdText(SBuf &returnedToken, const bool http1p0)
      // best we can do is a conditional reference since http1p0 value may change per-client
      const CharacterSet &tokenChars = (http1p0 ? qdtext1p0 : qdtext1p1);
  
-    for (;;) {
-        SBuf::size_type prefixLen = buf().findFirstNotOf(tokenChars);
-        returnedToken.append(consume(prefixLen));
+    SBuf parsedToken;
+
+    while (!tok.atEnd()) {
+        SBuf qdText;
+        if (tok.prefix(qdText, tokenChars))
+            parsedToken.append(qdText);
+
+        if (!http1p0 && tok.skip('\\')) { // HTTP/1.1 allows quoted-pair, HTTP/1.0 does not
+            if (tok.atEnd())
+                break;
  
-        // HTTP/1.1 allows quoted-pair, HTTP/1.0 does not
-        if (!http1p0 && skip('\\')) {
              /* RFC 7230 section 3.2.6
               *
               * The backslash octet ("\") can be used as a single-octet quoting
@@ -78,32 +66,42 @@ Http::One::Tokenizer::qdText(SBuf &returnedToken, const bool http1p0)
               */
              static const CharacterSet qPairChars = CharacterSet::HTAB + CharacterSet::SP + CharacterSet::VCHAR + CharacterSet::OBSTEXT;
              SBuf escaped;
-            if (!prefix(escaped, qPairChars, 1)) {
-                returnedToken.clear();
-                restoreLastCheckpoint();
-                return false;
-            }
-            returnedToken.append(escaped);
+            if (!tok.prefix(escaped, qPairChars, 1))
+                throw TexcHere("invalid escaped character in quoted-pair");
+
+            parsedToken.append(escaped);
              continue;
+        }
  
-        } else if (skip('"')) {
-            break; // done
+        if (tok.skip('"'))
+            return parsedToken; // may be empty
  
-        } else if (atEnd()) {
-            // need more data
-            returnedToken.clear();
-            restoreLastCheckpoint();
-            return false;
-        }
+        if (tok.atEnd())
+            break;
  
-        // else, we have an error
-        debugs(24, 8, "invalid bytes for set " << tokenChars.name);
-        returnedToken.clear();
-        restoreLastCheckpoint();
-        return false;
+        throw TexcHere(ToSBuf("invalid bytes for set ", tokenChars.name));
      }
  
-    // found the whole string
-    return true;
+    throw Http::One::InsufficientInput();
+}
+
+SBuf
+Http::One::tokenOrQuotedString(Parser::Tokenizer &tok, const bool http1p0)
+{
+    if (tok.skip('"'))
+        return parseQuotedStringSuffix(tok, http1p0);
+
+    if (tok.atEnd())
+        throw InsufficientInput();
+
+    SBuf parsedToken;
+    if (!tok.prefix(parsedToken, CharacterSet::TCHAR))
+        throw TexcHere("invalid input while expecting an HTTP token");
+
+    if (tok.atEnd())
+        throw InsufficientInput();
+
+    // got the complete token
+    return parsedToken;
  }
  
diff --git a/src/http/one/Tokenizer.h b/src/http/one/Tokenizer.h

index a29ce5c5d1cc5e696a1273f9456b1c8a59c38e29..cbd7b1c48464aadae1500c28d2b7689e357fe404 100644 (file)
--- a/src/http/one/Tokenizer.h
+++ b/src/http/one/Tokenizer.h
@@ -9,68 +9,47 @@
  #ifndef SQUID_SRC_HTTP_ONE_TOKENIZER_H
  #define SQUID_SRC_HTTP_ONE_TOKENIZER_H
  
-#include "parser/Tokenizer.h"
+#include "parser/forward.h"
+#include "sbuf/forward.h"
  
  namespace Http {
  namespace One {
  
  /**
- * Lexical processor extended to tokenize HTTP/1.x syntax.
+ * Extracts either an HTTP/1 token or quoted-string while dealing with
+ * possibly incomplete input typical for incremental text parsers.
+ * Unescapes escaped characters in HTTP/1.1 quoted strings.
   *
- * \see ::Parser::Tokenizer for more detail
+ * \param http1p0 whether to prohibit \-escaped characters in quoted strings
+ * \throws InsufficientInput as appropriate, including on unterminated tokens
+ * \returns extracted token or quoted string (without quotes)
+ *
+ * Governed by:
+ *  - RFC 1945 section 2.1
+ *  "
+ *    A string of text is parsed as a single word if it is quoted using
+ *    double-quote marks.
+ *
+ *        quoted-string  = ( <"> *(qdtext) <"> )
+ *
+ *        qdtext         = <any CHAR except <"> and CTLs,
+ *                         but including LWS>
+ *
+ *    Single-character quoting using the backslash ("\") character is not
+ *    permitted in HTTP/1.0.
+ *  "
+ *
+ *  - RFC 7230 section 3.2.6
+ *  "
+ *    A string of text is parsed as a single value if it is quoted using
+ *    double-quote marks.
+ *
+ *    quoted-string  = DQUOTE *( qdtext / quoted-pair ) DQUOTE
+ *    qdtext         = HTAB / SP /%x21 / %x23-5B / %x5D-7E / obs-text
+ *    obs-text       = %x80-FF
+ *  "
   */
-class Tokenizer : public ::Parser::Tokenizer
-{
-public:
-    Tokenizer(SBuf &s) : ::Parser::Tokenizer(s), savedStats_(0) {}
-
-    /**
-     * Attempt to parse a quoted-string lexical construct.
-     *
-     * Governed by:
-     *  - RFC 1945 section 2.1
-     *  "
-     *    A string of text is parsed as a single word if it is quoted using
-     *    double-quote marks.
-     *
-     *        quoted-string  = ( <"> *(qdtext) <"> )
-     *
-     *        qdtext         = <any CHAR except <"> and CTLs,
-     *                         but including LWS>
-     *
-     *    Single-character quoting using the backslash ("\") character is not
-     *    permitted in HTTP/1.0.
-     *  "
-     *
-     *  - RFC 7230 section 3.2.6
-     *  "
-     *    A string of text is parsed as a single value if it is quoted using
-     *    double-quote marks.
-     *
-     *    quoted-string  = DQUOTE *( qdtext / quoted-pair ) DQUOTE
-     *    qdtext         = HTAB / SP /%x21 / %x23-5B / %x5D-7E / obs-text
-     *    obs-text       = %x80-FF
-     *  "
-     *
-     * \param escaped HTTP/1.0 does not permit \-escaped characters
-     */
-    bool quotedString(SBuf &value, const bool http1p0 = false);
-
-    /**
-     * Attempt to parse a (token / quoted-string ) lexical construct.
-     */
-    bool quotedStringOrToken(SBuf &value, const bool http1p0 = false);
-
-private:
-    /// parse the internal component of a quote-string, and terminal DQUOTE
-    bool qdText(SBuf &value, const bool http1p0);
-
-    void checkpoint() { savedCheckpoint_ = buf(); savedStats_ = parsedSize(); }
-    void restoreLastCheckpoint() { undoParse(savedCheckpoint_, savedStats_); }
-
-    SBuf savedCheckpoint_;
-    SBuf::size_type savedStats_;
-};
+SBuf tokenOrQuotedString(Parser::Tokenizer &tok, const bool http1p0 = false);
  
  } // namespace One
  } // namespace Http
diff --git a/src/http/one/forward.h b/src/http/one/forward.h

index ab8c7a5db6cdec17c021c77fcfdf652349247de2..c9216abe391560a18fec2e8c27e9420d949c5824 100644 (file)
--- a/src/http/one/forward.h
+++ b/src/http/one/forward.h
@@ -10,6 +10,7 @@
  #define SQUID_SRC_HTTP_ONE_FORWARD_H
  
  #include "base/RefCount.h"
+#include "parser/forward.h"
  #include "sbuf/forward.h"
  
  namespace Http {
@@ -31,6 +32,8 @@ typedef RefCount<Http::One::ResponseParser> ResponseParserPointer;
  /// CRLF textual representation
  const SBuf &CrLf();
  
+using ::Parser::InsufficientInput;
+
  } // namespace One
  } // namespace Http
  
diff --git a/src/parser/BinaryTokenizer.h b/src/parser/BinaryTokenizer.h

index de8369e2ae18ea2c3c8a978dfaa9dd7cfce69149..dac16ead08201b90d16ac868d779fbce3a4ed82c 100644 (file)
--- a/src/parser/BinaryTokenizer.h
+++ b/src/parser/BinaryTokenizer.h
@@ -10,6 +10,7 @@
  #define SQUID_SRC_PARSER_BINARYTOKENIZER_H
  
  #include "ip/forward.h"
+#include "parser/forward.h"
  #include "sbuf/SBuf.h"
  
  namespace Parser
@@ -45,7 +46,7 @@ public:
  class BinaryTokenizer
  {
  public:
-    class InsufficientInput {}; // thrown when a method runs out of data
+    typedef ::Parser::InsufficientInput InsufficientInput;
      typedef uint64_t size_type; // enough for the largest supported offset
  
      BinaryTokenizer();
diff --git a/src/parser/Makefile.am b/src/parser/Makefile.am

index aef32354ca9b74afa56dd5df230dba5c2e8e3c56..c08d1d52aa0d9478a6a9bd5f697345980b0f00f5 100644 (file)
--- a/src/parser/Makefile.am
+++ b/src/parser/Makefile.am
@@ -13,6 +13,7 @@ noinst_LTLIBRARIES = libparser.la
  libparser_la_SOURCES = \
         BinaryTokenizer.h \
         BinaryTokenizer.cc \
+       forward.h \
         Tokenizer.h \
         Tokenizer.cc
  
diff --git a/src/parser/Tokenizer.cc b/src/parser/Tokenizer.cc

index 99f8eb33cb31a8b585540e377a472dd52319c994..0b44e40639608bc08825125322bc22813350e3b2 100644 (file)
--- a/src/parser/Tokenizer.cc
+++ b/src/parser/Tokenizer.cc
@@ -10,7 +10,9 @@
  
  #include "squid.h"
  #include "Debug.h"
+#include "parser/forward.h"
  #include "parser/Tokenizer.h"
+#include "sbuf/Stream.h"
  
  #include <cerrno>
  #if HAVE_CTYPE_H
@@ -96,6 +98,23 @@ Parser::Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars, c
      return true;
  }
  
+SBuf
+Parser::Tokenizer::prefix(const char *description, const CharacterSet &tokenChars, const SBuf::size_type limit)
+{
+    if (atEnd())
+        throw InsufficientInput();
+
+    SBuf result;
+
+    if (!prefix(result, tokenChars, limit))
+        throw TexcHere(ToSBuf("cannot parse ", description));
+
+    if (atEnd())
+        throw InsufficientInput();
+
+    return result;
+}
+
  bool
  Parser::Tokenizer::suffix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit)
  {
@@ -283,3 +302,24 @@ Parser::Tokenizer::int64(int64_t & result, int base, bool allowSign, const SBuf:
      return success(s - range.rawContent());
  }
  
+int64_t
+Parser::Tokenizer::udec64(const char *description, const SBuf::size_type limit)
+{
+    if (atEnd())
+        throw InsufficientInput();
+
+    int64_t result = 0;
+
+    // Since we only support unsigned decimals, a parsing failure with a
+    // non-empty input always implies invalid/malformed input (or a buggy
+    // limit=0 caller). TODO: Support signed and non-decimal integers by
+    // refactoring int64() to detect insufficient input.
+    if (!int64(result, 10, false, limit))
+        throw TexcHere(ToSBuf("cannot parse ", description));
+
+    if (atEnd())
+        throw InsufficientInput(); // more digits may be coming
+
+    return result;
+}
+
diff --git a/src/parser/Tokenizer.h b/src/parser/Tokenizer.h

index f04fd3e2ea2463160dccbfda51efc861c74a8905..6ae81625300f3e570b4f05dbec020fa6e2431d36 100644 (file)
--- a/src/parser/Tokenizer.h
+++ b/src/parser/Tokenizer.h
@@ -143,6 +143,19 @@ public:
       */
      bool int64(int64_t &result, int base = 0, bool allowSign = true, SBuf::size_type limit = SBuf::npos);
  
+    /*
+     * The methods below mimic their counterparts documented above, but they
+     * throw on errors, including InsufficientInput. The field description
+     * parameter is used for error reporting and debugging.
+     */
+
+    /// prefix() wrapper but throws InsufficientInput if input contains
+    /// nothing but the prefix (i.e. if the prefix is not "terminated")
+    SBuf prefix(const char *description, const CharacterSet &tokenChars, SBuf::size_type limit = SBuf::npos);
+
+    /// int64() wrapper but limited to unsigned decimal integers (for now)
+    int64_t udec64(const char *description, SBuf::size_type limit = SBuf::npos);
+
  protected:
      SBuf consume(const SBuf::size_type n);
      SBuf::size_type success(const SBuf::size_type n);
diff --git a/src/parser/forward.h b/src/parser/forward.h

new file mode 100644 (file)

index 0000000..5a95b7a
--- /dev/null
+++ b/src/parser/forward.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (C) 1996-2019 The Squid Software Foundation and contributors
+ *
+ * Squid software is distributed under GPLv2+ license and includes
+ * contributions from numerous individuals and organizations.
+ * Please see the COPYING and CONTRIBUTORS files for details.
+ */
+
+#ifndef SQUID_PARSER_FORWARD_H
+#define SQUID_PARSER_FORWARD_H
+
+namespace Parser {
+class Tokenizer;
+class BinaryTokenizer;
+
+// TODO: Move this declaration (to parser/Elements.h) if we need more like it.
+/// thrown by modern "incremental" parsers when they need more data
+class InsufficientInput {};
+} // namespace Parser
+
+#endif /* SQUID_PARSER_FORWARD_H */
+
author	Eduard Bagdasaryan <eduard.bagdasaryan@measurement-factory.com>
	Mon, 18 Mar 2019 17:48:21 +0000 (17:48 +0000)
committer	Squid Anubis <squid-anubis@squid-cache.org>
	Mon, 18 Mar 2019 17:48:25 +0000 (17:48 +0000)
src/adaptation/icap/ModXact.cc		patch \| blob \| blame \| history
src/adaptation/icap/ModXact.h		patch \| blob \| blame \| history
src/http/one/Parser.cc		patch \| blob \| blame \| history
src/http/one/Parser.h		patch \| blob \| blame \| history
src/http/one/RequestParser.cc		patch \| blob \| blame \| history
src/http/one/RequestParser.h		patch \| blob \| blame \| history
src/http/one/ResponseParser.cc		patch \| blob \| blame \| history
src/http/one/ResponseParser.h		patch \| blob \| blame \| history
src/http/one/TeChunkedParser.cc		patch \| blob \| blame \| history
src/http/one/TeChunkedParser.h		patch \| blob \| blame \| history
src/http/one/Tokenizer.cc		patch \| blob \| blame \| history
src/http/one/Tokenizer.h		patch \| blob \| blame \| history
src/http/one/forward.h		patch \| blob \| blame \| history
src/parser/BinaryTokenizer.h		patch \| blob \| blame \| history
src/parser/Makefile.am		patch \| blob \| blame \| history
src/parser/Tokenizer.cc		patch \| blob \| blame \| history
src/parser/Tokenizer.h		patch \| blob \| blame \| history
src/parser/forward.h	[new file with mode: 0644]	patch \| blob