From 417da4006cf5c97d44e74431b816fc58fec9e270 Mon Sep 17 00:00:00 2001 From: Eduard Bagdasaryan Date: Mon, 18 Mar 2019 17:48:21 +0000 Subject: [PATCH] Fix incremental parsing of chunked quoted extensions (#310) Before this change, incremental parsing of quoted chunked extensions was broken for two reasons: * Http::One::Parser::skipLineTerminator() unexpectedly threw after partially received quoted chunk extension value. * When Http::One::Tokenizer was unable to parse a quoted extension, it incorrectly restored the input buffer to the beginning of the extension value (instead of the extension itself), thus making further incremental parsing iterations impossible. IMO, the reason for this problem was that Http::One::Tokenizer::qdText() could not distinguish two cases (returning false in both): * the end of the quoted string not yet reached * an input error, e.g., wrong/unexpected character A possible approach could be to improve Http::One::Tokenizer, making it aware about "needs more data" state. However, to be acceptable, these improvements should be done in the base Parser::Tokenizer class instead. These changes seem to be non-trivial and could be done separately and later. Another approach, used here, is to simplify the complex and error-prone chunked extensions parsing algorithm, fixing incremental parsing bugs and still parse incrementally in almost all cases. The performance regression could be expected only in relatively rare cases of partially received or malformed extensions. Also: * fixed parsing of partial use-original-body extension values * do not treat an invalid use-original-body as an unknown extension * optimization: parse use-original-body extension only in ICAP context (i.e., where it is expected) * improvement: added a new API to TeChunkedParser to specify known chunked extensions list --- src/adaptation/icap/ModXact.cc | 21 ++++- src/adaptation/icap/ModXact.h | 20 +++++ src/http/one/Parser.cc | 35 ++++---- src/http/one/Parser.h | 10 ++- src/http/one/RequestParser.cc | 16 ++-- src/http/one/RequestParser.h | 8 +- src/http/one/ResponseParser.cc | 17 ++-- src/http/one/ResponseParser.h | 2 +- src/http/one/TeChunkedParser.cc | 139 ++++++++++++++++++-------------- src/http/one/TeChunkedParser.h | 41 ++++++++-- src/http/one/Tokenizer.cc | 104 ++++++++++++------------ src/http/one/Tokenizer.h | 89 ++++++++------------ src/http/one/forward.h | 3 + src/parser/BinaryTokenizer.h | 3 +- src/parser/Makefile.am | 1 + src/parser/Tokenizer.cc | 40 +++++++++ src/parser/Tokenizer.h | 13 +++ src/parser/forward.h | 22 +++++ 18 files changed, 364 insertions(+), 220 deletions(-) create mode 100644 src/parser/forward.h diff --git a/src/adaptation/icap/ModXact.cc b/src/adaptation/icap/ModXact.cc index 3a095a0bb2..78a3740131 100644 --- a/src/adaptation/icap/ModXact.cc +++ b/src/adaptation/icap/ModXact.cc @@ -26,10 +26,11 @@ #include "comm/Connection.h" #include "err_detail_type.h" #include "http/ContentLengthInterpreter.h" -#include "http/one/TeChunkedParser.h" #include "HttpHeaderTools.h" #include "HttpReply.h" #include "MasterXaction.h" +#include "parser/Tokenizer.h" +#include "sbuf/Stream.h" #include "SquidTime.h" // flow and terminology: @@ -43,6 +44,8 @@ CBDATA_NAMESPACED_CLASS_INIT(Adaptation::Icap, ModXactLauncher); static const size_t TheBackupLimit = BodyPipe::MaxCapacity; +const SBuf Adaptation::Icap::ChunkExtensionValueParser::UseOriginalBodyName("use-original-body"); + Adaptation::Icap::ModXact::State::State() { memset(this, 0, sizeof(*this)); @@ -1145,6 +1148,7 @@ void Adaptation::Icap::ModXact::decideOnParsingBody() state.parsing = State::psBody; replyHttpBodySize = 0; bodyParser = new Http1::TeChunkedParser; + bodyParser->parseExtensionValuesWith(&extensionParser); makeAdaptedBodyPipe("adapted response from the ICAP server"); Must(state.sending == State::sendingAdapted); } else { @@ -1182,8 +1186,8 @@ void Adaptation::Icap::ModXact::parseBody() } if (parsed) { - if (state.readyForUob && bodyParser->useOriginBody >= 0) - prepPartialBodyEchoing(static_cast(bodyParser->useOriginBody)); + if (state.readyForUob && extensionParser.sawUseOriginalBody()) + prepPartialBodyEchoing(extensionParser.useOriginalBody()); else stopSending(true); // the parser succeeds only if all parsed data fits if (trailerParser) @@ -2074,3 +2078,14 @@ bool Adaptation::Icap::TrailerParser::parse(const char *buf, int len, int atEnd, return parsed > 0; } +void +Adaptation::Icap::ChunkExtensionValueParser::parse(Tokenizer &tok, const SBuf &extName) +{ + if (extName == UseOriginalBodyName) { + useOriginalBody_ = tok.udec64("use-original-body"); + assert(useOriginalBody_ >= 0); + } else { + Ignore(tok, extName); + } +} + diff --git a/src/adaptation/icap/ModXact.h b/src/adaptation/icap/ModXact.h index 6aa1e9f2a1..830938bb93 100644 --- a/src/adaptation/icap/ModXact.h +++ b/src/adaptation/icap/ModXact.h @@ -15,6 +15,7 @@ #include "adaptation/icap/Xaction.h" #include "BodyPipe.h" #include "http/one/forward.h" +#include "http/one/TeChunkedParser.h" /* * ICAPModXact implements ICAP REQMOD and RESPMOD transaction using @@ -120,6 +121,23 @@ public: size_t hdr_sz; // pedantic XXX: wrong type dictated by HttpHeader::parse() API }; +/// handles ICAP-specific chunk extensions supported by Squid +class ChunkExtensionValueParser: public Http1::ChunkExtensionValueParser +{ +public: + /* Http1::ChunkExtensionValueParser API */ + virtual void parse(Tokenizer &tok, const SBuf &extName) override; + + bool sawUseOriginalBody() const { return useOriginalBody_ >= 0; } + uint64_t useOriginalBody() const { assert(sawUseOriginalBody()); return static_cast(useOriginalBody_); } + +private: + static const SBuf UseOriginalBodyName; + + /// the value of the parsed use-original-body chunk extension (or -1) + int64_t useOriginalBody_ = -1; +}; + class ModXact: public Xaction, public BodyProducer, public BodyConsumer { CBDATA_CLASS(ModXact); @@ -301,6 +319,8 @@ private: TrailerParser *trailerParser; + ChunkExtensionValueParser extensionParser; + class State { diff --git a/src/http/one/Parser.cc b/src/http/one/Parser.cc index 069bdacee0..8b262a99dc 100644 --- a/src/http/one/Parser.cc +++ b/src/http/one/Parser.cc @@ -7,10 +7,11 @@ */ #include "squid.h" +#include "base/CharacterSet.h" #include "Debug.h" #include "http/one/Parser.h" -#include "http/one/Tokenizer.h" #include "mime_header.h" +#include "parser/Tokenizer.h" #include "SquidConfig.h" /// RFC 7230 section 2.6 - 7 magic octets @@ -61,20 +62,19 @@ Http::One::Parser::DelimiterCharacters() RelaxedDelimiterCharacters() : CharacterSet::SP; } -bool -Http::One::Parser::skipLineTerminator(Http1::Tokenizer &tok) const +void +Http::One::Parser::skipLineTerminator(Tokenizer &tok) const { if (tok.skip(Http1::CrLf())) - return true; + return; if (Config.onoff.relaxed_header_parser && tok.skipOne(CharacterSet::LF)) - return true; + return; if (tok.atEnd() || (tok.remaining().length() == 1 && tok.remaining().at(0) == '\r')) - return false; // need more data + throw InsufficientInput(); throw TexcHere("garbage instead of CRLF line terminator"); - return false; // unreachable, but make naive compilers happy } /// all characters except the LF line terminator @@ -102,7 +102,7 @@ LineCharacters() void Http::One::Parser::cleanMimePrefix() { - Http1::Tokenizer tok(mimeHeaderBlock_); + Tokenizer tok(mimeHeaderBlock_); while (tok.skipOne(RelaxedDelimiterCharacters())) { (void)tok.skipAll(LineCharacters()); // optional line content // LF terminator is required. @@ -137,7 +137,7 @@ Http::One::Parser::cleanMimePrefix() void Http::One::Parser::unfoldMime() { - Http1::Tokenizer tok(mimeHeaderBlock_); + Tokenizer tok(mimeHeaderBlock_); const auto szLimit = mimeHeaderBlock_.length(); mimeHeaderBlock_.clear(); // prevent the mime sender being able to make append() realloc/grow multiple times. @@ -227,7 +227,7 @@ Http::One::Parser::getHeaderField(const char *name) debugs(25, 5, "looking for " << name); // while we can find more LF in the SBuf - Http1::Tokenizer tok(mimeHeaderBlock_); + Tokenizer tok(mimeHeaderBlock_); SBuf p; while (tok.prefix(p, LineCharacters())) { @@ -249,7 +249,7 @@ Http::One::Parser::getHeaderField(const char *name) p.consume(namelen + 1); // TODO: optimize SBuf::trim to take CharacterSet directly - Http1::Tokenizer t(p); + Tokenizer t(p); t.skipAll(CharacterSet::WSP); p = t.remaining(); @@ -272,10 +272,15 @@ Http::One::ErrorLevel() } // BWS = *( SP / HTAB ) ; WhitespaceCharacters() may relax this RFC 7230 rule -bool -Http::One::ParseBws(Tokenizer &tok) +void +Http::One::ParseBws(Parser::Tokenizer &tok) { - if (const auto count = tok.skipAll(Parser::WhitespaceCharacters())) { + const auto count = tok.skipAll(Parser::WhitespaceCharacters()); + + if (tok.atEnd()) + throw InsufficientInput(); // even if count is positive + + if (count) { // Generating BWS is a MUST-level violation so warn about it as needed. debugs(33, ErrorLevel(), "found " << count << " BWS octets"); // RFC 7230 says we MUST parse BWS, so we fall through even if @@ -283,6 +288,6 @@ Http::One::ParseBws(Tokenizer &tok) } // else we successfully "parsed" an empty BWS sequence - return true; + // success: no more BWS characters expected } diff --git a/src/http/one/Parser.h b/src/http/one/Parser.h index 202c52d542..f449e8334c 100644 --- a/src/http/one/Parser.h +++ b/src/http/one/Parser.h @@ -12,6 +12,7 @@ #include "anyp/ProtocolVersion.h" #include "http/one/forward.h" #include "http/StatusCode.h" +#include "parser/forward.h" #include "sbuf/SBuf.h" namespace Http { @@ -40,6 +41,7 @@ class Parser : public RefCountable { public: typedef SBuf::size_type size_type; + typedef ::Parser::Tokenizer Tokenizer; Parser() = default; Parser(const Parser &) = default; @@ -122,11 +124,11 @@ protected: * detect and skip the CRLF or (if tolerant) LF line terminator * consume from the tokenizer. * - * throws if non-terminator is detected. + * \throws exception on bad or InsuffientInput. * \retval true only if line terminator found. * \retval false incomplete or missing line terminator, need more data. */ - bool skipLineTerminator(Http1::Tokenizer &tok) const; + void skipLineTerminator(Tokenizer &) const; /** * Scan to find the mime headers block for current message. @@ -163,8 +165,8 @@ private: }; /// skips and, if needed, warns about RFC 7230 BWS ("bad" whitespace) -/// \returns true (always; unlike all the skip*() functions) -bool ParseBws(Tokenizer &tok); +/// \throws InsufficientInput when the end of BWS cannot be confirmed +void ParseBws(Parser::Tokenizer &); /// the right debugs() level for logging HTTP violation messages int ErrorLevel(); diff --git a/src/http/one/RequestParser.cc b/src/http/one/RequestParser.cc index 5dd48011c5..81c6d20845 100644 --- a/src/http/one/RequestParser.cc +++ b/src/http/one/RequestParser.cc @@ -9,8 +9,8 @@ #include "squid.h" #include "Debug.h" #include "http/one/RequestParser.h" -#include "http/one/Tokenizer.h" #include "http/ProtocolVersion.h" +#include "parser/Tokenizer.h" #include "profiler/Profiler.h" #include "SquidConfig.h" @@ -59,7 +59,7 @@ Http::One::RequestParser::skipGarbageLines() * RFC 7230 section 2.6, 3.1 and 3.5 */ bool -Http::One::RequestParser::parseMethodField(Http1::Tokenizer &tok) +Http::One::RequestParser::parseMethodField(Tokenizer &tok) { // method field is a sequence of TCHAR. // Limit to 32 characters to prevent overly long sequences of non-HTTP @@ -140,7 +140,7 @@ Http::One::RequestParser::RequestTargetCharacters() } bool -Http::One::RequestParser::parseUriField(Http1::Tokenizer &tok) +Http::One::RequestParser::parseUriField(Tokenizer &tok) { /* Arbitrary 64KB URI upper length limit. * @@ -173,7 +173,7 @@ Http::One::RequestParser::parseUriField(Http1::Tokenizer &tok) } bool -Http::One::RequestParser::parseHttpVersionField(Http1::Tokenizer &tok) +Http::One::RequestParser::parseHttpVersionField(Tokenizer &tok) { static const SBuf http1p0("HTTP/1.0"); static const SBuf http1p1("HTTP/1.1"); @@ -248,7 +248,7 @@ Http::One::RequestParser::skipDelimiter(const size_t count, const char *where) /// Parse CRs at the end of request-line, just before the terminating LF. bool -Http::One::RequestParser::skipTrailingCrs(Http1::Tokenizer &tok) +Http::One::RequestParser::skipTrailingCrs(Tokenizer &tok) { if (Config.onoff.relaxed_header_parser) { (void)tok.skipAllTrailing(CharacterSet::CR); // optional; multiple OK @@ -284,12 +284,12 @@ Http::One::RequestParser::parseRequestFirstLine() // Earlier, skipGarbageLines() took care of any leading LFs (if allowed). // Now, the request line has to end at the first LF. static const CharacterSet lineChars = CharacterSet::LF.complement("notLF"); - ::Parser::Tokenizer lineTok(buf_); + Tokenizer lineTok(buf_); if (!lineTok.prefix(line, lineChars) || !lineTok.skip('\n')) { if (buf_.length() >= Config.maxRequestHeaderSize) { /* who should we blame for our failure to parse this line? */ - Http1::Tokenizer methodTok(buf_); + Tokenizer methodTok(buf_); if (!parseMethodField(methodTok)) return -1; // blame a bad method (or its delimiter) @@ -303,7 +303,7 @@ Http::One::RequestParser::parseRequestFirstLine() return 0; } - Http1::Tokenizer tok(line); + Tokenizer tok(line); if (!parseMethodField(tok)) return -1; diff --git a/src/http/one/RequestParser.h b/src/http/one/RequestParser.h index dd6dc6f1ca..d48f72cba5 100644 --- a/src/http/one/RequestParser.h +++ b/src/http/one/RequestParser.h @@ -59,11 +59,11 @@ private: bool doParse(const SBuf &aBuf); /* all these return false and set parseStatusCode on parsing failures */ - bool parseMethodField(Http1::Tokenizer &); - bool parseUriField(Http1::Tokenizer &); - bool parseHttpVersionField(Http1::Tokenizer &); + bool parseMethodField(Tokenizer &); + bool parseUriField(Tokenizer &); + bool parseHttpVersionField(Tokenizer &); bool skipDelimiter(const size_t count, const char *where); - bool skipTrailingCrs(Http1::Tokenizer &tok); + bool skipTrailingCrs(Tokenizer &tok); bool http0() const {return !msgProtocol_.major;} static const CharacterSet &RequestTargetCharacters(); diff --git a/src/http/one/ResponseParser.cc b/src/http/one/ResponseParser.cc index 4dcaafb270..4d09a03cc1 100644 --- a/src/http/one/ResponseParser.cc +++ b/src/http/one/ResponseParser.cc @@ -9,8 +9,8 @@ #include "squid.h" #include "Debug.h" #include "http/one/ResponseParser.h" -#include "http/one/Tokenizer.h" #include "http/ProtocolVersion.h" +#include "parser/Tokenizer.h" #include "profiler/Profiler.h" #include "SquidConfig.h" @@ -47,7 +47,7 @@ Http::One::ResponseParser::firstLineSize() const // NP: we found the protocol version and consumed it already. // just need the status code and reason phrase int -Http::One::ResponseParser::parseResponseStatusAndReason(Http1::Tokenizer &tok, const CharacterSet &WspDelim) +Http::One::ResponseParser::parseResponseStatusAndReason(Tokenizer &tok, const CharacterSet &WspDelim) { if (!completedStatus_) { debugs(74, 9, "seek status-code in: " << tok.remaining().substr(0,10) << "..."); @@ -87,14 +87,13 @@ Http::One::ResponseParser::parseResponseStatusAndReason(Http1::Tokenizer &tok, c static const CharacterSet phraseChars = CharacterSet::WSP + CharacterSet::VCHAR + CharacterSet::OBSTEXT; (void)tok.prefix(reasonPhrase_, phraseChars); // optional, no error if missing try { - if (skipLineTerminator(tok)) { - debugs(74, DBG_DATA, "parse remaining buf={length=" << tok.remaining().length() << ", data='" << tok.remaining() << "'}"); - buf_ = tok.remaining(); // resume checkpoint - return 1; - } + skipLineTerminator(tok); + buf_ = tok.remaining(); // resume checkpoint + debugs(74, DBG_DATA, Raw("leftovers", buf_.rawContent(), buf_.length())); + return 1; + } catch (const InsufficientInput &) { reasonPhrase_.clear(); return 0; // need more to be sure we have it all - } catch (const std::exception &ex) { debugs(74, 6, "invalid status-line: " << ex.what()); } @@ -119,7 +118,7 @@ Http::One::ResponseParser::parseResponseStatusAndReason(Http1::Tokenizer &tok, c int Http::One::ResponseParser::parseResponseFirstLine() { - Http1::Tokenizer tok(buf_); + Tokenizer tok(buf_); const CharacterSet &WspDelim = DelimiterCharacters(); diff --git a/src/http/one/ResponseParser.h b/src/http/one/ResponseParser.h index d80e172139..9ea63df805 100644 --- a/src/http/one/ResponseParser.h +++ b/src/http/one/ResponseParser.h @@ -47,7 +47,7 @@ public: private: int parseResponseFirstLine(); - int parseResponseStatusAndReason(Http1::Tokenizer&, const CharacterSet &); + int parseResponseStatusAndReason(Tokenizer&, const CharacterSet &); /// magic prefix for identifying ICY response messages static const SBuf IcyMagic; diff --git a/src/http/one/TeChunkedParser.cc b/src/http/one/TeChunkedParser.cc index 3d26c6fcb3..d56d467e6d 100644 --- a/src/http/one/TeChunkedParser.cc +++ b/src/http/one/TeChunkedParser.cc @@ -13,10 +13,13 @@ #include "http/one/Tokenizer.h" #include "http/ProtocolVersion.h" #include "MemBuf.h" +#include "parser/Tokenizer.h" #include "Parsing.h" +#include "sbuf/Stream.h" #include "SquidConfig.h" -Http::One::TeChunkedParser::TeChunkedParser() +Http::One::TeChunkedParser::TeChunkedParser(): + customExtensionValueParser(nullptr) { // chunked encoding only exists in HTTP/1.1 Http1::Parser::msgProtocol_ = Http::ProtocolVersion(1,1); @@ -31,7 +34,11 @@ Http::One::TeChunkedParser::clear() buf_.clear(); theChunkSize = theLeftBodySize = 0; theOut = NULL; - useOriginBody = -1; + // XXX: We do not reset customExtensionValueParser here. Based on the + // clear() API description, we must, but it makes little sense and could + // break method callers if they appear because some of them may forget to + // reset customExtensionValueParser. TODO: Remove Http1::Parser as our + // parent class and this unnecessary method with it. } bool @@ -49,14 +56,14 @@ Http::One::TeChunkedParser::parse(const SBuf &aBuf) if (parsingStage_ == Http1::HTTP_PARSE_NONE) parsingStage_ = Http1::HTTP_PARSE_CHUNK_SZ; - Http1::Tokenizer tok(buf_); + Tokenizer tok(buf_); // loop for as many chunks as we can // use do-while instead of while so that we can incrementally // restart in the middle of a chunk/frame do { - if (parsingStage_ == Http1::HTTP_PARSE_CHUNK_EXT && !parseChunkExtension(tok, theChunkSize)) + if (parsingStage_ == Http1::HTTP_PARSE_CHUNK_EXT && !parseChunkMetadataSuffix(tok)) return false; if (parsingStage_ == Http1::HTTP_PARSE_CHUNK && !parseChunkBody(tok)) @@ -80,7 +87,7 @@ Http::One::TeChunkedParser::needsMoreSpace() const /// RFC 7230 section 4.1 chunk-size bool -Http::One::TeChunkedParser::parseChunkSize(Http1::Tokenizer &tok) +Http::One::TeChunkedParser::parseChunkSize(Tokenizer &tok) { Must(theChunkSize <= 0); // Should(), really @@ -104,66 +111,75 @@ Http::One::TeChunkedParser::parseChunkSize(Http1::Tokenizer &tok) return false; // should not be reachable } -/** - * Parses chunk metadata suffix, looking for interesting extensions and/or - * getting to the line terminator. RFC 7230 section 4.1.1 and its Errata #4667: - * - * chunk-ext = *( BWS ";" BWS chunk-ext-name [ BWS "=" BWS chunk-ext-val ] ) - * chunk-ext-name = token - * chunk-ext-val = token / quoted-string - * - * ICAP 'use-original-body=N' extension is supported. - */ +/// Parses "[chunk-ext] CRLF" from RFC 7230 section 4.1.1: +/// chunk = chunk-size [ chunk-ext ] CRLF chunk-data CRLF +/// last-chunk = 1*"0" [ chunk-ext ] CRLF bool -Http::One::TeChunkedParser::parseChunkExtension(Http1::Tokenizer &tok, bool skipKnown) +Http::One::TeChunkedParser::parseChunkMetadataSuffix(Tokenizer &tok) { - SBuf ext; - SBuf value; - while ( - ParseBws(tok) && // Bug 4492: IBM_HTTP_Server sends SP after chunk-size - tok.skip(';') && - ParseBws(tok) && // Bug 4492: ICAP servers send SP before chunk-ext-name - tok.prefix(ext, CharacterSet::TCHAR)) { // chunk-ext-name - - // whole value part is optional. if no '=' expect next chunk-ext - if (ParseBws(tok) && tok.skip('=') && ParseBws(tok)) { - - if (!skipKnown) { - if (ext.cmp("use-original-body",17) == 0 && tok.int64(useOriginBody, 10)) { - debugs(94, 3, "Found chunk extension " << ext << "=" << useOriginBody); - buf_ = tok.remaining(); // parse checkpoint - continue; - } - } - - debugs(94, 5, "skipping unknown chunk extension " << ext); - - // unknown might have a value token or quoted-string - if (tok.quotedStringOrToken(value) && !tok.atEnd()) { - buf_ = tok.remaining(); // parse checkpoint - continue; - } - - // otherwise need more data OR corrupt syntax - break; - } - - if (!tok.atEnd()) - buf_ = tok.remaining(); // parse checkpoint (unless there might be more token name) - } - - if (skipLineTerminator(tok)) { - buf_ = tok.remaining(); // checkpoint - // non-0 chunk means data, 0-size means optional Trailer follows + // Code becomes much simpler when incremental parsing functions throw on + // bad or insufficient input, like in the code below. TODO: Expand up. + try { + parseChunkExtensions(tok); // a possibly empty chunk-ext list + skipLineTerminator(tok); + buf_ = tok.remaining(); parsingStage_ = theChunkSize ? Http1::HTTP_PARSE_CHUNK : Http1::HTTP_PARSE_MIME; return true; + } catch (const InsufficientInput &) { + tok.reset(buf_); // backtrack to the last commit point + return false; } + // other exceptions bubble up to kill message parsing +} - return false; +/// Parses the chunk-ext list (RFC 7230 section 4.1.1 and its Errata #4667): +/// chunk-ext = *( BWS ";" BWS chunk-ext-name [ BWS "=" BWS chunk-ext-val ] ) +void +Http::One::TeChunkedParser::parseChunkExtensions(Tokenizer &tok) +{ + do { + ParseBws(tok); // Bug 4492: IBM_HTTP_Server sends SP after chunk-size + + if (!tok.skip(';')) + return; // reached the end of extensions (if any) + + parseOneChunkExtension(tok); + buf_ = tok.remaining(); // got one extension + } while (true); +} + +void +Http::One::ChunkExtensionValueParser::Ignore(Tokenizer &tok, const SBuf &extName) +{ + const auto ignoredValue = tokenOrQuotedString(tok); + debugs(94, 5, extName << " with value " << ignoredValue); +} + +/// Parses a single chunk-ext list element: +/// chunk-ext = *( BWS ";" BWS chunk-ext-name [ BWS "=" BWS chunk-ext-val ] ) +void +Http::One::TeChunkedParser::parseOneChunkExtension(Tokenizer &tok) +{ + ParseBws(tok); // Bug 4492: ICAP servers send SP before chunk-ext-name + + const auto extName = tok.prefix("chunk-ext-name", CharacterSet::TCHAR); + + ParseBws(tok); + + if (!tok.skip('=')) + return; // parsed a valueless chunk-ext + + ParseBws(tok); + + // optimization: the only currently supported extension needs last-chunk + if (!theChunkSize && customExtensionValueParser) + customExtensionValueParser->parse(tok, extName); + else + ChunkExtensionValueParser::Ignore(tok, extName); } bool -Http::One::TeChunkedParser::parseChunkBody(Http1::Tokenizer &tok) +Http::One::TeChunkedParser::parseChunkBody(Tokenizer &tok) { if (theLeftBodySize > 0) { buf_ = tok.remaining(); // sync buffers before buf_ use @@ -188,17 +204,20 @@ Http::One::TeChunkedParser::parseChunkBody(Http1::Tokenizer &tok) } bool -Http::One::TeChunkedParser::parseChunkEnd(Http1::Tokenizer &tok) +Http::One::TeChunkedParser::parseChunkEnd(Tokenizer &tok) { Must(theLeftBodySize == 0); // Should(), really - if (skipLineTerminator(tok)) { + try { + skipLineTerminator(tok); buf_ = tok.remaining(); // parse checkpoint theChunkSize = 0; // done with the current chunk parsingStage_ = Http1::HTTP_PARSE_CHUNK_SZ; return true; } - - return false; + catch (const InsufficientInput &) { + return false; + } + // other exceptions bubble up to kill message parsing } diff --git a/src/http/one/TeChunkedParser.h b/src/http/one/TeChunkedParser.h index 517dcd0ed4..09f908c0f8 100644 --- a/src/http/one/TeChunkedParser.h +++ b/src/http/one/TeChunkedParser.h @@ -18,6 +18,26 @@ namespace Http namespace One { +using ::Parser::InsufficientInput; + +// TODO: Move this class into http/one/ChunkExtensionValueParser.* +/// A customizable parser of a single chunk extension value (chunk-ext-val). +/// From RFC 7230 section 4.1.1 and its Errata #4667: +/// chunk-ext = *( BWS ";" BWS chunk-ext-name [ BWS "=" BWS chunk-ext-val ] ) +/// chunk-ext-name = token +/// chunk-ext-val = token / quoted-string +class ChunkExtensionValueParser +{ +public: + typedef ::Parser::Tokenizer Tokenizer; + + /// extracts and ignores the value of a named extension + static void Ignore(Tokenizer &tok, const SBuf &extName); + + /// extracts and then interprets (or ignores) the extension value + virtual void parse(Tokenizer &tok, const SBuf &extName) = 0; +}; + /** * An incremental parser for chunked transfer coding * defined in RFC 7230 section 4.1. @@ -25,7 +45,7 @@ namespace One * * The parser shovels content bytes from the raw * input buffer into the content output buffer, both caller-supplied. - * Ignores chunk extensions except for ICAP's ieof. + * Chunk extensions like use-original-body are handled via parseExtensionValuesWith(). * Trailers are available via mimeHeader() if wanted. */ class TeChunkedParser : public Http1::Parser @@ -37,6 +57,10 @@ public: /// set the buffer to be used to store decoded chunk data void setPayloadBuffer(MemBuf *parsedContent) {theOut = parsedContent;} + /// Instead of ignoring all chunk extension values, give the supplied + /// parser a chance to handle them. Only applied to last-chunk (for now). + void parseExtensionValuesWith(ChunkExtensionValueParser *parser) { customExtensionValueParser = parser; } + bool needsMoreSpace() const; /* Http1::Parser API */ @@ -45,17 +69,20 @@ public: virtual Parser::size_type firstLineSize() const {return 0;} // has no meaning with multiple chunks private: - bool parseChunkSize(Http1::Tokenizer &tok); - bool parseChunkExtension(Http1::Tokenizer &tok, bool skipKnown); - bool parseChunkBody(Http1::Tokenizer &tok); - bool parseChunkEnd(Http1::Tokenizer &tok); + bool parseChunkSize(Tokenizer &tok); + bool parseChunkMetadataSuffix(Tokenizer &); + void parseChunkExtensions(Tokenizer &); + void parseOneChunkExtension(Tokenizer &); + bool parseChunkBody(Tokenizer &tok); + bool parseChunkEnd(Tokenizer &tok); MemBuf *theOut; uint64_t theChunkSize; uint64_t theLeftBodySize; -public: - int64_t useOriginBody; + /// An optional plugin for parsing and interpreting custom chunk-ext-val. + /// This "visitor" object is owned by our creator. + ChunkExtensionValueParser *customExtensionValueParser; }; } // namespace One diff --git a/src/http/one/Tokenizer.cc b/src/http/one/Tokenizer.cc index 371ed46e06..913397cda9 100644 --- a/src/http/one/Tokenizer.cc +++ b/src/http/one/Tokenizer.cc @@ -8,35 +8,18 @@ #include "squid.h" #include "Debug.h" +#include "http/one/Parser.h" #include "http/one/Tokenizer.h" - -bool -Http::One::Tokenizer::quotedString(SBuf &returnedToken, const bool http1p0) -{ - checkpoint(); - - if (!skip('"')) - return false; - - return qdText(returnedToken, http1p0); -} - -bool -Http::One::Tokenizer::quotedStringOrToken(SBuf &returnedToken, const bool http1p0) +#include "parser/Tokenizer.h" +#include "sbuf/Stream.h" + +/// Extracts quoted-string after the caller removes the initial '"'. +/// \param http1p0 whether to prohibit \-escaped characters in quoted strings +/// \throws InsufficientInput when input can be a token _prefix_ +/// \returns extracted quoted string (without quotes and with chars unescaped) +static SBuf +parseQuotedStringSuffix(Parser::Tokenizer &tok, const bool http1p0) { - checkpoint(); - - if (!skip('"')) - return prefix(returnedToken, CharacterSet::TCHAR); - - return qdText(returnedToken, http1p0); -} - -bool -Http::One::Tokenizer::qdText(SBuf &returnedToken, const bool http1p0) -{ - // the initial DQUOTE has been skipped by the caller - /* * RFC 1945 - defines qdtext: * inclusive of LWS (which includes CR and LF) @@ -61,12 +44,17 @@ Http::One::Tokenizer::qdText(SBuf &returnedToken, const bool http1p0) // best we can do is a conditional reference since http1p0 value may change per-client const CharacterSet &tokenChars = (http1p0 ? qdtext1p0 : qdtext1p1); - for (;;) { - SBuf::size_type prefixLen = buf().findFirstNotOf(tokenChars); - returnedToken.append(consume(prefixLen)); + SBuf parsedToken; + + while (!tok.atEnd()) { + SBuf qdText; + if (tok.prefix(qdText, tokenChars)) + parsedToken.append(qdText); + + if (!http1p0 && tok.skip('\\')) { // HTTP/1.1 allows quoted-pair, HTTP/1.0 does not + if (tok.atEnd()) + break; - // HTTP/1.1 allows quoted-pair, HTTP/1.0 does not - if (!http1p0 && skip('\\')) { /* RFC 7230 section 3.2.6 * * The backslash octet ("\") can be used as a single-octet quoting @@ -78,32 +66,42 @@ Http::One::Tokenizer::qdText(SBuf &returnedToken, const bool http1p0) */ static const CharacterSet qPairChars = CharacterSet::HTAB + CharacterSet::SP + CharacterSet::VCHAR + CharacterSet::OBSTEXT; SBuf escaped; - if (!prefix(escaped, qPairChars, 1)) { - returnedToken.clear(); - restoreLastCheckpoint(); - return false; - } - returnedToken.append(escaped); + if (!tok.prefix(escaped, qPairChars, 1)) + throw TexcHere("invalid escaped character in quoted-pair"); + + parsedToken.append(escaped); continue; + } - } else if (skip('"')) { - break; // done + if (tok.skip('"')) + return parsedToken; // may be empty - } else if (atEnd()) { - // need more data - returnedToken.clear(); - restoreLastCheckpoint(); - return false; - } + if (tok.atEnd()) + break; - // else, we have an error - debugs(24, 8, "invalid bytes for set " << tokenChars.name); - returnedToken.clear(); - restoreLastCheckpoint(); - return false; + throw TexcHere(ToSBuf("invalid bytes for set ", tokenChars.name)); } - // found the whole string - return true; + throw Http::One::InsufficientInput(); +} + +SBuf +Http::One::tokenOrQuotedString(Parser::Tokenizer &tok, const bool http1p0) +{ + if (tok.skip('"')) + return parseQuotedStringSuffix(tok, http1p0); + + if (tok.atEnd()) + throw InsufficientInput(); + + SBuf parsedToken; + if (!tok.prefix(parsedToken, CharacterSet::TCHAR)) + throw TexcHere("invalid input while expecting an HTTP token"); + + if (tok.atEnd()) + throw InsufficientInput(); + + // got the complete token + return parsedToken; } diff --git a/src/http/one/Tokenizer.h b/src/http/one/Tokenizer.h index a29ce5c5d1..cbd7b1c484 100644 --- a/src/http/one/Tokenizer.h +++ b/src/http/one/Tokenizer.h @@ -9,68 +9,47 @@ #ifndef SQUID_SRC_HTTP_ONE_TOKENIZER_H #define SQUID_SRC_HTTP_ONE_TOKENIZER_H -#include "parser/Tokenizer.h" +#include "parser/forward.h" +#include "sbuf/forward.h" namespace Http { namespace One { /** - * Lexical processor extended to tokenize HTTP/1.x syntax. + * Extracts either an HTTP/1 token or quoted-string while dealing with + * possibly incomplete input typical for incremental text parsers. + * Unescapes escaped characters in HTTP/1.1 quoted strings. * - * \see ::Parser::Tokenizer for more detail + * \param http1p0 whether to prohibit \-escaped characters in quoted strings + * \throws InsufficientInput as appropriate, including on unterminated tokens + * \returns extracted token or quoted string (without quotes) + * + * Governed by: + * - RFC 1945 section 2.1 + * " + * A string of text is parsed as a single word if it is quoted using + * double-quote marks. + * + * quoted-string = ( <"> *(qdtext) <"> ) + * + * qdtext = and CTLs, + * but including LWS> + * + * Single-character quoting using the backslash ("\") character is not + * permitted in HTTP/1.0. + * " + * + * - RFC 7230 section 3.2.6 + * " + * A string of text is parsed as a single value if it is quoted using + * double-quote marks. + * + * quoted-string = DQUOTE *( qdtext / quoted-pair ) DQUOTE + * qdtext = HTAB / SP /%x21 / %x23-5B / %x5D-7E / obs-text + * obs-text = %x80-FF + * " */ -class Tokenizer : public ::Parser::Tokenizer -{ -public: - Tokenizer(SBuf &s) : ::Parser::Tokenizer(s), savedStats_(0) {} - - /** - * Attempt to parse a quoted-string lexical construct. - * - * Governed by: - * - RFC 1945 section 2.1 - * " - * A string of text is parsed as a single word if it is quoted using - * double-quote marks. - * - * quoted-string = ( <"> *(qdtext) <"> ) - * - * qdtext = and CTLs, - * but including LWS> - * - * Single-character quoting using the backslash ("\") character is not - * permitted in HTTP/1.0. - * " - * - * - RFC 7230 section 3.2.6 - * " - * A string of text is parsed as a single value if it is quoted using - * double-quote marks. - * - * quoted-string = DQUOTE *( qdtext / quoted-pair ) DQUOTE - * qdtext = HTAB / SP /%x21 / %x23-5B / %x5D-7E / obs-text - * obs-text = %x80-FF - * " - * - * \param escaped HTTP/1.0 does not permit \-escaped characters - */ - bool quotedString(SBuf &value, const bool http1p0 = false); - - /** - * Attempt to parse a (token / quoted-string ) lexical construct. - */ - bool quotedStringOrToken(SBuf &value, const bool http1p0 = false); - -private: - /// parse the internal component of a quote-string, and terminal DQUOTE - bool qdText(SBuf &value, const bool http1p0); - - void checkpoint() { savedCheckpoint_ = buf(); savedStats_ = parsedSize(); } - void restoreLastCheckpoint() { undoParse(savedCheckpoint_, savedStats_); } - - SBuf savedCheckpoint_; - SBuf::size_type savedStats_; -}; +SBuf tokenOrQuotedString(Parser::Tokenizer &tok, const bool http1p0 = false); } // namespace One } // namespace Http diff --git a/src/http/one/forward.h b/src/http/one/forward.h index ab8c7a5db6..c9216abe39 100644 --- a/src/http/one/forward.h +++ b/src/http/one/forward.h @@ -10,6 +10,7 @@ #define SQUID_SRC_HTTP_ONE_FORWARD_H #include "base/RefCount.h" +#include "parser/forward.h" #include "sbuf/forward.h" namespace Http { @@ -31,6 +32,8 @@ typedef RefCount ResponseParserPointer; /// CRLF textual representation const SBuf &CrLf(); +using ::Parser::InsufficientInput; + } // namespace One } // namespace Http diff --git a/src/parser/BinaryTokenizer.h b/src/parser/BinaryTokenizer.h index de8369e2ae..dac16ead08 100644 --- a/src/parser/BinaryTokenizer.h +++ b/src/parser/BinaryTokenizer.h @@ -10,6 +10,7 @@ #define SQUID_SRC_PARSER_BINARYTOKENIZER_H #include "ip/forward.h" +#include "parser/forward.h" #include "sbuf/SBuf.h" namespace Parser @@ -45,7 +46,7 @@ public: class BinaryTokenizer { public: - class InsufficientInput {}; // thrown when a method runs out of data + typedef ::Parser::InsufficientInput InsufficientInput; typedef uint64_t size_type; // enough for the largest supported offset BinaryTokenizer(); diff --git a/src/parser/Makefile.am b/src/parser/Makefile.am index aef32354ca..c08d1d52aa 100644 --- a/src/parser/Makefile.am +++ b/src/parser/Makefile.am @@ -13,6 +13,7 @@ noinst_LTLIBRARIES = libparser.la libparser_la_SOURCES = \ BinaryTokenizer.h \ BinaryTokenizer.cc \ + forward.h \ Tokenizer.h \ Tokenizer.cc diff --git a/src/parser/Tokenizer.cc b/src/parser/Tokenizer.cc index 99f8eb33cb..0b44e40639 100644 --- a/src/parser/Tokenizer.cc +++ b/src/parser/Tokenizer.cc @@ -10,7 +10,9 @@ #include "squid.h" #include "Debug.h" +#include "parser/forward.h" #include "parser/Tokenizer.h" +#include "sbuf/Stream.h" #include #if HAVE_CTYPE_H @@ -96,6 +98,23 @@ Parser::Tokenizer::prefix(SBuf &returnedToken, const CharacterSet &tokenChars, c return true; } +SBuf +Parser::Tokenizer::prefix(const char *description, const CharacterSet &tokenChars, const SBuf::size_type limit) +{ + if (atEnd()) + throw InsufficientInput(); + + SBuf result; + + if (!prefix(result, tokenChars, limit)) + throw TexcHere(ToSBuf("cannot parse ", description)); + + if (atEnd()) + throw InsufficientInput(); + + return result; +} + bool Parser::Tokenizer::suffix(SBuf &returnedToken, const CharacterSet &tokenChars, const SBuf::size_type limit) { @@ -283,3 +302,24 @@ Parser::Tokenizer::int64(int64_t & result, int base, bool allowSign, const SBuf: return success(s - range.rawContent()); } +int64_t +Parser::Tokenizer::udec64(const char *description, const SBuf::size_type limit) +{ + if (atEnd()) + throw InsufficientInput(); + + int64_t result = 0; + + // Since we only support unsigned decimals, a parsing failure with a + // non-empty input always implies invalid/malformed input (or a buggy + // limit=0 caller). TODO: Support signed and non-decimal integers by + // refactoring int64() to detect insufficient input. + if (!int64(result, 10, false, limit)) + throw TexcHere(ToSBuf("cannot parse ", description)); + + if (atEnd()) + throw InsufficientInput(); // more digits may be coming + + return result; +} + diff --git a/src/parser/Tokenizer.h b/src/parser/Tokenizer.h index f04fd3e2ea..6ae8162530 100644 --- a/src/parser/Tokenizer.h +++ b/src/parser/Tokenizer.h @@ -143,6 +143,19 @@ public: */ bool int64(int64_t &result, int base = 0, bool allowSign = true, SBuf::size_type limit = SBuf::npos); + /* + * The methods below mimic their counterparts documented above, but they + * throw on errors, including InsufficientInput. The field description + * parameter is used for error reporting and debugging. + */ + + /// prefix() wrapper but throws InsufficientInput if input contains + /// nothing but the prefix (i.e. if the prefix is not "terminated") + SBuf prefix(const char *description, const CharacterSet &tokenChars, SBuf::size_type limit = SBuf::npos); + + /// int64() wrapper but limited to unsigned decimal integers (for now) + int64_t udec64(const char *description, SBuf::size_type limit = SBuf::npos); + protected: SBuf consume(const SBuf::size_type n); SBuf::size_type success(const SBuf::size_type n); diff --git a/src/parser/forward.h b/src/parser/forward.h new file mode 100644 index 0000000000..5a95b7a452 --- /dev/null +++ b/src/parser/forward.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 1996-2019 The Squid Software Foundation and contributors + * + * Squid software is distributed under GPLv2+ license and includes + * contributions from numerous individuals and organizations. + * Please see the COPYING and CONTRIBUTORS files for details. + */ + +#ifndef SQUID_PARSER_FORWARD_H +#define SQUID_PARSER_FORWARD_H + +namespace Parser { +class Tokenizer; +class BinaryTokenizer; + +// TODO: Move this declaration (to parser/Elements.h) if we need more like it. +/// thrown by modern "incremental" parsers when they need more data +class InsufficientInput {}; +} // namespace Parser + +#endif /* SQUID_PARSER_FORWARD_H */ + -- 2.39.2