From: Amos Jeffries Date: Mon, 30 Mar 2015 11:43:06 +0000 (-0700) Subject: Refactor Http1::ChunkedCodingParser to Http1::Parser API with SBuf X-Git-Tag: merge-candidate-3-v1~86^2~3 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=be29ee33a41c010816e9ea2dfc617b85ceab1110;p=thirdparty%2Fsquid.git Refactor Http1::ChunkedCodingParser to Http1::Parser API with SBuf * refactor the parse to accept SBuf input buffer using the Parser API and shift the MemBuf output buffer accesses to a separate API method. * refactor the parsing steps to use Parser methods where available and Tokenizer for lexical tokenising. This removes one temporary data copy and two performance regressions from each chunked message payload processing action. It also removes c-string operations and strign termination dependency from chunk parsing. --- diff --git a/src/adaptation/icap/ModXact.cc b/src/adaptation/icap/ModXact.cc index 638bb7f985..b3e1ff41b2 100644 --- a/src/adaptation/icap/ModXact.cc +++ b/src/adaptation/icap/ModXact.cc @@ -1123,15 +1123,9 @@ void Adaptation::Icap::ModXact::parseBody() // the parser will throw on errors BodyPipeCheckout bpc(*adapted.body_pipe); - // XXX: performance regression. SBuf-convert (or Parser-convert?) the chunked decoder. - MemBuf encodedData; - encodedData.init(); - // NP: we must do this instead of pointing encodedData at the SBuf::rawContent - // because chunked decoder uses MemBuf::consume, which shuffles buffer bytes around. - encodedData.append(readBuf.rawContent(), readBuf.length()); - const bool parsed = bodyParser->parse(&encodedData, &bpc.buf); - // XXX: httpChunkDecoder has consumed from MemBuf. - readBuf.consume(readBuf.length() - encodedData.contentSize()); + bodyParser->setPayloadBuffer(&bpc.buf); + const bool parsed = bodyParser->parse(readBuf); + readBuf = bodyParser->remaining(); // sync buffers after parse bpc.checkIn(); debugs(93, 5, "have " << readBuf.length() << " body bytes after parsed all: " << parsed); diff --git a/src/client_side.cc b/src/client_side.cc index 6493a76bbe..17f749dcb5 100644 --- a/src/client_side.cc +++ b/src/client_side.cc @@ -3229,25 +3229,23 @@ ConnStateData::handleRequestBodyData() { assert(bodyPipe != NULL); - size_t putSize = 0; - if (in.bodyParser) { // chunked encoding - if (const err_type error = handleChunkedRequestBody(putSize)) { + if (const err_type error = handleChunkedRequestBody()) { abortChunkedRequestBody(error); return false; } } else { // identity encoding debugs(33,5, HERE << "handling plain request body for " << clientConnection); - putSize = bodyPipe->putMoreData(in.buf.c_str(), in.buf.length()); + const size_t putSize = bodyPipe->putMoreData(in.buf.c_str(), in.buf.length()); + if (putSize > 0) + consumeInput(putSize); + if (!bodyPipe->mayNeedMoreData()) { // BodyPipe will clear us automagically when we produced everything bodyPipe = NULL; } } - if (putSize > 0) - consumeInput(putSize); - if (!bodyPipe) { debugs(33,5, HERE << "produced entire request body for " << clientConnection); @@ -3266,7 +3264,7 @@ ConnStateData::handleRequestBodyData() /// parses available chunked encoded body bytes, checks size, returns errors err_type -ConnStateData::handleChunkedRequestBody(size_t &putSize) +ConnStateData::handleChunkedRequestBody() { debugs(33, 7, "chunked from " << clientConnection << ": " << in.buf.length()); @@ -3275,16 +3273,11 @@ ConnStateData::handleChunkedRequestBody(size_t &putSize) if (in.buf.isEmpty()) // nothing to do return ERR_NONE; - MemBuf raw; // Http1::ChunkedCodingParser only works with MemBufs - // add one because MemBuf will assert if it cannot 0-terminate - raw.init(in.buf.length(), in.buf.length()+1); - raw.append(in.buf.c_str(), in.buf.length()); - - const mb_size_t wasContentSize = raw.contentSize(); BodyPipeCheckout bpc(*bodyPipe); - const bool parsed = in.bodyParser->parse(&raw, &bpc.buf); + in.bodyParser->setPayloadBuffer(&bpc.buf); + const bool parsed = in.bodyParser->parse(in.buf); + in.buf = in.bodyParser->remaining(); // sync buffers bpc.checkIn(); - putSize = wasContentSize - raw.contentSize(); // dechunk then check: the size limit applies to _dechunked_ content if (clientIsRequestBodyTooLargeForPolicy(bodyPipe->producedSize())) diff --git a/src/client_side.h b/src/client_side.h index 9c17e95142..eea76f2362 100644 --- a/src/client_side.h +++ b/src/client_side.h @@ -420,7 +420,7 @@ protected: void startDechunkingRequest(); void finishDechunkingRequest(bool withSuccess); void abortChunkedRequestBody(const err_type error); - err_type handleChunkedRequestBody(size_t &putSize); + err_type handleChunkedRequestBody(); void startPinnedConnectionMonitoring(); void clientPinnedConnectionRead(const CommIoCbParams &io); diff --git a/src/http.cc b/src/http.cc index c6d4a7b2f9..6a05c38e4a 100644 --- a/src/http.cc +++ b/src/http.cc @@ -1397,15 +1397,9 @@ HttpStateData::decodeAndWriteReplyBody() SQUID_ENTER_THROWING_CODE(); MemBuf decodedData; decodedData.init(); - // XXX: performance regression. SBuf-convert (or Parser-convert?) the chunked decoder. - MemBuf encodedData; - encodedData.init(); - // NP: we must do this instead of pointing encodedData at the SBuf::rawContent - // because chunked decoder uses MemBuf::consume, which shuffles buffer bytes around. - encodedData.append(inBuf.rawContent(), inBuf.length()); - const bool doneParsing = httpChunkDecoder->parse(&encodedData,&decodedData); - // XXX: httpChunkDecoder has consumed from MemBuf. - inBuf.consume(inBuf.length() - encodedData.contentSize()); + httpChunkDecoder->setPayloadBuffer(&decodedData); + const bool doneParsing = httpChunkDecoder->parse(inBuf); + inBuf = httpChunkDecoder->remaining(); // sync buffers after parse len = decodedData.contentSize(); data=decodedData.content(); addVirginReplyBody(data, len); diff --git a/src/http/one/ChunkedCodingParser.cc b/src/http/one/ChunkedCodingParser.cc index 60c0d1c394..adcd376fa3 100644 --- a/src/http/one/ChunkedCodingParser.cc +++ b/src/http/one/ChunkedCodingParser.cc @@ -12,6 +12,7 @@ #include "http/one/ChunkedCodingParser.h" #include "http/ProtocolVersion.h" #include "MemBuf.h" +#include "parser/Tokenizer.h" #include "Parsing.h" Http::One::ChunkedCodingParser::ChunkedCodingParser() @@ -26,42 +27,40 @@ void Http::One::ChunkedCodingParser::clear() { parsingStage_ = Http1::HTTP_PARSE_NONE; + buf_.clear(); theChunkSize = theLeftBodySize = 0; - theIn = theOut = NULL; + theOut = NULL; useOriginBody = -1; - inQuoted = inSlashed = false; } bool -Http::One::ChunkedCodingParser::parse(MemBuf *rawData, MemBuf *parsedContent) +Http::One::ChunkedCodingParser::parse(const SBuf &aBuf) { - Must(rawData && parsedContent); - theIn = rawData; - theOut = parsedContent; + buf_ = aBuf; + debugs(74, DBG_DATA, "Parse buf={length=" << aBuf.length() << ", data='" << aBuf << "'}"); + Must(!buf_.isEmpty() && theOut); if (parsingStage_ == Http1::HTTP_PARSE_NONE) parsingStage_ = Http1::HTTP_PARSE_CHUNK_SZ; + ::Parser::Tokenizer tok(buf_); + // loop for as many chunks as we can // use do-while instead of while so that we can incrementally // restart in the middle of a chunk/frame do { - if (parsingStage_ == Http1::HTTP_PARSE_CHUNK_EXT) { - if (theChunkSize) - parseUnusedChunkExtension(); - else - parseLastChunkExtension(); - } + if (parsingStage_ == Http1::HTTP_PARSE_CHUNK_EXT && !parseChunkExtension(tok, theChunkSize)) + return false; - if (parsingStage_ == Http1::HTTP_PARSE_CHUNK && !parseChunkBody()) + if (parsingStage_ == Http1::HTTP_PARSE_CHUNK && !parseChunkBody(tok)) return false; - if (parsingStage_ == Http1::HTTP_PARSE_MIME && !parseTrailer()) + if (parsingStage_ == Http1::HTTP_PARSE_MIME && !grabMimeBlock("Trailers", 64*1024 /* 64KB max */)) return false; // loop for as many chunks as we can - } while (parsingStage_ == Http1::HTTP_PARSE_CHUNK_SZ && parseChunkSize()); + } while (parsingStage_ == Http1::HTTP_PARSE_CHUNK_SZ && parseChunkSize(tok)); return !needsMoreData() && !needsMoreSpace(); } @@ -73,266 +72,138 @@ Http::One::ChunkedCodingParser::needsMoreSpace() const return parsingStage_ == Http1::HTTP_PARSE_CHUNK && !theOut->hasPotentialSpace(); } +/// RFC 7230 section 4.1 chunk-size bool -Http::One::ChunkedCodingParser::parseChunkSize() +Http::One::ChunkedCodingParser::parseChunkSize(::Parser::Tokenizer &tok) { Must(theChunkSize <= 0); // Should(), really - const char *p = theIn->content(); - while (p < theIn->space() && xisxdigit(*p)) ++p; - if (p >= theIn->space()) { - return false; - } - int64_t size = -1; - if (StringToInt64(theIn->content(), size, &p, 16)) { + if (tok.int64(size, 16, false) && !tok.atEnd()) { if (size < 0) throw TexcHere("negative chunk size"); theChunkSize = theLeftBodySize = size; debugs(94,7, "found chunk: " << theChunkSize); - theIn->consume(p - theIn->content()); + buf_ = tok.remaining(); // parse checkpoint parsingStage_ = Http1::HTTP_PARSE_CHUNK_EXT; return true; - } else - throw TexcHere("corrupted chunk size"); - - return false; -} - -/// Skips a set of RFC 7230 section 4.1.1 chunk-ext -/// http://tools.ietf.org/html/rfc7230#section-4.1.1 -void -Http::One::ChunkedCodingParser::parseUnusedChunkExtension() -{ - size_t crlfBeg = 0; - size_t crlfEnd = 0; - if (findCrlf(crlfBeg, crlfEnd, inQuoted, inSlashed)) { - inQuoted = inSlashed = false; - theIn->consume(crlfEnd); - // non-0 chunk means data, 0-size means optional Trailer follows - parsingStage_ = theChunkSize ? Http1::HTTP_PARSE_CHUNK : Http1::HTTP_PARSE_MIME; - } else { - theIn->consume(theIn->contentSize()); - } -} - -bool -Http::One::ChunkedCodingParser::parseChunkBody() -{ - Must(theLeftBodySize > 0); // Should, really - - const size_t availSize = min(theLeftBodySize, (uint64_t)theIn->contentSize()); - const size_t safeSize = min(availSize, (size_t)theOut->potentialSpaceSize()); - - theOut->append(theIn->content(), safeSize); - theIn->consume(safeSize); - theLeftBodySize -= safeSize; - - if (theLeftBodySize == 0) - return parseChunkEnd(); - else - Must(needsMoreData() || needsMoreSpace()); - return true; -} - -bool -Http::One::ChunkedCodingParser::parseChunkEnd() -{ - Must(theLeftBodySize == 0); // Should(), really - - size_t crlfBeg = 0; - size_t crlfEnd = 0; - - if (findCrlf(crlfBeg, crlfEnd)) { - if (crlfBeg != 0) { - throw TexcHere("found data between chunk end and CRLF"); - return false; - } - - theIn->consume(crlfEnd); - theChunkSize = 0; // done with the current chunk - parsingStage_ = Http1::HTTP_PARSE_CHUNK_SZ; - return true; + } else if (tok.atEnd()) { + return false; // need more data } - return false; + // else error + throw TexcHere("corrupted chunk size"); + return false; // should not be reachable } /** - * Parse Trailer RFC 7230 section 4.1.2 trailer-part - * http://tools.ietf.org/html/rfc7230#section-4.1.2 + * Parses a set of RFC 7230 section 4.1.1 chunk-ext + * http://tools.ietf.org/html/rfc7230#section-4.1.1 * - * trailer-part = *( header-field CRLF ) + * chunk-ext = *( ";" chunk-ext-name [ "=" chunk-ext-val ] ) + * chunk-ext-name = token + * chunk-ext-val = token / quoted-string * - * Currently Trailer headers are ignored/skipped + * ICAP 'use-original-body=N' extension is supported. */ bool -Http::One::ChunkedCodingParser::parseTrailer() +Http::One::ChunkedCodingParser::parseChunkExtension(::Parser::Tokenizer &tok, bool skipKnown) { - Must(theChunkSize == 0); // Should(), really + // TODO implement a proper quoted-string Tokenizer method + static const CharacterSet qString = CharacterSet("qString","\"\r\n").add('\0').complement(); - while (parsingStage_ == Http1::HTTP_PARSE_MIME) { - if (!parseTrailerHeader()) - return false; - } + SBuf ext; + while (tok.skip(';') && tok.prefix(ext, CharacterSet::TCHAR)) { - return true; -} + // whole value part is optional. if no '=' expect next chunk-ext + if (tok.skip('=')) { -/// skip one RFC 7230 header-field from theIn buffer -bool -Http::One::ChunkedCodingParser::parseTrailerHeader() -{ - size_t crlfBeg = 0; - size_t crlfEnd = 0; + if (!skipKnown) { + if (ext.cmp("use-original-body",17) == 0 && tok.int64(useOriginBody, 10)) { + debugs(94, 3, "Found chunk extension " << ext << "=" << useOriginBody); + buf_ = tok.remaining(); // parse checkpoint + continue; + } + } - if (findCrlf(crlfBeg, crlfEnd)) { + debugs(94, 5, "skipping unknown chunk extension " << ext); -#if TRAILERS_ARE_SUPPORTED - if (crlfBeg > 0) - theTrailer.append(theIn->content(), crlfEnd); -#endif + // unknown might have a value token ... + if (tok.skipAll(CharacterSet::TCHAR) && !tok.atEnd()) { + buf_ = tok.remaining(); // parse checkpoint + continue; + } - theIn->consume(crlfEnd); + // ... or a quoted-string + if (tok.skipOne(CharacterSet::DQUOTE) && tok.skipAll(qString) && tok.skipOne(CharacterSet::DQUOTE)) { + buf_ = tok.remaining(); // parse checkpoint + continue; + } - if (crlfBeg == 0) { - parsingStage_ = Http1::HTTP_PARSE_DONE; + // otherwise need more data OR corrupt syntax + break; } - return true; + if (!tok.atEnd()) + buf_ = tok.remaining(); // parse checkpoint (unless there might be more token name) } - return false; -} - -/// Finds next CRLF. Does not store parsing state. -bool -Http::One::ChunkedCodingParser::findCrlf(size_t &crlfBeg, size_t &crlfEnd) -{ - bool quoted = false; - bool slashed = false; - return findCrlf(crlfBeg, crlfEnd, quoted, slashed); -} - -/// Finds next CRLF. Parsing state stored in quoted and slashed -/// parameters. Incremental: can resume when more data is available. -bool -Http::One::ChunkedCodingParser::findCrlf(size_t &crlfBeg, size_t &crlfEnd, bool "ed, bool &slashed) -{ - // XXX: This code was copied, with permission, from another software. - // There is a similar and probably better code inside httpHeaderParse - // but it seems difficult to isolate due to parsing-unrelated bloat. - // Such isolation should probably be done before this class is used - // for handling of traffic "more external" than ICAP. - - const char *buf = theIn->content(); - size_t size = theIn->contentSize(); - - ssize_t crOff = -1; - - for (size_t i = 0; i < size; ++i) { - if (slashed) { - slashed = false; - continue; - } - - const char c = buf[i]; - - // handle quoted strings - if (quoted) { - if (c == '\\') - slashed = true; - else if (c == '"') - quoted = false; - - continue; - } else if (c == '"') { - quoted = true; - crOff = -1; - continue; - } - - if (crOff < 0) { // looking for the first CR or LF - - if (c == '\n') { - crlfBeg = i; - crlfEnd = ++i; - return true; - } - - if (c == '\r') - crOff = i; - } else { // skipping CRs, looking for the first LF - - if (c == '\n') { - crlfBeg = crOff; - crlfEnd = ++i; - return true; - } + if (tok.atEnd()) + return false; - if (c != '\r') - crOff = -1; - } + if (skipLineTerminator(tok)) { + buf_ = tok.remaining(); // checkpoint + // non-0 chunk means data, 0-size means optional Trailer follows + parsingStage_ = theChunkSize ? Http1::HTTP_PARSE_CHUNK : Http1::HTTP_PARSE_MIME; + return true; } + throw TexcHere("corrupted chunk extension value"); return false; } -/** - * Parses a set of RFC 7230 section 4.1.1 chunk-ext - * http://tools.ietf.org/html/rfc7230#section-4.1.1 - * - * chunk-ext = *( ";" chunk-ext-name [ "=" chunk-ext-val ] ) - * chunk-ext-name = token - * chunk-ext-val = token / quoted-string - * - * ICAP 'use-original-body=N' extension is supported. - */ -void -Http::One::ChunkedCodingParser::parseLastChunkExtension() +bool +Http::One::ChunkedCodingParser::parseChunkBody(::Parser::Tokenizer &tok) { - size_t crlfBeg = 0; - size_t crlfEnd = 0; + Must(theLeftBodySize > 0); // Should, really - if (!findCrlf(crlfBeg, crlfEnd)) - return; + buf_ = tok.remaining(); // sync buffers before buf_ use - const char *const startExt = theIn->content(); - const char *const endExt = theIn->content() + crlfBeg; + // TODO fix type mismatches and casting for these + const size_t availSize = min(theLeftBodySize, (uint64_t)buf_.length()); + const size_t safeSize = min(availSize, (size_t)theOut->potentialSpaceSize()); - // chunk-extension starts at startExt and ends with LF at endEx - for (const char *p = startExt; p < endExt;) { + theOut->append(buf_.rawContent(), safeSize); + buf_.consume(safeSize); + theLeftBodySize -= safeSize; - while (*p == ' ' || *p == '\t') ++p; // skip spaces before ';' + tok.reset(buf_); // sync buffers after consume() - if (*p++ != ';') // each ext name=value pair is preceded with ';' - break; + if (theLeftBodySize == 0) + return parseChunkEnd(tok); + else + Must(needsMoreData() || needsMoreSpace()); - while (*p == ' ' || *p == '\t') ++p; // skip spaces before name + return true; +} - if (p >= endExt) - break; // malformed extension: ';' without ext name=value pair +bool +Http::One::ChunkedCodingParser::parseChunkEnd(::Parser::Tokenizer &tok) +{ + Must(theLeftBodySize == 0); // Should(), really - const int extSize = endExt - p; - // TODO: we need debugData() stream manipulator to dump data - debugs(94,7, "Found chunk extension; size=" << extSize); + if (skipLineTerminator(tok)) { + buf_ = tok.remaining(); // parse checkpoint + theChunkSize = 0; // done with the current chunk + parsingStage_ = Http1::HTTP_PARSE_CHUNK_SZ; + return true; - // TODO: support implied *LWS around '=' - if (extSize > 18 && strncmp(p, "use-original-body=", 18) == 0) { - (void)StringToInt64(p+18, useOriginBody, &p, 10); - debugs(94, 3, HERE << "use-original-body=" << useOriginBody); - break; // remove to support more than just use-original-body - } else { - debugs(94, 5, HERE << "skipping unknown chunk extension"); - // TODO: support quoted-string chunk-ext-val - while (p < endExt && *p != ';') ++p; // skip until the next ';' - } + } else if (!tok.atEnd()) { + throw TexcHere("found data between chunk end and CRLF"); } - theIn->consume(crlfEnd); - - parsingStage_ = theChunkSize ? Http1::HTTP_PARSE_CHUNK : Http1::HTTP_PARSE_MIME; + return false; } diff --git a/src/http/one/ChunkedCodingParser.h b/src/http/one/ChunkedCodingParser.h index 820eac75ae..9b22dcbb85 100644 --- a/src/http/one/ChunkedCodingParser.h +++ b/src/http/one/ChunkedCodingParser.h @@ -26,49 +26,33 @@ namespace One * The parser shovels content bytes from the raw * input buffer into the content output buffer, both caller-supplied. * Ignores chunk extensions except for ICAP's ieof. - * Has a trailer-handling placeholder. + * Trailers are available via mimeHeader() if wanted. */ class ChunkedCodingParser : public Http1::Parser { public: ChunkedCodingParser(); - virtual ~ChunkedCodingParser() {} + virtual ~ChunkedCodingParser() {theOut=NULL;/* we dont own this object */} - /** - \retval true complete success - \retval false needs more data - \throws ?? error. - */ - bool parse(MemBuf *rawData, MemBuf *parsedContent); + /// set the buffer to be used to store decoded chunk data + void setPayloadBuffer(MemBuf *parsedContent) {theOut = parsedContent;} bool needsMoreSpace() const; /* Http1::Parser API */ virtual void clear(); - virtual bool parse(const SBuf &) {return false;} // XXX implement - virtual size_type firstLineSize() const {return 0;} // has no meaning with multiple chunks + virtual bool parse(const SBuf &); + virtual Parser::size_type firstLineSize() const {return 0;} // has no meaning with multiple chunks private: - bool parseChunkSize(); - void parseUnusedChunkExtension(); - void parseLastChunkExtension(); - void parseChunkBeg(); - bool parseChunkBody(); - bool parseChunkEnd(); - bool parseTrailer(); - bool parseTrailerHeader(); + bool parseChunkSize(::Parser::Tokenizer &tok); + bool parseChunkExtension(::Parser::Tokenizer &tok, bool skipKnown); + bool parseChunkBody(::Parser::Tokenizer &tok); + bool parseChunkEnd(::Parser::Tokenizer &tok); - bool findCrlf(size_t &crlfBeg, size_t &crlfEnd); - bool findCrlf(size_t &crlfBeg, size_t &crlfEnd, bool "ed, bool &slashed); - -private: - MemBuf *theIn; MemBuf *theOut; - uint64_t theChunkSize; uint64_t theLeftBodySize; - bool inQuoted; ///< stores parsing state for incremental findCrlf - bool inSlashed; ///< stores parsing state for incremental findCrlf public: int64_t useOriginBody;