From: Amos Jeffries Date: Fri, 10 Apr 2015 11:02:44 +0000 (-0700) Subject: Add Http1::Tokenzer class X-Git-Tag: merge-candidate-3-v1~81^2~3 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=f29718b079f1e0a665fd8705b267e6176e9378a5;p=thirdparty%2Fsquid.git Add Http1::Tokenzer class ... for tokenizing HTTP/1.x lexical symbols. Inherits from the protocol agnostic ::Parser::Tokenizer base class. Provdes quoted-string and (token / quoted-string) parsing methods with HTTP/1.0 and HTTP/1.1 compliant character sets and \-escaping support. --- diff --git a/src/http/one/Makefile.am b/src/http/one/Makefile.am index ce91c2d89b..a5bfa65bce 100644 --- a/src/http/one/Makefile.am +++ b/src/http/one/Makefile.am @@ -17,4 +17,6 @@ libhttp1_la_SOURCES = \ RequestParser.cc \ RequestParser.h \ ResponseParser.cc \ - ResponseParser.h + ResponseParser.h \ + Tokenizer.cc \ + Tokenizer.h diff --git a/src/http/one/Parser.cc b/src/http/one/Parser.cc index a3c199bbda..7352e606fa 100644 --- a/src/http/one/Parser.cc +++ b/src/http/one/Parser.cc @@ -9,8 +9,8 @@ #include "squid.h" #include "Debug.h" #include "http/one/Parser.h" +#include "http/one/Tokenizer.h" #include "mime_header.h" -#include "parser/Tokenizer.h" #include "SquidConfig.h" /// RFC 7230 section 2.6 - 7 magic octets @@ -26,7 +26,7 @@ Http::One::Parser::clear() } bool -Http::One::Parser::skipLineTerminator(::Parser::Tokenizer &tok) const +Http::One::Parser::skipLineTerminator(Http1::Tokenizer &tok) const { static const SBuf crlf("\r\n"); if (tok.skip(crlf)) @@ -102,7 +102,7 @@ Http::One::Parser::getHeaderField(const char *name) // while we can find more LF in the SBuf static CharacterSet iso8859Line = CharacterSet("non-LF",'\0','\n'-1) + CharacterSet(NULL, '\n'+1, (unsigned char)0xFF); - ::Parser::Tokenizer tok(mimeHeaderBlock_); + Http1::Tokenizer tok(mimeHeaderBlock_); SBuf p; static const SBuf crlf("\r\n"); @@ -125,7 +125,7 @@ Http::One::Parser::getHeaderField(const char *name) p.consume(namelen + 1); // TODO: optimize SBuf::trim to take CharacterSet directly - ::Parser::Tokenizer t(p); + Http1::Tokenizer t(p); t.skipAll(CharacterSet::WSP); p = t.remaining(); diff --git a/src/http/one/Parser.h b/src/http/one/Parser.h index 42ddb52201..09d51ec53b 100644 --- a/src/http/one/Parser.h +++ b/src/http/one/Parser.h @@ -14,10 +14,6 @@ #include "http/StatusCode.h" #include "SBuf.h" -namespace Parser { -class Tokenizer; -} - namespace Http { namespace One { @@ -105,7 +101,7 @@ public: protected: /// detect and skip the CRLF or (if tolerant) LF line terminator /// consume from the tokenizer and return true only if found - bool skipLineTerminator(::Parser::Tokenizer &tok) const; + bool skipLineTerminator(Http1::Tokenizer &tok) const; /** * Scan to find the mime headers block for current message. diff --git a/src/http/one/RequestParser.cc b/src/http/one/RequestParser.cc index 8577b7c27b..97545d8b29 100644 --- a/src/http/one/RequestParser.cc +++ b/src/http/one/RequestParser.cc @@ -9,8 +9,8 @@ #include "squid.h" #include "Debug.h" #include "http/one/RequestParser.h" +#include "http/one/Tokenizer.h" #include "http/ProtocolVersion.h" -#include "parser/Tokenizer.h" #include "profiler/Profiler.h" #include "SquidConfig.h" @@ -72,7 +72,7 @@ Http::One::RequestParser::skipGarbageLines() * \retval 0 more data is needed to complete the parse */ int -Http::One::RequestParser::parseMethodField(::Parser::Tokenizer &tok, const CharacterSet &WspDelim) +Http::One::RequestParser::parseMethodField(Http1::Tokenizer &tok, const CharacterSet &WspDelim) { // scan for up to 16 valid method characters. static const size_t maxMethodLength = 16; // TODO: make this configurable? @@ -132,7 +132,7 @@ uriValidCharacters() } int -Http::One::RequestParser::parseUriField(::Parser::Tokenizer &tok) +Http::One::RequestParser::parseUriField(Http1::Tokenizer &tok) { // URI field is a sequence of ... what? segments all have different valid charset // go with non-whitespace non-binary characters for now @@ -187,7 +187,7 @@ Http::One::RequestParser::parseUriField(::Parser::Tokenizer &tok) } int -Http::One::RequestParser::parseHttpVersionField(::Parser::Tokenizer &tok) +Http::One::RequestParser::parseHttpVersionField(Http1::Tokenizer &tok) { // partial match of HTTP/1 magic prefix if (tok.remaining().length() < Http1magic.length() && Http1magic.startsWith(tok.remaining())) { @@ -246,7 +246,7 @@ Http::One::RequestParser::parseHttpVersionField(::Parser::Tokenizer &tok) int Http::One::RequestParser::parseRequestFirstLine() { - ::Parser::Tokenizer tok(buf_); + Http1::Tokenizer tok(buf_); debugs(74, 5, "parsing possible request: buf.length=" << buf_.length()); debugs(74, DBG_DATA, buf_); @@ -297,7 +297,7 @@ Http::One::RequestParser::parseRequestFirstLine() // seek the LF character, then tokenize the line in reverse SBuf line; if (tok.prefix(line, LfDelim) && tok.skip('\n')) { - ::Parser::Tokenizer rTok(line); + Http1::Tokenizer rTok(line); SBuf nil; (void)rTok.suffix(nil,CharacterSet::CR); // optional CR in terminator SBuf digit; diff --git a/src/http/one/RequestParser.h b/src/http/one/RequestParser.h index f793ff0578..c48ad5ed57 100644 --- a/src/http/one/RequestParser.h +++ b/src/http/one/RequestParser.h @@ -47,9 +47,9 @@ public: private: void skipGarbageLines(); int parseRequestFirstLine(); - int parseMethodField(::Parser::Tokenizer &, const CharacterSet &); - int parseUriField(::Parser::Tokenizer &); - int parseHttpVersionField(::Parser::Tokenizer &); + int parseMethodField(Http1::Tokenizer &, const CharacterSet &); + int parseUriField(Http1::Tokenizer &); + int parseHttpVersionField(Http1::Tokenizer &); /// what request method has been found on the first line HttpRequestMethod method_; diff --git a/src/http/one/ResponseParser.cc b/src/http/one/ResponseParser.cc index f74360fb72..37bcf71b50 100644 --- a/src/http/one/ResponseParser.cc +++ b/src/http/one/ResponseParser.cc @@ -9,8 +9,8 @@ #include "squid.h" #include "Debug.h" #include "http/one/ResponseParser.h" +#include "http/one/Tokenizer.h" #include "http/ProtocolVersion.h" -#include "parser/Tokenizer.h" #include "profiler/Profiler.h" #include "SquidConfig.h" @@ -47,7 +47,7 @@ Http::One::ResponseParser::firstLineSize() const // NP: we found the protocol version and consumed it already. // just need the status code and reason phrase int -Http::One::ResponseParser::parseResponseStatusAndReason(::Parser::Tokenizer &tok, const CharacterSet &WspDelim) +Http::One::ResponseParser::parseResponseStatusAndReason(Http1::Tokenizer &tok, const CharacterSet &WspDelim) { if (!completedStatus_) { debugs(74, 9, "seek status-code in: " << tok.remaining().substr(0,10) << "..."); @@ -121,7 +121,7 @@ Http::One::ResponseParser::parseResponseStatusAndReason(::Parser::Tokenizer &tok int Http::One::ResponseParser::parseResponseFirstLine() { - ::Parser::Tokenizer tok(buf_); + Http1::Tokenizer tok(buf_); CharacterSet WspDelim = CharacterSet::SP; // strict parse only accepts SP diff --git a/src/http/one/ResponseParser.h b/src/http/one/ResponseParser.h index f9356605bd..509819ffaf 100644 --- a/src/http/one/ResponseParser.h +++ b/src/http/one/ResponseParser.h @@ -43,7 +43,7 @@ public: private: int parseResponseFirstLine(); - int parseResponseStatusAndReason(::Parser::Tokenizer&, const CharacterSet &); + int parseResponseStatusAndReason(Http1::Tokenizer&, const CharacterSet &); /// magic prefix for identifying ICY response messages static const SBuf IcyMagic; diff --git a/src/http/one/Tokenizer.cc b/src/http/one/Tokenizer.cc new file mode 100644 index 0000000000..bfab18a7bb --- /dev/null +++ b/src/http/one/Tokenizer.cc @@ -0,0 +1,109 @@ +/* + * Copyright (C) 1996-2015 The Squid Software Foundation and contributors + * + * Squid software is distributed under GPLv2+ license and includes + * contributions from numerous individuals and organizations. + * Please see the COPYING and CONTRIBUTORS files for details. + */ + +#include "squid.h" +#include "Debug.h" +#include "http/one/Tokenizer.h" + +bool +Http::One::Tokenizer::quotedString(SBuf &returnedToken, const bool http1p0) +{ + checkpoint(); + + if (!skip('"')) + return false; + + return qdText(returnedToken, http1p0); +} + +bool +Http::One::Tokenizer::quotedStringOrToken(SBuf &returnedToken, const bool http1p0) +{ + checkpoint(); + + if (!skip('"')) + return prefix(returnedToken, CharacterSet::TCHAR); + + return qdText(returnedToken, http1p0); +} + +bool +Http::One::Tokenizer::qdText(SBuf &returnedToken, const bool http1p0) +{ + // the initial DQUOTE has been skipped by the caller + + /* + * RFC 1945 - defines qdtext: + * inclusive of LWS (which includes CR and LF) + * exclusive of 0x80-0xFF + * includes 0x5E ('\') as just a regular character + */ + static const CharacterSet qdtext1p0 = CharacterSet("qdtext (HTTP/1.0)", 0x23, 0x7E) + + CharacterSet("", "!") + + CharacterSet::CR + CharacterSet::LF + CharacterSet::HTAB + CharacterSet::SP; + /* + * RFC 7230 - defines qdtext: + * exclusive of CR and LF + * inclusive of 0x80-0xFF + * includes 0x5E ('\') but only when part of quoted-pair + */ + static const CharacterSet qdtext1p1 = CharacterSet("qdtext (HTTP/1.1)", 0x23, 0x5B) + + CharacterSet("", "!") + + CharacterSet("", 0x5D, 0x7E) + + CharacterSet::HTAB + CharacterSet::SP + + CharacterSet::OBSTEXT; + + // best we can do is a conditional reference since http1p0 value may change per-client + const CharacterSet &tokenChars = (http1p0 ? qdtext1p0 : qdtext1p1); + + for (;;) { + SBuf::size_type prefixLen = buf().findFirstNotOf(tokenChars); + returnedToken.append(consume(prefixLen)); + + // HTTP/1.1 allows quoted-pair, HTTP/1.0 does not + if (!http1p0 && skip('\\')) { + /* RFC 7230 section 3.2.6 + * + * The backslash octet ("\") can be used as a single-octet quoting + * mechanism within quoted-string and comment constructs. Recipients + * that process the value of a quoted-string MUST handle a quoted-pair + * as if it were replaced by the octet following the backslash. + * + * quoted-pair = "\" ( HTAB / SP / VCHAR / obs-text ) + */ + static const CharacterSet qPairChars = CharacterSet::HTAB + CharacterSet::SP + CharacterSet::VCHAR + CharacterSet::OBSTEXT; + SBuf escaped; + if (!prefix(escaped, qPairChars, 1)) { + returnedToken.clear(); + restoreLastCheckpoint(); + return false; + } + returnedToken.append(escaped); + continue; + + } else if (skip('"')) { + break; // done + + } else if (atEnd()) { + // need more data + returnedToken.clear(); + restoreLastCheckpoint(); + return false; + } + + // else, we have an error + debugs(24, 8, "invalid bytes for set " << tokenChars.name); + returnedToken.clear(); + restoreLastCheckpoint(); + return false; + } + + // found the whole string + return true; +} + diff --git a/src/http/one/Tokenizer.h b/src/http/one/Tokenizer.h new file mode 100644 index 0000000000..60b276ffee --- /dev/null +++ b/src/http/one/Tokenizer.h @@ -0,0 +1,79 @@ +/* + * Copyright (C) 1996-2015 The Squid Software Foundation and contributors + * + * Squid software is distributed under GPLv2+ license and includes + * contributions from numerous individuals and organizations. + * Please see the COPYING and CONTRIBUTORS files for details. + */ + +#ifndef SQUID_SRC_HTTP_ONE_TOKENIZER_H +#define SQUID_SRC_HTTP_ONE_TOKENIZER_H + +#include "parser/Tokenizer.h" + +namespace Http { +namespace One { + +/** + * Lexical processor extended to tokenize HTTP/1.x syntax. + * + * \see ::Parser::Tokenizer for more detail + */ +class Tokenizer : public ::Parser::Tokenizer +{ +public: + Tokenizer(SBuf &s) : ::Parser::Tokenizer(s) {} + + /** + * Attempt to parse a quoted-string lexical construct. + * + * Governed by: + * - RFC 1945 section 2.1 + * " + * A string of text is parsed as a single word if it is quoted using + * double-quote marks. + * + * quoted-string = ( <"> *(qdtext) <"> ) + * + * qdtext = and CTLs, + * but including LWS> + * + * Single-character quoting using the backslash ("\") character is not + * permitted in HTTP/1.0. + * " + * + * - RFC 7230 section 3.2.6 + * " + * A string of text is parsed as a single value if it is quoted using + * double-quote marks. + * + * quoted-string = DQUOTE *( qdtext / quoted-pair ) DQUOTE + * qdtext = HTAB / SP /%x21 / %x23-5B / %x5D-7E / obs-text + * obs-text = %x80-FF + * " + * + * \param escaped HTTP/1.0 does not permit \-escaped characters + */ + bool quotedString(SBuf &value, const bool http1p0 = false); + + /** + * Attempt to parse a (token / quoted-string ) lexical construct. + */ + bool quotedStringOrToken(SBuf &value, const bool http1p0 = false); + +private: + /// parse the internal component of a quote-string, and terminal DQUOTE + bool qdText(SBuf &value, const bool http1p0); + + void checkpoint() { savedCheckpoint_ = buf(); savedStats_ = parsedSize(); } + void restoreLastCheckpoint() { undoParse(savedCheckpoint_, savedStats_); } + + SBuf savedCheckpoint_; + SBuf::size_type savedStats_; +}; + +} // namespace One +} // namespace Http + +#endif /* SQUID_SRC_HTTP_ONE_TOKENIZER_H */ + diff --git a/src/http/one/forward.h b/src/http/one/forward.h index d7bf5ced11..fdce927cca 100644 --- a/src/http/one/forward.h +++ b/src/http/one/forward.h @@ -14,6 +14,8 @@ namespace Http { namespace One { +class Tokenizer; + class Parser; typedef RefCount ParserPointer; diff --git a/src/parser/Tokenizer.h b/src/parser/Tokenizer.h index e18e76f5d7..47d4a7515a 100644 --- a/src/parser/Tokenizer.h +++ b/src/parser/Tokenizer.h @@ -44,7 +44,7 @@ public: const SBuf& remaining() const { return buf_; } /// reinitialize processing for a new buffer - void reset(const SBuf &newBuf) { buf_ = newBuf; parsed_ = 0; } + void reset(const SBuf &newBuf) { undoParse(newBuf, 0); } /** Basic strtok(3): * Skips all leading delimiters (if any), @@ -135,6 +135,9 @@ protected: SBuf consume(const SBuf::size_type n); SBuf::size_type success(const SBuf::size_type n); + /// reset the buffer and parsed stats to a saved checkpoint + void undoParse(const SBuf &newBuf, SBuf::size_type cParsed) { buf_ = newBuf; parsed_ = cParsed; } + private: SBuf buf_; ///< yet unparsed input SBuf::size_type parsed_; ///< bytes successfully parsed, including skipped