From: Amos Jeffries <squid3@treenet.co.nz>
Date: Fri, 10 Apr 2015 11:02:44 +0000 (-0700)
Subject: Add Http1::Tokenzer class
X-Git-Tag: merge-candidate-3-v1~81^2~3
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=f29718b079f1e0a665fd8705b267e6176e9378a5;p=thirdparty%2Fsquid.git

Add Http1::Tokenzer class

... for tokenizing HTTP/1.x lexical symbols. Inherits from the protocol
agnostic ::Parser::Tokenizer base class.

Provdes quoted-string and (token / quoted-string) parsing methods with
HTTP/1.0 and HTTP/1.1 compliant character sets and \-escaping support.
---

diff --git a/src/http/one/Makefile.am b/src/http/one/Makefile.am
index ce91c2d89b..a5bfa65bce 100644
--- a/src/http/one/Makefile.am
+++ b/src/http/one/Makefile.am
@@ -17,4 +17,6 @@ libhttp1_la_SOURCES = \
 	RequestParser.cc \
 	RequestParser.h \
 	ResponseParser.cc \
-	ResponseParser.h
+	ResponseParser.h \
+	Tokenizer.cc \
+	Tokenizer.h
diff --git a/src/http/one/Parser.cc b/src/http/one/Parser.cc
index a3c199bbda..7352e606fa 100644
--- a/src/http/one/Parser.cc
+++ b/src/http/one/Parser.cc
@@ -9,8 +9,8 @@
 #include "squid.h"
 #include "Debug.h"
 #include "http/one/Parser.h"
+#include "http/one/Tokenizer.h"
 #include "mime_header.h"
-#include "parser/Tokenizer.h"
 #include "SquidConfig.h"
 
 /// RFC 7230 section 2.6 - 7 magic octets
@@ -26,7 +26,7 @@ Http::One::Parser::clear()
 }
 
 bool
-Http::One::Parser::skipLineTerminator(::Parser::Tokenizer &tok) const
+Http::One::Parser::skipLineTerminator(Http1::Tokenizer &tok) const
 {
     static const SBuf crlf("\r\n");
     if (tok.skip(crlf))
@@ -102,7 +102,7 @@ Http::One::Parser::getHeaderField(const char *name)
 
     // while we can find more LF in the SBuf
     static CharacterSet iso8859Line = CharacterSet("non-LF",'\0','\n'-1) + CharacterSet(NULL, '\n'+1, (unsigned char)0xFF);
-    ::Parser::Tokenizer tok(mimeHeaderBlock_);
+    Http1::Tokenizer tok(mimeHeaderBlock_);
     SBuf p;
     static const SBuf crlf("\r\n");
 
@@ -125,7 +125,7 @@ Http::One::Parser::getHeaderField(const char *name)
         p.consume(namelen + 1);
 
         // TODO: optimize SBuf::trim to take CharacterSet directly
-        ::Parser::Tokenizer t(p);
+        Http1::Tokenizer t(p);
         t.skipAll(CharacterSet::WSP);
         p = t.remaining();
 
diff --git a/src/http/one/Parser.h b/src/http/one/Parser.h
index 42ddb52201..09d51ec53b 100644
--- a/src/http/one/Parser.h
+++ b/src/http/one/Parser.h
@@ -14,10 +14,6 @@
 #include "http/StatusCode.h"
 #include "SBuf.h"
 
-namespace Parser {
-class Tokenizer;
-}
-
 namespace Http {
 namespace One {
 
@@ -105,7 +101,7 @@ public:
 protected:
     /// detect and skip the CRLF or (if tolerant) LF line terminator
     /// consume from the tokenizer and return true only if found
-    bool skipLineTerminator(::Parser::Tokenizer &tok) const;
+    bool skipLineTerminator(Http1::Tokenizer &tok) const;
 
     /**
      * Scan to find the mime headers block for current message.
diff --git a/src/http/one/RequestParser.cc b/src/http/one/RequestParser.cc
index 8577b7c27b..97545d8b29 100644
--- a/src/http/one/RequestParser.cc
+++ b/src/http/one/RequestParser.cc
@@ -9,8 +9,8 @@
 #include "squid.h"
 #include "Debug.h"
 #include "http/one/RequestParser.h"
+#include "http/one/Tokenizer.h"
 #include "http/ProtocolVersion.h"
-#include "parser/Tokenizer.h"
 #include "profiler/Profiler.h"
 #include "SquidConfig.h"
 
@@ -72,7 +72,7 @@ Http::One::RequestParser::skipGarbageLines()
  * \retval  0  more data is needed to complete the parse
  */
 int
-Http::One::RequestParser::parseMethodField(::Parser::Tokenizer &tok, const CharacterSet &WspDelim)
+Http::One::RequestParser::parseMethodField(Http1::Tokenizer &tok, const CharacterSet &WspDelim)
 {
     // scan for up to 16 valid method characters.
     static const size_t maxMethodLength = 16; // TODO: make this configurable?
@@ -132,7 +132,7 @@ uriValidCharacters()
 }
 
 int
-Http::One::RequestParser::parseUriField(::Parser::Tokenizer &tok)
+Http::One::RequestParser::parseUriField(Http1::Tokenizer &tok)
 {
     // URI field is a sequence of ... what? segments all have different valid charset
     // go with non-whitespace non-binary characters for now
@@ -187,7 +187,7 @@ Http::One::RequestParser::parseUriField(::Parser::Tokenizer &tok)
 }
 
 int
-Http::One::RequestParser::parseHttpVersionField(::Parser::Tokenizer &tok)
+Http::One::RequestParser::parseHttpVersionField(Http1::Tokenizer &tok)
 {
     // partial match of HTTP/1 magic prefix
     if (tok.remaining().length() < Http1magic.length() && Http1magic.startsWith(tok.remaining())) {
@@ -246,7 +246,7 @@ Http::One::RequestParser::parseHttpVersionField(::Parser::Tokenizer &tok)
 int
 Http::One::RequestParser::parseRequestFirstLine()
 {
-    ::Parser::Tokenizer tok(buf_);
+    Http1::Tokenizer tok(buf_);
 
     debugs(74, 5, "parsing possible request: buf.length=" << buf_.length());
     debugs(74, DBG_DATA, buf_);
@@ -297,7 +297,7 @@ Http::One::RequestParser::parseRequestFirstLine()
         // seek the LF character, then tokenize the line in reverse
         SBuf line;
         if (tok.prefix(line, LfDelim) && tok.skip('\n')) {
-            ::Parser::Tokenizer rTok(line);
+            Http1::Tokenizer rTok(line);
             SBuf nil;
             (void)rTok.suffix(nil,CharacterSet::CR); // optional CR in terminator
             SBuf digit;
diff --git a/src/http/one/RequestParser.h b/src/http/one/RequestParser.h
index f793ff0578..c48ad5ed57 100644
--- a/src/http/one/RequestParser.h
+++ b/src/http/one/RequestParser.h
@@ -47,9 +47,9 @@ public:
 private:
     void skipGarbageLines();
     int parseRequestFirstLine();
-    int parseMethodField(::Parser::Tokenizer &, const CharacterSet &);
-    int parseUriField(::Parser::Tokenizer &);
-    int parseHttpVersionField(::Parser::Tokenizer &);
+    int parseMethodField(Http1::Tokenizer &, const CharacterSet &);
+    int parseUriField(Http1::Tokenizer &);
+    int parseHttpVersionField(Http1::Tokenizer &);
 
     /// what request method has been found on the first line
     HttpRequestMethod method_;
diff --git a/src/http/one/ResponseParser.cc b/src/http/one/ResponseParser.cc
index f74360fb72..37bcf71b50 100644
--- a/src/http/one/ResponseParser.cc
+++ b/src/http/one/ResponseParser.cc
@@ -9,8 +9,8 @@
 #include "squid.h"
 #include "Debug.h"
 #include "http/one/ResponseParser.h"
+#include "http/one/Tokenizer.h"
 #include "http/ProtocolVersion.h"
-#include "parser/Tokenizer.h"
 #include "profiler/Profiler.h"
 #include "SquidConfig.h"
 
@@ -47,7 +47,7 @@ Http::One::ResponseParser::firstLineSize() const
 // NP: we found the protocol version and consumed it already.
 // just need the status code and reason phrase
 int
-Http::One::ResponseParser::parseResponseStatusAndReason(::Parser::Tokenizer &tok, const CharacterSet &WspDelim)
+Http::One::ResponseParser::parseResponseStatusAndReason(Http1::Tokenizer &tok, const CharacterSet &WspDelim)
 {
     if (!completedStatus_) {
         debugs(74, 9, "seek status-code in: " << tok.remaining().substr(0,10) << "...");
@@ -121,7 +121,7 @@ Http::One::ResponseParser::parseResponseStatusAndReason(::Parser::Tokenizer &tok
 int
 Http::One::ResponseParser::parseResponseFirstLine()
 {
-    ::Parser::Tokenizer tok(buf_);
+    Http1::Tokenizer tok(buf_);
 
     CharacterSet WspDelim = CharacterSet::SP; // strict parse only accepts SP
 
diff --git a/src/http/one/ResponseParser.h b/src/http/one/ResponseParser.h
index f9356605bd..509819ffaf 100644
--- a/src/http/one/ResponseParser.h
+++ b/src/http/one/ResponseParser.h
@@ -43,7 +43,7 @@ public:
 
 private:
     int parseResponseFirstLine();
-    int parseResponseStatusAndReason(::Parser::Tokenizer&, const CharacterSet &);
+    int parseResponseStatusAndReason(Http1::Tokenizer&, const CharacterSet &);
 
     /// magic prefix for identifying ICY response messages
     static const SBuf IcyMagic;
diff --git a/src/http/one/Tokenizer.cc b/src/http/one/Tokenizer.cc
new file mode 100644
index 0000000000..bfab18a7bb
--- /dev/null
+++ b/src/http/one/Tokenizer.cc
@@ -0,0 +1,109 @@
+/*
+ * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
+ *
+ * Squid software is distributed under GPLv2+ license and includes
+ * contributions from numerous individuals and organizations.
+ * Please see the COPYING and CONTRIBUTORS files for details.
+ */
+
+#include "squid.h"
+#include "Debug.h"
+#include "http/one/Tokenizer.h"
+
+bool
+Http::One::Tokenizer::quotedString(SBuf &returnedToken, const bool http1p0)
+{
+    checkpoint();
+
+    if (!skip('"'))
+        return false;
+
+    return qdText(returnedToken, http1p0);
+}
+
+bool
+Http::One::Tokenizer::quotedStringOrToken(SBuf &returnedToken, const bool http1p0)
+{
+    checkpoint();
+
+    if (!skip('"'))
+        return prefix(returnedToken, CharacterSet::TCHAR);
+
+    return qdText(returnedToken, http1p0);
+}
+
+bool
+Http::One::Tokenizer::qdText(SBuf &returnedToken, const bool http1p0)
+{
+    // the initial DQUOTE has been skipped by the caller
+
+    /*
+     * RFC 1945 - defines qdtext:
+     *   inclusive of LWS (which includes CR and LF)
+     *   exclusive of 0x80-0xFF
+     *   includes 0x5E ('\') as just a regular character
+     */
+    static const CharacterSet qdtext1p0 = CharacterSet("qdtext (HTTP/1.0)", 0x23, 0x7E) +
+                                          CharacterSet("", "!") +
+                                          CharacterSet::CR + CharacterSet::LF + CharacterSet::HTAB + CharacterSet::SP;
+    /*
+     * RFC 7230 - defines qdtext:
+     *   exclusive of CR and LF
+     *   inclusive of 0x80-0xFF
+     *   includes 0x5E ('\') but only when part of quoted-pair
+     */
+    static const CharacterSet qdtext1p1 = CharacterSet("qdtext (HTTP/1.1)", 0x23, 0x5B) +
+                                          CharacterSet("", "!") +
+                                          CharacterSet("", 0x5D, 0x7E) +
+                                          CharacterSet::HTAB + CharacterSet::SP +
+                                          CharacterSet::OBSTEXT;
+
+    // best we can do is a conditional reference since http1p0 value may change per-client
+    const CharacterSet &tokenChars = (http1p0 ? qdtext1p0 : qdtext1p1);
+
+    for (;;) {
+        SBuf::size_type prefixLen = buf().findFirstNotOf(tokenChars);
+        returnedToken.append(consume(prefixLen));
+
+        // HTTP/1.1 allows quoted-pair, HTTP/1.0 does not
+        if (!http1p0 && skip('\\')) {
+            /* RFC 7230 section 3.2.6
+             *
+             * The backslash octet ("\") can be used as a single-octet quoting
+             * mechanism within quoted-string and comment constructs.  Recipients
+             * that process the value of a quoted-string MUST handle a quoted-pair
+             * as if it were replaced by the octet following the backslash.
+             *
+             *   quoted-pair    = "\" ( HTAB / SP / VCHAR / obs-text )
+             */
+            static const CharacterSet qPairChars = CharacterSet::HTAB + CharacterSet::SP + CharacterSet::VCHAR + CharacterSet::OBSTEXT;
+            SBuf escaped;
+            if (!prefix(escaped, qPairChars, 1)) {
+                returnedToken.clear();
+                restoreLastCheckpoint();
+                return false;
+            }
+            returnedToken.append(escaped);
+            continue;
+
+        } else if (skip('"')) {
+            break; // done
+
+        } else if (atEnd()) {
+            // need more data
+            returnedToken.clear();
+            restoreLastCheckpoint();
+            return false;
+        }
+
+        // else, we have an error
+        debugs(24, 8, "invalid bytes for set " << tokenChars.name);
+        returnedToken.clear();
+        restoreLastCheckpoint();
+        return false;
+    }
+
+    // found the whole string
+    return true;
+}
+
diff --git a/src/http/one/Tokenizer.h b/src/http/one/Tokenizer.h
new file mode 100644
index 0000000000..60b276ffee
--- /dev/null
+++ b/src/http/one/Tokenizer.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (C) 1996-2015 The Squid Software Foundation and contributors
+ *
+ * Squid software is distributed under GPLv2+ license and includes
+ * contributions from numerous individuals and organizations.
+ * Please see the COPYING and CONTRIBUTORS files for details.
+ */
+
+#ifndef SQUID_SRC_HTTP_ONE_TOKENIZER_H
+#define SQUID_SRC_HTTP_ONE_TOKENIZER_H
+
+#include "parser/Tokenizer.h"
+
+namespace Http {
+namespace One {
+
+/**
+ * Lexical processor extended to tokenize HTTP/1.x syntax.
+ *
+ * \see ::Parser::Tokenizer for more detail
+ */
+class Tokenizer : public ::Parser::Tokenizer
+{
+public:
+    Tokenizer(SBuf &s) : ::Parser::Tokenizer(s) {}
+
+    /**
+     * Attempt to parse a quoted-string lexical construct.
+     *
+     * Governed by:
+     *  - RFC 1945 section 2.1
+     *  "
+     *    A string of text is parsed as a single word if it is quoted using
+     *    double-quote marks.
+     *
+     *        quoted-string  = ( <"> *(qdtext) <"> )
+     *
+     *        qdtext         = <any CHAR except <"> and CTLs,
+     *                         but including LWS>
+     *
+     *    Single-character quoting using the backslash ("\") character is not
+     *    permitted in HTTP/1.0.
+     *  "
+     *
+     *  - RFC 7230 section 3.2.6
+     *  "
+     *    A string of text is parsed as a single value if it is quoted using
+     *    double-quote marks.
+     *
+     *    quoted-string  = DQUOTE *( qdtext / quoted-pair ) DQUOTE
+     *    qdtext         = HTAB / SP /%x21 / %x23-5B / %x5D-7E / obs-text
+     *    obs-text       = %x80-FF
+     *  "
+     *
+     * \param escaped HTTP/1.0 does not permit \-escaped characters
+     */
+    bool quotedString(SBuf &value, const bool http1p0 = false);
+
+    /**
+     * Attempt to parse a (token / quoted-string ) lexical construct.
+     */
+    bool quotedStringOrToken(SBuf &value, const bool http1p0 = false);
+
+private:
+    /// parse the internal component of a quote-string, and terminal DQUOTE
+    bool qdText(SBuf &value, const bool http1p0);
+
+    void checkpoint() { savedCheckpoint_ = buf(); savedStats_ = parsedSize(); }
+    void restoreLastCheckpoint() { undoParse(savedCheckpoint_, savedStats_); }
+
+    SBuf savedCheckpoint_;
+    SBuf::size_type savedStats_;
+};
+
+} // namespace One
+} // namespace Http
+
+#endif /* SQUID_SRC_HTTP_ONE_TOKENIZER_H */
+
diff --git a/src/http/one/forward.h b/src/http/one/forward.h
index d7bf5ced11..fdce927cca 100644
--- a/src/http/one/forward.h
+++ b/src/http/one/forward.h
@@ -14,6 +14,8 @@
 namespace Http {
 namespace One {
 
+class Tokenizer;
+
 class Parser;
 typedef RefCount<Http::One::Parser> ParserPointer;
 
diff --git a/src/parser/Tokenizer.h b/src/parser/Tokenizer.h
index e18e76f5d7..47d4a7515a 100644
--- a/src/parser/Tokenizer.h
+++ b/src/parser/Tokenizer.h
@@ -44,7 +44,7 @@ public:
     const SBuf& remaining() const { return buf_; }
 
     /// reinitialize processing for a new buffer
-    void reset(const SBuf &newBuf) { buf_ = newBuf; parsed_ = 0; }
+    void reset(const SBuf &newBuf) { undoParse(newBuf, 0); }
 
     /** Basic strtok(3):
      *  Skips all leading delimiters (if any),
@@ -135,6 +135,9 @@ protected:
     SBuf consume(const SBuf::size_type n);
     SBuf::size_type success(const SBuf::size_type n);
 
+    /// reset the buffer and parsed stats to a saved checkpoint
+    void undoParse(const SBuf &newBuf, SBuf::size_type cParsed) { buf_ = newBuf; parsed_ = cParsed; }
+
 private:
     SBuf buf_; ///< yet unparsed input
     SBuf::size_type parsed_; ///< bytes successfully parsed, including skipped