From: Ben Darnell <ben@bendarnell.com>
Date: Sat, 23 Apr 2016 23:34:23 +0000 (-0400)
Subject: Begin to fix type confusion in HTTPHeaders.
X-Git-Tag: v4.4.0b1~27^2
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=10da549cbebf00a5cd3ad3898f141e56f5ede090;p=thirdparty%2Ftornado.git

Begin to fix type confusion in HTTPHeaders.

Values in HTTPHeaders were typed inconsistently: the class itself
assumed that values were of type str, while many callers assumed they
were bytes. Since headers are practically restricted to ascii, this
would work on python 2 but fail when certain combinations were
encountered on python 3 (notably the combination of GzipContentEncoding
with multiple preexisting Vary headers).

This commit adds a test and fixes the bug, and also adds enough type
annotations that mypy was able to report errors in the old incorrect
code.

Fixes #1670
---

diff --git a/tornado/escape.py b/tornado/escape.py
index e6636f203..7a3b0e034 100644
--- a/tornado/escape.py
+++ b/tornado/escape.py
@@ -37,6 +37,11 @@ else:
     import htmlentitydefs
     import urllib as urllib_parse
 
+try:
+    import typing  # noqa
+except ImportError:
+    pass
+
 
 _XHTML_ESCAPE_RE = re.compile('[&<>"\']')
 _XHTML_ESCAPE_DICT = {'&': '&amp;', '<': '&lt;', '>': '&gt;', '"': '&quot;',
@@ -180,6 +185,7 @@ _UTF8_TYPES = (bytes, type(None))
 
 
 def utf8(value):
+    # type: (typing.Union[bytes,unicode_type,None])->typing.Union[bytes,None]
     """Converts a string argument to a byte string.
 
     If the argument is already a byte string or None, it is returned unchanged.
diff --git a/tornado/http1connection.py b/tornado/http1connection.py
index b04cff132..8194f9143 100644
--- a/tornado/http1connection.py
+++ b/tornado/http1connection.py
@@ -30,7 +30,7 @@ from tornado import httputil
 from tornado import iostream
 from tornado.log import gen_log, app_log
 from tornado import stack_context
-from tornado.util import GzipDecompressor
+from tornado.util import GzipDecompressor, PY3
 
 
 class _QuietException(Exception):
@@ -372,7 +372,14 @@ class HTTP1Connection(httputil.HTTPConnection):
             self._expected_content_remaining = int(headers['Content-Length'])
         else:
             self._expected_content_remaining = None
-        lines.extend([utf8(n) + b": " + utf8(v) for n, v in headers.get_all()])
+        # TODO: headers are supposed to be of type str, but we still have some
+        # cases that let bytes slip through. Remove these native_str calls when those
+        # are fixed.
+        header_lines = (native_str(n) + ": " + native_str(v) for n, v in headers.get_all())
+        if PY3:
+            lines.extend(l.encode('latin1') for l in header_lines)
+        else:
+            lines.extend(header_lines)
         for line in lines:
             if b'\n' in line:
                 raise ValueError('Newline in header: ' + repr(line))
diff --git a/tornado/httputil.py b/tornado/httputil.py
index d0901565a..866681adf 100644
--- a/tornado/httputil.py
+++ b/tornado/httputil.py
@@ -59,6 +59,12 @@ except ImportError:
     # on the class definition itself; must go through an assignment.
     SSLError = _SSLError  # type: ignore
 
+try:
+    import typing
+except ImportError:
+    pass
+
+
 # RFC 7230 section 3.5: a recipient MAY recognize a single LF as a line
 # terminator and ignore any preceding CR.
 _CRLF_RE = re.compile(r'\r?\n')
@@ -124,8 +130,8 @@ class HTTPHeaders(collections.MutableMapping):
     Set-Cookie: C=D
     """
     def __init__(self, *args, **kwargs):
-        self._dict = {}
-        self._as_list = {}
+        self._dict = {}  # type: typing.Dict[str, str]
+        self._as_list = {}  # type: typing.Dict[str, typing.List[str]]
         self._last_key = None
         if (len(args) == 1 and len(kwargs) == 0 and
                 isinstance(args[0], HTTPHeaders)):
@@ -139,6 +145,7 @@ class HTTPHeaders(collections.MutableMapping):
     # new public methods
 
     def add(self, name, value):
+        # type: (str, str) -> None
         """Adds a new value for the given key."""
         norm_name = _normalized_headers[name]
         self._last_key = norm_name
@@ -155,6 +162,7 @@ class HTTPHeaders(collections.MutableMapping):
         return self._as_list.get(norm_name, [])
 
     def get_all(self):
+        # type: () -> typing.Iterable[typing.Tuple[str, str]]
         """Returns an iterable of all (name, value) pairs.
 
         If a header has multiple values, multiple pairs will be
@@ -203,6 +211,7 @@ class HTTPHeaders(collections.MutableMapping):
         self._as_list[norm_name] = [value]
 
     def __getitem__(self, name):
+        # type: (str) -> str
         return self._dict[_normalized_headers[name]]
 
     def __delitem__(self, name):
diff --git a/tornado/test/web_test.py b/tornado/test/web_test.py
index fac23a21f..7e4178549 100644
--- a/tornado/test/web_test.py
+++ b/tornado/test/web_test.py
@@ -1506,8 +1506,8 @@ class ErrorHandlerXSRFTest(WebTestCase):
 class GzipTestCase(SimpleHandlerTestCase):
     class Handler(RequestHandler):
         def get(self):
-            if self.get_argument('vary', None):
-                self.set_header('Vary', self.get_argument('vary'))
+            for v in self.get_arguments('vary'):
+                self.add_header('Vary', v)
             # Must write at least MIN_LENGTH bytes to activate compression.
             self.write('hello world' + ('!' * GZipContentEncoding.MIN_LENGTH))
 
@@ -1516,8 +1516,7 @@ class GzipTestCase(SimpleHandlerTestCase):
             gzip=True,
             static_path=os.path.join(os.path.dirname(__file__), 'static'))
 
-    def test_gzip(self):
-        response = self.fetch('/')
+    def assert_compressed(self, response):
         # simple_httpclient renames the content-encoding header;
         # curl_httpclient doesn't.
         self.assertEqual(
@@ -1525,17 +1524,18 @@ class GzipTestCase(SimpleHandlerTestCase):
                 'Content-Encoding',
                 response.headers.get('X-Consumed-Content-Encoding')),
             'gzip')
+
+
+    def test_gzip(self):
+        response = self.fetch('/')
+        self.assert_compressed(response)
         self.assertEqual(response.headers['Vary'], 'Accept-Encoding')
 
     def test_gzip_static(self):
         # The streaming responses in StaticFileHandler have subtle
         # interactions with the gzip output so test this case separately.
         response = self.fetch('/robots.txt')
-        self.assertEqual(
-            response.headers.get(
-                'Content-Encoding',
-                response.headers.get('X-Consumed-Content-Encoding')),
-            'gzip')
+        self.assert_compressed(response)
         self.assertEqual(response.headers['Vary'], 'Accept-Encoding')
 
     def test_gzip_not_requested(self):
@@ -1545,9 +1545,16 @@ class GzipTestCase(SimpleHandlerTestCase):
 
     def test_vary_already_present(self):
         response = self.fetch('/?vary=Accept-Language')
-        self.assertEqual(response.headers['Vary'],
-                         'Accept-Language, Accept-Encoding')
-
+        self.assert_compressed(response)
+        self.assertEqual([s.strip() for s in response.headers['Vary'].split(',')],
+                         ['Accept-Language', 'Accept-Encoding'])
+
+    def test_vary_already_present_multiple(self):
+        # Regression test for https://github.com/tornadoweb/tornado/issues/1670
+        response = self.fetch('/?vary=Accept-Language&vary=Cookie')
+        self.assert_compressed(response)
+        self.assertEqual([s.strip() for s in response.headers['Vary'].split(',')],
+                         ['Accept-Language', 'Cookie', 'Accept-Encoding'])
 
 @wsgi_safe
 class PathArgsInPrepareTest(WebTestCase):
diff --git a/tornado/util.py b/tornado/util.py
index 4283d4e86..d49a84f42 100644
--- a/tornado/util.py
+++ b/tornado/util.py
@@ -33,12 +33,13 @@ else:
 # Aliases for types that are spelled differently in different Python
 # versions. bytes_type is deprecated and no longer used in Tornado
 # itself but is left in case anyone outside Tornado is using it.
-unicode_type = type(u'')
 bytes_type = bytes
 if PY3:
+    unicode_type = str
     basestring_type = str
 else:
-    # The name basestring doesn't exist in py3 so silence flake8.
+    # The names unicode and basestring don't exist in py3 so silence flake8.
+    unicode_type = unicode  # noqa
     basestring_type = basestring  # noqa
 
 
diff --git a/tornado/web.py b/tornado/web.py
index 8f2acfcc9..c9ff2b2d0 100644
--- a/tornado/web.py
+++ b/tornado/web.py
@@ -104,6 +104,11 @@ else:
 
 try:
     import typing  # noqa
+
+    # The following types are accepted by RequestHandler.set_header
+    # and related methods.
+    _HeaderTypes = typing.Union[bytes, unicode_type,
+                                numbers.Integral, datetime.datetime]
 except ImportError:
     pass
 
@@ -164,6 +169,7 @@ class RequestHandler(object):
         self._auto_finish = True
         self._transforms = None  # will be set in _execute
         self._prepared_future = None
+        self._headers = None  # type: httputil.HTTPHeaders
         self.path_args = None
         self.path_kwargs = None
         self.ui = ObjectDict((n, self._ui_method(m)) for n, m in
@@ -318,6 +324,7 @@ class RequestHandler(object):
         return self._status_code
 
     def set_header(self, name, value):
+        # type: (str, _HeaderTypes) -> None
         """Sets the given response header name and value.
 
         If a datetime is given, we automatically format it according to the
@@ -327,6 +334,7 @@ class RequestHandler(object):
         self._headers[name] = self._convert_header_value(value)
 
     def add_header(self, name, value):
+        # type: (str, _HeaderTypes) -> None
         """Adds the given response header and value.
 
         Unlike `set_header`, `add_header` may be called multiple times
@@ -343,13 +351,25 @@ class RequestHandler(object):
         if name in self._headers:
             del self._headers[name]
 
-    _INVALID_HEADER_CHAR_RE = re.compile(br"[\x00-\x1f]")
+    _INVALID_HEADER_CHAR_RE = re.compile(r"[\x00-\x1f]")
 
     def _convert_header_value(self, value):
-        if isinstance(value, bytes):
-            pass
-        elif isinstance(value, unicode_type):
-            value = value.encode('utf-8')
+        # type: (_HeaderTypes) -> str
+
+        # Convert the input value to a str. This type check is a bit
+        # subtle: The bytes case only executes on python 3, and the
+        # unicode case only executes on python 2, because the other
+        # cases are covered by the first match for str.
+        if isinstance(value, str):
+            retval = value
+        elif isinstance(value, bytes):  # py3
+            # Non-ascii characters in headers are not well supported,
+            # but if you pass bytes, use latin1 so they pass through as-is.
+            retval = value.decode('latin1')
+        elif isinstance(value, unicode_type):  # py2
+            # TODO: This is inconsistent with the use of latin1 above,
+            # but it's been that way for a long time. Should it change?
+            retval = escape.utf8(value)
         elif isinstance(value, numbers.Integral):
             # return immediately since we know the converted value will be safe
             return str(value)
@@ -359,9 +379,9 @@ class RequestHandler(object):
             raise TypeError("Unsupported header value %r" % value)
         # If \n is allowed into the header, it is possible to inject
         # additional headers or split the request.
-        if RequestHandler._INVALID_HEADER_CHAR_RE.search(value):
-            raise ValueError("Unsafe header value %r", value)
-        return value
+        if RequestHandler._INVALID_HEADER_CHAR_RE.search(retval):
+            raise ValueError("Unsafe header value %r", retval)
+        return retval
 
     _ARG_DEFAULT = object()
 
@@ -2696,6 +2716,7 @@ class OutputTransform(object):
         pass
 
     def transform_first_chunk(self, status_code, headers, chunk, finishing):
+        # type: (int, httputil.HTTPHeaders, bytes, bool) -> typing.Tuple[int, httputil.HTTPHeaders, bytes]
         return status_code, headers, chunk
 
     def transform_chunk(self, chunk, finishing):
@@ -2736,10 +2757,12 @@ class GZipContentEncoding(OutputTransform):
         return ctype.startswith('text/') or ctype in self.CONTENT_TYPES
 
     def transform_first_chunk(self, status_code, headers, chunk, finishing):
+        # type: (int, httputil.HTTPHeaders, bytes, bool) -> typing.Tuple[int, httputil.HTTPHeaders, bytes]
+        # TODO: can/should this type be inherited from the superclass?
         if 'Vary' in headers:
-            headers['Vary'] += b', Accept-Encoding'
+            headers['Vary'] += ', Accept-Encoding'
         else:
-            headers['Vary'] = b'Accept-Encoding'
+            headers['Vary'] = 'Accept-Encoding'
         if self._gzipping:
             ctype = _unicode(headers.get("Content-Type", "")).split(";")[0]
             self._gzipping = self._compressible_type(ctype) and \