Fix header parsing with unicode newline-like characters.

author Ben Darnell <ben@bendarnell.com>

Sun, 11 Jan 2015 17:40:51 +0000 (12:40 -0500)

committer Ben Darnell <ben@bendarnell.com>

Sun, 11 Jan 2015 17:40:51 +0000 (12:40 -0500)
author Ben Darnell <ben@bendarnell.com>
Sun, 11 Jan 2015 17:40:51 +0000 (12:40 -0500)
committer Ben Darnell <ben@bendarnell.com>
Sun, 11 Jan 2015 17:40:51 +0000 (12:40 -0500)
diff --git a/tornado/httputil.py b/tornado/httputil.py

index 88389fedfe581d68b9e55a8e0ce0d3306d47662c..dfee4f6508dea625606eba1217752a2cee6f7db1 100644 (file)
--- a/tornado/httputil.py
+++ b/tornado/httputil.py
@@ -62,6 +62,11 @@ except ImportError:
          pass
  
  
+# RFC 7230 section 3.5: a recipient MAY recognize a single LF as a line
+# terminator and ignore any preceding CR.
+_CRLF_RE = re.compile(r'\r?\n')
+
+
  class _NormalizedHeaderCache(dict):
      """Dynamic cached mapping of header names to Http-Header-Case.
  
@@ -193,7 +198,7 @@ class HTTPHeaders(dict):
          [('Content-Length', '42'), ('Content-Type', 'text/html')]
          """
          h = cls()
-        for line in headers.splitlines():
+        for line in _CRLF_RE.split(headers):
              if line:
                  h.parse_line(line)
          return h
diff --git a/tornado/test/httputil_test.py b/tornado/test/httputil_test.py

index adbd17114a138f31a9c29408bd6ad822bc0a7566..53bb4e7985b0ec71c8bd21ec80314190d49e1c4b 100644 (file)
--- a/tornado/test/httputil_test.py
+++ b/tornado/test/httputil_test.py
@@ -3,7 +3,7 @@
  
  from __future__ import absolute_import, division, print_function, with_statement
  from tornado.httputil import url_concat, parse_multipart_form_data, HTTPHeaders, format_timestamp, HTTPServerRequest, parse_request_start_line
-from tornado.escape import utf8
+from tornado.escape import utf8, native_str
  from tornado.log import gen_log
  from tornado.testing import ExpectLog
  from tornado.test.util import unittest
@@ -228,6 +228,57 @@ Foo: even
                            ("Foo", "bar baz"),
                            ("Foo", "even more lines")])
  
+    def test_unicode_newlines(self):
+        # Ensure that only \r\n is recognized as a header separator, and not
+        # the other newline-like unicode characters.
+        # Characters that are likely to be problematic can be found in
+        # http://unicode.org/standard/reports/tr13/tr13-5.html
+        # and cpython's unicodeobject.c (which defines the implementation
+        # of unicode_type.splitlines(), and uses a different list than TR13).
+        newlines = [
+            u'\u001b', # VERTICAL TAB
+            u'\u001c', # FILE SEPARATOR
+            u'\u001d', # GROUP SEPARATOR
+            u'\u001e', # RECORD SEPARATOR
+            u'\u0085', # NEXT LINE
+            u'\u2028', # LINE SEPARATOR
+            u'\u2029', # PARAGRAPH SEPARATOR
+            ]
+        for newline in newlines:
+            # Try the utf8 and latin1 representations of each newline
+            for encoding in ['utf8', 'latin1']:
+                try:
+                    try:
+                        encoded = newline.encode(encoding)
+                    except UnicodeEncodeError:
+                        # Some chars cannot be represented in latin1
+                        continue
+                    data = b'Cookie: foo=' + encoded + b'bar'
+                    # parse() wants a native_str, so decode through latin1
+                    # in the same way the real parser does.
+                    headers = HTTPHeaders.parse(
+                        native_str(data.decode('latin1')))
+                    expected = [('Cookie', 'foo=' +
+                                 native_str(encoded.decode('latin1')) + 'bar')]
+                    self.assertEqual(
+                        expected, list(headers.get_all()))
+                except Exception:
+                    gen_log.warning("failed while trying %r in %s",
+                                    newline, encoding)
+                    raise
+
+    def test_optional_cr(self):
+        # Both CRLF and LF should be accepted as separators. CR should not be
+        # part of the data when followed by LF, but it is a normal char
+        # otherwise (or should bare CR be an error?)
+        headers = HTTPHeaders.parse(
+            'CRLF: crlf\r\nLF: lf\nCR: cr\rMore: more\r\n')
+        self.assertEqual(sorted(headers.get_all()),
+                         [('Cr', 'cr\rMore: more'),
+                          ('Crlf', 'crlf'),
+                          ('Lf', 'lf'),
+                         ])
+
  
  class FormatTimestampTest(unittest.TestCase):
      # Make sure that all the input types are supported.
author	Ben Darnell <ben@bendarnell.com>
	Sun, 11 Jan 2015 17:40:51 +0000 (12:40 -0500)
committer	Ben Darnell <ben@bendarnell.com>
	Sun, 11 Jan 2015 17:40:51 +0000 (12:40 -0500)
tornado/httputil.py		patch \| blob \| blame \| history
tornado/test/httputil_test.py		patch \| blob \| blame \| history