From: Ben Darnell <ben@bendarnell.com>
Date: Sun, 11 Jan 2015 17:40:51 +0000 (-0500)
Subject: Fix header parsing with unicode newline-like characters.
X-Git-Tag: v4.1.0b1~40
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=88cfbc2b3446b29f198f64c7b84d70394f5c711e;p=thirdparty%2Ftornado.git

Fix header parsing with unicode newline-like characters.

We were using python's unicode.splitlines(), which recognizes
more characters than just CRLF and could cause problems if those
characters appeared in header values.

Fixes #1291.
---

diff --git a/tornado/httputil.py b/tornado/httputil.py
index 88389fedf..dfee4f650 100644
--- a/tornado/httputil.py
+++ b/tornado/httputil.py
@@ -62,6 +62,11 @@ except ImportError:
         pass
 
 
+# RFC 7230 section 3.5: a recipient MAY recognize a single LF as a line
+# terminator and ignore any preceding CR.
+_CRLF_RE = re.compile(r'\r?\n')
+
+
 class _NormalizedHeaderCache(dict):
     """Dynamic cached mapping of header names to Http-Header-Case.
 
@@ -193,7 +198,7 @@ class HTTPHeaders(dict):
         [('Content-Length', '42'), ('Content-Type', 'text/html')]
         """
         h = cls()
-        for line in headers.splitlines():
+        for line in _CRLF_RE.split(headers):
             if line:
                 h.parse_line(line)
         return h
diff --git a/tornado/test/httputil_test.py b/tornado/test/httputil_test.py
index adbd17114..53bb4e798 100644
--- a/tornado/test/httputil_test.py
+++ b/tornado/test/httputil_test.py
@@ -3,7 +3,7 @@
 
 from __future__ import absolute_import, division, print_function, with_statement
 from tornado.httputil import url_concat, parse_multipart_form_data, HTTPHeaders, format_timestamp, HTTPServerRequest, parse_request_start_line
-from tornado.escape import utf8
+from tornado.escape import utf8, native_str
 from tornado.log import gen_log
 from tornado.testing import ExpectLog
 from tornado.test.util import unittest
@@ -228,6 +228,57 @@ Foo: even
                           ("Foo", "bar baz"),
                           ("Foo", "even more lines")])
 
+    def test_unicode_newlines(self):
+        # Ensure that only \r\n is recognized as a header separator, and not
+        # the other newline-like unicode characters.
+        # Characters that are likely to be problematic can be found in
+        # http://unicode.org/standard/reports/tr13/tr13-5.html
+        # and cpython's unicodeobject.c (which defines the implementation
+        # of unicode_type.splitlines(), and uses a different list than TR13).
+        newlines = [
+            u'\u001b', # VERTICAL TAB
+            u'\u001c', # FILE SEPARATOR
+            u'\u001d', # GROUP SEPARATOR
+            u'\u001e', # RECORD SEPARATOR
+            u'\u0085', # NEXT LINE
+            u'\u2028', # LINE SEPARATOR
+            u'\u2029', # PARAGRAPH SEPARATOR
+            ]
+        for newline in newlines:
+            # Try the utf8 and latin1 representations of each newline
+            for encoding in ['utf8', 'latin1']:
+                try:
+                    try:
+                        encoded = newline.encode(encoding)
+                    except UnicodeEncodeError:
+                        # Some chars cannot be represented in latin1
+                        continue
+                    data = b'Cookie: foo=' + encoded + b'bar'
+                    # parse() wants a native_str, so decode through latin1
+                    # in the same way the real parser does.
+                    headers = HTTPHeaders.parse(
+                        native_str(data.decode('latin1')))
+                    expected = [('Cookie', 'foo=' +
+                                 native_str(encoded.decode('latin1')) + 'bar')]
+                    self.assertEqual(
+                        expected, list(headers.get_all()))
+                except Exception:
+                    gen_log.warning("failed while trying %r in %s",
+                                    newline, encoding)
+                    raise
+
+    def test_optional_cr(self):
+        # Both CRLF and LF should be accepted as separators. CR should not be
+        # part of the data when followed by LF, but it is a normal char
+        # otherwise (or should bare CR be an error?)
+        headers = HTTPHeaders.parse(
+            'CRLF: crlf\r\nLF: lf\nCR: cr\rMore: more\r\n')
+        self.assertEqual(sorted(headers.get_all()),
+                         [('Cr', 'cr\rMore: more'),
+                          ('Crlf', 'crlf'),
+                          ('Lf', 'lf'),
+                         ])
+
 
 class FormatTimestampTest(unittest.TestCase):
     # Make sure that all the input types are supported.