From: Ben Darnell Date: Sun, 11 Jan 2015 17:40:51 +0000 (-0500) Subject: Fix header parsing with unicode newline-like characters. X-Git-Tag: v4.1.0b1~40 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=88cfbc2b3446b29f198f64c7b84d70394f5c711e;p=thirdparty%2Ftornado.git Fix header parsing with unicode newline-like characters. We were using python's unicode.splitlines(), which recognizes more characters than just CRLF and could cause problems if those characters appeared in header values. Fixes #1291. --- diff --git a/tornado/httputil.py b/tornado/httputil.py index 88389fedf..dfee4f650 100644 --- a/tornado/httputil.py +++ b/tornado/httputil.py @@ -62,6 +62,11 @@ except ImportError: pass +# RFC 7230 section 3.5: a recipient MAY recognize a single LF as a line +# terminator and ignore any preceding CR. +_CRLF_RE = re.compile(r'\r?\n') + + class _NormalizedHeaderCache(dict): """Dynamic cached mapping of header names to Http-Header-Case. @@ -193,7 +198,7 @@ class HTTPHeaders(dict): [('Content-Length', '42'), ('Content-Type', 'text/html')] """ h = cls() - for line in headers.splitlines(): + for line in _CRLF_RE.split(headers): if line: h.parse_line(line) return h diff --git a/tornado/test/httputil_test.py b/tornado/test/httputil_test.py index adbd17114..53bb4e798 100644 --- a/tornado/test/httputil_test.py +++ b/tornado/test/httputil_test.py @@ -3,7 +3,7 @@ from __future__ import absolute_import, division, print_function, with_statement from tornado.httputil import url_concat, parse_multipart_form_data, HTTPHeaders, format_timestamp, HTTPServerRequest, parse_request_start_line -from tornado.escape import utf8 +from tornado.escape import utf8, native_str from tornado.log import gen_log from tornado.testing import ExpectLog from tornado.test.util import unittest @@ -228,6 +228,57 @@ Foo: even ("Foo", "bar baz"), ("Foo", "even more lines")]) + def test_unicode_newlines(self): + # Ensure that only \r\n is recognized as a header separator, and not + # the other newline-like unicode characters. + # Characters that are likely to be problematic can be found in + # http://unicode.org/standard/reports/tr13/tr13-5.html + # and cpython's unicodeobject.c (which defines the implementation + # of unicode_type.splitlines(), and uses a different list than TR13). + newlines = [ + u'\u001b', # VERTICAL TAB + u'\u001c', # FILE SEPARATOR + u'\u001d', # GROUP SEPARATOR + u'\u001e', # RECORD SEPARATOR + u'\u0085', # NEXT LINE + u'\u2028', # LINE SEPARATOR + u'\u2029', # PARAGRAPH SEPARATOR + ] + for newline in newlines: + # Try the utf8 and latin1 representations of each newline + for encoding in ['utf8', 'latin1']: + try: + try: + encoded = newline.encode(encoding) + except UnicodeEncodeError: + # Some chars cannot be represented in latin1 + continue + data = b'Cookie: foo=' + encoded + b'bar' + # parse() wants a native_str, so decode through latin1 + # in the same way the real parser does. + headers = HTTPHeaders.parse( + native_str(data.decode('latin1'))) + expected = [('Cookie', 'foo=' + + native_str(encoded.decode('latin1')) + 'bar')] + self.assertEqual( + expected, list(headers.get_all())) + except Exception: + gen_log.warning("failed while trying %r in %s", + newline, encoding) + raise + + def test_optional_cr(self): + # Both CRLF and LF should be accepted as separators. CR should not be + # part of the data when followed by LF, but it is a normal char + # otherwise (or should bare CR be an error?) + headers = HTTPHeaders.parse( + 'CRLF: crlf\r\nLF: lf\nCR: cr\rMore: more\r\n') + self.assertEqual(sorted(headers.get_all()), + [('Cr', 'cr\rMore: more'), + ('Crlf', 'crlf'), + ('Lf', 'lf'), + ]) + class FormatTimestampTest(unittest.TestCase): # Make sure that all the input types are supported.