From: Mikhail Burshteyn <mdburshteyn@gmail.com>
Date: Sat, 4 May 2019 08:16:18 +0000 (+0300)
Subject: Speedup headers parsing
X-Git-Tag: v6.1.0b1~76^2
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=refs%2Fpull%2F2646%2Fhead;p=thirdparty%2Ftornado.git

Speedup headers parsing

Replace `_CRLF_RE.split(headers)` with a simple `headers.split('\n')` with an additional check for `'\r'` in each line.
Add benchmark to measure performance impact of the change.
The benchmark results are as follows:
* split only: ~3x faster on CPython, 14-18x faster on PyPy
* full headers parse: ~1.3x faster on CPython, ~3-4.5x faster on PyPy
---

diff --git a/maint/benchmark/parsing_benchmark.py b/maint/benchmark/parsing_benchmark.py
new file mode 100644
index 000000000..d0bfcc895
--- /dev/null
+++ b/maint/benchmark/parsing_benchmark.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+import re
+import timeit
+from enum import Enum
+from typing import Callable
+
+from tornado.httputil import HTTPHeaders
+from tornado.options import define, options, parse_command_line
+
+
+define("benchmark", type=str)
+define("num_runs", type=int, default=1)
+
+
+_CRLF_RE = re.compile(r"\r?\n")
+_TEST_HEADERS = (
+    "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,"
+    "image/apng,*/*;q=0.8,application/signed-exchange;v=b3\r\n"
+    "Accept-Encoding: gzip, deflate, br\r\n"
+    "Accept-Language: ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7\r\n"
+    "Cache-Control: max-age=0\r\n"
+    "Connection: keep-alive\r\n"
+    "Host: example.com\r\n"
+    "Upgrade-Insecure-Requests: 1\r\n"
+    "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+    "(KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36\r\n"
+)
+
+
+def headers_split_re(headers: str) -> None:
+    for line in _CRLF_RE.split(headers):
+        pass
+
+
+def headers_split_simple(headers: str) -> None:
+    for line in headers.split("\n"):
+        if line.endswith("\r"):
+            line = line[:-1]
+
+
+def headers_parse_re(headers: str) -> HTTPHeaders:
+    h = HTTPHeaders()
+    for line in _CRLF_RE.split(headers):
+        if line:
+            h.parse_line(line)
+    return h
+
+
+def headers_parse_simple(headers: str) -> HTTPHeaders:
+    h = HTTPHeaders()
+    for line in headers.split("\n"):
+        if line.endswith("\r"):
+            line = line[:-1]
+        if line:
+            h.parse_line(line)
+    return h
+
+
+def run_headers_split():
+    regex_time = timeit.timeit(lambda: headers_split_re(_TEST_HEADERS), number=100000)
+    print("regex", regex_time)
+
+    simple_time = timeit.timeit(
+        lambda: headers_split_simple(_TEST_HEADERS), number=100000
+    )
+    print("str.split", simple_time)
+
+    print("speedup", regex_time / simple_time)
+
+
+def run_headers_full():
+    regex_time = timeit.timeit(lambda: headers_parse_re(_TEST_HEADERS), number=10000)
+    print("regex", regex_time)
+
+    simple_time = timeit.timeit(
+        lambda: headers_parse_simple(_TEST_HEADERS), number=10000
+    )
+    print("str.split", simple_time)
+
+    print("speedup", regex_time / simple_time)
+
+
+class Benchmark(Enum):
+    def __new__(cls, arg_value: str, func: Callable[[], None]):
+        member = object.__new__(cls)
+        member._value_ = arg_value
+        member.func = func
+        return member
+
+    HEADERS_SPLIT = ("headers-split", run_headers_split)
+    HEADERS_FULL = ("headers-full", run_headers_full)
+
+
+def main():
+    parse_command_line()
+
+    try:
+        func = Benchmark(options.benchmark).func
+    except ValueError:
+        known_benchmarks = [benchmark.value for benchmark in Benchmark]
+        print(
+            "Unknown benchmark: '{}', supported values are: {}"
+            .format(options.benchmark, ", ".join(known_benchmarks))
+        )
+        return
+
+    for _ in range(options.num_runs):
+        func()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tornado/httputil.py b/tornado/httputil.py
index 835327d0a..15f5675e2 100644
--- a/tornado/httputil.py
+++ b/tornado/httputil.py
@@ -62,11 +62,6 @@ if typing.TYPE_CHECKING:
     import unittest  # noqa: F401
 
 
-# RFC 7230 section 3.5: a recipient MAY recognize a single LF as a line
-# terminator and ignore any preceding CR.
-_CRLF_RE = re.compile(r"\r?\n")
-
-
 class _NormalizedHeaderCache(dict):
     """Dynamic cached mapping of header names to Http-Header-Case.
 
@@ -223,7 +218,11 @@ class HTTPHeaders(collections.abc.MutableMapping):
 
         """
         h = cls()
-        for line in _CRLF_RE.split(headers):
+        # RFC 7230 section 3.5: a recipient MAY recognize a single LF as a line
+        # terminator and ignore any preceding CR.
+        for line in headers.split("\n"):
+            if line.endswith("\r"):
+                line = line[:-1]
             if line:
                 h.parse_line(line)
         return h