]> git.ipfire.org Git - thirdparty/tornado.git/commitdiff
Speedup headers parsing 2646/head
authorMikhail Burshteyn <mdburshteyn@gmail.com>
Sat, 4 May 2019 08:16:18 +0000 (11:16 +0300)
committerMikhail Burshteyn <mdburshteyn@gmail.com>
Sat, 4 May 2019 08:16:18 +0000 (11:16 +0300)
Replace `_CRLF_RE.split(headers)` with a simple `headers.split('\n')` with an additional check for `'\r'` in each line.
Add benchmark to measure performance impact of the change.
The benchmark results are as follows:
* split only: ~3x faster on CPython, 14-18x faster on PyPy
* full headers parse: ~1.3x faster on CPython, ~3-4.5x faster on PyPy

maint/benchmark/parsing_benchmark.py [new file with mode: 0644]
tornado/httputil.py

diff --git a/maint/benchmark/parsing_benchmark.py b/maint/benchmark/parsing_benchmark.py
new file mode 100644 (file)
index 0000000..d0bfcc8
--- /dev/null
@@ -0,0 +1,112 @@
+#!/usr/bin/env python\r
+import re\r
+import timeit\r
+from enum import Enum\r
+from typing import Callable\r
+\r
+from tornado.httputil import HTTPHeaders\r
+from tornado.options import define, options, parse_command_line\r
+\r
+\r
+define("benchmark", type=str)\r
+define("num_runs", type=int, default=1)\r
+\r
+\r
+_CRLF_RE = re.compile(r"\r?\n")\r
+_TEST_HEADERS = (\r
+    "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,"\r
+    "image/apng,*/*;q=0.8,application/signed-exchange;v=b3\r\n"\r
+    "Accept-Encoding: gzip, deflate, br\r\n"\r
+    "Accept-Language: ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7\r\n"\r
+    "Cache-Control: max-age=0\r\n"\r
+    "Connection: keep-alive\r\n"\r
+    "Host: example.com\r\n"\r
+    "Upgrade-Insecure-Requests: 1\r\n"\r
+    "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "\r
+    "(KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36\r\n"\r
+)\r
+\r
+\r
+def headers_split_re(headers: str) -> None:\r
+    for line in _CRLF_RE.split(headers):\r
+        pass\r
+\r
+\r
+def headers_split_simple(headers: str) -> None:\r
+    for line in headers.split("\n"):\r
+        if line.endswith("\r"):\r
+            line = line[:-1]\r
+\r
+\r
+def headers_parse_re(headers: str) -> HTTPHeaders:\r
+    h = HTTPHeaders()\r
+    for line in _CRLF_RE.split(headers):\r
+        if line:\r
+            h.parse_line(line)\r
+    return h\r
+\r
+\r
+def headers_parse_simple(headers: str) -> HTTPHeaders:\r
+    h = HTTPHeaders()\r
+    for line in headers.split("\n"):\r
+        if line.endswith("\r"):\r
+            line = line[:-1]\r
+        if line:\r
+            h.parse_line(line)\r
+    return h\r
+\r
+\r
+def run_headers_split():\r
+    regex_time = timeit.timeit(lambda: headers_split_re(_TEST_HEADERS), number=100000)\r
+    print("regex", regex_time)\r
+\r
+    simple_time = timeit.timeit(\r
+        lambda: headers_split_simple(_TEST_HEADERS), number=100000\r
+    )\r
+    print("str.split", simple_time)\r
+\r
+    print("speedup", regex_time / simple_time)\r
+\r
+\r
+def run_headers_full():\r
+    regex_time = timeit.timeit(lambda: headers_parse_re(_TEST_HEADERS), number=10000)\r
+    print("regex", regex_time)\r
+\r
+    simple_time = timeit.timeit(\r
+        lambda: headers_parse_simple(_TEST_HEADERS), number=10000\r
+    )\r
+    print("str.split", simple_time)\r
+\r
+    print("speedup", regex_time / simple_time)\r
+\r
+\r
+class Benchmark(Enum):\r
+    def __new__(cls, arg_value: str, func: Callable[[], None]):\r
+        member = object.__new__(cls)\r
+        member._value_ = arg_value\r
+        member.func = func\r
+        return member\r
+\r
+    HEADERS_SPLIT = ("headers-split", run_headers_split)\r
+    HEADERS_FULL = ("headers-full", run_headers_full)\r
+\r
+\r
+def main():\r
+    parse_command_line()\r
+\r
+    try:\r
+        func = Benchmark(options.benchmark).func\r
+    except ValueError:\r
+        known_benchmarks = [benchmark.value for benchmark in Benchmark]\r
+        print(\r
+            "Unknown benchmark: '{}', supported values are: {}"\r
+            .format(options.benchmark, ", ".join(known_benchmarks))\r
+        )\r
+        return\r
+\r
+    for _ in range(options.num_runs):\r
+        func()\r
+\r
+\r
+if __name__ == '__main__':\r
+    main()\r
index 835327d0a1395966ad55e7355189eace7b068b26..15f5675e2920bea26ad9637ad42d365eb1f36505 100644 (file)
@@ -62,11 +62,6 @@ if typing.TYPE_CHECKING:
     import unittest  # noqa: F401
 
 
-# RFC 7230 section 3.5: a recipient MAY recognize a single LF as a line
-# terminator and ignore any preceding CR.
-_CRLF_RE = re.compile(r"\r?\n")
-
-
 class _NormalizedHeaderCache(dict):
     """Dynamic cached mapping of header names to Http-Header-Case.
 
@@ -223,7 +218,11 @@ class HTTPHeaders(collections.abc.MutableMapping):
 
         """
         h = cls()
-        for line in _CRLF_RE.split(headers):
+        # RFC 7230 section 3.5: a recipient MAY recognize a single LF as a line
+        # terminator and ignore any preceding CR.
+        for line in headers.split("\n"):
+            if line.endswith("\r"):
+                line = line[:-1]
             if line:
                 h.parse_line(line)
         return h