From: Mikhail Burshteyn Date: Sat, 4 May 2019 08:16:18 +0000 (+0300) Subject: Speedup headers parsing X-Git-Tag: v6.1.0b1~76^2 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=refs%2Fpull%2F2646%2Fhead;p=thirdparty%2Ftornado.git Speedup headers parsing Replace `_CRLF_RE.split(headers)` with a simple `headers.split('\n')` with an additional check for `'\r'` in each line. Add benchmark to measure performance impact of the change. The benchmark results are as follows: * split only: ~3x faster on CPython, 14-18x faster on PyPy * full headers parse: ~1.3x faster on CPython, ~3-4.5x faster on PyPy --- diff --git a/maint/benchmark/parsing_benchmark.py b/maint/benchmark/parsing_benchmark.py new file mode 100644 index 000000000..d0bfcc895 --- /dev/null +++ b/maint/benchmark/parsing_benchmark.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python +import re +import timeit +from enum import Enum +from typing import Callable + +from tornado.httputil import HTTPHeaders +from tornado.options import define, options, parse_command_line + + +define("benchmark", type=str) +define("num_runs", type=int, default=1) + + +_CRLF_RE = re.compile(r"\r?\n") +_TEST_HEADERS = ( + "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp," + "image/apng,*/*;q=0.8,application/signed-exchange;v=b3\r\n" + "Accept-Encoding: gzip, deflate, br\r\n" + "Accept-Language: ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7\r\n" + "Cache-Control: max-age=0\r\n" + "Connection: keep-alive\r\n" + "Host: example.com\r\n" + "Upgrade-Insecure-Requests: 1\r\n" + "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36\r\n" +) + + +def headers_split_re(headers: str) -> None: + for line in _CRLF_RE.split(headers): + pass + + +def headers_split_simple(headers: str) -> None: + for line in headers.split("\n"): + if line.endswith("\r"): + line = line[:-1] + + +def headers_parse_re(headers: str) -> HTTPHeaders: + h = HTTPHeaders() + for line in _CRLF_RE.split(headers): + if line: + h.parse_line(line) + return h + + +def headers_parse_simple(headers: str) -> HTTPHeaders: + h = HTTPHeaders() + for line in headers.split("\n"): + if line.endswith("\r"): + line = line[:-1] + if line: + h.parse_line(line) + return h + + +def run_headers_split(): + regex_time = timeit.timeit(lambda: headers_split_re(_TEST_HEADERS), number=100000) + print("regex", regex_time) + + simple_time = timeit.timeit( + lambda: headers_split_simple(_TEST_HEADERS), number=100000 + ) + print("str.split", simple_time) + + print("speedup", regex_time / simple_time) + + +def run_headers_full(): + regex_time = timeit.timeit(lambda: headers_parse_re(_TEST_HEADERS), number=10000) + print("regex", regex_time) + + simple_time = timeit.timeit( + lambda: headers_parse_simple(_TEST_HEADERS), number=10000 + ) + print("str.split", simple_time) + + print("speedup", regex_time / simple_time) + + +class Benchmark(Enum): + def __new__(cls, arg_value: str, func: Callable[[], None]): + member = object.__new__(cls) + member._value_ = arg_value + member.func = func + return member + + HEADERS_SPLIT = ("headers-split", run_headers_split) + HEADERS_FULL = ("headers-full", run_headers_full) + + +def main(): + parse_command_line() + + try: + func = Benchmark(options.benchmark).func + except ValueError: + known_benchmarks = [benchmark.value for benchmark in Benchmark] + print( + "Unknown benchmark: '{}', supported values are: {}" + .format(options.benchmark, ", ".join(known_benchmarks)) + ) + return + + for _ in range(options.num_runs): + func() + + +if __name__ == '__main__': + main() diff --git a/tornado/httputil.py b/tornado/httputil.py index 835327d0a..15f5675e2 100644 --- a/tornado/httputil.py +++ b/tornado/httputil.py @@ -62,11 +62,6 @@ if typing.TYPE_CHECKING: import unittest # noqa: F401 -# RFC 7230 section 3.5: a recipient MAY recognize a single LF as a line -# terminator and ignore any preceding CR. -_CRLF_RE = re.compile(r"\r?\n") - - class _NormalizedHeaderCache(dict): """Dynamic cached mapping of header names to Http-Header-Case. @@ -223,7 +218,11 @@ class HTTPHeaders(collections.abc.MutableMapping): """ h = cls() - for line in _CRLF_RE.split(headers): + # RFC 7230 section 3.5: a recipient MAY recognize a single LF as a line + # terminator and ignore any preceding CR. + for line in headers.split("\n"): + if line.endswith("\r"): + line = line[:-1] if line: h.parse_line(line) return h