From: Ben Darnell Date: Thu, 27 Mar 2025 20:22:33 +0000 (-0400) Subject: httputil: Centralize regexes based directly on RFCs X-Git-Tag: v6.5.0b1~16^2~2 X-Git-Url: http://git.ipfire.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=1cfc0d127a287cd732ff9f5419450a7ce1163454;p=thirdparty%2Ftornado.git httputil: Centralize regexes based directly on RFCs This will make it easier to stay in strict conformance with the RFCs. Note that this commit makes a few small semantic changes to response start-line parsing: status codes must be exactly three digits, and control characters are not allowed in reason phrases. --- diff --git a/tornado/httputil.py b/tornado/httputil.py index 536b1455..f0c4ff10 100644 --- a/tornado/httputil.py +++ b/tornado/httputil.py @@ -71,7 +71,32 @@ else: # To be used with str.strip() and related methods. HTTP_WHITESPACE = " \t" -HTTP_TOKEN_RE = re.compile(r"^[!#$%&'*+\-.^_`|~0-9A-Za-z]+$") + +class _ABNF: + """Class that holds a subset of ABNF rules from RFC 9110 and friends. + + Class attributes are re.Pattern objects, with the same name as in the RFC + (with hyphens changed to underscores). Currently contains only the subset + we use (which is why this class is not public). Unfortunately the fields + cannot be alphabetized as they are in the RFCs because of dependencies. + """ + + # RFC 5234 (ABNF) + VCHAR = re.compile(r"[\x21-\x7E]") + + # RFC 9110 (HTTP Semantics) + obs_text = re.compile(r"[\x80-\xFF]") + tchar = re.compile(r"[!#$%&'*+\-.^_`|~0-9A-Za-z]") + token = re.compile(rf"{tchar.pattern}+") + field_name = token + + # RFC 9112 (HTTP/1.1) + HTTP_version = re.compile(r"HTTP/[0-9]\.[0-9]") + reason_phrase = re.compile(rf"(?:[\t ]|{VCHAR.pattern}|{obs_text.pattern})+") + status_code = re.compile(r"[0-9]{3}") + status_line = re.compile( + rf"({HTTP_version.pattern}) ({status_code.pattern}) ({reason_phrase.pattern})?" + ) @lru_cache(1000) @@ -145,7 +170,7 @@ class HTTPHeaders(StrMutableMapping): def add(self, name: str, value: str) -> None: """Adds a new value for the given key.""" - if not HTTP_TOKEN_RE.match(name): + if not _ABNF.token.fullmatch(name): raise HTTPInputError("Invalid header name %r" % name) norm_name = _normalize_header(name) self._last_key = norm_name @@ -892,9 +917,6 @@ class RequestStartLine(typing.NamedTuple): version: str -_http_version_re = re.compile(r"^HTTP/1\.[0-9]$") - - def parse_request_start_line(line: str) -> RequestStartLine: """Returns a (method, path, version) tuple for an HTTP 1.x request line. @@ -909,10 +931,15 @@ def parse_request_start_line(line: str) -> RequestStartLine: # https://tools.ietf.org/html/rfc7230#section-3.1.1 # invalid request-line SHOULD respond with a 400 (Bad Request) raise HTTPInputError("Malformed HTTP request line") - if not _http_version_re.match(version): + if not _ABNF.HTTP_version.fullmatch(version): raise HTTPInputError( "Malformed HTTP version in HTTP Request-Line: %r" % version ) + if not version.startswith("HTTP/1"): + # HTTP/2 and above doesn't use parse_request_start_line. + # This could be folded into the regex but we don't want to deviate + # from the ABNF in the RFCs. + raise HTTPInputError("Unexpected HTTP version %r" % version) return RequestStartLine(method, path, version) @@ -922,9 +949,6 @@ class ResponseStartLine(typing.NamedTuple): reason: str -_http_response_line_re = re.compile(r"(HTTP/1.[0-9]) ([0-9]+) ([^\r]*)") - - def parse_response_start_line(line: str) -> ResponseStartLine: """Returns a (version, code, reason) tuple for an HTTP 1.x response line. @@ -933,11 +957,14 @@ def parse_response_start_line(line: str) -> ResponseStartLine: >>> parse_response_start_line("HTTP/1.1 200 OK") ResponseStartLine(version='HTTP/1.1', code=200, reason='OK') """ - line = native_str(line) - match = _http_response_line_re.match(line) + match = _ABNF.status_line.fullmatch(line) if not match: raise HTTPInputError("Error parsing response start line") - return ResponseStartLine(match.group(1), int(match.group(2)), match.group(3)) + r = ResponseStartLine(match.group(1), int(match.group(2)), match.group(3)) + if not r.version.startswith("HTTP/1"): + # HTTP/2 and above doesn't use parse_response_start_line. + raise HTTPInputError("Unexpected HTTP version %r" % r.version) + return r # _parseparam and _parse_header are copied and modified from python2.7's cgi.py