]> git.ipfire.org Git - thirdparty/httpx.git/commitdiff
Differentiate between 'url.host' and 'url.raw_host' (#1590)
authorTom Christie <tom@tomchristie.com>
Fri, 23 Apr 2021 10:00:53 +0000 (11:00 +0100)
committerGitHub <noreply@github.com>
Fri, 23 Apr 2021 10:00:53 +0000 (11:00 +0100)
* Differentiate between 'url.host' and 'url.raw_host'

httpx/_models.py
tests/models/test_url.py

index 357baaca154bb6f23f61c22cae572c1bbc498594..dc8888882130abf9bfd684ee163cb918129dad9e 100644 (file)
@@ -8,6 +8,7 @@ from collections.abc import MutableMapping
 from http.cookiejar import Cookie, CookieJar
 from urllib.parse import parse_qsl, quote, unquote, urlencode
 
+import idna
 import rfc3986
 import rfc3986.exceptions
 
@@ -60,15 +61,16 @@ from ._utils import (
 
 class URL:
     """
-    url = httpx.URL("HTTPS://jo%40email.com:a%20secret@example.com:1234/pa%20th?search=ab#anchorlink")
+    url = httpx.URL("HTTPS://jo%40email.com:a%20secret@müller.de:1234/pa%20th?search=ab#anchorlink")
 
     assert url.scheme == "https"
     assert url.username == "jo@email.com"
     assert url.password == "a secret"
     assert url.userinfo == b"jo%40email.com:a%20secret"
-    assert url.host == "example.com"
+    assert url.host == "müller.de"
+    assert url.raw_host == b"xn--mller-kva.de"
     assert url.port == 1234
-    assert url.netloc == "example.com:1234"
+    assert url.netloc == b"xn--mller-kva.de:1234"
     assert url.path == "/pa th"
     assert url.query == b"?search=ab"
     assert url.raw_path == b"/pa%20th?search=ab"
@@ -76,17 +78,28 @@ class URL:
 
     The components of a URL are broken down like this:
 
-    https://jo%40email.com:a%20secret@example.com:1234/pa%20th?search=ab#anchorlink
-    [scheme][  username  ] [password] [  host  ][port][ path ] [ query ] [fragment]
-            [       userinfo        ] [    netloc    ][    raw_path    ]
+       https://jo%40email.com:a%20secret@müller.de:1234/pa%20th?search=ab#anchorlink
+    [scheme]   [  username  ] [password] [ host ][port][ path ] [ query ] [fragment]
+               [       userinfo        ] [   netloc   ][    raw_path    ]
 
     Note that:
 
     * `url.scheme` is normalized to always be lowercased.
 
-    * `url.host` is normalized to always be lowercased, and is IDNA encoded. For instance:
-       url = httpx.URL("http://中国.icom.museum")
-       assert url.host == "xn--fiqs8s.icom.museum"
+    * `url.host` is normalized to always be lowercased. Internationalized domain
+      names are represented in unicode, without IDNA encoding applied. For instance:
+
+      url = httpx.URL("http://中国.icom.museum")
+      assert url.host == "中国.icom.museum"
+      url = httpx.URL("http://xn--fiqs8s.icom.museum")
+      assert url.host == "中国.icom.museum"
+
+    * `url.raw_host` is normalized to always be lowercased, and is IDNA encoded.
+
+      url = httpx.URL("http://中国.icom.museum")
+      assert url.raw_host == b"xn--fiqs8s.icom.museum"
+      url = httpx.URL("http://xn--fiqs8s.icom.museum")
+      assert url.raw_host == b"xn--fiqs8s.icom.museum"
 
     * `url.userinfo` is raw bytes, without URL escaping. Usually you'll want to work with
       `url.username` and `url.password` instead, which handle the URL escaping.
@@ -150,6 +163,14 @@ class URL:
         """
         return self._uri_reference.scheme or ""
 
+    @property
+    def raw_scheme(self) -> bytes:
+        """
+        The raw bytes representation of the URL scheme, such as b"http", b"https".
+        Always normalised to lowercase.
+        """
+        return self.scheme.encode("ascii")
+
     @property
     def userinfo(self) -> bytes:
         """
@@ -181,7 +202,7 @@ class URL:
     def host(self) -> str:
         """
         The URL host as a string.
-        Always normlized to lowercase, and IDNA encoded.
+        Always normalized to lowercase, with IDNA hosts decoded into unicode.
 
         Examples:
 
@@ -189,18 +210,52 @@ class URL:
         assert url.host == "www.example.org"
 
         url = httpx.URL("http://中国.icom.museum")
-        assert url.host == "xn--fiqs8s.icom.museum"
+        assert url.host == "中国.icom.museum"
+
+        url = httpx.URL("http://xn--fiqs8s.icom.museum")
+        assert url.host == "中国.icom.museum"
 
         url = httpx.URL("https://[::ffff:192.168.0.1]")
         assert url.host == "::ffff:192.168.0.1"
         """
-        host: str = self._uri_reference.host
+        host: str = self._uri_reference.host or ""
+
+        if host and ":" in host and host[0] == "[":
+            # it's an IPv6 address
+            host = host.lstrip("[").rstrip("]")
+
+        if host.startswith("xn--"):
+            host = idna.decode(host)
+
+        return host
+
+    @property
+    def raw_host(self) -> bytes:
+        """
+        The raw bytes representation of the URL host.
+        Always normalized to lowercase, and IDNA encoded.
+
+        Examples:
+
+        url = httpx.URL("http://www.EXAMPLE.org")
+        assert url.raw_host == b"www.example.org"
+
+        url = httpx.URL("http://中国.icom.museum")
+        assert url.raw_host == b"xn--fiqs8s.icom.museum"
+
+        url = httpx.URL("http://xn--fiqs8s.icom.museum")
+        assert url.raw_host == b"xn--fiqs8s.icom.museum"
+
+        url = httpx.URL("https://[::ffff:192.168.0.1]")
+        assert url.raw_host == b"::ffff:192.168.0.1"
+        """
+        host: str = self._uri_reference.host or ""
 
         if host and ":" in host and host[0] == "[":
             # it's an IPv6 address
             host = host.lstrip("[").rstrip("]")
 
-        return host or ""
+        return host.encode("ascii")
 
     @property
     def port(self) -> typing.Optional[int]:
@@ -211,14 +266,17 @@ class URL:
         return int(port) if port else None
 
     @property
-    def netloc(self) -> str:
+    def netloc(self) -> bytes:
         """
-        Either `<host>` or `<host>:<port>` as a string.
-        Always normlized to lowercase, and IDNA encoded.
+        Either `<host>` or `<host>:<port>` as bytes.
+        Always normalized to lowercase, and IDNA encoded.
         """
         host = self._uri_reference.host or ""
         port = self._uri_reference.port
-        return host if port is None else f"{host}:{port}"
+        netloc = host.encode("ascii")
+        if port:
+            netloc = netloc + b":" + str(port).encode("ascii")
+        return netloc
 
     @property
     def path(self) -> str:
@@ -277,8 +335,8 @@ class URL:
         Provides the (scheme, host, port, target) for the outgoing request.
         """
         return (
-            self.scheme.encode("ascii"),
-            self.host.encode("ascii"),
+            self.raw_scheme,
+            self.raw_host,
             self.port,
             self.raw_path,
         )
@@ -293,7 +351,7 @@ class URL:
         # URLs with a fragment portion as not absolute.
         # What we actually care about is if the URL provides
         # a scheme and hostname to which connections should be made.
-        return bool(self.scheme and self.host)
+        return bool(self._uri_reference.scheme and self._uri_reference.host)
 
     @property
     def is_relative_url(self) -> bool:
@@ -321,7 +379,7 @@ class URL:
             "userinfo": bytes,
             "host": str,
             "port": int,
-            "netloc": str,
+            "netloc": bytes,
             "path": str,
             "query": bytes,
             "raw_path": bytes,
@@ -354,12 +412,16 @@ class URL:
                 # it's an IPv6 address, so it should be hidden under bracket
                 host = f"[{host}]"
 
-            kwargs["netloc"] = f"{host}:{port}" if port is not None else host
+            kwargs["netloc"] = (
+                f"{host}:{port}".encode("ascii")
+                if port is not None
+                else host.encode("ascii")
+            )
 
         if "userinfo" in kwargs or "netloc" in kwargs:
             # Consolidate userinfo and netloc into authority.
             userinfo = (kwargs.pop("userinfo", self.userinfo) or b"").decode("ascii")
-            netloc = kwargs.pop("netloc", self.netloc) or ""
+            netloc = (kwargs.pop("netloc", self.netloc) or b"").decode("ascii")
             authority = f"{userinfo}@{netloc}" if userinfo else netloc
             kwargs["authority"] = authority
 
@@ -848,11 +910,10 @@ class Request:
         )
 
         if not has_host and self.url.host:
-            default_port = {"http": 80, "https": 443}.get(self.url.scheme)
-            if self.url.port is None or self.url.port == default_port:
-                host_header = self.url.host.encode("ascii")
-            else:
-                host_header = self.url.netloc.encode("ascii")
+            default_port = {"http": b":80", "https": b":443"}.get(self.url.scheme, b"")
+            host_header = self.url.netloc
+            if host_header.endswith(default_port):
+                host_header = host_header[: -len(default_port)]
             auto_headers.append((b"Host", host_header))
         if not has_content_length and self.method in ("POST", "PUT", "PATCH"):
             auto_headers.append((b"Content-Length", b"0"))
index 2d14afd71c24f3d892371b447d79d07305008969..393503107221bf3bfa186725bd7ec4bfe258deef 100644 (file)
@@ -4,41 +4,53 @@ import httpx
 
 
 @pytest.mark.parametrize(
-    "given,idna,host,scheme,port",
+    "given,idna,host,raw_host,scheme,port",
     [
         (
             "http://中国.icom.museum:80/",
             "http://xn--fiqs8s.icom.museum:80/",
-            "xn--fiqs8s.icom.museum",
+            "中国.icom.museum",
+            b"xn--fiqs8s.icom.museum",
             "http",
             80,
         ),
         (
             "http://Königsgäßchen.de",
             "http://xn--knigsgchen-b4a3dun.de",
-            "xn--knigsgchen-b4a3dun.de",
+            "königsgäßchen.de",
+            b"xn--knigsgchen-b4a3dun.de",
             "http",
             None,
         ),
-        ("https://faß.de", "https://xn--fa-hia.de", "xn--fa-hia.de", "https", None),
+        (
+            "https://faß.de",
+            "https://xn--fa-hia.de",
+            "faß.de",
+            b"xn--fa-hia.de",
+            "https",
+            None,
+        ),
         (
             "https://βόλος.com:443",
             "https://xn--nxasmm1c.com:443",
-            "xn--nxasmm1c.com",
+            "βόλος.com",
+            b"xn--nxasmm1c.com",
             "https",
             443,
         ),
         (
             "http://ශ්‍රී.com:444",
             "http://xn--10cl1a0b660p.com:444",
-            "xn--10cl1a0b660p.com",
+            "ශ්‍රී.com",
+            b"xn--10cl1a0b660p.com",
             "http",
             444,
         ),
         (
             "https://نامه‌ای.com:4433",
             "https://xn--mgba3gch31f060k.com:4433",
-            "xn--mgba3gch31f060k.com",
+            "نامه‌ای.com",
+            b"xn--mgba3gch31f060k.com",
             "https",
             4433,
         ),
@@ -52,10 +64,11 @@ import httpx
         "https_with_custom_port",
     ],
 )
-def test_idna_url(given, idna, host, scheme, port):
+def test_idna_url(given, idna, host, raw_host, scheme, port):
     url = httpx.URL(given)
     assert url == httpx.URL(idna)
     assert url.host == host
+    assert url.raw_host == raw_host
     assert url.scheme == scheme
     assert url.port == port
 
@@ -197,7 +210,7 @@ def test_url_copywith_authority_subcomponents():
 
 def test_url_copywith_netloc():
     copy_with_kwargs = {
-        "netloc": "example.net:444",
+        "netloc": b"example.net:444",
     }
     url = httpx.URL("https://example.org")
     new = url.copy_with(**copy_with_kwargs)
@@ -301,7 +314,7 @@ def test_ipv6_url():
     url = httpx.URL("http://[::ffff:192.168.0.1]:5678/")
 
     assert url.host == "::ffff:192.168.0.1"
-    assert url.netloc == "[::ffff:192.168.0.1]:5678"
+    assert url.netloc == b"[::ffff:192.168.0.1]:5678"
 
 
 @pytest.mark.parametrize(
@@ -317,7 +330,7 @@ def test_ipv6_url_copy_with_host(url_str, new_host):
     url = httpx.URL(url_str).copy_with(host=new_host)
 
     assert url.host == "::ffff:192.168.0.1"
-    assert url.netloc == "[::ffff:192.168.0.1]:1234"
+    assert url.netloc == b"[::ffff:192.168.0.1]:1234"
     assert str(url) == "http://[::ffff:192.168.0.1]:1234"
 
 
@@ -327,5 +340,5 @@ def test_ipv6_url_from_raw_url(host):
     url = httpx.URL(raw_url)
 
     assert url.host == "::ffff:192.168.0.1"
-    assert url.netloc == "[::ffff:192.168.0.1]:443"
+    assert url.netloc == b"[::ffff:192.168.0.1]:443"
     assert str(url) == "https://[::ffff:192.168.0.1]:443/"