]> git.ipfire.org Git - thirdparty/httpx.git/commitdiff
Finesse URL properties (#1285)
authorTom Christie <tom@tomchristie.com>
Mon, 21 Sep 2020 10:35:25 +0000 (11:35 +0100)
committerGitHub <noreply@github.com>
Mon, 21 Sep 2020 10:35:25 +0000 (11:35 +0100)
* url.userinfo should be URL encoded bytes

* Neater copy_with implementation

* Finesse API around URL properties and copy_with

* Docstring for URL, and drop url.authority

* Support url.copy_with(raw_path=...)

* Docstrings on URL methods

* Tweak docstring

httpx/_auth.py
httpx/_models.py
setup.cfg
tests/models/test_requests.py
tests/models/test_url.py

index fdbda9fa97457a39f5cfaa610822c92915791d01..c91ab7b986454720f055e6b46c914e9a6eb3b1b3 100644 (file)
@@ -217,7 +217,7 @@ class DigestAuth(Auth):
 
         A1 = b":".join((self._username, challenge.realm, self._password))
 
-        path = request.url.full_path.encode("utf-8")
+        path = request.url.raw_path
         A2 = b":".join((request.method.encode(), path))
         # TODO: implement auth-int
         HA2 = digest(A2)
index 03a08075f3d240e682e258c9ce43c676d79cd5b6..83e644c2fa14662dc85f34894c8ca6cdd1ea282b 100644 (file)
@@ -62,6 +62,46 @@ from ._utils import (
 
 
 class URL:
+    """
+    url = httpx.URL("HTTPS://jo%40email.com:a%20secret@example.com:1234/pa%20th?search=ab#anchorlink")
+
+    assert url.scheme == "https"
+    assert url.username == "jo@email.com"
+    assert url.password == "a secret"
+    assert url.userinfo == b"jo%40email.com:a%20secret"
+    assert url.host == "example.com"
+    assert url.port == 1234
+    assert url.netloc == "example.com:1234"
+    assert url.path == "/pa th"
+    assert url.query == b"?search=ab"
+    assert url.raw_path == b"/pa%20th?search=ab"
+    assert url.fragment == "anchorlink"
+
+    The components of a URL are broken down like this:
+
+    https://jo%40email.com:a%20secret@example.com:1234/pa%20th?search=ab#anchorlink
+    [scheme][  username  ] [password] [  host  ][port][ path ] [ query ] [fragment]
+            [       userinfo        ] [    netloc    ][    raw_path    ]
+
+    Note that:
+
+    * `url.scheme` is normalized to always be lowercased.
+
+    * `url.host` is normalized to always be lowercased, and is IDNA encoded. For instance:
+       url = httpx.URL("http://中国.icom.museum")
+       assert url.host == "xn--fiqs8s.icom.museum"
+
+    * `url.userinfo` is raw bytes, without URL escaping. Usually you'll want to work with
+      `url.username` and `url.password` instead, which handle the URL escaping.
+
+    * `url.raw_path` is raw bytes of both the path and query, without URL escaping.
+      This portion is used as the target when constructing HTTP requests. Usually you'll
+      want to work with `url.path` instead.
+
+    * `url.query` is raw bytes, without URL escaping. A URL query string portion can only
+      be properly URL escaped when decoding the parameter names and values themselves.
+    """
+
     def __init__(
         self, url: typing.Union["URL", str, RawURL] = "", params: QueryParamTypes = None
     ) -> None:
@@ -102,63 +142,134 @@ class URL:
 
     @property
     def scheme(self) -> str:
+        """
+        The URL scheme, such as "http", "https".
+        Always normalised to lowercase.
+        """
         return self._uri_reference.scheme or ""
 
     @property
-    def authority(self) -> str:
-        port_str = self._uri_reference.port
-        default_port_str = {"https": "443", "http": "80"}.get(self.scheme, "")
-        if port_str is None or port_str == default_port_str:
-            return self._uri_reference.host or ""
-        return self._uri_reference.authority or ""
-
-    @property
-    def userinfo(self) -> str:
-        return self._uri_reference.userinfo or ""
+    def userinfo(self) -> bytes:
+        """
+        The URL userinfo as a raw bytestring.
+        For example: b"jo%40email.com:a%20secret".
+        """
+        userinfo = self._uri_reference.userinfo or ""
+        return userinfo.encode("ascii")
 
     @property
     def username(self) -> str:
-        return unquote(self.userinfo.partition(":")[0])
+        """
+        The URL username as a string, with URL decoding applied.
+        For example: "jo@email.com"
+        """
+        userinfo = self._uri_reference.userinfo or ""
+        return unquote(userinfo.partition(":")[0])
 
     @property
     def password(self) -> str:
-        return unquote(self.userinfo.partition(":")[2])
+        """
+        The URL password as a string, with URL decoding applied.
+        For example: "a secret"
+        """
+        userinfo = self._uri_reference.userinfo or ""
+        return unquote(userinfo.partition(":")[2])
 
     @property
     def host(self) -> str:
+        """
+        The URL host as a string.
+        Always normlized to lowercase, and IDNA encoded.
+
+        Examples:
+
+        url = httpx.URL("http://www.EXAMPLE.org")
+        assert url.host == "www.example.org"
+
+        url = httpx.URL("http://中国.icom.museum")
+        assert url.host == "xn--fiqs8s.icom.museum"
+        """
         return self._uri_reference.host or ""
 
     @property
     def port(self) -> typing.Optional[int]:
+        """
+        The URL port as an integer.
+        """
         port = self._uri_reference.port
         return int(port) if port else None
 
+    @property
+    def netloc(self) -> str:
+        """
+        Either `<host>` or `<host>:<port>` as a string.
+        Always normlized to lowercase, and IDNA encoded.
+        """
+        host = self._uri_reference.host or ""
+        port = self._uri_reference.port
+        return host if port is None else f"{host}:{port}"
+
     @property
     def path(self) -> str:
-        return self._uri_reference.path or "/"
+        """
+        The URL path as a string. Excluding the query string, and URL decoded.
+
+        For example:
+
+        url = httpx.URL("https://example.com/pa%20th")
+        assert url.path == "/pa th"
+        """
+        path = self._uri_reference.path or "/"
+        return unquote(path)
 
     @property
-    def query(self) -> str:
-        return self._uri_reference.query or ""
+    def query(self) -> bytes:
+        """
+        The URL query string, as raw bytes, excluding the leading b"?".
+        Note that URL decoding can only be applied on URL query strings
+        at the point of decoding the individual parameter names/values.
+        """
+        query = self._uri_reference.query or ""
+        return query.encode("ascii")
 
     @property
-    def full_path(self) -> str:
-        path = self.path
-        if self.query:
-            path += "?" + self.query
-        return path
+    def raw_path(self) -> bytes:
+        """
+        The complete URL path and query string as raw bytes.
+        Used as the target when constructing HTTP requests.
+
+        For example:
+
+        GET /users?search=some%20text HTTP/1.1
+        Host: www.example.org
+        Connection: close
+        """
+        path = self._uri_reference.path or "/"
+        if self._uri_reference.query is not None:
+            path += "?" + self._uri_reference.query
+        return path.encode("ascii")
 
     @property
     def fragment(self) -> str:
+        """
+        The URL fragments, as used in HTML anchors.
+        As a string, without the leading '#'.
+        """
         return self._uri_reference.fragment or ""
 
     @property
     def raw(self) -> RawURL:
+        """
+        The URL in the raw representation used by the low level
+        transport API. For example, see `httpcore`.
+
+        Provides the (scheme, host, port, target) for the outgoing request.
+        """
         return (
             self.scheme.encode("ascii"),
             self.host.encode("ascii"),
             self.port,
-            self.full_path.encode("ascii"),
+            self.raw_path,
         )
 
     @property
@@ -181,36 +292,93 @@ class URL:
 
     @property
     def is_relative_url(self) -> bool:
+        """
+        Return `False` for absolute URLs such as 'http://example.com/path',
+        and `True` for relative URLs such as '/path'.
+        """
         return not self.is_absolute_url
 
     def copy_with(self, **kwargs: typing.Any) -> "URL":
-        if (
-            "username" in kwargs
-            or "password" in kwargs
-            or "host" in kwargs
-            or "port" in kwargs
-        ):
-            host = kwargs.pop("host", self.host)
-            port = kwargs.pop("port", self.port)
+        """
+        Copy this URL, returning a new URL with some components altered.
+        Accepts the same set of parameters as the components that are made
+        available via properties on the `URL` class.
+
+        For example:
+
+        url = httpx.URL("https://www.example.com").copy_with(username="jo@gmail.com", password="a secret")
+        assert url == "https://jo%40email.com:a%20secret@www.example.com"
+        """
+        allowed = {
+            "scheme": str,
+            "username": str,
+            "password": str,
+            "userinfo": bytes,
+            "host": str,
+            "port": int,
+            "netloc": str,
+            "path": str,
+            "query": bytes,
+            "raw_path": bytes,
+            "fragment": str,
+        }
+        for key, value in kwargs.items():
+            if key not in allowed:
+                message = f"{key!r} is an invalid keyword argument for copy_with()"
+                raise TypeError(message)
+            if value is not None and not isinstance(value, allowed[key]):
+                expected = allowed[key].__name__
+                seen = type(value).__name__
+                message = f"Argument {key!r} must be {expected} but got {seen}"
+                raise TypeError(message)
+
+        # Replace username, password, userinfo, host, port, netloc with "authority" for rfc3986
+        if "username" in kwargs or "password" in kwargs:
+            # Consolidate username and password into userinfo.
             username = quote(kwargs.pop("username", self.username) or "")
             password = quote(kwargs.pop("password", self.password) or "")
+            userinfo = f"{username}:{password}" if password else username
+            kwargs["userinfo"] = userinfo.encode("ascii")
 
-            authority = host
-            if port is not None:
-                authority += f":{port}"
-            if username:
-                userpass = username
-                if password:
-                    userpass += f":{password}"
-                authority = f"{userpass}@{authority}"
+        if "host" in kwargs or "port" in kwargs:
+            # Consolidate host and port into  netloc.
+            host = kwargs.pop("host", self.host) or ""
+            port = kwargs.pop("port", self.port)
+            kwargs["netloc"] = f"{host}:{port}" if port is not None else host
 
+        if "userinfo" in kwargs or "netloc" in kwargs:
+            # Consolidate userinfo and netloc into authority.
+            userinfo = (kwargs.pop("userinfo", self.userinfo) or b"").decode("ascii")
+            netloc = kwargs.pop("netloc", self.netloc) or ""
+            authority = f"{userinfo}@{netloc}" if userinfo else netloc
             kwargs["authority"] = authority
 
+        if "raw_path" in kwargs:
+            raw_path = kwargs.pop("raw_path") or b""
+            path, has_query, query = raw_path.decode("ascii").partition("?")
+            kwargs["path"] = path
+            kwargs["query"] = query if has_query else None
+
+        else:
+            # Ensure path=<url quoted str> for rfc3986
+            if kwargs.get("path") is not None:
+                kwargs["path"] = quote(kwargs["path"])
+
+            # Ensure query=<str> for rfc3986
+            if kwargs.get("query") is not None:
+                kwargs["query"] = kwargs["query"].decode("ascii")
+
         return URL(self._uri_reference.copy_with(**kwargs).unsplit())
 
     def join(self, url: URLTypes) -> "URL":
         """
         Return an absolute URL, using this URL as the base.
+
+        Eg.
+
+        url = httpx.URL("https://www.example.com/test")
+        url = url.join("/new/path")
+        assert url == "https://www.example.com/test/new/path"
         """
         if self.is_relative_url:
             return URL(url)
@@ -234,9 +402,10 @@ class URL:
         class_name = self.__class__.__name__
         url_str = str(self)
         if self._uri_reference.userinfo:
+            username = quote(self.username)
             url_str = (
                 rfc3986.urlparse(url_str)
-                .copy_with(userinfo=f"{self.username}:[secure]")
+                .copy_with(userinfo=f"{username}:[secure]")
                 .unsplit()
             )
         return f"{class_name}({url_str!r})"
@@ -653,9 +822,13 @@ class Request:
             "content-length" in self.headers or "transfer-encoding" in self.headers
         )
 
-        if not has_host and self.url.authority:
-            host = self.url.copy_with(username=None, password=None).authority
-            auto_headers.append((b"host", host.encode("ascii")))
+        if not has_host and self.url.host:
+            default_port = {"http": 80, "https": 443}.get(self.url.scheme)
+            if self.url.port is None or self.url.port == default_port:
+                host_header = self.url.host.encode("ascii")
+            else:
+                host_header = self.url.netloc.encode("ascii")
+            auto_headers.append((b"host", host_header))
         if not has_content_length and self.method in ("POST", "PUT", "PATCH"):
             auto_headers.append((b"content-length", b"0"))
 
index abf929021eb7e25157a0212f2ba065531d702e38..23185eedce08a773e9dafc1ad8040396ece974c6 100644 (file)
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [flake8]
 ignore = W503, E203, B305
-max-line-length = 88
+max-line-length = 120
 
 [mypy]
 disallow_untyped_defs = True
@@ -14,7 +14,7 @@ check_untyped_defs = True
 profile = black
 combine_as_imports = True
 known_first_party = httpx,tests
-known_third_party = brotli,certifi,chardet,cryptography,httpcore,pytest,rfc3986,setuptools,sniffio,trio,trustme,uvicorn
+known_third_party = brotli,certifi,cryptography,httpcore,pytest,rfc3986,setuptools,sniffio,trio,trustme,uvicorn
 
 [tool:pytest]
 addopts = --cov=httpx --cov=tests -rxXs
index 66ba887626c9000aad4f3ce103d96c5c0e8eba05..c20a11bc138fe6d47cc369bfa4bc6ff62f1a542a 100644 (file)
@@ -172,10 +172,12 @@ def test_url():
     request = httpx.Request("GET", url)
     assert request.url.scheme == "http"
     assert request.url.port is None
-    assert request.url.full_path == "/"
+    assert request.url.path == "/"
+    assert request.url.raw_path == b"/"
 
     url = "https://example.org/abc?foo=bar"
     request = httpx.Request("GET", url)
     assert request.url.scheme == "https"
     assert request.url.port is None
-    assert request.url.full_path == "/abc?foo=bar"
+    assert request.url.path == "/abc"
+    assert request.url.raw_path == b"/abc?foo=bar"
index 8d34a75a790f873469cde22d367cb0156ec3065c..fcd81ba50ca1c879d5fdb01aaade8fa292d7ea0e 100644 (file)
@@ -65,9 +65,9 @@ def test_url():
     assert url.scheme == "https"
     assert url.host == "example.org"
     assert url.port == 123
-    assert url.authority == "example.org:123"
     assert url.path == "/path/to/somewhere"
-    assert url.query == "abc=123"
+    assert url.query == b"abc=123"
+    assert url.raw_path == b"/path/to/somewhere?abc=123"
     assert url.fragment == "anchor"
     assert (
         repr(url) == "URL('https://example.org:123/path/to/somewhere?abc=123#anchor')"
@@ -175,7 +175,7 @@ def test_url_set():
     assert all(url in urls for url in url_set)
 
 
-def test_url_copywith_for_authority():
+def test_url_copywith_authority_subcomponents():
     copy_with_kwargs = {
         "username": "username",
         "password": "password",
@@ -184,12 +184,19 @@ def test_url_copywith_for_authority():
     }
     url = httpx.URL("https://example.org")
     new = url.copy_with(**copy_with_kwargs)
-    for k, v in copy_with_kwargs.items():
-        assert getattr(new, k) == v
     assert str(new) == "https://username:password@example.net:444"
 
 
-def test_url_copywith_for_userinfo():
+def test_url_copywith_netloc():
+    copy_with_kwargs = {
+        "netloc": "example.net:444",
+    }
+    url = httpx.URL("https://example.org")
+    new = url.copy_with(**copy_with_kwargs)
+    assert str(new) == "https://example.net:444"
+
+
+def test_url_copywith_userinfo_subcomponents():
     copy_with_kwargs = {
         "username": "tom@example.org",
         "password": "abc123@ %",
@@ -199,6 +206,51 @@ def test_url_copywith_for_userinfo():
     assert str(new) == "https://tom%40example.org:abc123%40%20%25@example.org"
     assert new.username == "tom@example.org"
     assert new.password == "abc123@ %"
+    assert new.userinfo == b"tom%40example.org:abc123%40%20%25"
+
+
+def test_url_copywith_invalid_component():
+    url = httpx.URL("https://example.org")
+    with pytest.raises(TypeError):
+        url.copy_with(pathh="/incorrect-spelling")
+    with pytest.raises(TypeError):
+        url.copy_with(userinfo="should be bytes")
+
+
+def test_url_copywith_urlencoded_path():
+    url = httpx.URL("https://example.org")
+    url = url.copy_with(path="/path to somewhere")
+    assert url.path == "/path to somewhere"
+    assert url.query == b""
+    assert url.raw_path == b"/path%20to%20somewhere"
+
+
+def test_url_copywith_query():
+    url = httpx.URL("https://example.org")
+    url = url.copy_with(query=b"a=123")
+    assert url.path == "/"
+    assert url.query == b"a=123"
+    assert url.raw_path == b"/?a=123"
+
+
+def test_url_copywith_raw_path():
+    url = httpx.URL("https://example.org")
+    url = url.copy_with(raw_path=b"/some/path")
+    assert url.path == "/some/path"
+    assert url.query == b""
+    assert url.raw_path == b"/some/path"
+
+    url = httpx.URL("https://example.org")
+    url = url.copy_with(raw_path=b"/some/path?")
+    assert url.path == "/some/path"
+    assert url.query == b""
+    assert url.raw_path == b"/some/path?"
+
+    url = httpx.URL("https://example.org")
+    url = url.copy_with(raw_path=b"/some/path?a=123")
+    assert url.path == "/some/path"
+    assert url.query == b"a=123"
+    assert url.raw_path == b"/some/path?a=123"
 
 
 def test_url_invalid():
@@ -212,3 +264,26 @@ def test_url_invalid_type():
 
     with pytest.raises(TypeError):
         httpx.URL(ExternalURLClass())  # type: ignore
+
+
+def test_url_with_empty_query():
+    """
+    URLs with and without a trailing `?` but an empty query component
+    should preserve the information on the raw path.
+    """
+    url = httpx.URL("https://www.example.com/path")
+    assert url.path == "/path"
+    assert url.query == b""
+    assert url.raw_path == b"/path"
+
+    url = httpx.URL("https://www.example.com/path?")
+    assert url.path == "/path"
+    assert url.query == b""
+    assert url.raw_path == b"/path?"
+
+
+def test_url_with_url_encoded_path():
+    url = httpx.URL("https://www.example.com/path%20to%20somewhere")
+    assert url.path == "/path to somewhere"
+    assert url.query == b""
+    assert url.raw_path == b"/path%20to%20somewhere"