]> git.ipfire.org Git - thirdparty/httpx.git/commitdiff
Cleaner no proxy support (#1103) 1118/head
authorTom Christie <tom@tomchristie.com>
Sun, 2 Aug 2020 09:00:45 +0000 (10:00 +0100)
committerGitHub <noreply@github.com>
Sun, 2 Aug 2020 09:00:45 +0000 (10:00 +0100)
* Add internal URLMatcher class

* Use URLMatcher for proxy lookups in transport_for_url

* Docstring

* Pin pytest

* Add support for no-proxies configurations

* Don't call should_not_proxy on each request

* Drop print statements

* Tweak comment

* Tweak comment on domain wildcards

* Update httpx/_utils.py

Co-authored-by: Florimond Manca <florimond.manca@gmail.com>
* Pull test_should_not_be_proxied cases into test_proxies_environ

Co-authored-by: Florimond Manca <florimond.manca@gmail.com>
httpx/_client.py
httpx/_utils.py
tests/client/test_proxies.py
tests/test_utils.py

index bfc4e623b298b1f4129187fb11bd757362ac2d25..4110fd7d9696a6565f74e152512866e633dbdb6b 100644 (file)
@@ -49,7 +49,6 @@ from ._utils import (
     get_environment_proxies,
     get_logger,
     same_origin,
-    should_not_be_proxied,
     warn_deprecated,
 )
 
@@ -95,7 +94,7 @@ class BaseClient:
         if proxies is None:
             if trust_env:
                 return {
-                    key: Proxy(url=url)
+                    key: None if url is None else Proxy(url=url)
                     for key, url in get_environment_proxies().items()
                 }
             return {}
@@ -558,10 +557,9 @@ class Client(BaseClient):
         url = request.url
         enforce_http_url(request)
 
-        if self._proxies and not should_not_be_proxied(url):
-            for pattern, transport in self._proxies.items():
-                if pattern.matches(url):
-                    return self._transport if transport is None else transport
+        for pattern, transport in self._proxies.items():
+            if pattern.matches(url):
+                return self._transport if transport is None else transport
 
         return self._transport
 
@@ -1089,10 +1087,9 @@ class AsyncClient(BaseClient):
         url = request.url
         enforce_http_url(request)
 
-        if self._proxies and not should_not_be_proxied(url):
-            for pattern, transport in self._proxies.items():
-                if pattern.matches(url):
-                    return self._transport if transport is None else transport
+        for pattern, transport in self._proxies.items():
+            if pattern.matches(url):
+                return self._transport if transport is None else transport
 
         return self._transport
 
index 3f869810ac7282fd6e962f5bcafb46b45f4fb430..c2dc6a453b48da85a9c06475fb3a7c313245c957 100644 (file)
@@ -299,42 +299,39 @@ def same_origin(url: "URL", other: "URL") -> bool:
     )
 
 
-def should_not_be_proxied(url: "URL") -> bool:
-    """
-    Return True if url should not be proxied,
-    return False otherwise.
-    """
-    no_proxy = getproxies().get("no")
-    if not no_proxy:
-        return False
-    no_proxy_list = [host.strip() for host in no_proxy.split(",")]
-    for name in no_proxy_list:
-        if name == "*":
-            return True
-        if name:
-            name = name.lstrip(".")  # ignore leading dots
-            name = re.escape(name)
-            pattern = r"(.+\.)?%s$" % name
-            if re.match(pattern, url.host, re.I) or re.match(
-                pattern, url.authority, re.I
-            ):
-                return True
-    return False
-
-
-def get_environment_proxies() -> typing.Dict[str, str]:
+def get_environment_proxies() -> typing.Dict[str, typing.Optional[str]]:
     """Gets proxy information from the environment"""
 
     # urllib.request.getproxies() falls back on System
     # Registry and Config for proxies on Windows and macOS.
     # We don't want to propagate non-HTTP proxies into
     # our configuration such as 'TRAVIS_APT_PROXY'.
-    supported_proxy_schemes = ("http", "https", "all")
-    return {
-        key: val
-        for key, val in getproxies().items()
-        if ("://" in key or key in supported_proxy_schemes)
-    }
+    proxy_info = getproxies()
+    mounts: typing.Dict[str, typing.Optional[str]] = {}
+
+    for scheme in ("http", "https", "all"):
+        if proxy_info.get(scheme):
+            mounts[scheme] = proxy_info[scheme]
+
+    no_proxy_hosts = [host.strip() for host in proxy_info.get("no", "").split(",")]
+    for hostname in no_proxy_hosts:
+        # See https://curl.haxx.se/libcurl/c/CURLOPT_NOPROXY.html for details
+        # on how names in `NO_PROXY` are handled.
+        if hostname == "*":
+            # If NO_PROXY=* is used or if "*" occurs as any one of the comma
+            # seperated hostnames, then we should just bypass any information
+            # from HTTP_PROXY, HTTPS_PROXY, ALL_PROXY, and always ignore
+            # proxies.
+            return {}
+        elif hostname:
+            # NO_PROXY=.google.com is marked as "all://*.google.com,
+            #   which disables "www.google.com" but not "google.com"
+            # NO_PROXY=google.com is marked as "all://*google.com,
+            #   which disables "www.google.com" and "google.com".
+            #   (But not "wwwgoogle.com")
+            mounts[f"all://*{hostname}"] = None
+
+    return mounts
 
 
 def to_bytes(value: typing.Union[str, bytes], encoding: str = "utf-8") -> bytes:
@@ -485,13 +482,32 @@ class URLPattern:
         url = URL(pattern)
         self.pattern = pattern
         self.scheme = "" if url.scheme == "all" else url.scheme
-        self.host = url.host
+        self.host = "" if url.host == "*" else url.host
         self.port = url.port
+        if not url.host or url.host == "*":
+            self.host_regex: typing.Optional[typing.Pattern[str]] = None
+        else:
+            if url.host.startswith("*."):
+                # *.example.com should match "www.example.com", but not "example.com"
+                domain = re.escape(url.host[2:])
+                self.host_regex = re.compile(f"^.+\\.{domain}$")
+            elif url.host.startswith("*"):
+                # *example.com should match "www.example.com" and "example.com"
+                domain = re.escape(url.host[1:])
+                self.host_regex = re.compile(f"^(.+\\.)?{domain}$")
+            else:
+                # example.com should match "example.com" but not "www.example.com"
+                domain = re.escape(url.host)
+                self.host_regex = re.compile(f"^{domain}$")
 
     def matches(self, other: "URL") -> bool:
         if self.scheme and self.scheme != other.scheme:
             return False
-        if self.host and self.host != other.host:
+        if (
+            self.host
+            and self.host_regex is not None
+            and not self.host_regex.match(other.host)
+        ):
             return False
         if self.port is not None and self.port != other.port:
             return False
@@ -503,8 +519,11 @@ class URLPattern:
         The priority allows URLPattern instances to be sortable, so that
         we can match from most specific to least specific.
         """
-        port_priority = -1 if self.port is not None else 0
+        # URLs with a port should take priority over URLs without a port.
+        port_priority = 0 if self.port is not None else 1
+        # Longer hostnames should match first.
         host_priority = -len(self.host)
+        # Longer schemes should match first.
         scheme_priority = -len(self.scheme)
         return (port_priority, host_priority, scheme_priority)
 
index 4fc1831943aa32d776da838ceba776c0d2fffbbf..cffa75d5d6105ad633a78c26d8de1e11dcb0e63b 100644 (file)
@@ -56,6 +56,20 @@ PROXY_URL = "http://[::1]"
         ("http://example.com", {}, None),
         ("http://example.com", {"https": PROXY_URL}, None),
         ("http://example.com", {"http://example.net": PROXY_URL}, None),
+        # Using "*" should match any domain name.
+        ("http://example.com", {"http://*": PROXY_URL}, PROXY_URL),
+        ("https://example.com", {"http://*": PROXY_URL}, None),
+        # Using "example.com" should match example.com, but not www.example.com
+        ("http://example.com", {"http://example.com": PROXY_URL}, PROXY_URL),
+        ("http://www.example.com", {"http://example.com": PROXY_URL}, None),
+        # Using "*.example.com" should match www.example.com, but not example.com
+        ("http://example.com", {"http://*.example.com": PROXY_URL}, None),
+        ("http://www.example.com", {"http://*.example.com": PROXY_URL}, PROXY_URL),
+        # Using "*example.com" should match example.com and www.example.com
+        ("http://example.com", {"http://*example.com": PROXY_URL}, PROXY_URL),
+        ("http://www.example.com", {"http://*example.com": PROXY_URL}, PROXY_URL),
+        ("http://wwwexample.com", {"http://*example.com": PROXY_URL}, None),
+        # ...
         ("http://example.com:443", {"http://example.com": PROXY_URL}, PROXY_URL),
         ("http://example.com", {"all": PROXY_URL}, PROXY_URL),
         ("http://example.com", {"all": PROXY_URL, "http://example.com": None}, None),
@@ -134,6 +148,85 @@ def test_unsupported_proxy_scheme():
             {"HTTP_PROXY": "http://example.com", "NO_PROXY": "google.com"},
             None,
         ),
+        # Everything proxied when NO_PROXY is empty/unset
+        (
+            "http://127.0.0.1",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": ""},
+            "http://localhost:123",
+        ),
+        # Not proxied if NO_PROXY matches URL.
+        (
+            "http://127.0.0.1",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": "127.0.0.1"},
+            None,
+        ),
+        # Proxied if NO_PROXY scheme does not match URL.
+        (
+            "http://127.0.0.1",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": "https://127.0.0.1"},
+            "http://localhost:123",
+        ),
+        # Proxied if NO_PROXY scheme does not match host.
+        (
+            "http://127.0.0.1",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": "1.1.1.1"},
+            "http://localhost:123",
+        ),
+        # Not proxied if NO_PROXY matches host domain suffix.
+        (
+            "http://courses.mit.edu",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": "mit.edu"},
+            None,
+        ),
+        # Proxied even though NO_PROXY matches host domain *prefix*.
+        (
+            "https://mit.edu.info",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": "mit.edu"},
+            "http://localhost:123",
+        ),
+        # Not proxied if one item in NO_PROXY case matches host domain suffix.
+        (
+            "https://mit.edu.info",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": "mit.edu,edu.info"},
+            None,
+        ),
+        # Not proxied if one item in NO_PROXY case matches host domain suffix.
+        # May include whitespace.
+        (
+            "https://mit.edu.info",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": "mit.edu, edu.info"},
+            None,
+        ),
+        # Proxied if no items in NO_PROXY match.
+        (
+            "https://mit.edu.info",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": "mit.edu,mit.info"},
+            "http://localhost:123",
+        ),
+        # Proxied if NO_PROXY domain doesn't match.
+        (
+            "https://foo.example.com",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": "www.example.com"},
+            "http://localhost:123",
+        ),
+        # Not proxied for subdomains matching NO_PROXY, with a leading ".".
+        (
+            "https://www.example1.com",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": ".example1.com"},
+            None,
+        ),
+        # Proxied, because NO_PROXY subdomains only match if "." seperated.
+        (
+            "https://www.example2.com",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": "ample2.com"},
+            "http://localhost:123",
+        ),
+        # No requests are proxied if NO_PROXY="*" is set.
+        (
+            "https://www.example3.com",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": "*"},
+            None,
+        ),
     ],
 )
 @pytest.mark.parametrize("client_class", [httpx.Client, httpx.AsyncClient])
index c2ab33dfe13bf3791eff7daf9285d7247acf8c8c..d564ceb75a26dd5e2914fd81be86749c7f8aebf4 100644 (file)
@@ -15,7 +15,6 @@ from httpx._utils import (
     obfuscate_sensitive_headers,
     parse_header_links,
     same_origin,
-    should_not_be_proxied,
 )
 from tests.utils import override_log_level
 
@@ -223,82 +222,6 @@ def test_obfuscate_sensitive_headers(headers, output):
     assert list(obfuscate_sensitive_headers(bytes_headers)) == bytes_output
 
 
-@pytest.mark.parametrize(
-    ["url", "no_proxy", "expected"],
-    [
-        (
-            "http://127.0.0.1",
-            {"NO_PROXY": ""},
-            False,
-        ),  # everything proxied when no_proxy is empty/unset
-        (
-            "http://127.0.0.1",
-            {"NO_PROXY": "127.0.0.1"},
-            True,
-        ),  # no_proxy as ip case is matched
-        (
-            "http://127.0.0.1",
-            {"NO_PROXY": "https://127.0.0.1"},
-            False,
-        ),  # no_proxy with scheme is ignored
-        (
-            "http://127.0.0.1",
-            {"NO_PROXY": "1.1.1.1"},
-            False,
-        ),  # different no_proxy means its proxied
-        (
-            "http://courses.mit.edu",
-            {"NO_PROXY": "mit.edu"},
-            True,
-        ),  # no_proxy for sub-domain matches
-        (
-            "https://mit.edu.info",
-            {"NO_PROXY": "mit.edu"},
-            False,
-        ),  # domain is actually edu.info, so should be proxied
-        (
-            "https://mit.edu.info",
-            {"NO_PROXY": "mit.edu,edu.info"},
-            True,
-        ),  # list in no_proxy, matches second domain
-        (
-            "https://mit.edu.info",
-            {"NO_PROXY": "mit.edu, edu.info"},
-            True,
-        ),  # list with spaces in no_proxy
-        (
-            "https://mit.edu.info",
-            {"NO_PROXY": "mit.edu,mit.info"},
-            False,
-        ),  # list in no_proxy, without any domain matching
-        (
-            "https://foo.example.com",
-            {"NO_PROXY": "www.example.com"},
-            False,
-        ),  # different subdomains foo vs www means we still proxy
-        (
-            "https://www.example1.com",
-            {"NO_PROXY": ".example1.com"},
-            True,
-        ),  # no_proxy starting with dot
-        (
-            "https://www.example2.com",
-            {"NO_PROXY": "ample2.com"},
-            False,
-        ),  # whole-domain matching
-        (
-            "https://www.example3.com",
-            {"NO_PROXY": "*"},
-            True,
-        ),  # wildcard * means nothing proxied
-    ],
-)
-def test_should_not_be_proxied(url, no_proxy, expected):
-    os.environ.update(no_proxy)
-    parsed_url = httpx.URL(url)
-    assert should_not_be_proxied(parsed_url) == expected
-
-
 def test_same_origin():
     origin1 = httpx.URL("https://example.com")
     origin2 = httpx.URL("HTTPS://EXAMPLE.COM:443")