Cleaner no proxy support (#1103)

author Tom Christie <tom@tomchristie.com>

Sun, 2 Aug 2020 09:00:45 +0000 (10:00 +0100)

committer GitHub <noreply@github.com>

Sun, 2 Aug 2020 09:00:45 +0000 (10:00 +0100)
author Tom Christie <tom@tomchristie.com>
Sun, 2 Aug 2020 09:00:45 +0000 (10:00 +0100)
committer GitHub <noreply@github.com>
Sun, 2 Aug 2020 09:00:45 +0000 (10:00 +0100)
diff --git a/httpx/_client.py b/httpx/_client.py

index bfc4e623b298b1f4129187fb11bd757362ac2d25..4110fd7d9696a6565f74e152512866e633dbdb6b 100644 (file)
--- a/httpx/_client.py
+++ b/httpx/_client.py
@@ -49,7 +49,6 @@ from ._utils import (
      get_environment_proxies,
      get_logger,
      same_origin,
-    should_not_be_proxied,
      warn_deprecated,
  )
  
@@ -95,7 +94,7 @@ class BaseClient:
          if proxies is None:
              if trust_env:
                  return {
-                    key: Proxy(url=url)
+                    key: None if url is None else Proxy(url=url)
                      for key, url in get_environment_proxies().items()
                  }
              return {}
@@ -558,10 +557,9 @@ class Client(BaseClient):
          url = request.url
          enforce_http_url(request)
  
-        if self._proxies and not should_not_be_proxied(url):
-            for pattern, transport in self._proxies.items():
-                if pattern.matches(url):
-                    return self._transport if transport is None else transport
+        for pattern, transport in self._proxies.items():
+            if pattern.matches(url):
+                return self._transport if transport is None else transport
  
          return self._transport
  
@@ -1089,10 +1087,9 @@ class AsyncClient(BaseClient):
          url = request.url
          enforce_http_url(request)
  
-        if self._proxies and not should_not_be_proxied(url):
-            for pattern, transport in self._proxies.items():
-                if pattern.matches(url):
-                    return self._transport if transport is None else transport
+        for pattern, transport in self._proxies.items():
+            if pattern.matches(url):
+                return self._transport if transport is None else transport
  
          return self._transport
  
diff --git a/httpx/_utils.py b/httpx/_utils.py

index 3f869810ac7282fd6e962f5bcafb46b45f4fb430..c2dc6a453b48da85a9c06475fb3a7c313245c957 100644 (file)
--- a/httpx/_utils.py
+++ b/httpx/_utils.py
@@ -299,42 +299,39 @@ def same_origin(url: "URL", other: "URL") -> bool:
      )
  
  
-def should_not_be_proxied(url: "URL") -> bool:
-    """
-    Return True if url should not be proxied,
-    return False otherwise.
-    """
-    no_proxy = getproxies().get("no")
-    if not no_proxy:
-        return False
-    no_proxy_list = [host.strip() for host in no_proxy.split(",")]
-    for name in no_proxy_list:
-        if name == "*":
-            return True
-        if name:
-            name = name.lstrip(".")  # ignore leading dots
-            name = re.escape(name)
-            pattern = r"(.+\.)?%s$" % name
-            if re.match(pattern, url.host, re.I) or re.match(
-                pattern, url.authority, re.I
-            ):
-                return True
-    return False
-
-
-def get_environment_proxies() -> typing.Dict[str, str]:
+def get_environment_proxies() -> typing.Dict[str, typing.Optional[str]]:
      """Gets proxy information from the environment"""
  
      # urllib.request.getproxies() falls back on System
      # Registry and Config for proxies on Windows and macOS.
      # We don't want to propagate non-HTTP proxies into
      # our configuration such as 'TRAVIS_APT_PROXY'.
-    supported_proxy_schemes = ("http", "https", "all")
-    return {
-        key: val
-        for key, val in getproxies().items()
-        if ("://" in key or key in supported_proxy_schemes)
-    }
+    proxy_info = getproxies()
+    mounts: typing.Dict[str, typing.Optional[str]] = {}
+
+    for scheme in ("http", "https", "all"):
+        if proxy_info.get(scheme):
+            mounts[scheme] = proxy_info[scheme]
+
+    no_proxy_hosts = [host.strip() for host in proxy_info.get("no", "").split(",")]
+    for hostname in no_proxy_hosts:
+        # See https://curl.haxx.se/libcurl/c/CURLOPT_NOPROXY.html for details
+        # on how names in `NO_PROXY` are handled.
+        if hostname == "*":
+            # If NO_PROXY=* is used or if "*" occurs as any one of the comma
+            # seperated hostnames, then we should just bypass any information
+            # from HTTP_PROXY, HTTPS_PROXY, ALL_PROXY, and always ignore
+            # proxies.
+            return {}
+        elif hostname:
+            # NO_PROXY=.google.com is marked as "all://*.google.com,
+            #   which disables "www.google.com" but not "google.com"
+            # NO_PROXY=google.com is marked as "all://*google.com,
+            #   which disables "www.google.com" and "google.com".
+            #   (But not "wwwgoogle.com")
+            mounts[f"all://*{hostname}"] = None
+
+    return mounts
  
  
  def to_bytes(value: typing.Union[str, bytes], encoding: str = "utf-8") -> bytes:
@@ -485,13 +482,32 @@ class URLPattern:
          url = URL(pattern)
          self.pattern = pattern
          self.scheme = "" if url.scheme == "all" else url.scheme
-        self.host = url.host
+        self.host = "" if url.host == "*" else url.host
          self.port = url.port
+        if not url.host or url.host == "*":
+            self.host_regex: typing.Optional[typing.Pattern[str]] = None
+        else:
+            if url.host.startswith("*."):
+                # *.example.com should match "www.example.com", but not "example.com"
+                domain = re.escape(url.host[2:])
+                self.host_regex = re.compile(f"^.+\\.{domain}$")
+            elif url.host.startswith("*"):
+                # *example.com should match "www.example.com" and "example.com"
+                domain = re.escape(url.host[1:])
+                self.host_regex = re.compile(f"^(.+\\.)?{domain}$")
+            else:
+                # example.com should match "example.com" but not "www.example.com"
+                domain = re.escape(url.host)
+                self.host_regex = re.compile(f"^{domain}$")
  
      def matches(self, other: "URL") -> bool:
          if self.scheme and self.scheme != other.scheme:
              return False
-        if self.host and self.host != other.host:
+        if (
+            self.host
+            and self.host_regex is not None
+            and not self.host_regex.match(other.host)
+        ):
              return False
          if self.port is not None and self.port != other.port:
              return False
@@ -503,8 +519,11 @@ class URLPattern:
          The priority allows URLPattern instances to be sortable, so that
          we can match from most specific to least specific.
          """
-        port_priority = -1 if self.port is not None else 0
+        # URLs with a port should take priority over URLs without a port.
+        port_priority = 0 if self.port is not None else 1
+        # Longer hostnames should match first.
          host_priority = -len(self.host)
+        # Longer schemes should match first.
          scheme_priority = -len(self.scheme)
          return (port_priority, host_priority, scheme_priority)
  
diff --git a/tests/client/test_proxies.py b/tests/client/test_proxies.py

index 4fc1831943aa32d776da838ceba776c0d2fffbbf..cffa75d5d6105ad633a78c26d8de1e11dcb0e63b 100644 (file)
--- a/tests/client/test_proxies.py
+++ b/tests/client/test_proxies.py
@@ -56,6 +56,20 @@ PROXY_URL = "http://[::1]"
          ("http://example.com", {}, None),
          ("http://example.com", {"https": PROXY_URL}, None),
          ("http://example.com", {"http://example.net": PROXY_URL}, None),
+        # Using "*" should match any domain name.
+        ("http://example.com", {"http://*": PROXY_URL}, PROXY_URL),
+        ("https://example.com", {"http://*": PROXY_URL}, None),
+        # Using "example.com" should match example.com, but not www.example.com
+        ("http://example.com", {"http://example.com": PROXY_URL}, PROXY_URL),
+        ("http://www.example.com", {"http://example.com": PROXY_URL}, None),
+        # Using "*.example.com" should match www.example.com, but not example.com
+        ("http://example.com", {"http://*.example.com": PROXY_URL}, None),
+        ("http://www.example.com", {"http://*.example.com": PROXY_URL}, PROXY_URL),
+        # Using "*example.com" should match example.com and www.example.com
+        ("http://example.com", {"http://*example.com": PROXY_URL}, PROXY_URL),
+        ("http://www.example.com", {"http://*example.com": PROXY_URL}, PROXY_URL),
+        ("http://wwwexample.com", {"http://*example.com": PROXY_URL}, None),
+        # ...
          ("http://example.com:443", {"http://example.com": PROXY_URL}, PROXY_URL),
          ("http://example.com", {"all": PROXY_URL}, PROXY_URL),
          ("http://example.com", {"all": PROXY_URL, "http://example.com": None}, None),
@@ -134,6 +148,85 @@ def test_unsupported_proxy_scheme():
              {"HTTP_PROXY": "http://example.com", "NO_PROXY": "google.com"},
              None,
          ),
+        # Everything proxied when NO_PROXY is empty/unset
+        (
+            "http://127.0.0.1",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": ""},
+            "http://localhost:123",
+        ),
+        # Not proxied if NO_PROXY matches URL.
+        (
+            "http://127.0.0.1",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": "127.0.0.1"},
+            None,
+        ),
+        # Proxied if NO_PROXY scheme does not match URL.
+        (
+            "http://127.0.0.1",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": "https://127.0.0.1"},
+            "http://localhost:123",
+        ),
+        # Proxied if NO_PROXY scheme does not match host.
+        (
+            "http://127.0.0.1",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": "1.1.1.1"},
+            "http://localhost:123",
+        ),
+        # Not proxied if NO_PROXY matches host domain suffix.
+        (
+            "http://courses.mit.edu",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": "mit.edu"},
+            None,
+        ),
+        # Proxied even though NO_PROXY matches host domain *prefix*.
+        (
+            "https://mit.edu.info",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": "mit.edu"},
+            "http://localhost:123",
+        ),
+        # Not proxied if one item in NO_PROXY case matches host domain suffix.
+        (
+            "https://mit.edu.info",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": "mit.edu,edu.info"},
+            None,
+        ),
+        # Not proxied if one item in NO_PROXY case matches host domain suffix.
+        # May include whitespace.
+        (
+            "https://mit.edu.info",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": "mit.edu, edu.info"},
+            None,
+        ),
+        # Proxied if no items in NO_PROXY match.
+        (
+            "https://mit.edu.info",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": "mit.edu,mit.info"},
+            "http://localhost:123",
+        ),
+        # Proxied if NO_PROXY domain doesn't match.
+        (
+            "https://foo.example.com",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": "www.example.com"},
+            "http://localhost:123",
+        ),
+        # Not proxied for subdomains matching NO_PROXY, with a leading ".".
+        (
+            "https://www.example1.com",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": ".example1.com"},
+            None,
+        ),
+        # Proxied, because NO_PROXY subdomains only match if "." seperated.
+        (
+            "https://www.example2.com",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": "ample2.com"},
+            "http://localhost:123",
+        ),
+        # No requests are proxied if NO_PROXY="*" is set.
+        (
+            "https://www.example3.com",
+            {"ALL_PROXY": "http://localhost:123", "NO_PROXY": "*"},
+            None,
+        ),
      ],
  )
  @pytest.mark.parametrize("client_class", [httpx.Client, httpx.AsyncClient])
diff --git a/tests/test_utils.py b/tests/test_utils.py

index c2ab33dfe13bf3791eff7daf9285d7247acf8c8c..d564ceb75a26dd5e2914fd81be86749c7f8aebf4 100644 (file)
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -15,7 +15,6 @@ from httpx._utils import (
      obfuscate_sensitive_headers,
      parse_header_links,
      same_origin,
-    should_not_be_proxied,
  )
  from tests.utils import override_log_level
  
@@ -223,82 +222,6 @@ def test_obfuscate_sensitive_headers(headers, output):
      assert list(obfuscate_sensitive_headers(bytes_headers)) == bytes_output
  
  
-@pytest.mark.parametrize(
-    ["url", "no_proxy", "expected"],
-    [
-        (
-            "http://127.0.0.1",
-            {"NO_PROXY": ""},
-            False,
-        ),  # everything proxied when no_proxy is empty/unset
-        (
-            "http://127.0.0.1",
-            {"NO_PROXY": "127.0.0.1"},
-            True,
-        ),  # no_proxy as ip case is matched
-        (
-            "http://127.0.0.1",
-            {"NO_PROXY": "https://127.0.0.1"},
-            False,
-        ),  # no_proxy with scheme is ignored
-        (
-            "http://127.0.0.1",
-            {"NO_PROXY": "1.1.1.1"},
-            False,
-        ),  # different no_proxy means its proxied
-        (
-            "http://courses.mit.edu",
-            {"NO_PROXY": "mit.edu"},
-            True,
-        ),  # no_proxy for sub-domain matches
-        (
-            "https://mit.edu.info",
-            {"NO_PROXY": "mit.edu"},
-            False,
-        ),  # domain is actually edu.info, so should be proxied
-        (
-            "https://mit.edu.info",
-            {"NO_PROXY": "mit.edu,edu.info"},
-            True,
-        ),  # list in no_proxy, matches second domain
-        (
-            "https://mit.edu.info",
-            {"NO_PROXY": "mit.edu, edu.info"},
-            True,
-        ),  # list with spaces in no_proxy
-        (
-            "https://mit.edu.info",
-            {"NO_PROXY": "mit.edu,mit.info"},
-            False,
-        ),  # list in no_proxy, without any domain matching
-        (
-            "https://foo.example.com",
-            {"NO_PROXY": "www.example.com"},
-            False,
-        ),  # different subdomains foo vs www means we still proxy
-        (
-            "https://www.example1.com",
-            {"NO_PROXY": ".example1.com"},
-            True,
-        ),  # no_proxy starting with dot
-        (
-            "https://www.example2.com",
-            {"NO_PROXY": "ample2.com"},
-            False,
-        ),  # whole-domain matching
-        (
-            "https://www.example3.com",
-            {"NO_PROXY": "*"},
-            True,
-        ),  # wildcard * means nothing proxied
-    ],
-)
-def test_should_not_be_proxied(url, no_proxy, expected):
-    os.environ.update(no_proxy)
-    parsed_url = httpx.URL(url)
-    assert should_not_be_proxied(parsed_url) == expected
-
-
  def test_same_origin():
      origin1 = httpx.URL("https://example.com")
      origin2 = httpx.URL("HTTPS://EXAMPLE.COM:443")
author	Tom Christie <tom@tomchristie.com>
	Sun, 2 Aug 2020 09:00:45 +0000 (10:00 +0100)
committer	GitHub <noreply@github.com>
	Sun, 2 Aug 2020 09:00:45 +0000 (10:00 +0100)
httpx/_client.py		patch \| blob \| blame \| history
httpx/_utils.py		patch \| blob \| blame \| history
tests/client/test_proxies.py		patch \| blob \| blame \| history
tests/test_utils.py		patch \| blob \| blame \| history