[3.3] bpo-30500: urllib: Simplify splithost by calling into urlparse. (#1849) (#2292)

author Victor Stinner <victor.stinner@gmail.com>

Wed, 26 Jul 2017 02:43:52 +0000 (04:43 +0200)

committer Ned Deily <nad@python.org>

Wed, 26 Jul 2017 02:43:52 +0000 (22:43 -0400)
author Victor Stinner <victor.stinner@gmail.com>
Wed, 26 Jul 2017 02:43:52 +0000 (04:43 +0200)
committer Ned Deily <nad@python.org>
Wed, 26 Jul 2017 02:43:52 +0000 (22:43 -0400)
diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py

index d67cf258b0212331ce604599aa88b4ab580b3c39..af637ffd78ab98c2596e85dbe0e9b8be931f4474 100644 (file)
--- a/Lib/test/test_urlparse.py
+++ b/Lib/test/test_urlparse.py
@@ -664,6 +664,52 @@ class UrlParseTestCase(unittest.TestCase):
          self.assertEqual(urllib.parse.urlparse(b"x-newscheme://foo.com/stuff?query"),
                           (b'x-newscheme', b'foo.com', b'/stuff', b'', b'query', b''))
  
+    def test_default_scheme(self):
+        # Exercise the scheme parameter of urlparse() and urlsplit()
+        for func in (urllib.parse.urlparse, urllib.parse.urlsplit):
+            result = func("http://example.net/", "ftp")
+            self.assertEqual(result.scheme, "http")
+            result = func(b"http://example.net/", b"ftp")
+            self.assertEqual(result.scheme, b"http")
+            self.assertEqual(func("path", "ftp").scheme, "ftp")
+            self.assertEqual(func("path", scheme="ftp").scheme, "ftp")
+            self.assertEqual(func(b"path", scheme=b"ftp").scheme, b"ftp")
+            self.assertEqual(func("path").scheme, "")
+            self.assertEqual(func(b"path").scheme, b"")
+            self.assertEqual(func(b"path", "").scheme, b"")
+
+    def test_parse_fragments(self):
+        # Exercise the allow_fragments parameter of urlparse() and urlsplit()
+        tests = (
+            ("http:#frag", "path", "frag"),
+            ("//example.net#frag", "path", "frag"),
+            ("index.html#frag", "path", "frag"),
+            (";a=b#frag", "params", "frag"),
+            ("?a=b#frag", "query", "frag"),
+            ("#frag", "path", "frag"),
+            ("abc#@frag", "path", "@frag"),
+            ("//abc#@frag", "path", "@frag"),
+            ("//abc:80#@frag", "path", "@frag"),
+            ("//abc#@frag:80", "path", "@frag:80"),
+        )
+        for url, attr, expected_frag in tests:
+            for func in (urllib.parse.urlparse, urllib.parse.urlsplit):
+                if attr == "params" and func is urllib.parse.urlsplit:
+                    attr = "path"
+                result = func(url, allow_fragments=False)
+                self.assertEqual(result.fragment, "")
+                self.assertTrue(
+                        getattr(result, attr).endswith("#" + expected_frag))
+                self.assertEqual(func(url, "", False).fragment, "")
+
+                result = func(url, allow_fragments=True)
+                self.assertEqual(result.fragment, expected_frag)
+                self.assertFalse(
+                        getattr(result, attr).endswith(expected_frag))
+                self.assertEqual(func(url, "", True).fragment,
+                                 expected_frag)
+                self.assertEqual(func(url).fragment, expected_frag)
+
      def test_mixed_types_rejected(self):
          # Several functions that process either strings or ASCII encoded bytes
          # accept multiple arguments. Check they reject mixed type input
@@ -749,52 +795,6 @@ class UrlParseTestCase(unittest.TestCase):
                                                            errors="ignore")
          self.assertEqual(result, [('key', '\u0141-')])
  
-    def test_splitport(self):
-        splitport = urllib.parse.splitport
-        self.assertEqual(splitport('parrot:88'), ('parrot', '88'))
-        self.assertEqual(splitport('parrot'), ('parrot', None))
-        self.assertEqual(splitport('parrot:'), ('parrot', None))
-        self.assertEqual(splitport('127.0.0.1'), ('127.0.0.1', None))
-        self.assertEqual(splitport('parrot:cheese'), ('parrot:cheese', None))
-
-    def test_splitnport(self):
-        splitnport = urllib.parse.splitnport
-        self.assertEqual(splitnport('parrot:88'), ('parrot', 88))
-        self.assertEqual(splitnport('parrot'), ('parrot', -1))
-        self.assertEqual(splitnport('parrot', 55), ('parrot', 55))
-        self.assertEqual(splitnport('parrot:'), ('parrot', -1))
-        self.assertEqual(splitnport('parrot:', 55), ('parrot', 55))
-        self.assertEqual(splitnport('127.0.0.1'), ('127.0.0.1', -1))
-        self.assertEqual(splitnport('127.0.0.1', 55), ('127.0.0.1', 55))
-        self.assertEqual(splitnport('parrot:cheese'), ('parrot', None))
-        self.assertEqual(splitnport('parrot:cheese', 55), ('parrot', None))
-
-    def test_splitquery(self):
-        # Normal cases are exercised by other tests; ensure that we also
-        # catch cases with no port specified (testcase ensuring coverage)
-        result = urllib.parse.splitquery('http://python.org/fake?foo=bar')
-        self.assertEqual(result, ('http://python.org/fake', 'foo=bar'))
-        result = urllib.parse.splitquery('http://python.org/fake?foo=bar?')
-        self.assertEqual(result, ('http://python.org/fake?foo=bar', ''))
-        result = urllib.parse.splitquery('http://python.org/fake')
-        self.assertEqual(result, ('http://python.org/fake', None))
-
-    def test_splitvalue(self):
-        # Normal cases are exercised by other tests; test pathological cases
-        # with no key/value pairs. (testcase ensuring coverage)
-        result = urllib.parse.splitvalue('foo=bar')
-        self.assertEqual(result, ('foo', 'bar'))
-        result = urllib.parse.splitvalue('foo=')
-        self.assertEqual(result, ('foo', ''))
-        result = urllib.parse.splitvalue('foobar')
-        self.assertEqual(result, ('foobar', None))
-
-    def test_to_bytes(self):
-        result = urllib.parse.to_bytes('http://www.python.org')
-        self.assertEqual(result, 'http://www.python.org')
-        self.assertRaises(UnicodeError, urllib.parse.to_bytes,
-                          'http://www.python.org/medi\u00e6val')
-
      def test_urlencode_sequences(self):
          # Other tests incidentally urlencode things; test non-covered cases:
          # Sequence and object values.
@@ -863,9 +863,162 @@ class UrlParseTestCase(unittest.TestCase):
          self.assertEqual(p1.path, '863-1234')
          self.assertEqual(p1.params, 'phone-context=+1-914-555')
  
+    def test_Quoter_repr(self):
+        quoter = urllib.parse.Quoter(urllib.parse._ALWAYS_SAFE)
+        self.assertIn('Quoter', repr(quoter))
+
+
+class Utility_Tests(unittest.TestCase):
+    """Testcase to test the various utility functions in the urllib."""
+    # In Python 2 this test class was in test_urllib.
+
+    def test_splittype(self):
+        splittype = urllib.parse.splittype
+        self.assertEqual(splittype('type:opaquestring'), ('type', 'opaquestring'))
+        self.assertEqual(splittype('opaquestring'), (None, 'opaquestring'))
+        self.assertEqual(splittype(':opaquestring'), (None, ':opaquestring'))
+        self.assertEqual(splittype('type:'), ('type', ''))
+        self.assertEqual(splittype('type:opaque:string'), ('type', 'opaque:string'))
+
+    def test_splithost(self):
+        splithost = urllib.parse.splithost
+        self.assertEqual(splithost('//www.example.org:80/foo/bar/baz.html'),
+                         ('www.example.org:80', '/foo/bar/baz.html'))
+        self.assertEqual(splithost('//www.example.org:80'),
+                         ('www.example.org:80', ''))
+        self.assertEqual(splithost('/foo/bar/baz.html'),
+                         (None, '/foo/bar/baz.html'))
+
+        # bpo-30500: # starts a fragment.
+        self.assertEqual(splithost('//127.0.0.1#@host.com'),
+                         ('127.0.0.1', '/#@host.com'))
+        self.assertEqual(splithost('//127.0.0.1#@host.com:80'),
+                         ('127.0.0.1', '/#@host.com:80'))
+        self.assertEqual(splithost('//127.0.0.1:80#@host.com'),
+                         ('127.0.0.1:80', '/#@host.com'))
+
+        # Empty host is returned as empty string.
+        self.assertEqual(splithost("///file"),
+                         ('', '/file'))
+
+        # Trailing semicolon, question mark and hash symbol are kept.
+        self.assertEqual(splithost("//example.net/file;"),
+                         ('example.net', '/file;'))
+        self.assertEqual(splithost("//example.net/file?"),
+                         ('example.net', '/file?'))
+        self.assertEqual(splithost("//example.net/file#"),
+                         ('example.net', '/file#'))
+
+    def test_splituser(self):
+        splituser = urllib.parse.splituser
+        self.assertEqual(splituser('User:Pass@www.python.org:080'),
+                         ('User:Pass', 'www.python.org:080'))
+        self.assertEqual(splituser('@www.python.org:080'),
+                         ('', 'www.python.org:080'))
+        self.assertEqual(splituser('www.python.org:080'),
+                         (None, 'www.python.org:080'))
+        self.assertEqual(splituser('User:Pass@'),
+                         ('User:Pass', ''))
+        self.assertEqual(splituser('User@example.com:Pass@www.python.org:080'),
+                         ('User@example.com:Pass', 'www.python.org:080'))
+
+    def test_splitpasswd(self):
+        # Some of the password examples are not sensible, but it is added to
+        # confirming to RFC2617 and addressing issue4675.
+        splitpasswd = urllib.parse.splitpasswd
+        self.assertEqual(splitpasswd('user:ab'), ('user', 'ab'))
+        self.assertEqual(splitpasswd('user:a\nb'), ('user', 'a\nb'))
+        self.assertEqual(splitpasswd('user:a\tb'), ('user', 'a\tb'))
+        self.assertEqual(splitpasswd('user:a\rb'), ('user', 'a\rb'))
+        self.assertEqual(splitpasswd('user:a\fb'), ('user', 'a\fb'))
+        self.assertEqual(splitpasswd('user:a\vb'), ('user', 'a\vb'))
+        self.assertEqual(splitpasswd('user:a:b'), ('user', 'a:b'))
+        self.assertEqual(splitpasswd('user:a b'), ('user', 'a b'))
+        self.assertEqual(splitpasswd('user 2:ab'), ('user 2', 'ab'))
+        self.assertEqual(splitpasswd('user+1:a+b'), ('user+1', 'a+b'))
+        self.assertEqual(splitpasswd('user:'), ('user', ''))
+        self.assertEqual(splitpasswd('user'), ('user', None))
+        self.assertEqual(splitpasswd(':ab'), ('', 'ab'))
+
+    def test_splitport(self):
+        splitport = urllib.parse.splitport
+        self.assertEqual(splitport('parrot:88'), ('parrot', '88'))
+        self.assertEqual(splitport('parrot'), ('parrot', None))
+        self.assertEqual(splitport('parrot:'), ('parrot', None))
+        self.assertEqual(splitport('127.0.0.1'), ('127.0.0.1', None))
+        self.assertEqual(splitport('parrot:cheese'), ('parrot:cheese', None))
+        self.assertEqual(splitport('[::1]:88'), ('[::1]', '88'))
+        self.assertEqual(splitport('[::1]'), ('[::1]', None))
+        self.assertEqual(splitport(':88'), ('', '88'))
+
+    def test_splitnport(self):
+        splitnport = urllib.parse.splitnport
+        self.assertEqual(splitnport('parrot:88'), ('parrot', 88))
+        self.assertEqual(splitnport('parrot'), ('parrot', -1))
+        self.assertEqual(splitnport('parrot', 55), ('parrot', 55))
+        self.assertEqual(splitnport('parrot:'), ('parrot', -1))
+        self.assertEqual(splitnport('parrot:', 55), ('parrot', 55))
+        self.assertEqual(splitnport('127.0.0.1'), ('127.0.0.1', -1))
+        self.assertEqual(splitnport('127.0.0.1', 55), ('127.0.0.1', 55))
+        self.assertEqual(splitnport('parrot:cheese'), ('parrot', None))
+        self.assertEqual(splitnport('parrot:cheese', 55), ('parrot', None))
+
+    def test_splitquery(self):
+        # Normal cases are exercised by other tests; ensure that we also
+        # catch cases with no port specified (testcase ensuring coverage)
+        splitquery = urllib.parse.splitquery
+        self.assertEqual(splitquery('http://python.org/fake?foo=bar'),
+                         ('http://python.org/fake', 'foo=bar'))
+        self.assertEqual(splitquery('http://python.org/fake?foo=bar?'),
+                         ('http://python.org/fake?foo=bar', ''))
+        self.assertEqual(splitquery('http://python.org/fake'),
+                         ('http://python.org/fake', None))
+        self.assertEqual(splitquery('?foo=bar'), ('', 'foo=bar'))
+
+    def test_splittag(self):
+        splittag = urllib.parse.splittag
+        self.assertEqual(splittag('http://example.com?foo=bar#baz'),
+                         ('http://example.com?foo=bar', 'baz'))
+        self.assertEqual(splittag('http://example.com?foo=bar#'),
+                         ('http://example.com?foo=bar', ''))
+        self.assertEqual(splittag('#baz'), ('', 'baz'))
+        self.assertEqual(splittag('http://example.com?foo=bar'),
+                         ('http://example.com?foo=bar', None))
+        self.assertEqual(splittag('http://example.com?foo=bar#baz#boo'),
+                         ('http://example.com?foo=bar#baz', 'boo'))
+
+    def test_splitattr(self):
+        splitattr = urllib.parse.splitattr
+        self.assertEqual(splitattr('/path;attr1=value1;attr2=value2'),
+                         ('/path', ['attr1=value1', 'attr2=value2']))
+        self.assertEqual(splitattr('/path;'), ('/path', ['']))
+        self.assertEqual(splitattr(';attr1=value1;attr2=value2'),
+                         ('', ['attr1=value1', 'attr2=value2']))
+        self.assertEqual(splitattr('/path'), ('/path', []))
+
+    def test_splitvalue(self):
+        # Normal cases are exercised by other tests; test pathological cases
+        # with no key/value pairs. (testcase ensuring coverage)
+        splitvalue = urllib.parse.splitvalue
+        self.assertEqual(splitvalue('foo=bar'), ('foo', 'bar'))
+        self.assertEqual(splitvalue('foo='), ('foo', ''))
+        self.assertEqual(splitvalue('=bar'), ('', 'bar'))
+        self.assertEqual(splitvalue('foobar'), ('foobar', None))
+        self.assertEqual(splitvalue('foo=bar=baz'), ('foo', 'bar=baz'))
+
+    def test_to_bytes(self):
+        result = urllib.parse.to_bytes('http://www.python.org')
+        self.assertEqual(result, 'http://www.python.org')
+        self.assertRaises(UnicodeError, urllib.parse.to_bytes,
+                          'http://www.python.org/medi\u00e6val')
+
+    def test_unwrap(self):
+        url = urllib.parse.unwrap('<URL:type://host/path>')
+        self.assertEqual(url, 'type://host/path')
+
  
  def test_main():
-    support.run_unittest(UrlParseTestCase)
+    support.run_unittest(UrlParseTestCase, Utility_Tests)
  
  if __name__ == "__main__":
      test_main()
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py

index 975c6ffb9c15a30487542d0927d0c63b8527b19f..4fcf0c0ead12c97ac800c1e9762d4293c017c5d7 100644 (file)
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -860,14 +860,12 @@ def splithost(url):
      """splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
      global _hostprog
      if _hostprog is None:
-        import re
-        _hostprog = re.compile('^//([^/?]*)(.*)$')
+        _hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
  
      match = _hostprog.match(url)
      if match:
-        host_port = match.group(1)
-        path = match.group(2)
-        if path and not path.startswith('/'):
+        host_port, path = match.groups()
+        if path and path[0] != '/':
              path = '/' + path
          return host_port, path
      return None, url
diff --git a/Misc/ACKS b/Misc/ACKS

index cc194ab7adda030aa6d3f2e7e48b5a015114eec1..e4c753839512a0f883f1f8e04e3ccb2182580e7c 100644 (file)
--- a/Misc/ACKS
+++ b/Misc/ACKS
@@ -892,6 +892,7 @@ Chad Netzer
  Max Neunhöffer
  George Neville-Neil
  Hieu Nguyen
+Nam Nguyen
  Johannes Nicolai
  Samuel Nicolary
  Jonathan Niehof
diff --git a/Misc/NEWS.d/next/Security/2017-07-11-22-02-51.bpo-30500.wXUrkQ.rst b/Misc/NEWS.d/next/Security/2017-07-11-22-02-51.bpo-30500.wXUrkQ.rst

new file mode 100644 (file)

index 0000000..6570e70
--- /dev/null
+++ b/Misc/NEWS.d/next/Security/2017-07-11-22-02-51.bpo-30500.wXUrkQ.rst
@@ -0,0 +1,4 @@
+Fix urllib.parse.splithost() to correctly parse fragments. For example,
+``splithost('//127.0.0.1#@evil.com/')`` now correctly returns the
+``127.0.0.1`` host, instead of treating ``@evil.com`` as the host in an
+authentification (``login@host``).
author	Victor Stinner <victor.stinner@gmail.com>
	Wed, 26 Jul 2017 02:43:52 +0000 (04:43 +0200)
committer	Ned Deily <nad@python.org>
	Wed, 26 Jul 2017 02:43:52 +0000 (22:43 -0400)
Lib/test/test_urlparse.py		patch \| blob \| blame \| history
Lib/urllib/parse.py		patch \| blob \| blame \| history
Misc/ACKS		patch \| blob \| blame \| history
Misc/NEWS.d/next/Security/2017-07-11-22-02-51.bpo-30500.wXUrkQ.rst	[new file with mode: 0644]	patch \| blob