[3.11] gh-67693: Fix urlunparse() and urlunsplit() for URIs with path starting with...

author Serhiy Storchaka <storchaka@gmail.com>

Wed, 4 Sep 2024 15:42:58 +0000 (18:42 +0300)

committer GitHub <noreply@github.com>

Wed, 4 Sep 2024 15:42:58 +0000 (17:42 +0200)
author Serhiy Storchaka <storchaka@gmail.com>
Wed, 4 Sep 2024 15:42:58 +0000 (18:42 +0300)
committer GitHub <noreply@github.com>
Wed, 4 Sep 2024 15:42:58 +0000 (17:42 +0200)
diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py

index 4fef4303c48dfc21867bb5251d357d177a1dd4c3..2376dad81b2fbc7973623bf90c35c01b09a9303d 100644 (file)
--- a/Lib/test/test_urlparse.py
+++ b/Lib/test/test_urlparse.py
@@ -103,7 +103,9 @@ parse_qs_test_cases = [
  
  class UrlParseTestCase(unittest.TestCase):
  
-    def checkRoundtrips(self, url, parsed, split):
+    def checkRoundtrips(self, url, parsed, split, url2=None):
+        if url2 is None:
+            url2 = url
          result = urllib.parse.urlparse(url)
          self.assertEqual(result, parsed)
          t = (result.scheme, result.netloc, result.path,
@@ -111,7 +113,7 @@ class UrlParseTestCase(unittest.TestCase):
          self.assertEqual(t, parsed)
          # put it back together and it should be the same
          result2 = urllib.parse.urlunparse(result)
-        self.assertEqual(result2, url)
+        self.assertEqual(result2, url2)
          self.assertEqual(result2, result.geturl())
  
          # the result of geturl() is a fixpoint; we can always parse it
@@ -137,7 +139,7 @@ class UrlParseTestCase(unittest.TestCase):
               result.query, result.fragment)
          self.assertEqual(t, split)
          result2 = urllib.parse.urlunsplit(result)
-        self.assertEqual(result2, url)
+        self.assertEqual(result2, url2)
          self.assertEqual(result2, result.geturl())
  
          # check the fixpoint property of re-parsing the result of geturl()
@@ -175,9 +177,39 @@ class UrlParseTestCase(unittest.TestCase):
  
      def test_roundtrips(self):
          str_cases = [
+            ('path/to/file',
+             ('', '', 'path/to/file', '', '', ''),
+             ('', '', 'path/to/file', '', '')),
+            ('/path/to/file',
+             ('', '', '/path/to/file', '', '', ''),
+             ('', '', '/path/to/file', '', '')),
+            ('//path/to/file',
+             ('', 'path', '/to/file', '', '', ''),
+             ('', 'path', '/to/file', '', '')),
+            ('////path/to/file',
+             ('', '', '//path/to/file', '', '', ''),
+             ('', '', '//path/to/file', '', '')),
+            ('scheme:path/to/file',
+             ('scheme', '', 'path/to/file', '', '', ''),
+             ('scheme', '', 'path/to/file', '', '')),
+            ('scheme:/path/to/file',
+             ('scheme', '', '/path/to/file', '', '', ''),
+             ('scheme', '', '/path/to/file', '', '')),
+            ('scheme://path/to/file',
+             ('scheme', 'path', '/to/file', '', '', ''),
+             ('scheme', 'path', '/to/file', '', '')),
+            ('scheme:////path/to/file',
+             ('scheme', '', '//path/to/file', '', '', ''),
+             ('scheme', '', '//path/to/file', '', '')),
              ('file:///tmp/junk.txt',
               ('file', '', '/tmp/junk.txt', '', '', ''),
               ('file', '', '/tmp/junk.txt', '', '')),
+            ('file:////tmp/junk.txt',
+             ('file', '', '//tmp/junk.txt', '', '', ''),
+             ('file', '', '//tmp/junk.txt', '', '')),
+            ('file://///tmp/junk.txt',
+             ('file', '', '///tmp/junk.txt', '', '', ''),
+             ('file', '', '///tmp/junk.txt', '', '')),
              ('imap://mail.python.org/mbox1',
               ('imap', 'mail.python.org', '/mbox1', '', '', ''),
               ('imap', 'mail.python.org', '/mbox1', '', '')),
@@ -208,6 +240,38 @@ class UrlParseTestCase(unittest.TestCase):
          for url, parsed, split in str_cases + bytes_cases:
              self.checkRoundtrips(url, parsed, split)
  
+    def test_roundtrips_normalization(self):
+        str_cases = [
+            ('///path/to/file',
+             '/path/to/file',
+             ('', '', '/path/to/file', '', '', ''),
+             ('', '', '/path/to/file', '', '')),
+            ('scheme:///path/to/file',
+             'scheme:/path/to/file',
+             ('scheme', '', '/path/to/file', '', '', ''),
+             ('scheme', '', '/path/to/file', '', '')),
+            ('file:/tmp/junk.txt',
+             'file:///tmp/junk.txt',
+             ('file', '', '/tmp/junk.txt', '', '', ''),
+             ('file', '', '/tmp/junk.txt', '', '')),
+            ('http:/tmp/junk.txt',
+             'http:///tmp/junk.txt',
+             ('http', '', '/tmp/junk.txt', '', '', ''),
+             ('http', '', '/tmp/junk.txt', '', '')),
+            ('https:/tmp/junk.txt',
+             'https:///tmp/junk.txt',
+             ('https', '', '/tmp/junk.txt', '', '', ''),
+             ('https', '', '/tmp/junk.txt', '', '')),
+        ]
+        def _encode(t):
+            return (t[0].encode('ascii'),
+                    t[1].encode('ascii'),
+                    tuple(x.encode('ascii') for x in t[2]),
+                    tuple(x.encode('ascii') for x in t[3]))
+        bytes_cases = [_encode(x) for x in str_cases]
+        for url, url2, parsed, split in str_cases + bytes_cases:
+            self.checkRoundtrips(url, parsed, split, url2)
+
      def test_http_roundtrips(self):
          # urllib.parse.urlsplit treats 'http:' as an optimized special case,
          # so we test both 'http:' and 'https:' in all the following.
diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py

index 10c302d34c199cba34dd0d20601ff1c4f16401e4..abf1d1b546682ad6e9bfdc254764b6358b72d0c4 100644 (file)
--- a/Lib/urllib/parse.py
+++ b/Lib/urllib/parse.py
@@ -525,7 +525,7 @@ def urlunsplit(components):
      empty query; the RFC states that these are equivalent)."""
      scheme, netloc, url, query, fragment, _coerce_result = (
                                            _coerce_args(*components))
-    if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'):
+    if netloc or (scheme and scheme in uses_netloc) or url[:2] == '//':
          if url and url[:1] != '/': url = '/' + url
          url = '//' + (netloc or '') + url
      if scheme:
diff --git a/Misc/NEWS.d/next/Library/2019-08-27-01-16-50.gh-issue-67693.4NIAiy.rst b/Misc/NEWS.d/next/Library/2019-08-27-01-16-50.gh-issue-67693.4NIAiy.rst

new file mode 100644 (file)

index 0000000..22457df
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2019-08-27-01-16-50.gh-issue-67693.4NIAiy.rst
@@ -0,0 +1,2 @@
+Fix :func:`urllib.parse.urlunparse` and :func:`urllib.parse.urlunsplit` for URIs with path starting with multiple slashes and no authority.
+Based on patch by Ashwin Ramaswami.
author	Serhiy Storchaka <storchaka@gmail.com>
	Wed, 4 Sep 2024 15:42:58 +0000 (18:42 +0300)
committer	GitHub <noreply@github.com>
	Wed, 4 Sep 2024 15:42:58 +0000 (17:42 +0200)
Lib/test/test_urlparse.py		patch \| blob \| blame \| history
Lib/urllib/parse.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2019-08-27-01-16-50.gh-issue-67693.4NIAiy.rst	[new file with mode: 0644]	patch \| blob