]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
GH-123599: `url2pathname()`: handle authority section in file URL (#126844)
authorBarney Gale <barney.gale@gmail.com>
Thu, 10 Apr 2025 19:58:04 +0000 (20:58 +0100)
committerGitHub <noreply@github.com>
Thu, 10 Apr 2025 19:58:04 +0000 (19:58 +0000)
In `urllib.request.url2pathname()`, if the authority resolves to the
current host, discard it. If an authority is present but resolves somewhere
else, then on Windows we return a UNC path (as before), and on other
platforms we raise `URLError`.

Affects `pathlib.Path.from_uri()` in the same way.

Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com>
Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com>
Doc/library/pathlib.rst
Doc/library/urllib.request.rst
Doc/whatsnew/3.14.rst
Lib/pathlib/__init__.py
Lib/test/test_pathlib/test_pathlib.py
Lib/test/test_urllib.py
Lib/urllib/request.py
Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst [new file with mode: 0644]

index 708a16e6bc8c94916aa89916a87802fa3f14fdd4..b82986902861b2ab43f6a51b3ebfac57625d4d69 100644 (file)
@@ -871,6 +871,12 @@ conforming to :rfc:`8089`.
 
    .. versionadded:: 3.13
 
+   .. versionchanged:: next
+      If a URL authority (e.g. a hostname) is present and resolves to a local
+      address, it is discarded. If an authority is present and *doesn't*
+      resolve to a local address, then on Windows a UNC path is returned (as
+      before), and on other platforms a :exc:`ValueError` is raised.
+
 
 .. method:: Path.as_uri()
 
index 8b54e10713e782a855a1e143e1245c7de497306c..edfc249eb43c78ef9a995cb33e9b741e56929141 100644 (file)
@@ -158,16 +158,16 @@ The :mod:`urllib.request` module defines the following functions:
       >>> 'file:' + pathname2url(path)
       'file:///C:/Program%20Files'
 
-   .. versionchanged:: 3.14
-      Paths beginning with a slash are converted to URLs with authority
-      sections. For example, the path ``/etc/hosts`` is converted to
-      the URL ``///etc/hosts``.
-
    .. versionchanged:: 3.14
       Windows drive letters are no longer converted to uppercase, and ``:``
       characters not following a drive letter no longer cause an
       :exc:`OSError` exception to be raised on Windows.
 
+   .. versionchanged:: 3.14
+      Paths beginning with a slash are converted to URLs with authority
+      sections. For example, the path ``/etc/hosts`` is converted to
+      the URL ``///etc/hosts``.
+
 
 .. function:: url2pathname(url)
 
@@ -186,6 +186,13 @@ The :mod:`urllib.request` module defines the following functions:
       characters not following a drive letter no longer cause an
       :exc:`OSError` exception to be raised on Windows.
 
+   .. versionchanged:: next
+      This function calls :func:`socket.gethostbyname` if the URL authority
+      isn't empty or ``localhost``. If the authority resolves to a local IP
+      address then it is discarded; otherwise, on Windows a UNC path is
+      returned (as before), and on other platforms a
+      :exc:`~urllib.error.URLError` is raised.
+
 
 .. function:: getproxies()
 
index 0f15a2a8a8f6afa286655195312d8b54478ec25c..f8cae78b909a0048dbd134e93b3d99d5abee6167 100644 (file)
@@ -1197,6 +1197,25 @@ urllib
   supporting SHA-256 digest authentication as specified in :rfc:`7616`.
   (Contributed by Calvin Bui in :gh:`128193`.)
 
+* Improve standards compliance when parsing and emitting ``file:`` URLs.
+
+  In :func:`urllib.request.url2pathname`:
+
+  - Discard URL authorities that resolve to a local IP address.
+  - Raise :exc:`~urllib.error.URLError` if a URL authority doesn't resolve
+    to ``localhost``, except on Windows where we return a UNC path.
+
+  In :func:`urllib.request.pathname2url`:
+
+  - Include an empty URL authority when a path begins with a slash. For
+    example, the path ``/etc/hosts`` is converted to the URL ``///etc/hosts``.
+
+  On Windows, drive letters are no longer converted to uppercase, and ``:``
+  characters not following a drive letter no longer cause an :exc:`OSError`
+  exception to be raised.
+
+  (Contributed by Barney Gale in :gh:`125866`.)
+
 
 uuid
 ----
index cd28f62ce3baf53a28171727184f657af8b92a4c..43a5440e0132ff5c9656ca233b00dae9c6dbc83b 100644 (file)
@@ -1278,8 +1278,12 @@ class Path(PurePath):
         """Return a new path from the given 'file' URI."""
         if not uri.startswith('file:'):
             raise ValueError(f"URI does not start with 'file:': {uri!r}")
+        from urllib.error import URLError
         from urllib.request import url2pathname
-        path = cls(url2pathname(uri.removeprefix('file:')))
+        try:
+            path = cls(url2pathname(uri.removeprefix('file:')))
+        except URLError as exc:
+            raise ValueError(exc.reason) from None
         if not path.is_absolute():
             raise ValueError(f"URI is not absolute: {uri!r}")
         return path
index b1fcc5f6f0538e895a9993e0e6a9e5670d9e1490..00ec17e21e235fd903d2ffbf61ea8104e5db0d86 100644 (file)
@@ -3285,10 +3285,14 @@ class PathTest(PurePathTest):
     def test_from_uri_posix(self):
         P = self.cls
         self.assertEqual(P.from_uri('file:/foo/bar'), P('/foo/bar'))
-        self.assertEqual(P.from_uri('file://foo/bar'), P('//foo/bar'))
+        self.assertRaises(ValueError, P.from_uri, 'file://foo/bar')
         self.assertEqual(P.from_uri('file:///foo/bar'), P('/foo/bar'))
         self.assertEqual(P.from_uri('file:////foo/bar'), P('//foo/bar'))
         self.assertEqual(P.from_uri('file://localhost/foo/bar'), P('/foo/bar'))
+        if not is_wasi:
+            self.assertEqual(P.from_uri('file://127.0.0.1/foo/bar'), P('/foo/bar'))
+            self.assertEqual(P.from_uri(f'file://{socket.gethostname()}/foo/bar'),
+                             P('/foo/bar'))
         self.assertRaises(ValueError, P.from_uri, 'foo/bar')
         self.assertRaises(ValueError, P.from_uri, '/foo/bar')
         self.assertRaises(ValueError, P.from_uri, '//foo/bar')
index ed23215c4d0ab7e9089e55dc3eec4bb68c5e7f67..ecf429e17811a4a8b4abc083be82aa285b2f7188 100644 (file)
@@ -11,6 +11,7 @@ from test import support
 from test.support import os_helper
 from test.support import socket_helper
 import os
+import socket
 try:
     import ssl
 except ImportError:
@@ -1424,6 +1425,17 @@ class Pathname_Tests(unittest.TestCase):
                          "url2pathname() failed; %s != %s" %
                          (expect, result))
 
+    def test_pathname2url(self):
+        # Test cases common to Windows and POSIX.
+        fn = urllib.request.pathname2url
+        sep = os.path.sep
+        self.assertEqual(fn(''), '')
+        self.assertEqual(fn(sep), '///')
+        self.assertEqual(fn('a'), 'a')
+        self.assertEqual(fn(f'a{sep}b.c'), 'a/b.c')
+        self.assertEqual(fn(f'{sep}a{sep}b.c'), '///a/b.c')
+        self.assertEqual(fn(f'{sep}a{sep}b%#c'), '///a/b%25%23c')
+
     @unittest.skipUnless(sys.platform == 'win32',
                          'test specific to Windows pathnames.')
     def test_pathname2url_win(self):
@@ -1466,12 +1478,9 @@ class Pathname_Tests(unittest.TestCase):
                      'test specific to POSIX pathnames')
     def test_pathname2url_posix(self):
         fn = urllib.request.pathname2url
-        self.assertEqual(fn('/'), '///')
-        self.assertEqual(fn('/a/b.c'), '///a/b.c')
         self.assertEqual(fn('//a/b.c'), '////a/b.c')
         self.assertEqual(fn('///a/b.c'), '/////a/b.c')
         self.assertEqual(fn('////a/b.c'), '//////a/b.c')
-        self.assertEqual(fn('/a/b%#c'), '///a/b%25%23c')
 
     @unittest.skipUnless(os_helper.FS_NONASCII, 'need os_helper.FS_NONASCII')
     def test_pathname2url_nonascii(self):
@@ -1480,11 +1489,25 @@ class Pathname_Tests(unittest.TestCase):
         url = urllib.parse.quote(os_helper.FS_NONASCII, encoding=encoding, errors=errors)
         self.assertEqual(urllib.request.pathname2url(os_helper.FS_NONASCII), url)
 
+    def test_url2pathname(self):
+        # Test cases common to Windows and POSIX.
+        fn = urllib.request.url2pathname
+        sep = os.path.sep
+        self.assertEqual(fn(''), '')
+        self.assertEqual(fn('/'), f'{sep}')
+        self.assertEqual(fn('///'), f'{sep}')
+        self.assertEqual(fn('////'), f'{sep}{sep}')
+        self.assertEqual(fn('foo'), 'foo')
+        self.assertEqual(fn('foo/bar'), f'foo{sep}bar')
+        self.assertEqual(fn('/foo/bar'), f'{sep}foo{sep}bar')
+        self.assertEqual(fn('//localhost/foo/bar'), f'{sep}foo{sep}bar')
+        self.assertEqual(fn('///foo/bar'), f'{sep}foo{sep}bar')
+        self.assertEqual(fn('////foo/bar'), f'{sep}{sep}foo{sep}bar')
+
     @unittest.skipUnless(sys.platform == 'win32',
                          'test specific to Windows pathnames.')
     def test_url2pathname_win(self):
         fn = urllib.request.url2pathname
-        self.assertEqual(fn('/'), '\\')
         self.assertEqual(fn('/C:/'), 'C:\\')
         self.assertEqual(fn("///C|"), 'C:')
         self.assertEqual(fn("///C:"), 'C:')
@@ -1530,11 +1553,13 @@ class Pathname_Tests(unittest.TestCase):
                      'test specific to POSIX pathnames')
     def test_url2pathname_posix(self):
         fn = urllib.request.url2pathname
-        self.assertEqual(fn('/foo/bar'), '/foo/bar')
-        self.assertEqual(fn('//foo/bar'), '//foo/bar')
-        self.assertEqual(fn('///foo/bar'), '/foo/bar')
-        self.assertEqual(fn('////foo/bar'), '//foo/bar')
-        self.assertEqual(fn('//localhost/foo/bar'), '/foo/bar')
+        self.assertRaises(urllib.error.URLError, fn, '//foo/bar')
+        self.assertRaises(urllib.error.URLError, fn, '//localhost:/foo/bar')
+        self.assertRaises(urllib.error.URLError, fn, '//:80/foo/bar')
+        self.assertRaises(urllib.error.URLError, fn, '//:/foo/bar')
+        self.assertRaises(urllib.error.URLError, fn, '//c:80/foo/bar')
+        self.assertEqual(fn('//127.0.0.1/foo/bar'), '/foo/bar')
+        self.assertEqual(fn(f'//{socket.gethostname()}/foo/bar'), '/foo/bar')
 
     @unittest.skipUnless(os_helper.FS_NONASCII, 'need os_helper.FS_NONASCII')
     def test_url2pathname_nonascii(self):
index f22dc56af2f428158289054f4a7a623b090615d2..84c075ec8b359f4131119e2e43ff9e53b7bc086c 100644 (file)
@@ -1450,16 +1450,6 @@ def parse_http_list(s):
     return [part.strip() for part in res]
 
 class FileHandler(BaseHandler):
-    # Use local file or FTP depending on form of URL
-    def file_open(self, req):
-        url = req.selector
-        if url[:2] == '//' and url[2:3] != '/' and (req.host and
-                req.host != 'localhost'):
-            if not req.host in self.get_names():
-                raise URLError("file:// scheme is supported only on localhost")
-        else:
-            return self.open_local_file(req)
-
     # names for the localhost
     names = None
     def get_names(self):
@@ -1476,8 +1466,7 @@ class FileHandler(BaseHandler):
     def open_local_file(self, req):
         import email.utils
         import mimetypes
-        host = req.host
-        filename = req.selector
+        filename = _splittype(req.full_url)[1]
         localfile = url2pathname(filename)
         try:
             stats = os.stat(localfile)
@@ -1487,21 +1476,21 @@ class FileHandler(BaseHandler):
             headers = email.message_from_string(
                 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' %
                 (mtype or 'text/plain', size, modified))
-            if host:
-                host, port = _splitport(host)
-            if not host or \
-                (not port and _safe_gethostbyname(host) in self.get_names()):
-                origurl = 'file:' + pathname2url(localfile)
-                return addinfourl(open(localfile, 'rb'), headers, origurl)
+            origurl = f'file:{pathname2url(localfile)}'
+            return addinfourl(open(localfile, 'rb'), headers, origurl)
         except OSError as exp:
             raise URLError(exp, exp.filename)
-        raise URLError('file not on local host')
 
-def _safe_gethostbyname(host):
+    file_open = open_local_file
+
+def _is_local_authority(authority):
+    if not authority or authority == 'localhost':
+        return True
     try:
-        return socket.gethostbyname(host)
-    except socket.gaierror:
-        return None
+        address = socket.gethostbyname(authority)
+    except (socket.gaierror, AttributeError):
+        return False
+    return address in FileHandler().get_names()
 
 class FTPHandler(BaseHandler):
     def ftp_open(self, req):
@@ -1649,16 +1638,13 @@ class DataHandler(BaseHandler):
 def url2pathname(url):
     """OS-specific conversion from a relative URL of the 'file' scheme
     to a file system path; not recommended for general use."""
-    if url[:3] == '///':
-        # Empty authority section, so the path begins on the third character.
-        url = url[2:]
-    elif url[:12] == '//localhost/':
-        # Skip past 'localhost' authority.
-        url = url[11:]
-
+    authority, url = _splithost(url)
     if os.name == 'nt':
-        if url[:3] == '///':
-            # Skip past extra slash before UNC drive in URL path.
+        if not _is_local_authority(authority):
+            # e.g. file://server/share/file.txt
+            url = '//' + authority + url
+        elif url[:3] == '///':
+            # e.g. file://///server/share/file.txt
             url = url[1:]
         else:
             if url[:1] == '/' and url[2:3] in (':', '|'):
@@ -1668,6 +1654,8 @@ def url2pathname(url):
                 # Older URLs use a pipe after a drive letter
                 url = url[:1] + ':' + url[2:]
         url = url.replace('/', '\\')
+    elif not _is_local_authority(authority):
+        raise URLError("file:// scheme is supported only on localhost")
     encoding = sys.getfilesystemencoding()
     errors = sys.getfilesystemencodeerrors()
     return unquote(url, encoding=encoding, errors=errors)
diff --git a/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst b/Misc/NEWS.d/next/Library/2024-11-14-21-17-48.gh-issue-126838.Yr5vKF.rst
new file mode 100644 (file)
index 0000000..857cc35
--- /dev/null
@@ -0,0 +1,5 @@
+Fix issue where :func:`urllib.request.url2pathname` mishandled file URLs with
+authorities. If an authority is present and resolves to ``localhost``, it is
+now discarded. If an authority is present but *doesn't* resolve to
+``localhost``, then on Windows a UNC path is returned (as before), and on
+other platforms a :exc:`urllib.error.URLError` is now raised.