]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
GH-79634: Accept path-like objects as pathlib glob patterns. (#114017)
authorBarney Gale <barney.gale@gmail.com>
Sat, 20 Jan 2024 02:10:25 +0000 (02:10 +0000)
committerGitHub <noreply@github.com>
Sat, 20 Jan 2024 02:10:25 +0000 (02:10 +0000)
Allow `os.PathLike` objects to be passed as patterns to `pathlib.Path.glob()` and `rglob()`. (It's already possible to use them in `PurePath.match()`)

While we're in the area:

- Allow empty glob patterns in `PathBase` (but not `Path`)
- Speed up globbing in `PathBase` by generating paths with trailing slashes only as a final step, rather than for every intermediate directory.
- Simplify and speed up handling of rare patterns involving both `**` and `..` segments.

Doc/library/pathlib.rst
Lib/pathlib/__init__.py
Lib/pathlib/_abc.py
Lib/test/test_pathlib/test_pathlib.py
Lib/test/test_pathlib/test_pathlib_abc.py
Misc/NEWS.d/next/Library/2024-01-12-17-32-36.gh-issue-79634.uTSTRI.rst [new file with mode: 0644]

index be207ca222274e81840cc8241526f8b5ac25f6ca..b924f470e0be0448f006c55fe416ed89262e2bed 100644 (file)
@@ -1036,6 +1036,9 @@ call fails (for example because the path doesn't exist).
       future Python release, patterns with this ending will match both files
       and directories. Add a trailing slash to match only directories.
 
+   .. versionchanged:: 3.13
+      The *pattern* parameter accepts a :term:`path-like object`.
+
 .. method:: Path.group(*, follow_symlinks=True)
 
    Return the name of the group owning the file. :exc:`KeyError` is raised
@@ -1498,6 +1501,9 @@ call fails (for example because the path doesn't exist).
    .. versionchanged:: 3.13
       The *follow_symlinks* parameter was added.
 
+   .. versionchanged:: 3.13
+      The *pattern* parameter accepts a :term:`path-like object`.
+
 .. method:: Path.rmdir()
 
    Remove this directory.  The directory must be empty.
index f14d35bb0038d011732f302a03a58bcdb840be85..b043aed12b3849e92bdeb415552b84c9cc0b6e6f 100644 (file)
@@ -467,6 +467,29 @@ class PurePath(_abc.PurePathBase):
         from urllib.parse import quote_from_bytes
         return prefix + quote_from_bytes(os.fsencode(path))
 
+    @property
+    def _pattern_stack(self):
+        """Stack of path components, to be used with patterns in glob()."""
+        parts = self._tail.copy()
+        pattern = self._raw_path
+        if self.anchor:
+            raise NotImplementedError("Non-relative patterns are unsupported")
+        elif not parts:
+            raise ValueError("Unacceptable pattern: {!r}".format(pattern))
+        elif pattern[-1] in (self.pathmod.sep, self.pathmod.altsep):
+            # GH-65238: pathlib doesn't preserve trailing slash. Add it back.
+            parts.append('')
+        elif parts[-1] == '**':
+            # GH-70303: '**' only matches directories. Add trailing slash.
+            warnings.warn(
+                "Pattern ending '**' will match files and directories in a "
+                "future Python release. Add a trailing slash to match only "
+                "directories and remove this warning.",
+                FutureWarning, 4)
+            parts.append('')
+        parts.reverse()
+        return parts
+
 
 # Subclassing os.PathLike makes isinstance() checks slower,
 # which in turn makes Path construction slower. Register instead!
@@ -580,7 +603,7 @@ class Path(_abc.PathBase, PurePath):
     def _scandir(self):
         return os.scandir(self)
 
-    def _make_child_entry(self, entry, is_dir=False):
+    def _make_child_entry(self, entry):
         # Transform an entry yielded from _scandir() into a path object.
         path_str = entry.name if str(self) == '.' else entry.path
         path = self.with_segments(path_str)
@@ -591,6 +614,8 @@ class Path(_abc.PathBase, PurePath):
         return path
 
     def _make_child_relpath(self, name):
+        if not name:
+            return self
         path_str = str(self)
         tail = self._tail
         if tail:
@@ -611,14 +636,8 @@ class Path(_abc.PathBase, PurePath):
         kind, including directories) matching the given relative pattern.
         """
         sys.audit("pathlib.Path.glob", self, pattern)
-        if pattern.endswith('**'):
-            # GH-70303: '**' only matches directories. Add trailing slash.
-            warnings.warn(
-                "Pattern ending '**' will match files and directories in a "
-                "future Python release. Add a trailing slash to match only "
-                "directories and remove this warning.",
-                FutureWarning, 2)
-            pattern = f'{pattern}/'
+        if not isinstance(pattern, PurePath):
+            pattern = self.with_segments(pattern)
         return _abc.PathBase.glob(
             self, pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)
 
@@ -628,15 +647,9 @@ class Path(_abc.PathBase, PurePath):
         this subtree.
         """
         sys.audit("pathlib.Path.rglob", self, pattern)
-        if pattern.endswith('**'):
-            # GH-70303: '**' only matches directories. Add trailing slash.
-            warnings.warn(
-                "Pattern ending '**' will match files and directories in a "
-                "future Python release. Add a trailing slash to match only "
-                "directories and remove this warning.",
-                FutureWarning, 2)
-            pattern = f'{pattern}/'
-        pattern = f'**/{pattern}'
+        if not isinstance(pattern, PurePath):
+            pattern = self.with_segments(pattern)
+        pattern = '**' / pattern
         return _abc.PathBase.glob(
             self, pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)
 
index 48a6c218309385afd5771000bb3e61e624758012..e5eeb4afce2ea9e70e8107b0b82335672ff4ae5c 100644 (file)
@@ -63,6 +63,12 @@ def _compile_pattern(pat, sep, case_sensitive):
     return re.compile(regex, flags=flags).match
 
 
+def _select_special(paths, part):
+    """Yield special literal children of the given paths."""
+    for path in paths:
+        yield path._make_child_relpath(part)
+
+
 def _select_children(parent_paths, dir_only, follow_symlinks, match):
     """Yield direct children of given paths, filtering by name and type."""
     if follow_symlinks is None:
@@ -84,7 +90,7 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match):
                     except OSError:
                         continue
                 if match(entry.name):
-                    yield parent_path._make_child_entry(entry, dir_only)
+                    yield parent_path._make_child_entry(entry)
 
 
 def _select_recursive(parent_paths, dir_only, follow_symlinks):
@@ -107,7 +113,7 @@ def _select_recursive(parent_paths, dir_only, follow_symlinks):
                 for entry in entries:
                     try:
                         if entry.is_dir(follow_symlinks=follow_symlinks):
-                            paths.append(path._make_child_entry(entry, dir_only))
+                            paths.append(path._make_child_entry(entry))
                             continue
                     except OSError:
                         pass
@@ -427,6 +433,14 @@ class PurePathBase:
         a drive)."""
         return self.pathmod.isabs(self._raw_path)
 
+    @property
+    def _pattern_stack(self):
+        """Stack of path components, to be used with patterns in glob()."""
+        anchor, parts = self._stack
+        if anchor:
+            raise NotImplementedError("Non-relative patterns are unsupported")
+        return parts
+
     def match(self, path_pattern, *, case_sensitive=None):
         """
         Return True if this path matches the given pattern.
@@ -436,11 +450,10 @@ class PurePathBase:
         if case_sensitive is None:
             case_sensitive = _is_case_sensitive(self.pathmod)
         sep = path_pattern.pathmod.sep
-        pattern_str = str(path_pattern)
         if path_pattern.anchor:
-            pass
+            pattern_str = str(path_pattern)
         elif path_pattern.parts:
-            pattern_str = f'**{sep}{pattern_str}'
+            pattern_str = str('**' / path_pattern)
         else:
             raise ValueError("empty pattern")
         match = _compile_pattern(pattern_str, sep, case_sensitive)
@@ -714,10 +727,8 @@ class PathBase(PurePathBase):
         from contextlib import nullcontext
         return nullcontext(self.iterdir())
 
-    def _make_child_entry(self, entry, is_dir=False):
+    def _make_child_entry(self, entry):
         # Transform an entry yielded from _scandir() into a path object.
-        if is_dir:
-            return entry.joinpath('')
         return entry
 
     def _make_child_relpath(self, name):
@@ -727,57 +738,35 @@ class PathBase(PurePathBase):
         """Iterate over this subtree and yield all existing files (of any
         kind, including directories) matching the given relative pattern.
         """
-        path_pattern = self.with_segments(pattern)
-        if path_pattern.anchor:
-            raise NotImplementedError("Non-relative patterns are unsupported")
-        elif not path_pattern.parts:
-            raise ValueError("Unacceptable pattern: {!r}".format(pattern))
-
-        pattern_parts = list(path_pattern.parts)
-        if not self.pathmod.split(pattern)[1]:
-            # GH-65238: pathlib doesn't preserve trailing slash. Add it back.
-            pattern_parts.append('')
-
+        if not isinstance(pattern, PurePathBase):
+            pattern = self.with_segments(pattern)
         if case_sensitive is None:
             # TODO: evaluate case-sensitivity of each directory in _select_children().
             case_sensitive = _is_case_sensitive(self.pathmod)
 
-        # If symlinks are handled consistently, and the pattern does not
-        # contain '..' components, then we can use a 'walk-and-match' strategy
-        # when expanding '**' wildcards. When a '**' wildcard is encountered,
-        # all following pattern parts are immediately consumed and used to
-        # build a `re.Pattern` object. This pattern is used to filter the
-        # recursive walk. As a result, pattern parts following a '**' wildcard
-        # do not perform any filesystem access, which can be much faster!
-        filter_paths = follow_symlinks is not None and '..' not in pattern_parts
+        stack = pattern._pattern_stack
+        specials = ('', '.', '..')
+        filter_paths = False
         deduplicate_paths = False
         sep = self.pathmod.sep
         paths = iter([self.joinpath('')] if self.is_dir() else [])
-        part_idx = 0
-        while part_idx < len(pattern_parts):
-            part = pattern_parts[part_idx]
-            part_idx += 1
-            if part == '':
-                # Trailing slash.
-                pass
-            elif part == '..':
-                paths = (path._make_child_relpath('..') for path in paths)
+        while stack:
+            part = stack.pop()
+            if part in specials:
+                paths = _select_special(paths, part)
             elif part == '**':
                 # Consume adjacent '**' components.
-                while part_idx < len(pattern_parts) and pattern_parts[part_idx] == '**':
-                    part_idx += 1
-
-                if filter_paths and part_idx < len(pattern_parts) and pattern_parts[part_idx] != '':
-                    dir_only = pattern_parts[-1] == ''
-                    paths = _select_recursive(paths, dir_only, follow_symlinks)
+                while stack and stack[-1] == '**':
+                    stack.pop()
 
-                    # Filter out paths that don't match pattern.
-                    prefix_len = len(str(self._make_child_relpath('_'))) - 1
-                    match = _compile_pattern(str(path_pattern), sep, case_sensitive)
-                    paths = (path for path in paths if match(str(path), prefix_len))
-                    return paths
+                # Consume adjacent non-special components and enable post-walk
+                # regex filtering, provided we're treating symlinks consistently.
+                if follow_symlinks is not None:
+                    while stack and stack[-1] not in specials:
+                        filter_paths = True
+                        stack.pop()
 
-                dir_only = part_idx < len(pattern_parts)
+                dir_only = bool(stack)
                 paths = _select_recursive(paths, dir_only, follow_symlinks)
                 if deduplicate_paths:
                     # De-duplicate if we've already seen a '**' component.
@@ -786,9 +775,14 @@ class PathBase(PurePathBase):
             elif '**' in part:
                 raise ValueError("Invalid pattern: '**' can only be an entire path component")
             else:
-                dir_only = part_idx < len(pattern_parts)
+                dir_only = bool(stack)
                 match = _compile_pattern(part, sep, case_sensitive)
                 paths = _select_children(paths, dir_only, follow_symlinks, match)
+        if filter_paths:
+            # Filter out paths that don't match pattern.
+            prefix_len = len(str(self._make_child_relpath('_'))) - 1
+            match = _compile_pattern(str(pattern), sep, case_sensitive)
+            paths = (path for path in paths if match(str(path), prefix_len))
         return paths
 
     def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
@@ -796,8 +790,10 @@ class PathBase(PurePathBase):
         directories) matching the given relative pattern, anywhere in
         this subtree.
         """
-        return self.glob(
-            f'**/{pattern}', case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)
+        if not isinstance(pattern, PurePathBase):
+            pattern = self.with_segments(pattern)
+        pattern = '**' / pattern
+        return self.glob(pattern, case_sensitive=case_sensitive, follow_symlinks=follow_symlinks)
 
     def walk(self, top_down=True, on_error=None, follow_symlinks=False):
         """Walk the directory tree from this directory, similar to os.walk()."""
index 61d7939ad140b2c4db817cea5f91677d3318064a..bdbe92369639ef8c771e360f196c9dc42a75a9a1 100644 (file)
@@ -1818,6 +1818,13 @@ class PathTest(test_pathlib_abc.DummyPathTest, PurePathTest):
             list(base.walk())
             list(base.walk(top_down=False))
 
+    def test_glob_empty_pattern(self):
+        p = self.cls('')
+        with self.assertRaisesRegex(ValueError, 'Unacceptable pattern'):
+            list(p.glob(''))
+        with self.assertRaisesRegex(ValueError, 'Unacceptable pattern'):
+            list(p.glob('.'))
+
     def test_glob_many_open_files(self):
         depth = 30
         P = self.cls
@@ -1860,6 +1867,22 @@ class PathTest(test_pathlib_abc.DummyPathTest, PurePathTest):
         with self.assertWarns(FutureWarning):
             p.rglob('*/**')
 
+    def test_glob_pathlike(self):
+        P = self.cls
+        p = P(self.base)
+        pattern = "dir*/file*"
+        expect = {p / "dirB/fileB", p / "dirC/fileC"}
+        self.assertEqual(expect, set(p.glob(P(pattern))))
+        self.assertEqual(expect, set(p.glob(FakePath(pattern))))
+
+    def test_rglob_pathlike(self):
+        P = self.cls
+        p = P(self.base, "dirC")
+        pattern = "**/file*"
+        expect = {p / "fileC", p / "dirD/fileD"}
+        self.assertEqual(expect, set(p.rglob(P(pattern))))
+        self.assertEqual(expect, set(p.rglob(FakePath(pattern))))
+
 
 @only_posix
 class PosixPathTest(PathTest, PurePosixPathTest):
index f877c98b7678f49bb62941b66ab41a11447e44ec..199718a8a69c5ad6bb55fe52e904e1a0b3ad1efa 100644 (file)
@@ -1045,9 +1045,12 @@ class DummyPathTest(DummyPurePathTest):
             _check(p.glob("*/"), ["dirA/", "dirB/", "dirC/", "dirE/", "linkB/"])
 
     def test_glob_empty_pattern(self):
-        p = self.cls('')
-        with self.assertRaisesRegex(ValueError, 'Unacceptable pattern'):
-            list(p.glob(''))
+        def _check(glob, expected):
+            self.assertEqual(set(glob), { P(self.base, q) for q in expected })
+        P = self.cls
+        p = P(self.base)
+        _check(p.glob(""), [""])
+        _check(p.glob("."), ["."])
 
     def test_glob_case_sensitive(self):
         P = self.cls
diff --git a/Misc/NEWS.d/next/Library/2024-01-12-17-32-36.gh-issue-79634.uTSTRI.rst b/Misc/NEWS.d/next/Library/2024-01-12-17-32-36.gh-issue-79634.uTSTRI.rst
new file mode 100644 (file)
index 0000000..ba19b52
--- /dev/null
@@ -0,0 +1,2 @@
+Accept :term:`path-like objects <path-like object>` as patterns in
+:meth:`pathlib.Path.glob` and :meth:`~pathlib.Path.rglob`.