]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
GH-102613: Improve performance of `pathlib.Path.rglob()` (GH-104244)
authorBarney Gale <barney.gale@gmail.com>
Sun, 7 May 2023 21:12:50 +0000 (22:12 +0100)
committerGitHub <noreply@github.com>
Sun, 7 May 2023 21:12:50 +0000 (22:12 +0100)
Stop de-duplicating results in `_RecursiveWildcardSelector`. A new
`_DoubleRecursiveWildcardSelector` class is introduced which performs
de-duplication, but this is used _only_ for patterns with multiple
non-adjacent `**` segments, such as `path.glob('**/foo/**')`. By avoiding
the use of a set, `PurePath.__hash__()` is not called, and so paths do not
need to be stringified and case-normalised.

Also merge adjacent '**' segments in patterns.

Lib/pathlib.py
Lib/test/test_pathlib.py
Misc/NEWS.d/next/Library/2023-05-06-20-37-46.gh-issue-102613.QZG9iX.rst [new file with mode: 0644]

index 68255aa3e511ecef2734d6907f99fb16e585053a..20ec1ce9d80374ddcd96c60fbeb655464317bda7 100644 (file)
@@ -64,17 +64,25 @@ def _is_case_sensitive(flavour):
 @functools.lru_cache()
 def _make_selector(pattern_parts, flavour, case_sensitive):
     pat = pattern_parts[0]
-    child_parts = pattern_parts[1:]
     if not pat:
         return _TerminatingSelector()
     if pat == '**':
-        cls = _RecursiveWildcardSelector
-    elif pat == '..':
-        cls = _ParentSelector
-    elif '**' in pat:
-        raise ValueError("Invalid pattern: '**' can only be an entire path component")
+        child_parts_idx = 1
+        while child_parts_idx < len(pattern_parts) and pattern_parts[child_parts_idx] == '**':
+            child_parts_idx += 1
+        child_parts = pattern_parts[child_parts_idx:]
+        if '**' in child_parts:
+            cls = _DoubleRecursiveWildcardSelector
+        else:
+            cls = _RecursiveWildcardSelector
     else:
-        cls = _WildcardSelector
+        child_parts = pattern_parts[1:]
+        if pat == '..':
+            cls = _ParentSelector
+        elif '**' in pat:
+            raise ValueError("Invalid pattern: '**' can only be an entire path component")
+        else:
+            cls = _WildcardSelector
     return cls(pat, child_parts, flavour, case_sensitive)
 
 
@@ -183,20 +191,32 @@ class _RecursiveWildcardSelector(_Selector):
 
     def _select_from(self, parent_path, scandir):
         try:
-            yielded = set()
-            try:
-                successor_select = self.successor._select_from
-                for starting_point in self._iterate_directories(parent_path, scandir):
-                    for p in successor_select(starting_point, scandir):
-                        if p not in yielded:
-                            yield p
-                            yielded.add(p)
-            finally:
-                yielded.clear()
+            successor_select = self.successor._select_from
+            for starting_point in self._iterate_directories(parent_path, scandir):
+                for p in successor_select(starting_point, scandir):
+                    yield p
         except PermissionError:
             return
 
 
+class _DoubleRecursiveWildcardSelector(_RecursiveWildcardSelector):
+    """
+    Like _RecursiveWildcardSelector, but also de-duplicates results from
+    successive selectors. This is necessary if the pattern contains
+    multiple non-adjacent '**' segments.
+    """
+
+    def _select_from(self, parent_path, scandir):
+        yielded = set()
+        try:
+            for p in super()._select_from(parent_path, scandir):
+                if p not in yielded:
+                    yield p
+                    yielded.add(p)
+        finally:
+            yielded.clear()
+
+
 #
 # Public API
 #
index e25c77f2ba8af61f2fc6ba6fbd9b25002715f660..ee0ef9a34c385c02f192ae532dfb897d734342ab 100644 (file)
@@ -1853,13 +1853,14 @@ class _BasePathTest(object):
 
     def test_rglob_common(self):
         def _check(glob, expected):
-            self.assertEqual(set(glob), { P(BASE, q) for q in expected })
+            self.assertEqual(sorted(glob), sorted(P(BASE, q) for q in expected))
         P = self.cls
         p = P(BASE)
         it = p.rglob("fileA")
         self.assertIsInstance(it, collections.abc.Iterator)
         _check(it, ["fileA"])
         _check(p.rglob("fileB"), ["dirB/fileB"])
+        _check(p.rglob("**/fileB"), ["dirB/fileB"])
         _check(p.rglob("*/fileA"), [])
         if not os_helper.can_symlink():
             _check(p.rglob("*/fileB"), ["dirB/fileB"])
@@ -1883,9 +1884,12 @@ class _BasePathTest(object):
         _check(p.rglob("*"), ["dirC/fileC", "dirC/novel.txt",
                               "dirC/dirD", "dirC/dirD/fileD"])
         _check(p.rglob("file*"), ["dirC/fileC", "dirC/dirD/fileD"])
+        _check(p.rglob("**/file*"), ["dirC/fileC", "dirC/dirD/fileD"])
+        _check(p.rglob("dir*/**"), ["dirC/dirD"])
         _check(p.rglob("*/*"), ["dirC/dirD/fileD"])
         _check(p.rglob("*/"), ["dirC/dirD"])
         _check(p.rglob(""), ["dirC", "dirC/dirD"])
+        _check(p.rglob("**"), ["dirC", "dirC/dirD"])
         # gh-91616, a re module regression
         _check(p.rglob("*.txt"), ["dirC/novel.txt"])
         _check(p.rglob("*.*"), ["dirC/novel.txt"])
diff --git a/Misc/NEWS.d/next/Library/2023-05-06-20-37-46.gh-issue-102613.QZG9iX.rst b/Misc/NEWS.d/next/Library/2023-05-06-20-37-46.gh-issue-102613.QZG9iX.rst
new file mode 100644 (file)
index 0000000..01f8b94
--- /dev/null
@@ -0,0 +1,3 @@
+Improve performance of :meth:`pathlib.Path.glob` when expanding recursive
+wildcards ("``**``") by merging adjacent wildcards and de-duplicating
+results only when necessary.