GH-102613: Fast recursive globbing in `pathlib.Path.glob()` (GH-104512)

author Barney Gale <barney.gale@gmail.com>

Tue, 6 Jun 2023 22:50:36 +0000 (23:50 +0100)

committer GitHub <noreply@github.com>

Tue, 6 Jun 2023 22:50:36 +0000 (23:50 +0100)
author Barney Gale <barney.gale@gmail.com>
Tue, 6 Jun 2023 22:50:36 +0000 (23:50 +0100)
committer GitHub <noreply@github.com>
Tue, 6 Jun 2023 22:50:36 +0000 (23:50 +0100)
diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst

index 67ef36890d5739e7f92a0949b75df6d5fdd8a6c8..ad801d5d7cdc4b904b23ca951ab523d282b55d48 100644 (file)
--- a/Doc/library/pathlib.rst
+++ b/Doc/library/pathlib.rst
@@ -917,6 +917,14 @@ call fails (for example because the path doesn't exist).
         PosixPath('setup.py'),
         PosixPath('test_pathlib.py')]
  
+   .. note::
+      Using the "``**``" pattern in large directory trees may consume
+      an inordinate amount of time.
+
+   .. tip::
+      Set *follow_symlinks* to ``True`` or ``False`` to improve performance
+      of recursive globbing.
+
     By default, or when the *case_sensitive* keyword-only argument is set to
     ``None``, this method matches paths using platform-specific casing rules:
     typically, case-sensitive on POSIX, and case-insensitive on Windows.
@@ -927,10 +935,6 @@ call fails (for example because the path doesn't exist).
     wildcards. Set *follow_symlinks* to ``True`` to always follow symlinks, or
     ``False`` to treat all symlinks as files.
  
-   .. note::
-      Using the "``**``" pattern in large directory trees may consume
-      an inordinate amount of time.
-
     .. audit-event:: pathlib.Path.glob self,pattern pathlib.Path.glob
  
     .. versionchanged:: 3.11
diff --git a/Lib/pathlib.py b/Lib/pathlib.py

index 62406473b66e4fdf5b59837e1b43cf43116a0860..89c7b1ebdf727f6834d13580b6261f42cefa355f 100644 (file)
--- a/Lib/pathlib.py
+++ b/Lib/pathlib.py
@@ -78,33 +78,12 @@ _SWAP_SEP_AND_NEWLINE = {
  }
  
  
-@functools.lru_cache()
-def _make_selector(pattern_parts, flavour, case_sensitive):
-    pat = pattern_parts[0]
-    if not pat:
-        return _TerminatingSelector()
-    if pat == '**':
-        child_parts_idx = 1
-        while child_parts_idx < len(pattern_parts) and pattern_parts[child_parts_idx] == '**':
-            child_parts_idx += 1
-        child_parts = pattern_parts[child_parts_idx:]
-        if '**' in child_parts:
-            cls = _DoubleRecursiveWildcardSelector
-        else:
-            cls = _RecursiveWildcardSelector
-    else:
-        child_parts = pattern_parts[1:]
-        if pat == '..':
-            cls = _ParentSelector
-        elif '**' in pat:
-            raise ValueError("Invalid pattern: '**' can only be an entire path component")
-        else:
-            cls = _WildcardSelector
-    return cls(pat, child_parts, flavour, case_sensitive)
-
-
  @functools.lru_cache(maxsize=256)
  def _compile_pattern(pat, case_sensitive):
+    """Compile given glob pattern to a re.Pattern object (observing case
+    sensitivity), or None if the pattern should match everything."""
+    if pat == '*':
+        return None
      flags = re.NOFLAG if case_sensitive else re.IGNORECASE
      return re.compile(fnmatch.translate(pat), flags).match
  
@@ -127,7 +106,11 @@ def _compile_pattern_lines(pattern_lines, case_sensitive):
      # Match the start of the path, or just after a path separator
      parts = ['^']
      for part in pattern_lines.splitlines(keepends=True):
-        if part == '**\n':
+        if part == '*\n':
+            part = r'.+\n'
+        elif part == '*':
+            part = r'.+'
+        elif part == '**\n':
              # '**/' component: we use '[\s\S]' rather than '.' so that path
              # separators (i.e. newlines) are matched. The trailing '^' ensures
              # we terminate after a path separator (i.e. on a new line).
@@ -154,114 +137,70 @@ def _compile_pattern_lines(pattern_lines, case_sensitive):
      return re.compile(''.join(parts), flags=flags)
  
  
-class _Selector:
-    """A selector matches a specific glob pattern part against the children
-    of a given path."""
-
-    def __init__(self, child_parts, flavour, case_sensitive):
-        self.child_parts = child_parts
-        if child_parts:
-            self.successor = _make_selector(child_parts, flavour, case_sensitive)
-            self.dironly = True
-        else:
-            self.successor = _TerminatingSelector()
-            self.dironly = False
-
-    def select_from(self, parent_path, follow_symlinks):
-        """Iterate over all child paths of `parent_path` matched by this
-        selector.  This can contain parent_path itself."""
-        path_cls = type(parent_path)
-        scandir = path_cls._scandir
-        if not parent_path.is_dir():
-            return iter([])
-        return self._select_from(parent_path, scandir, follow_symlinks)
-
-
-class _TerminatingSelector:
-
-    def _select_from(self, parent_path, scandir, follow_symlinks):
-        yield parent_path
-
-
-class _ParentSelector(_Selector):
-
-    def __init__(self, name, child_parts, flavour, case_sensitive):
-        _Selector.__init__(self, child_parts, flavour, case_sensitive)
-
-    def _select_from(self,  parent_path, scandir, follow_symlinks):
-        path = parent_path._make_child_relpath('..')
-        for p in self.successor._select_from(path, scandir, follow_symlinks):
-            yield p
-
-
-class _WildcardSelector(_Selector):
-
-    def __init__(self, pat, child_parts, flavour, case_sensitive):
-        _Selector.__init__(self, child_parts, flavour, case_sensitive)
-        if case_sensitive is None:
-            # TODO: evaluate case-sensitivity of each directory in _select_from()
-            case_sensitive = _is_case_sensitive(flavour)
-        self.match = _compile_pattern(pat, case_sensitive)
-
-    def _select_from(self, parent_path, scandir, follow_symlinks):
-        follow_dirlinks = True if follow_symlinks is None else follow_symlinks
+def _select_children(parent_paths, dir_only, follow_symlinks, match):
+    """Yield direct children of given paths, filtering by name and type."""
+    if follow_symlinks is None:
+        follow_symlinks = True
+    for parent_path in parent_paths:
          try:
              # We must close the scandir() object before proceeding to
              # avoid exhausting file descriptors when globbing deep trees.
-            with scandir(parent_path) as scandir_it:
+            with parent_path._scandir() as scandir_it:
                  entries = list(scandir_it)
          except OSError:
              pass
          else:
              for entry in entries:
-                if self.dironly:
+                if dir_only:
                      try:
-                        if not entry.is_dir(follow_symlinks=follow_dirlinks):
+                        if not entry.is_dir(follow_symlinks=follow_symlinks):
                              continue
                      except OSError:
                          continue
                  name = entry.name
-                if self.match(name):
-                    path = parent_path._make_child_relpath(name)
-                    for p in self.successor._select_from(path, scandir, follow_symlinks):
-                        yield p
-
+                if match is None or match(name):
+                    yield parent_path._make_child_relpath(name)
  
-class _RecursiveWildcardSelector(_Selector):
-
-    def __init__(self, pat, child_parts, flavour, case_sensitive):
-        _Selector.__init__(self, child_parts, flavour, case_sensitive)
-
-    def _iterate_directories(self, parent_path, follow_symlinks):
-        yield parent_path
-        for dirpath, dirnames, _ in parent_path.walk(follow_symlinks=follow_symlinks):
-            for dirname in dirnames:
-                yield dirpath._make_child_relpath(dirname)
-
-    def _select_from(self, parent_path, scandir, follow_symlinks):
-        follow_dirlinks = False if follow_symlinks is None else follow_symlinks
-        successor_select = self.successor._select_from
-        for starting_point in self._iterate_directories(parent_path, follow_dirlinks):
-            for p in successor_select(starting_point, scandir, follow_symlinks):
-                yield p
-
-
-class _DoubleRecursiveWildcardSelector(_RecursiveWildcardSelector):
-    """
-    Like _RecursiveWildcardSelector, but also de-duplicates results from
-    successive selectors. This is necessary if the pattern contains
-    multiple non-adjacent '**' segments.
-    """
  
-    def _select_from(self, parent_path, scandir, follow_symlinks):
-        yielded = set()
-        try:
-            for p in super()._select_from(parent_path, scandir, follow_symlinks):
-                if p not in yielded:
-                    yield p
-                    yielded.add(p)
-        finally:
-            yielded.clear()
+def _select_recursive(parent_paths, dir_only, follow_symlinks):
+    """Yield given paths and all their subdirectories, recursively."""
+    if follow_symlinks is None:
+        follow_symlinks = False
+    for parent_path in parent_paths:
+        paths = [parent_path]
+        while paths:
+            path = paths.pop()
+            yield path
+            try:
+                # We must close the scandir() object before proceeding to
+                # avoid exhausting file descriptors when globbing deep trees.
+                with path._scandir() as scandir_it:
+                    entries = list(scandir_it)
+            except OSError:
+                pass
+            else:
+                for entry in entries:
+                    try:
+                        if entry.is_dir(follow_symlinks=follow_symlinks):
+                            paths.append(path._make_child_relpath(entry.name))
+                            continue
+                    except OSError:
+                        pass
+                    if not dir_only:
+                        yield path._make_child_relpath(entry.name)
+
+
+def _select_unique(paths):
+    """Yields the given paths, filtering out duplicates."""
+    yielded = set()
+    try:
+        for path in paths:
+            raw_path = path._raw_path
+            if raw_path not in yielded:
+                yield path
+                yielded.add(raw_path)
+    finally:
+        yielded.clear()
  
  
  #
@@ -1056,19 +995,26 @@ class Path(PurePath):
          return os.scandir(self)
  
      def _make_child_relpath(self, name):
+        sep = self._flavour.sep
+        lines_name = name.replace('\n', sep)
+        lines_str = self._lines
          path_str = str(self)
          tail = self._tail
          if tail:
-            path_str = f'{path_str}{self._flavour.sep}{name}'
+            path_str = f'{path_str}{sep}{name}'
+            lines_str = f'{lines_str}\n{lines_name}'
          elif path_str != '.':
              path_str = f'{path_str}{name}'
+            lines_str = f'{lines_str}{lines_name}'
          else:
              path_str = name
+            lines_str = lines_name
          path = self.with_segments(path_str)
          path._str = path_str
          path._drv = self.drive
          path._root = self.root
          path._tail_cached = tail + [name]
+        path._lines_cached = lines_str
          return path
  
      def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
@@ -1076,16 +1022,7 @@ class Path(PurePath):
          kind, including directories) matching the given relative pattern.
          """
          sys.audit("pathlib.Path.glob", self, pattern)
-        if not pattern:
-            raise ValueError("Unacceptable pattern: {!r}".format(pattern))
-        drv, root, pattern_parts = self._parse_path(pattern)
-        if drv or root:
-            raise NotImplementedError("Non-relative patterns are unsupported")
-        if pattern[-1] in (self._flavour.sep, self._flavour.altsep):
-            pattern_parts.append('')
-        selector = _make_selector(tuple(pattern_parts), self._flavour, case_sensitive)
-        for p in selector.select_from(self, follow_symlinks):
-            yield p
+        return self._glob(pattern, case_sensitive, follow_symlinks)
  
      def rglob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
          """Recursively yield all existing files (of any kind, including
@@ -1093,14 +1030,74 @@ class Path(PurePath):
          this subtree.
          """
          sys.audit("pathlib.Path.rglob", self, pattern)
-        drv, root, pattern_parts = self._parse_path(pattern)
-        if drv or root:
+        return self._glob(f'**/{pattern}', case_sensitive, follow_symlinks)
+
+    def _glob(self, pattern, case_sensitive, follow_symlinks):
+        path_pattern = self.with_segments(pattern)
+        if path_pattern.drive or path_pattern.root:
              raise NotImplementedError("Non-relative patterns are unsupported")
-        if pattern and pattern[-1] in (self._flavour.sep, self._flavour.altsep):
+        elif not path_pattern._tail:
+            raise ValueError("Unacceptable pattern: {!r}".format(pattern))
+
+        pattern_parts = list(path_pattern._tail)
+        if pattern[-1] in (self._flavour.sep, self._flavour.altsep):
+            # GH-65238: pathlib doesn't preserve trailing slash. Add it back.
              pattern_parts.append('')
-        selector = _make_selector(("**",) + tuple(pattern_parts), self._flavour, case_sensitive)
-        for p in selector.select_from(self, follow_symlinks):
-            yield p
+        if pattern_parts[-1] == '**':
+            # GH-70303: '**' only matches directories. Add trailing slash.
+            pattern_parts.append('')
+
+        if case_sensitive is None:
+            # TODO: evaluate case-sensitivity of each directory in _select_children().
+            case_sensitive = _is_case_sensitive(self._flavour)
+
+        # If symlinks are handled consistently, and the pattern does not
+        # contain '..' components, then we can use a 'walk-and-match' strategy
+        # when expanding '**' wildcards. When a '**' wildcard is encountered,
+        # all following pattern parts are immediately consumed and used to
+        # build a `re.Pattern` object. This pattern is used to filter the
+        # recursive walk. As a result, pattern parts following a '**' wildcard
+        # do not perform any filesystem access, which can be much faster!
+        filter_paths = follow_symlinks is not None and '..' not in pattern_parts
+        deduplicate_paths = False
+        paths = iter([self] if self.is_dir() else [])
+        part_idx = 0
+        while part_idx < len(pattern_parts):
+            part = pattern_parts[part_idx]
+            part_idx += 1
+            if part == '':
+                # Trailing slash.
+                pass
+            elif part == '..':
+                paths = (path._make_child_relpath('..') for path in paths)
+            elif part == '**':
+                # Consume adjacent '**' components.
+                while part_idx < len(pattern_parts) and pattern_parts[part_idx] == '**':
+                    part_idx += 1
+
+                if filter_paths and part_idx < len(pattern_parts) and pattern_parts[part_idx] != '':
+                    dir_only = pattern_parts[-1] == ''
+                    paths = _select_recursive(paths, dir_only, follow_symlinks)
+
+                    # Filter out paths that don't match pattern.
+                    prefix_len = len(self._make_child_relpath('_')._lines) - 1
+                    match = _compile_pattern_lines(path_pattern._lines, case_sensitive).match
+                    paths = (path for path in paths if match(path._lines[prefix_len:]))
+                    return paths
+
+                dir_only = part_idx < len(pattern_parts)
+                paths = _select_recursive(paths, dir_only, follow_symlinks)
+                if deduplicate_paths:
+                    # De-duplicate if we've already seen a '**' component.
+                    paths = _select_unique(paths)
+                deduplicate_paths = True
+            elif '**' in part:
+                raise ValueError("Invalid pattern: '**' can only be an entire path component")
+            else:
+                dir_only = part_idx < len(pattern_parts)
+                match = _compile_pattern(part, case_sensitive)
+                paths = _select_children(paths, dir_only, follow_symlinks, match)
+        return paths
  
      def walk(self, top_down=True, on_error=None, follow_symlinks=False):
          """Walk the directory tree from this directory, similar to os.walk()."""
diff --git a/Lib/test/test_pathlib.py b/Lib/test/test_pathlib.py

index da5197dbfacb39cb1164aba7fb2d0bacd8a6837a..1a008e5cea3f0027c1d8bc2f3a6382e36595951a 100644 (file)
--- a/Lib/test/test_pathlib.py
+++ b/Lib/test/test_pathlib.py
@@ -1898,6 +1898,16 @@ class _BasePathTest(object):
          _check(p, "*B/*", ["dirB/fileB", "dirB/linkD", "linkB/fileB", "linkB/linkD"])
          _check(p, "*/fileB", ["dirB/fileB", "linkB/fileB"])
          _check(p, "*/", ["dirA", "dirB", "dirC", "dirE", "linkB"])
+        _check(p, "dir*/*/..", ["dirC/dirD/..", "dirA/linkC/.."])
+        _check(p, "dir*/**/", ["dirA", "dirA/linkC", "dirA/linkC/linkD", "dirB", "dirB/linkD",
+                                 "dirC", "dirC/dirD", "dirE"])
+        _check(p, "dir*/**/..", ["dirA/..", "dirA/linkC/..", "dirB/..",
+                                 "dirC/..", "dirC/dirD/..", "dirE/.."])
+        _check(p, "dir*/*/**/", ["dirA/linkC", "dirA/linkC/linkD", "dirB/linkD", "dirC/dirD"])
+        _check(p, "dir*/*/**/..", ["dirA/linkC/..", "dirC/dirD/.."])
+        _check(p, "dir*/**/fileC", ["dirC/fileC"])
+        _check(p, "dir*/*/../dirD/**/", ["dirC/dirD/../dirD"])
+        _check(p, "*/dirD/**/", ["dirC/dirD"])
  
      @os_helper.skip_unless_symlink
      def test_glob_no_follow_symlinks_common(self):
@@ -1912,6 +1922,14 @@ class _BasePathTest(object):
          _check(p, "*B/*", ["dirB/fileB", "dirB/linkD"])
          _check(p, "*/fileB", ["dirB/fileB"])
          _check(p, "*/", ["dirA", "dirB", "dirC", "dirE"])
+        _check(p, "dir*/*/..", ["dirC/dirD/.."])
+        _check(p, "dir*/**/", ["dirA", "dirB", "dirC", "dirC/dirD", "dirE"])
+        _check(p, "dir*/**/..", ["dirA/..", "dirB/..", "dirC/..", "dirC/dirD/..", "dirE/.."])
+        _check(p, "dir*/*/**/", ["dirC/dirD"])
+        _check(p, "dir*/*/**/..", ["dirC/dirD/.."])
+        _check(p, "dir*/**/fileC", ["dirC/fileC"])
+        _check(p, "dir*/*/../dirD/**/", ["dirC/dirD/../dirD"])
+        _check(p, "*/dirD/**/", ["dirC/dirD"])
  
      def test_rglob_common(self):
          def _check(glob, expected):
diff --git a/Misc/NEWS.d/next/Library/2023-05-15-18-57-42.gh-issue-102613.YD9yx-.rst b/Misc/NEWS.d/next/Library/2023-05-15-18-57-42.gh-issue-102613.YD9yx-.rst

new file mode 100644 (file)

index 0000000..841a9e1
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2023-05-15-18-57-42.gh-issue-102613.YD9yx-.rst
@@ -0,0 +1,4 @@
+Improve performance of :meth:`pathlib.Path.glob` when expanding a pattern with
+a non-terminal "``**``" component by filtering walked paths through a regular
+expression, rather than calling :func:`os.scandir` more than once on each
+directory.
author	Barney Gale <barney.gale@gmail.com>
	Tue, 6 Jun 2023 22:50:36 +0000 (23:50 +0100)
committer	GitHub <noreply@github.com>
	Tue, 6 Jun 2023 22:50:36 +0000 (23:50 +0100)
Doc/library/pathlib.rst		patch \| blob \| blame \| history
Lib/pathlib.py		patch \| blob \| blame \| history
Lib/test/test_pathlib.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2023-05-15-18-57-42.gh-issue-102613.YD9yx-.rst	[new file with mode: 0644]	patch \| blob