]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
GH-115060: Speed up `pathlib.Path.glob()` by not scanning literal parts (#117732)
authorBarney Gale <barney.gale@gmail.com>
Fri, 12 Apr 2024 21:19:21 +0000 (22:19 +0100)
committerGitHub <noreply@github.com>
Fri, 12 Apr 2024 21:19:21 +0000 (22:19 +0100)
Don't bother calling `os.scandir()` to scan for literal pattern segments,
like `foo` in `foo/*.py`. Instead, append the segment(s) as-is and call
through to the next selector with `exists=False`, which signals that the
path might not exist. Subsequent selectors will call `os.scandir()` or
`os.lstat()` to filter out missing paths as needed.

Lib/glob.py
Lib/pathlib/_abc.py
Lib/test/test_pathlib/test_pathlib_abc.py
Misc/NEWS.d/next/Library/2024-04-10-22-35-24.gh-issue-115060.XEVuOb.rst [new file with mode: 0644]

index b1d2681d687ff731f7a319680d321a35ff7686c8..72cf22299763f04f0b783394e883ba6fc1272bd8 100644 (file)
@@ -331,9 +331,10 @@ class _Globber:
     """Class providing shell-style pattern matching and globbing.
     """
 
-    def __init__(self,  sep, case_sensitive, recursive=False):
+    def __init__(self, sep, case_sensitive, case_pedantic=False, recursive=False):
         self.sep = sep
         self.case_sensitive = case_sensitive
+        self.case_pedantic = case_pedantic
         self.recursive = recursive
 
     # Low-level methods
@@ -373,6 +374,8 @@ class _Globber:
             selector = self.recursive_selector
         elif part in _special_parts:
             selector = self.special_selector
+        elif not self.case_pedantic and magic_check.search(part) is None:
+            selector = self.literal_selector
         else:
             selector = self.wildcard_selector
         return selector(part, parts)
@@ -387,6 +390,23 @@ class _Globber:
             return select_next(path, exists)
         return select_special
 
+    def literal_selector(self, part, parts):
+        """Returns a function that selects a literal descendant of a path.
+        """
+
+        # Optimization: consume and join any subsequent literal parts here,
+        # rather than leaving them for the next selector. This reduces the
+        # number of string concatenation operations and calls to add_slash().
+        while parts and magic_check.search(parts[-1]) is None:
+            part += self.sep + parts.pop()
+
+        select_next = self.selector(parts)
+
+        def select_literal(path, exists=False):
+            path = self.concat_path(self.add_slash(path), part)
+            return select_next(path, exists=False)
+        return select_literal
+
     def wildcard_selector(self, part, parts):
         """Returns a function that selects direct children of a given path,
         filtering by pattern.
index b6cab0d285acd9494e8930fc1d7980e95e96a545..b51ad6f46d292abe3ab9167bed395c7c8b34b9d7 100644 (file)
@@ -686,8 +686,14 @@ class PathBase(PurePathBase):
     def _glob_selector(self, parts, case_sensitive, recurse_symlinks):
         if case_sensitive is None:
             case_sensitive = _is_case_sensitive(self.parser)
+            case_pedantic = False
+        else:
+            # The user has expressed a case sensitivity choice, but we don't
+            # know the case sensitivity of the underlying filesystem, so we
+            # must use scandir() for everything, including non-wildcard parts.
+            case_pedantic = True
         recursive = True if recurse_symlinks else glob._no_recurse_symlinks
-        globber = self._globber(self.parser.sep, case_sensitive, recursive)
+        globber = self._globber(self.parser.sep, case_sensitive, case_pedantic, recursive)
         return globber.selector(parts)
 
     def glob(self, pattern, *, case_sensitive=None, recurse_symlinks=True):
index 336115cf0fead2530e8358aedb4063d66c8e061d..6656b032cde28e9fad574a60b22afa0d18809ff6 100644 (file)
@@ -1429,10 +1429,10 @@ class DummyPath(PathBase):
         return "{}({!r})".format(self.__class__.__name__, self.as_posix())
 
     def stat(self, *, follow_symlinks=True):
-        if follow_symlinks:
-            path = str(self.resolve())
+        if follow_symlinks or self.name in ('', '.', '..'):
+            path = str(self.resolve(strict=True))
         else:
-            path = str(self.parent.resolve() / self.name)
+            path = str(self.parent.resolve(strict=True) / self.name)
         if path in self._files:
             st_mode = stat.S_IFREG
         elif path in self._directories:
@@ -1741,8 +1741,9 @@ class DummyPathTest(DummyPurePathTest):
     def test_glob_posix(self):
         P = self.cls
         p = P(self.base)
+        q = p / "FILEa"
         given = set(p.glob("FILEa"))
-        expect = set()
+        expect = {q} if q.exists() else set()
         self.assertEqual(given, expect)
         self.assertEqual(set(p.glob("FILEa*")), set())
 
@@ -1753,8 +1754,6 @@ class DummyPathTest(DummyPurePathTest):
         self.assertEqual(set(p.glob("FILEa")), { P(self.base, "fileA") })
         self.assertEqual(set(p.glob("*a\\")), { P(self.base, "dirA/") })
         self.assertEqual(set(p.glob("F*a")), { P(self.base, "fileA") })
-        self.assertEqual(set(map(str, p.glob("FILEa"))), {f"{p}\\fileA"})
-        self.assertEqual(set(map(str, p.glob("F*a"))), {f"{p}\\fileA"})
 
     def test_glob_empty_pattern(self):
         P = self.cls
@@ -1857,8 +1856,9 @@ class DummyPathTest(DummyPurePathTest):
     def test_rglob_posix(self):
         P = self.cls
         p = P(self.base, "dirC")
+        q = p / "dirD" / "FILEd"
         given = set(p.rglob("FILEd"))
-        expect = set()
+        expect = {q} if q.exists() else set()
         self.assertEqual(given, expect)
         self.assertEqual(set(p.rglob("FILEd*")), set())
 
@@ -1868,7 +1868,6 @@ class DummyPathTest(DummyPurePathTest):
         p = P(self.base, "dirC")
         self.assertEqual(set(p.rglob("FILEd")), { P(self.base, "dirC/dirD/fileD") })
         self.assertEqual(set(p.rglob("*\\")), { P(self.base, "dirC/dirD/") })
-        self.assertEqual(set(map(str, p.rglob("FILEd"))), {f"{p}\\dirD\\fileD"})
 
     @needs_symlinks
     def test_rglob_recurse_symlinks_common(self):
@@ -1931,7 +1930,11 @@ class DummyPathTest(DummyPurePathTest):
         self.assertEqual(set(p.glob("dirA/../file*")), { P(self.base, "dirA/../fileA") })
         self.assertEqual(set(p.glob("dirA/../file*/..")), set())
         self.assertEqual(set(p.glob("../xyzzy")), set())
-        self.assertEqual(set(p.glob("xyzzy/..")), set())
+        if self.cls.parser is posixpath:
+            self.assertEqual(set(p.glob("xyzzy/..")), set())
+        else:
+            # ".." segments are normalized first on Windows, so this path is stat()able.
+            self.assertEqual(set(p.glob("xyzzy/..")), { P(self.base, "xyzzy", "..") })
         self.assertEqual(set(p.glob("/".join([".."] * 50))), { P(self.base, *[".."] * 50)})
 
     @needs_symlinks
diff --git a/Misc/NEWS.d/next/Library/2024-04-10-22-35-24.gh-issue-115060.XEVuOb.rst b/Misc/NEWS.d/next/Library/2024-04-10-22-35-24.gh-issue-115060.XEVuOb.rst
new file mode 100644 (file)
index 0000000..b5084a0
--- /dev/null
@@ -0,0 +1,2 @@
+Speed up :meth:`pathlib.Path.glob` by not scanning directories for
+non-wildcard pattern segments.