GH-73435: Implement recursive wildcards in `pathlib.PurePath.match()` (#101398)

author Barney Gale <barney.gale@gmail.com>

Tue, 30 May 2023 20:18:09 +0000 (21:18 +0100)

committer GitHub <noreply@github.com>

Tue, 30 May 2023 20:18:09 +0000 (20:18 +0000)
author Barney Gale <barney.gale@gmail.com>
Tue, 30 May 2023 20:18:09 +0000 (21:18 +0100)
committer GitHub <noreply@github.com>
Tue, 30 May 2023 20:18:09 +0000 (20:18 +0000)
diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst

index ee3330f44f47d0d3d4ca88e1c696d33ad2799cbc..67ef36890d5739e7f92a0949b75df6d5fdd8a6c8 100644 (file)
--- a/Doc/library/pathlib.rst
+++ b/Doc/library/pathlib.rst
@@ -569,6 +569,13 @@ Pure paths provide the following methods and properties:
        >>> PurePath('a/b.py').match('/*.py')
        False
  
+   The *pattern* may be another path object; this speeds up matching the same
+   pattern against multiple files::
+
+      >>> pattern = PurePath('*.py')
+      >>> PurePath('a/b.py').match(pattern)
+      True
+
     As with other methods, case-sensitivity follows platform defaults::
  
        >>> PurePosixPath('b.py').match('*.PY')
@@ -581,6 +588,10 @@ Pure paths provide the following methods and properties:
     .. versionadded:: 3.12
        The *case_sensitive* argument.
  
+   .. versionchanged:: 3.13
+      Support for the recursive wildcard "``**``" was added. In previous
+      versions, it acted like the non-recursive wildcard "``*``".
+
  
  .. method:: PurePath.relative_to(other, walk_up=False)
  
diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst

index 8c81ac76a56b46317ec2a011dbe6e693c86891bd..44c0915492dcc09dcae3aab991f33e9ae8b721b7 100644 (file)
--- a/Doc/whatsnew/3.13.rst
+++ b/Doc/whatsnew/3.13.rst
@@ -90,6 +90,9 @@ Improved Modules
  pathlib
  -------
  
+* Add support for recursive wildcards in :meth:`pathlib.PurePath.match`.
+  (Contributed by Barney Gale in :gh:`73435`.)
+
  * Add *follow_symlinks* keyword-only argument to :meth:`pathlib.Path.glob` and
    :meth:`~pathlib.Path.rglob`.
    (Contributed by Barney Gale in :gh:`77609`.)
diff --git a/Lib/pathlib.py b/Lib/pathlib.py

index a57b582a211e06681e82dfd3433ef647673d256b..62406473b66e4fdf5b59837e1b43cf43116a0860 100644 (file)
--- a/Lib/pathlib.py
+++ b/Lib/pathlib.py
@@ -54,6 +54,7 @@ def _ignore_error(exception):
              getattr(exception, 'winerror', None) in _IGNORED_WINERRORS)
  
  
+@functools.cache
  def _is_case_sensitive(flavour):
      return flavour.normcase('Aa') == 'Aa'
  
@@ -61,6 +62,22 @@ def _is_case_sensitive(flavour):
  # Globbing helpers
  #
  
+
+# fnmatch.translate() returns a regular expression that includes a prefix and
+# a suffix, which enable matching newlines and ensure the end of the string is
+# matched, respectively. These features are undesirable for our implementation
+# of PurePatch.match(), which represents path separators as newlines and joins
+# pattern segments together. As a workaround, we define a slice object that
+# can remove the prefix and suffix from any translate() result. See the
+# _compile_pattern_lines() function for more details.
+_FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_')
+_FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX))
+_SWAP_SEP_AND_NEWLINE = {
+    '/': str.maketrans({'/': '\n', '\n': '/'}),
+    '\\': str.maketrans({'\\': '\n', '\n': '\\'}),
+}
+
+
  @functools.lru_cache()
  def _make_selector(pattern_parts, flavour, case_sensitive):
      pat = pattern_parts[0]
@@ -92,6 +109,51 @@ def _compile_pattern(pat, case_sensitive):
      return re.compile(fnmatch.translate(pat), flags).match
  
  
+@functools.lru_cache()
+def _compile_pattern_lines(pattern_lines, case_sensitive):
+    """Compile the given pattern lines to an `re.Pattern` object.
+
+    The *pattern_lines* argument is a glob-style pattern (e.g. '**/*.py') with
+    its path separators and newlines swapped (e.g. '**\n*.py`). By using
+    newlines to separate path components, and not setting `re.DOTALL`, we
+    ensure that the `*` wildcard cannot match path separators.
+
+    The returned `re.Pattern` object may have its `match()` method called to
+    match a complete pattern, or `search()` to match from the right. The
+    argument supplied to these methods must also have its path separators and
+    newlines swapped.
+    """
+
+    # Match the start of the path, or just after a path separator
+    parts = ['^']
+    for part in pattern_lines.splitlines(keepends=True):
+        if part == '**\n':
+            # '**/' component: we use '[\s\S]' rather than '.' so that path
+            # separators (i.e. newlines) are matched. The trailing '^' ensures
+            # we terminate after a path separator (i.e. on a new line).
+            part = r'[\s\S]*^'
+        elif part == '**':
+            # '**' component.
+            part = r'[\s\S]*'
+        elif '**' in part:
+            raise ValueError("Invalid pattern: '**' can only be an entire path component")
+        else:
+            # Any other component: pass to fnmatch.translate(). We slice off
+            # the common prefix and suffix added by translate() to ensure that
+            # re.DOTALL is not set, and the end of the string not matched,
+            # respectively. With DOTALL not set, '*' wildcards will not match
+            # path separators, because the '.' characters in the pattern will
+            # not match newlines.
+            part = fnmatch.translate(part)[_FNMATCH_SLICE]
+        parts.append(part)
+    # Match the end of the path, always.
+    parts.append(r'\Z')
+    flags = re.MULTILINE
+    if not case_sensitive:
+        flags |= re.IGNORECASE
+    return re.compile(''.join(parts), flags=flags)
+
+
  class _Selector:
      """A selector matches a specific glob pattern part against the children
      of a given path."""
@@ -276,6 +338,10 @@ class PurePath:
          # to implement comparison methods like `__lt__()`.
          '_parts_normcase_cached',
  
+        # The `_lines_cached` slot stores the string path with path separators
+        # and newlines swapped. This is used to implement `match()`.
+        '_lines_cached',
+
          # The `_hash` slot stores the hash of the case-normalized string
          # path. It's set when `__hash__()` is called for the first time.
          '_hash',
@@ -441,6 +507,16 @@ class PurePath:
              self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep)
              return self._parts_normcase_cached
  
+    @property
+    def _lines(self):
+        # Path with separators and newlines swapped, for pattern matching.
+        try:
+            return self._lines_cached
+        except AttributeError:
+            trans = _SWAP_SEP_AND_NEWLINE[self._flavour.sep]
+            self._lines_cached = str(self).translate(trans)
+            return self._lines_cached
+
      def __eq__(self, other):
          if not isinstance(other, PurePath):
              return NotImplemented
@@ -697,23 +773,18 @@ class PurePath:
          """
          Return True if this path matches the given pattern.
          """
+        if not isinstance(path_pattern, PurePath):
+            path_pattern = self.with_segments(path_pattern)
          if case_sensitive is None:
              case_sensitive = _is_case_sensitive(self._flavour)
-        pat = self.with_segments(path_pattern)
-        if not pat.parts:
+        pattern = _compile_pattern_lines(path_pattern._lines, case_sensitive)
+        if path_pattern.drive or path_pattern.root:
+            return pattern.match(self._lines) is not None
+        elif path_pattern._tail:
+            return pattern.search(self._lines) is not None
+        else:
              raise ValueError("empty pattern")
-        pat_parts = pat.parts
-        parts = self.parts
-        if pat.drive or pat.root:
-            if len(pat_parts) != len(parts):
-                return False
-        elif len(pat_parts) > len(parts):
-            return False
-        for part, pat in zip(reversed(parts), reversed(pat_parts)):
-            match = _compile_pattern(pat, case_sensitive)
-            if not match(part):
-                return False
-        return True
+
  
  # Subclassing os.PathLike makes isinstance() checks slower,
  # which in turn makes Path construction slower. Register instead!
diff --git a/Lib/test/test_pathlib.py b/Lib/test/test_pathlib.py

index 4391d685d3c1264621ba9a46fcb514b73fffa216..076ace3d93085764862102364620d5130431ecc5 100644 (file)
--- a/Lib/test/test_pathlib.py
+++ b/Lib/test/test_pathlib.py
@@ -310,8 +310,30 @@ class _BasePurePathTest(object):
          self.assertFalse(P('/ab.py').match('/a/*.py'))
          self.assertFalse(P('/a/b/c.py').match('/a/*.py'))
          # Multi-part glob-style pattern.
-        self.assertFalse(P('/a/b/c.py').match('/**/*.py'))
+        self.assertTrue(P('a').match('**'))
+        self.assertTrue(P('c.py').match('**'))
+        self.assertTrue(P('a/b/c.py').match('**'))
+        self.assertTrue(P('/a/b/c.py').match('**'))
+        self.assertTrue(P('/a/b/c.py').match('/**'))
+        self.assertTrue(P('/a/b/c.py').match('**/'))
+        self.assertTrue(P('/a/b/c.py').match('/a/**'))
+        self.assertTrue(P('/a/b/c.py').match('**/*.py'))
+        self.assertTrue(P('/a/b/c.py').match('/**/*.py'))
          self.assertTrue(P('/a/b/c.py').match('/a/**/*.py'))
+        self.assertTrue(P('/a/b/c.py').match('/a/b/**/*.py'))
+        self.assertTrue(P('/a/b/c.py').match('/**/**/**/**/*.py'))
+        self.assertFalse(P('c.py').match('**/a.py'))
+        self.assertFalse(P('c.py').match('c/**'))
+        self.assertFalse(P('a/b/c.py').match('**/a'))
+        self.assertFalse(P('a/b/c.py').match('**/a/b'))
+        self.assertFalse(P('a/b/c.py').match('**/a/b/c'))
+        self.assertFalse(P('a/b/c.py').match('**/a/b/c.'))
+        self.assertFalse(P('a/b/c.py').match('**/a/b/c./**'))
+        self.assertFalse(P('a/b/c.py').match('**/a/b/c./**'))
+        self.assertFalse(P('a/b/c.py').match('/a/b/c.py/**'))
+        self.assertFalse(P('a/b/c.py').match('/**/a/b/c.py'))
+        self.assertRaises(ValueError, P('a').match, '**a/b/c')
+        self.assertRaises(ValueError, P('a').match, 'a/b/c**')
          # Case-sensitive flag
          self.assertFalse(P('A.py').match('a.PY', case_sensitive=True))
          self.assertTrue(P('A.py').match('a.PY', case_sensitive=False))
diff --git a/Misc/NEWS.d/next/Library/2023-02-17-18-56-46.gh-issue-73435.7sTJHk.rst b/Misc/NEWS.d/next/Library/2023-02-17-18-56-46.gh-issue-73435.7sTJHk.rst

new file mode 100644 (file)

index 0000000..d5a2ae0
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2023-02-17-18-56-46.gh-issue-73435.7sTJHk.rst
@@ -0,0 +1 @@
+Add support for recursive wildcards in :meth:`pathlib.PurePath.match`.
author	Barney Gale <barney.gale@gmail.com>
	Tue, 30 May 2023 20:18:09 +0000 (21:18 +0100)
committer	GitHub <noreply@github.com>
	Tue, 30 May 2023 20:18:09 +0000 (20:18 +0000)
Doc/library/pathlib.rst		patch \| blob \| blame \| history
Doc/whatsnew/3.13.rst		patch \| blob \| blame \| history
Lib/pathlib.py		patch \| blob \| blame \| history
Lib/test/test_pathlib.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2023-02-17-18-56-46.gh-issue-73435.7sTJHk.rst	[new file with mode: 0644]	patch \| blob