GH-72904: Add `glob.translate()` function (#106703)

author Barney Gale <barney.gale@gmail.com>

Mon, 13 Nov 2023 17:15:56 +0000 (17:15 +0000)

committer GitHub <noreply@github.com>

Mon, 13 Nov 2023 17:15:56 +0000 (17:15 +0000)
author Barney Gale <barney.gale@gmail.com>
Mon, 13 Nov 2023 17:15:56 +0000 (17:15 +0000)
committer GitHub <noreply@github.com>
Mon, 13 Nov 2023 17:15:56 +0000 (17:15 +0000)
diff --git a/Doc/library/glob.rst b/Doc/library/glob.rst

index 0e4cfe7ebed797c09fc6d10c990423eec4ed2308..8e76d2d5f1653591fcd99480bea05323399b4f1e 100644 (file)
--- a/Doc/library/glob.rst
+++ b/Doc/library/glob.rst
@@ -145,6 +145,45 @@ default. For example, consider a directory containing :file:`card.gif` and
     >>> glob.glob('.c*')
     ['.card.gif']
  
+
+.. function:: translate(pathname, *, recursive=False, include_hidden=False, seps=None)
+
+   Convert the given path specification to a regular expression for use with
+   :func:`re.match`. The path specification can contain shell-style wildcards.
+
+   For example:
+
+      >>> import glob, re
+      >>>
+      >>> regex = glob.translate('**/*.txt', recursive=True, include_hidden=True)
+      >>> regex
+      '(?s:(?:.+/)?[^/]*\\.txt)\\Z'
+      >>> reobj = re.compile(regex)
+      >>> reobj.match('foo/bar/baz.txt')
+      <re.Match object; span=(0, 15), match='foo/bar/baz.txt'>
+
+   Path separators and segments are meaningful to this function, unlike
+   :func:`fnmatch.translate`. By default wildcards do not match path
+   separators, and ``*`` pattern segments match precisely one path segment.
+
+   If *recursive* is true, the pattern segment "``**``" will match any number
+   of path segments. If "``**``" occurs in any position other than a full
+   pattern segment, :exc:`ValueError` is raised.
+
+   If *include_hidden* is true, wildcards can match path segments that start
+   with a dot (``.``).
+
+   A sequence of path separators may be supplied to the *seps* argument. If
+   not given, :data:`os.sep` and :data:`~os.altsep` (if available) are used.
+
+   .. seealso::
+
+     :meth:`pathlib.PurePath.match` and :meth:`pathlib.Path.glob` methods,
+     which call this function to implement pattern matching and globbing.
+
+   .. versionadded:: 3.13
+
+
  .. seealso::
  
     Module :mod:`fnmatch`
diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst

index 9f9239a7eeb03683bf7262605c681240cf633452..81e133bb5454ecc67286dca6dfb5282f5c630cd6 100644 (file)
--- a/Doc/whatsnew/3.13.rst
+++ b/Doc/whatsnew/3.13.rst
@@ -183,6 +183,13 @@ doctest
    :attr:`doctest.TestResults.skipped` attributes.
    (Contributed by Victor Stinner in :gh:`108794`.)
  
+glob
+----
+
+* Add :func:`glob.translate` function that converts a path specification with
+  shell-style wildcards to a regular expression.
+  (Contributed by Barney Gale in :gh:`72904`.)
+
  io
  --
  
diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py

index d5e296f7748c1c650be01090c0461dfff5f4e31c..73acb1fe8d41060508d594ad5b61f2d58e448556 100644 (file)
--- a/Lib/fnmatch.py
+++ b/Lib/fnmatch.py
@@ -78,6 +78,11 @@ def translate(pat):
      """
  
      STAR = object()
+    parts = _translate(pat, STAR, '.')
+    return _join_translated_parts(parts, STAR)
+
+
+def _translate(pat, STAR, QUESTION_MARK):
      res = []
      add = res.append
      i, n = 0, len(pat)
@@ -89,7 +94,7 @@ def translate(pat):
              if (not res) or res[-1] is not STAR:
                  add(STAR)
          elif c == '?':
-            add('.')
+            add(QUESTION_MARK)
          elif c == '[':
              j = i
              if j < n and pat[j] == '!':
@@ -146,9 +151,11 @@ def translate(pat):
          else:
              add(re.escape(c))
      assert i == n
+    return res
+
  
+def _join_translated_parts(inp, STAR):
      # Deal with STARs.
-    inp = res
      res = []
      add = res.append
      i, n = 0, len(inp)
diff --git a/Lib/glob.py b/Lib/glob.py

index a7256422d520fb68ee22f77e1b39d2b0314a2cd0..4a335a10766cf4067d7a8c398ac18604acaa4f5a 100644 (file)
--- a/Lib/glob.py
+++ b/Lib/glob.py
@@ -249,3 +249,63 @@ def escape(pathname):
  
  
  _dir_open_flags = os.O_RDONLY | getattr(os, 'O_DIRECTORY', 0)
+
+
+def translate(pat, *, recursive=False, include_hidden=False, seps=None):
+    """Translate a pathname with shell wildcards to a regular expression.
+
+    If `recursive` is true, the pattern segment '**' will match any number of
+    path segments; if '**' appears outside its own segment, ValueError will be
+    raised.
+
+    If `include_hidden` is true, wildcards can match path segments beginning
+    with a dot ('.').
+
+    If a sequence of separator characters is given to `seps`, they will be
+    used to split the pattern into segments and match path separators. If not
+    given, os.path.sep and os.path.altsep (where available) are used.
+    """
+    if not seps:
+        if os.path.altsep:
+            seps = (os.path.sep, os.path.altsep)
+        else:
+            seps = os.path.sep
+    escaped_seps = ''.join(map(re.escape, seps))
+    any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps
+    not_sep = f'[^{escaped_seps}]'
+    if include_hidden:
+        one_last_segment = f'{not_sep}+'
+        one_segment = f'{one_last_segment}{any_sep}'
+        any_segments = f'(?:.+{any_sep})?'
+        any_last_segments = '.*'
+    else:
+        one_last_segment = f'[^{escaped_seps}.]{not_sep}*'
+        one_segment = f'{one_last_segment}{any_sep}'
+        any_segments = f'(?:{one_segment})*'
+        any_last_segments = f'{any_segments}(?:{one_last_segment})?'
+
+    results = []
+    parts = re.split(any_sep, pat)
+    last_part_idx = len(parts) - 1
+    for idx, part in enumerate(parts):
+        if part == '*':
+            results.append(one_segment if idx < last_part_idx else one_last_segment)
+            continue
+        if recursive:
+            if part == '**':
+                if idx < last_part_idx:
+                    if parts[idx + 1] != '**':
+                        results.append(any_segments)
+                else:
+                    results.append(any_last_segments)
+                continue
+            elif '**' in part:
+                raise ValueError("Invalid pattern: '**' can only be an entire path component")
+        if part:
+            if not include_hidden and part[0] in '*?':
+                results.append(r'(?!\.)')
+            results.extend(fnmatch._translate(part, f'{not_sep}*', not_sep))
+        if idx < last_part_idx:
+            results.append(any_sep)
+    res = ''.join(results)
+    return fr'(?s:{res})\Z'
diff --git a/Lib/pathlib.py b/Lib/pathlib.py

index 47a043c5e6b1b95c9f12fc2e99a0a348dc9ae07f..c06ea5c9bf1bd221a4f689225b2347d85b310ae1 100644 (file)
--- a/Lib/pathlib.py
+++ b/Lib/pathlib.py
@@ -6,8 +6,8 @@ operating systems.
  """
  
  import contextlib
-import fnmatch
  import functools
+import glob
  import io
  import ntpath
  import os
@@ -76,78 +76,16 @@ def _is_case_sensitive(pathmod):
  #
  
  
-# fnmatch.translate() returns a regular expression that includes a prefix and
-# a suffix, which enable matching newlines and ensure the end of the string is
-# matched, respectively. These features are undesirable for our implementation
-# of PurePatch.match(), which represents path separators as newlines and joins
-# pattern segments together. As a workaround, we define a slice object that
-# can remove the prefix and suffix from any translate() result. See the
-# _compile_pattern_lines() function for more details.
-_FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_')
-_FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX))
-_SWAP_SEP_AND_NEWLINE = {
-    '/': str.maketrans({'/': '\n', '\n': '/'}),
-    '\\': str.maketrans({'\\': '\n', '\n': '\\'}),
-}
-
-
  @functools.lru_cache(maxsize=256)
-def _compile_pattern(pat, case_sensitive):
+def _compile_pattern(pat, sep, case_sensitive):
      """Compile given glob pattern to a re.Pattern object (observing case
-    sensitivity), or None if the pattern should match everything."""
-    if pat == '*':
-        return None
+    sensitivity)."""
      flags = re.NOFLAG if case_sensitive else re.IGNORECASE
-    return re.compile(fnmatch.translate(pat), flags).match
-
-
-@functools.lru_cache()
-def _compile_pattern_lines(pattern_lines, case_sensitive):
-    """Compile the given pattern lines to an `re.Pattern` object.
-
-    The *pattern_lines* argument is a glob-style pattern (e.g. '**/*.py') with
-    its path separators and newlines swapped (e.g. '**\n*.py`). By using
-    newlines to separate path components, and not setting `re.DOTALL`, we
-    ensure that the `*` wildcard cannot match path separators.
-
-    The returned `re.Pattern` object may have its `match()` method called to
-    match a complete pattern, or `search()` to match from the right. The
-    argument supplied to these methods must also have its path separators and
-    newlines swapped.
-    """
-
-    # Match the start of the path, or just after a path separator
-    parts = ['^']
-    for part in pattern_lines.splitlines(keepends=True):
-        if part == '*\n':
-            part = r'.+\n'
-        elif part == '*':
-            part = r'.+'
-        elif part == '**\n':
-            # '**/' component: we use '(?s:.)' rather than '.' so that path
-            # separators (i.e. newlines) are matched. The trailing '^' ensures
-            # we terminate after a path separator (i.e. on a new line).
-            part = r'(?s:.)*^'
-        elif part == '**':
-            # '**' component.
-            part = r'(?s:.)*'
-        elif '**' in part:
-            raise ValueError("Invalid pattern: '**' can only be an entire path component")
-        else:
-            # Any other component: pass to fnmatch.translate(). We slice off
-            # the common prefix and suffix added by translate() to ensure that
-            # re.DOTALL is not set, and the end of the string not matched,
-            # respectively. With DOTALL not set, '*' wildcards will not match
-            # path separators, because the '.' characters in the pattern will
-            # not match newlines.
-            part = fnmatch.translate(part)[_FNMATCH_SLICE]
-        parts.append(part)
-    # Match the end of the path, always.
-    parts.append(r'\Z')
-    flags = re.MULTILINE
-    if not case_sensitive:
-        flags |= re.IGNORECASE
-    return re.compile(''.join(parts), flags=flags)
+    regex = glob.translate(pat, recursive=True, include_hidden=True, seps=sep)
+    # The string representation of an empty path is a single dot ('.'). Empty
+    # paths shouldn't match wildcards, so we consume it with an atomic group.
+    regex = r'(\.\Z)?+' + regex
+    return re.compile(regex, flags).match
  
  
  def _select_children(parent_paths, dir_only, follow_symlinks, match):
@@ -171,7 +109,7 @@ def _select_children(parent_paths, dir_only, follow_symlinks, match):
                      except OSError:
                          continue
                  name = entry.name
-                if match is None or match(name):
+                if match(name):
                      yield parent_path._make_child_relpath(name)
  
  
@@ -297,10 +235,6 @@ class PurePath:
          # to implement comparison methods like `__lt__()`.
          '_parts_normcase_cached',
  
-        # The `_lines_cached` slot stores the string path with path separators
-        # and newlines swapped. This is used to implement `match()`.
-        '_lines_cached',
-
          # The `_hash` slot stores the hash of the case-normalized string
          # path. It's set when `__hash__()` is called for the first time.
          '_hash',
@@ -475,20 +409,6 @@ class PurePath:
              self._parts_normcase_cached = self._str_normcase.split(self.pathmod.sep)
              return self._parts_normcase_cached
  
-    @property
-    def _lines(self):
-        # Path with separators and newlines swapped, for pattern matching.
-        try:
-            return self._lines_cached
-        except AttributeError:
-            path_str = str(self)
-            if path_str == '.':
-                self._lines_cached = ''
-            else:
-                trans = _SWAP_SEP_AND_NEWLINE[self.pathmod.sep]
-                self._lines_cached = path_str.translate(trans)
-            return self._lines_cached
-
      def __eq__(self, other):
          if not isinstance(other, PurePath):
              return NotImplemented
@@ -763,13 +683,16 @@ class PurePath:
              path_pattern = self.with_segments(path_pattern)
          if case_sensitive is None:
              case_sensitive = _is_case_sensitive(self.pathmod)
-        pattern = _compile_pattern_lines(path_pattern._lines, case_sensitive)
+        sep = path_pattern.pathmod.sep
+        pattern_str = str(path_pattern)
          if path_pattern.drive or path_pattern.root:
-            return pattern.match(self._lines) is not None
+            pass
          elif path_pattern._tail:
-            return pattern.search(self._lines) is not None
+            pattern_str = f'**{sep}{pattern_str}'
          else:
              raise ValueError("empty pattern")
+        match = _compile_pattern(pattern_str, sep, case_sensitive)
+        return match(str(self)) is not None
  
  
  # Subclassing os.PathLike makes isinstance() checks slower,
@@ -1069,26 +992,19 @@ class _PathBase(PurePath):
          return contextlib.nullcontext(self.iterdir())
  
      def _make_child_relpath(self, name):
-        sep = self.pathmod.sep
-        lines_name = name.replace('\n', sep)
-        lines_str = self._lines
          path_str = str(self)
          tail = self._tail
          if tail:
-            path_str = f'{path_str}{sep}{name}'
-            lines_str = f'{lines_str}\n{lines_name}'
+            path_str = f'{path_str}{self.pathmod.sep}{name}'
          elif path_str != '.':
              path_str = f'{path_str}{name}'
-            lines_str = f'{lines_str}{lines_name}'
          else:
              path_str = name
-            lines_str = lines_name
          path = self.with_segments(path_str)
          path._str = path_str
          path._drv = self.drive
          path._root = self.root
          path._tail_cached = tail + [name]
-        path._lines_cached = lines_str
          return path
  
      def glob(self, pattern, *, case_sensitive=None, follow_symlinks=None):
@@ -1139,6 +1055,7 @@ class _PathBase(PurePath):
          # do not perform any filesystem access, which can be much faster!
          filter_paths = follow_symlinks is not None and '..' not in pattern_parts
          deduplicate_paths = False
+        sep = self.pathmod.sep
          paths = iter([self] if self.is_dir() else [])
          part_idx = 0
          while part_idx < len(pattern_parts):
@@ -1159,9 +1076,9 @@ class _PathBase(PurePath):
                      paths = _select_recursive(paths, dir_only, follow_symlinks)
  
                      # Filter out paths that don't match pattern.
-                    prefix_len = len(self._make_child_relpath('_')._lines) - 1
-                    match = _compile_pattern_lines(path_pattern._lines, case_sensitive).match
-                    paths = (path for path in paths if match(path._lines[prefix_len:]))
+                    prefix_len = len(str(self._make_child_relpath('_'))) - 1
+                    match = _compile_pattern(str(path_pattern), sep, case_sensitive)
+                    paths = (path for path in paths if match(str(path), prefix_len))
                      return paths
  
                  dir_only = part_idx < len(pattern_parts)
@@ -1174,7 +1091,7 @@ class _PathBase(PurePath):
                  raise ValueError("Invalid pattern: '**' can only be an entire path component")
              else:
                  dir_only = part_idx < len(pattern_parts)
-                match = _compile_pattern(part, case_sensitive)
+                match = _compile_pattern(part, sep, case_sensitive)
                  paths = _select_children(paths, dir_only, follow_symlinks, match)
          return paths
  
diff --git a/Lib/test/test_glob.py b/Lib/test/test_glob.py

index f4b5821f408cb485dd2349fd29a651fad4dbec24..aa5fac8eca13542c6db420f7d3dfe3afd59de937 100644 (file)
--- a/Lib/test/test_glob.py
+++ b/Lib/test/test_glob.py
@@ -1,5 +1,6 @@
  import glob
  import os
+import re
  import shutil
  import sys
  import unittest
@@ -349,6 +350,96 @@ class GlobTests(unittest.TestCase):
              for it in iters:
                  self.assertEqual(next(it), p)
  
+    def test_translate_matching(self):
+        match = re.compile(glob.translate('*')).match
+        self.assertIsNotNone(match('foo'))
+        self.assertIsNotNone(match('foo.bar'))
+        self.assertIsNone(match('.foo'))
+        match = re.compile(glob.translate('.*')).match
+        self.assertIsNotNone(match('.foo'))
+        match = re.compile(glob.translate('**', recursive=True)).match
+        self.assertIsNotNone(match('foo'))
+        self.assertIsNone(match('.foo'))
+        self.assertIsNotNone(match(os.path.join('foo', 'bar')))
+        self.assertIsNone(match(os.path.join('foo', '.bar')))
+        self.assertIsNone(match(os.path.join('.foo', 'bar')))
+        self.assertIsNone(match(os.path.join('.foo', '.bar')))
+        match = re.compile(glob.translate('**/*', recursive=True)).match
+        self.assertIsNotNone(match(os.path.join('foo', 'bar')))
+        self.assertIsNone(match(os.path.join('foo', '.bar')))
+        self.assertIsNone(match(os.path.join('.foo', 'bar')))
+        self.assertIsNone(match(os.path.join('.foo', '.bar')))
+        match = re.compile(glob.translate('*/**', recursive=True)).match
+        self.assertIsNotNone(match(os.path.join('foo', 'bar')))
+        self.assertIsNone(match(os.path.join('foo', '.bar')))
+        self.assertIsNone(match(os.path.join('.foo', 'bar')))
+        self.assertIsNone(match(os.path.join('.foo', '.bar')))
+        match = re.compile(glob.translate('**/.bar', recursive=True)).match
+        self.assertIsNotNone(match(os.path.join('foo', '.bar')))
+        self.assertIsNone(match(os.path.join('.foo', '.bar')))
+        match = re.compile(glob.translate('**/*.*', recursive=True)).match
+        self.assertIsNone(match(os.path.join('foo', 'bar')))
+        self.assertIsNone(match(os.path.join('foo', '.bar')))
+        self.assertIsNotNone(match(os.path.join('foo', 'bar.txt')))
+        self.assertIsNone(match(os.path.join('foo', '.bar.txt')))
+
+    def test_translate(self):
+        def fn(pat):
+            return glob.translate(pat, seps='/')
+        self.assertEqual(fn('foo'), r'(?s:foo)\Z')
+        self.assertEqual(fn('foo/bar'), r'(?s:foo/bar)\Z')
+        self.assertEqual(fn('*'), r'(?s:[^/.][^/]*)\Z')
+        self.assertEqual(fn('?'), r'(?s:(?!\.)[^/])\Z')
+        self.assertEqual(fn('a*'), r'(?s:a[^/]*)\Z')
+        self.assertEqual(fn('*a'), r'(?s:(?!\.)[^/]*a)\Z')
+        self.assertEqual(fn('.*'), r'(?s:\.[^/]*)\Z')
+        self.assertEqual(fn('?aa'), r'(?s:(?!\.)[^/]aa)\Z')
+        self.assertEqual(fn('aa?'), r'(?s:aa[^/])\Z')
+        self.assertEqual(fn('aa[ab]'), r'(?s:aa[ab])\Z')
+        self.assertEqual(fn('**'), r'(?s:(?!\.)[^/]*)\Z')
+        self.assertEqual(fn('***'), r'(?s:(?!\.)[^/]*)\Z')
+        self.assertEqual(fn('a**'), r'(?s:a[^/]*)\Z')
+        self.assertEqual(fn('**b'), r'(?s:(?!\.)[^/]*b)\Z')
+        self.assertEqual(fn('/**/*/*.*/**'),
+                         r'(?s:/(?!\.)[^/]*/[^/.][^/]*/(?!\.)[^/]*\.[^/]*/(?!\.)[^/]*)\Z')
+
+    def test_translate_include_hidden(self):
+        def fn(pat):
+            return glob.translate(pat, include_hidden=True, seps='/')
+        self.assertEqual(fn('foo'), r'(?s:foo)\Z')
+        self.assertEqual(fn('foo/bar'), r'(?s:foo/bar)\Z')
+        self.assertEqual(fn('*'), r'(?s:[^/]+)\Z')
+        self.assertEqual(fn('?'), r'(?s:[^/])\Z')
+        self.assertEqual(fn('a*'), r'(?s:a[^/]*)\Z')
+        self.assertEqual(fn('*a'), r'(?s:[^/]*a)\Z')
+        self.assertEqual(fn('.*'), r'(?s:\.[^/]*)\Z')
+        self.assertEqual(fn('?aa'), r'(?s:[^/]aa)\Z')
+        self.assertEqual(fn('aa?'), r'(?s:aa[^/])\Z')
+        self.assertEqual(fn('aa[ab]'), r'(?s:aa[ab])\Z')
+        self.assertEqual(fn('**'), r'(?s:[^/]*)\Z')
+        self.assertEqual(fn('***'), r'(?s:[^/]*)\Z')
+        self.assertEqual(fn('a**'), r'(?s:a[^/]*)\Z')
+        self.assertEqual(fn('**b'), r'(?s:[^/]*b)\Z')
+        self.assertEqual(fn('/**/*/*.*/**'), r'(?s:/[^/]*/[^/]+/[^/]*\.[^/]*/[^/]*)\Z')
+
+    def test_translate_recursive(self):
+        def fn(pat):
+            return glob.translate(pat, recursive=True, include_hidden=True, seps='/')
+        self.assertEqual(fn('*'), r'(?s:[^/]+)\Z')
+        self.assertEqual(fn('?'), r'(?s:[^/])\Z')
+        self.assertEqual(fn('**'), r'(?s:.*)\Z')
+        self.assertEqual(fn('**/**'), r'(?s:.*)\Z')
+        self.assertRaises(ValueError, fn, '***')
+        self.assertRaises(ValueError, fn, 'a**')
+        self.assertRaises(ValueError, fn, '**b')
+        self.assertEqual(fn('/**/*/*.*/**'), r'(?s:/(?:.+/)?[^/]+/[^/]*\.[^/]*/.*)\Z')
+
+    def test_translate_seps(self):
+        def fn(pat):
+            return glob.translate(pat, recursive=True, include_hidden=True, seps=['/', '\\'])
+        self.assertEqual(fn('foo/bar\\baz'), r'(?s:foo[/\\]bar[/\\]baz)\Z')
+        self.assertEqual(fn('**/*'), r'(?s:(?:.+[/\\])?[^/\\]+)\Z')
+
  
  @skip_unless_symlink
  class SymlinkLoopGlobTests(unittest.TestCase):
diff --git a/Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst b/Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst

new file mode 100644 (file)

index 0000000..edc8ab0
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst
@@ -0,0 +1,2 @@
+Add :func:`glob.translate`. This function converts a pathname with shell-style
+wildcards to a regular expression.
author	Barney Gale <barney.gale@gmail.com>
	Mon, 13 Nov 2023 17:15:56 +0000 (17:15 +0000)
committer	GitHub <noreply@github.com>
	Mon, 13 Nov 2023 17:15:56 +0000 (17:15 +0000)
Doc/library/glob.rst		patch \| blob \| blame \| history
Doc/whatsnew/3.13.rst		patch \| blob \| blame \| history
Lib/fnmatch.py		patch \| blob \| blame \| history
Lib/glob.py		patch \| blob \| blame \| history
Lib/pathlib.py		patch \| blob \| blame \| history
Lib/test/test_glob.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2023-07-13-00-24-52.gh-issue-72904.Yn5-j0.rst	[new file with mode: 0644]	patch \| blob