]> git.ipfire.org Git - thirdparty/Python/cpython.git/commitdiff
gh-96346: Use double caching for re._compile() (#96347)
authorSerhiy Storchaka <storchaka@gmail.com>
Fri, 7 Oct 2022 19:21:42 +0000 (22:21 +0300)
committerGitHub <noreply@github.com>
Fri, 7 Oct 2022 19:21:42 +0000 (12:21 -0700)
Lib/re/__init__.py
Misc/NEWS.d/next/Library/2022-08-27-23-16-09.gh-issue-96346.jJX14I.rst [new file with mode: 0644]

index d58c2117ef3e14a9daa2529d975de1f9839075b3..8d6a4ef3880f0c46cc1e2949b29b9c64203af5f3 100644 (file)
@@ -229,6 +229,7 @@ def compile(pattern, flags=0):
 def purge():
     "Clear the regular expression caches"
     _cache.clear()
+    _cache2.clear()
     _compile_repl.cache_clear()
 
 def template(pattern, flags=0):
@@ -266,40 +267,64 @@ Match = type(_compiler.compile('', 0).match(''))
 # --------------------------------------------------------------------
 # internals
 
-_cache = {}  # ordered!
-
+# Use the fact that dict keeps the insertion order.
+# _cache2 uses the simple FIFO policy which has better latency.
+# _cache uses the LRU policy which has better hit rate.
+_cache = {}  # LRU
+_cache2 = {}  # FIFO
 _MAXCACHE = 512
+_MAXCACHE2 = 256
+assert _MAXCACHE2 < _MAXCACHE
+
 def _compile(pattern, flags):
     # internal: compile pattern
     if isinstance(flags, RegexFlag):
         flags = flags.value
     try:
-        return _cache[type(pattern), pattern, flags]
+        return _cache2[type(pattern), pattern, flags]
     except KeyError:
         pass
-    if isinstance(pattern, Pattern):
-        if flags:
-            raise ValueError(
-                "cannot process flags argument with a compiled pattern")
-        return pattern
-    if not _compiler.isstring(pattern):
-        raise TypeError("first argument must be string or compiled pattern")
-    if flags & T:
-        import warnings
-        warnings.warn("The re.TEMPLATE/re.T flag is deprecated "
-                  "as it is an undocumented flag "
-                  "without an obvious purpose. "
-                  "Don't use it.",
-                  DeprecationWarning)
-    p = _compiler.compile(pattern, flags)
-    if not (flags & DEBUG):
+
+    key = (type(pattern), pattern, flags)
+    # Item in _cache should be moved to the end if found.
+    p = _cache.pop(key, None)
+    if p is None:
+        if isinstance(pattern, Pattern):
+            if flags:
+                raise ValueError(
+                    "cannot process flags argument with a compiled pattern")
+            return pattern
+        if not _compiler.isstring(pattern):
+            raise TypeError("first argument must be string or compiled pattern")
+        if flags & T:
+            import warnings
+            warnings.warn("The re.TEMPLATE/re.T flag is deprecated "
+                    "as it is an undocumented flag "
+                    "without an obvious purpose. "
+                    "Don't use it.",
+                    DeprecationWarning)
+        p = _compiler.compile(pattern, flags)
+        if flags & DEBUG:
+            return p
         if len(_cache) >= _MAXCACHE:
-            # Drop the oldest item
+            # Drop the least recently used item.
+            # next(iter(_cache)) is known to have linear amortized time,
+            # but it is used here to avoid a dependency from using OrderedDict.
+            # For the small _MAXCACHE value it doesn't make much of a difference.
             try:
                 del _cache[next(iter(_cache))]
             except (StopIteration, RuntimeError, KeyError):
                 pass
-        _cache[type(pattern), pattern, flags] = p
+    # Append to the end.
+    _cache[key] = p
+
+    if len(_cache2) >= _MAXCACHE2:
+        # Drop the oldest item.
+        try:
+            del _cache2[next(iter(_cache2))]
+        except (StopIteration, RuntimeError, KeyError):
+            pass
+    _cache2[key] = p
     return p
 
 @functools.lru_cache(_MAXCACHE)
diff --git a/Misc/NEWS.d/next/Library/2022-08-27-23-16-09.gh-issue-96346.jJX14I.rst b/Misc/NEWS.d/next/Library/2022-08-27-23-16-09.gh-issue-96346.jJX14I.rst
new file mode 100644 (file)
index 0000000..9883348
--- /dev/null
@@ -0,0 +1 @@
+Use double caching for compiled RE patterns.