Merge commit from fork

author shamoon <4887959+shamoon@users.noreply.github.com>

Fri, 12 Dec 2025 17:28:47 +0000 (09:28 -0800)

committer GitHub <noreply@github.com>

Fri, 12 Dec 2025 17:28:47 +0000 (09:28 -0800)
author shamoon <4887959+shamoon@users.noreply.github.com>
Fri, 12 Dec 2025 17:28:47 +0000 (09:28 -0800)
committer GitHub <noreply@github.com>
Fri, 12 Dec 2025 17:28:47 +0000 (09:28 -0800)
diff --git a/pyproject.toml b/pyproject.toml

index 3108aacd0226a720078dabab5c21ab9c874a428a..60dab9f478d1d6659483db54b9d704224951bbac 100644 (file)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -63,6 +63,7 @@ dependencies = [
    "pyzbar~=0.1.9",
    "rapidfuzz~=3.14.0",
    "redis[hiredis]~=5.2.1",
+  "regex>=2025.9.18",
    "scikit-learn~=1.7.0",
    "setproctitle~=1.3.4",
    "tika-client~=0.10.0",
diff --git a/src/documents/matching.py b/src/documents/matching.py

index 2c8d2bf87d9a42abfb9bcf1781da593653ccde80..198ead64cd9b1f74b6459ada2b4758c8d768fbab 100644 (file)
--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -20,6 +20,7 @@ from documents.models import Tag
  from documents.models import Workflow
  from documents.models import WorkflowTrigger
  from documents.permissions import get_objects_for_user_owner_aware
+from documents.regex import safe_regex_search
  
  if TYPE_CHECKING:
      from django.db.models import QuerySet
@@ -152,7 +153,7 @@ def match_storage_paths(document: Document, classifier: DocumentClassifier, user
  
  
  def matches(matching_model: MatchingModel, document: Document):
-    search_kwargs = {}
+    search_flags = 0
  
      document_content = document.content
  
@@ -161,14 +162,18 @@ def matches(matching_model: MatchingModel, document: Document):
          return False
  
      if matching_model.is_insensitive:
-        search_kwargs = {"flags": re.IGNORECASE}
+        search_flags = re.IGNORECASE
  
      if matching_model.matching_algorithm == MatchingModel.MATCH_NONE:
          return False
  
      elif matching_model.matching_algorithm == MatchingModel.MATCH_ALL:
          for word in _split_match(matching_model):
-            search_result = re.search(rf"\b{word}\b", document_content, **search_kwargs)
+            search_result = re.search(
+                rf"\b{word}\b",
+                document_content,
+                flags=search_flags,
+            )
              if not search_result:
                  return False
          log_reason(
@@ -180,7 +185,7 @@ def matches(matching_model: MatchingModel, document: Document):
  
      elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
          for word in _split_match(matching_model):
-            if re.search(rf"\b{word}\b", document_content, **search_kwargs):
+            if re.search(rf"\b{word}\b", document_content, flags=search_flags):
                  log_reason(matching_model, document, f"it contains this word: {word}")
                  return True
          return False
@@ -190,7 +195,7 @@ def matches(matching_model: MatchingModel, document: Document):
              re.search(
                  rf"\b{re.escape(matching_model.match)}\b",
                  document_content,
-                **search_kwargs,
+                flags=search_flags,
              ),
          )
          if result:
@@ -202,16 +207,11 @@ def matches(matching_model: MatchingModel, document: Document):
          return result
  
      elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
-        try:
-            match = re.search(
-                re.compile(matching_model.match, **search_kwargs),
-                document_content,
-            )
-        except re.error:
-            logger.error(
-                f"Error while processing regular expression {matching_model.match}",
-            )
-            return False
+        match = safe_regex_search(
+            matching_model.match,
+            document_content,
+            flags=search_flags,
+        )
          if match:
              log_reason(
                  matching_model,
diff --git a/src/documents/regex.py b/src/documents/regex.py

new file mode 100644 (file)

index 0000000..35acc5a
--- /dev/null
+++ b/src/documents/regex.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+import logging
+import textwrap
+
+import regex
+from django.conf import settings
+
+logger = logging.getLogger("paperless.regex")
+
+REGEX_TIMEOUT_SECONDS: float = getattr(settings, "MATCH_REGEX_TIMEOUT_SECONDS", 0.1)
+
+
+def validate_regex_pattern(pattern: str) -> None:
+    """
+    Validate user provided regex for basic compile errors.
+    Raises ValueError on validation failure.
+    """
+
+    try:
+        regex.compile(pattern)
+    except regex.error as exc:
+        raise ValueError(exc.msg) from exc
+
+
+def safe_regex_search(pattern: str, text: str, *, flags: int = 0):
+    """
+    Run a regex search with a timeout. Returns a match object or None.
+    Validation errors and timeouts are logged and treated as no match.
+    """
+
+    try:
+        validate_regex_pattern(pattern)
+        compiled = regex.compile(pattern, flags=flags)
+    except (regex.error, ValueError) as exc:
+        logger.error(
+            "Error while processing regular expression %s: %s",
+            textwrap.shorten(pattern, width=80, placeholder="…"),
+            exc,
+        )
+        return None
+
+    try:
+        return compiled.search(text, timeout=REGEX_TIMEOUT_SECONDS)
+    except TimeoutError:
+        logger.warning(
+            "Regular expression matching timed out for pattern %s",
+            textwrap.shorten(pattern, width=80, placeholder="…"),
+        )
+        return None
diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py

index 6265d291c27b5569d4c4bbdba3bff5b4366764d1..f4518c04fc2e3f032e4af1bd4b6324ad36b82511 100644 (file)
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -71,6 +71,7 @@ from documents.parsers import is_mime_type_supported
  from documents.permissions import get_document_count_filter_for_user
  from documents.permissions import get_groups_with_only_permission
  from documents.permissions import set_permissions_for_object
+from documents.regex import validate_regex_pattern
  from documents.templating.filepath import validate_filepath_template_and_render
  from documents.templating.utils import convert_format_str_to_template_format
  from documents.validators import uri_validator
@@ -141,10 +142,10 @@ class MatchingModelSerializer(serializers.ModelSerializer):
              and self.initial_data["matching_algorithm"] == MatchingModel.MATCH_REGEX
          ):
              try:
-                re.compile(match)
-            except re.error as e:
+                validate_regex_pattern(match)
+            except ValueError as e:
                  raise serializers.ValidationError(
-                    _("Invalid regular expression: %(error)s") % {"error": str(e.msg)},
+                    _("Invalid regular expression: %(error)s") % {"error": str(e)},
                  )
          return match
  
diff --git a/src/documents/tests/test_matchables.py b/src/documents/tests/test_matchables.py

index 180cf77ed3a59b8dee4062744e3ef4eb5e514474..8b2a7a463364505e8dac2906bd0d4f7cea86f98d 100644 (file)
--- a/src/documents/tests/test_matchables.py
+++ b/src/documents/tests/test_matchables.py
@@ -206,6 +206,22 @@ class TestMatching(_TestMatchingBase):
      def test_tach_invalid_regex(self):
          self._test_matching("[", "MATCH_REGEX", [], ["Don't match this"])
  
+    def test_match_regex_timeout_returns_false(self):
+        tag = Tag.objects.create(
+            name="slow",
+            match=r"(a+)+$",
+            matching_algorithm=Tag.MATCH_REGEX,
+        )
+        document = Document(content=("a" * 5000) + "X")
+
+        with self.assertLogs("paperless.regex", level="WARNING") as cm:
+            self.assertFalse(matching.matches(tag, document))
+
+        self.assertTrue(
+            any("timed out" in message for message in cm.output),
+            f"Expected timeout log, got {cm.output}",
+        )
+
      def test_match_fuzzy(self):
          self._test_matching(
              "Springfield, Miss.",
diff --git a/uv.lock b/uv.lock

index ff0bb6b5b95292825a73dcdedd64105989f97abb..69d1f50bb0e371188c3b45760dfac986a2585687 100644 (file)
--- a/uv.lock
+++ b/uv.lock
@@ -2163,6 +2163,7 @@ dependencies = [
      { name = "pyzbar", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
      { name = "rapidfuzz", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
      { name = "redis", extra = ["hiredis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
+    { name = "regex", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
      { name = "scikit-learn", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
      { name = "setproctitle", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
      { name = "tika-client", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -2306,6 +2307,7 @@ requires-dist = [
      { name = "pyzbar", specifier = "~=0.1.9" },
      { name = "rapidfuzz", specifier = "~=3.14.0" },
      { name = "redis", extras = ["hiredis"], specifier = "~=5.2.1" },
+    { name = "regex", specifier = ">=2025.9.18" },
      { name = "scikit-learn", specifier = "~=1.7.0" },
      { name = "setproctitle", specifier = "~=1.3.4" },
      { name = "tika-client", specifier = "~=0.10.0" },
author	shamoon <4887959+shamoon@users.noreply.github.com>
	Fri, 12 Dec 2025 17:28:47 +0000 (09:28 -0800)
committer	GitHub <noreply@github.com>
	Fri, 12 Dec 2025 17:28:47 +0000 (09:28 -0800)
pyproject.toml		patch \| blob \| blame \| history
src/documents/matching.py		patch \| blob \| blame \| history
src/documents/regex.py	[new file with mode: 0644]	patch \| blob
src/documents/serialisers.py		patch \| blob \| blame \| history
src/documents/tests/test_matchables.py		patch \| blob \| blame \| history
uv.lock		patch \| blob \| blame \| history