"pyzbar~=0.1.9",
"rapidfuzz~=3.14.0",
"redis[hiredis]~=5.2.1",
+ "regex>=2025.9.18",
"scikit-learn~=1.7.0",
"setproctitle~=1.3.4",
"tika-client~=0.10.0",
from documents.models import Workflow
from documents.models import WorkflowTrigger
from documents.permissions import get_objects_for_user_owner_aware
+from documents.regex import safe_regex_search
if TYPE_CHECKING:
from django.db.models import QuerySet
def matches(matching_model: MatchingModel, document: Document):
- search_kwargs = {}
+ search_flags = 0
document_content = document.content
return False
if matching_model.is_insensitive:
- search_kwargs = {"flags": re.IGNORECASE}
+ search_flags = re.IGNORECASE
if matching_model.matching_algorithm == MatchingModel.MATCH_NONE:
return False
elif matching_model.matching_algorithm == MatchingModel.MATCH_ALL:
for word in _split_match(matching_model):
- search_result = re.search(rf"\b{word}\b", document_content, **search_kwargs)
+ search_result = re.search(
+ rf"\b{word}\b",
+ document_content,
+ flags=search_flags,
+ )
if not search_result:
return False
log_reason(
elif matching_model.matching_algorithm == MatchingModel.MATCH_ANY:
for word in _split_match(matching_model):
- if re.search(rf"\b{word}\b", document_content, **search_kwargs):
+ if re.search(rf"\b{word}\b", document_content, flags=search_flags):
log_reason(matching_model, document, f"it contains this word: {word}")
return True
return False
re.search(
rf"\b{re.escape(matching_model.match)}\b",
document_content,
- **search_kwargs,
+ flags=search_flags,
),
)
if result:
return result
elif matching_model.matching_algorithm == MatchingModel.MATCH_REGEX:
- try:
- match = re.search(
- re.compile(matching_model.match, **search_kwargs),
- document_content,
- )
- except re.error:
- logger.error(
- f"Error while processing regular expression {matching_model.match}",
- )
- return False
+ match = safe_regex_search(
+ matching_model.match,
+ document_content,
+ flags=search_flags,
+ )
if match:
log_reason(
matching_model,
--- /dev/null
+from __future__ import annotations
+
+import logging
+import textwrap
+
+import regex
+from django.conf import settings
+
+logger = logging.getLogger("paperless.regex")
+
+REGEX_TIMEOUT_SECONDS: float = getattr(settings, "MATCH_REGEX_TIMEOUT_SECONDS", 0.1)
+
+
+def validate_regex_pattern(pattern: str) -> None:
+ """
+ Validate user provided regex for basic compile errors.
+ Raises ValueError on validation failure.
+ """
+
+ try:
+ regex.compile(pattern)
+ except regex.error as exc:
+ raise ValueError(exc.msg) from exc
+
+
+def safe_regex_search(pattern: str, text: str, *, flags: int = 0):
+ """
+ Run a regex search with a timeout. Returns a match object or None.
+ Validation errors and timeouts are logged and treated as no match.
+ """
+
+ try:
+ validate_regex_pattern(pattern)
+ compiled = regex.compile(pattern, flags=flags)
+ except (regex.error, ValueError) as exc:
+ logger.error(
+ "Error while processing regular expression %s: %s",
+ textwrap.shorten(pattern, width=80, placeholder="…"),
+ exc,
+ )
+ return None
+
+ try:
+ return compiled.search(text, timeout=REGEX_TIMEOUT_SECONDS)
+ except TimeoutError:
+ logger.warning(
+ "Regular expression matching timed out for pattern %s",
+ textwrap.shorten(pattern, width=80, placeholder="…"),
+ )
+ return None
from documents.permissions import get_document_count_filter_for_user
from documents.permissions import get_groups_with_only_permission
from documents.permissions import set_permissions_for_object
+from documents.regex import validate_regex_pattern
from documents.templating.filepath import validate_filepath_template_and_render
from documents.templating.utils import convert_format_str_to_template_format
from documents.validators import uri_validator
and self.initial_data["matching_algorithm"] == MatchingModel.MATCH_REGEX
):
try:
- re.compile(match)
- except re.error as e:
+ validate_regex_pattern(match)
+ except ValueError as e:
raise serializers.ValidationError(
- _("Invalid regular expression: %(error)s") % {"error": str(e.msg)},
+ _("Invalid regular expression: %(error)s") % {"error": str(e)},
)
return match
def test_tach_invalid_regex(self):
self._test_matching("[", "MATCH_REGEX", [], ["Don't match this"])
+ def test_match_regex_timeout_returns_false(self):
+ tag = Tag.objects.create(
+ name="slow",
+ match=r"(a+)+$",
+ matching_algorithm=Tag.MATCH_REGEX,
+ )
+ document = Document(content=("a" * 5000) + "X")
+
+ with self.assertLogs("paperless.regex", level="WARNING") as cm:
+ self.assertFalse(matching.matches(tag, document))
+
+ self.assertTrue(
+ any("timed out" in message for message in cm.output),
+ f"Expected timeout log, got {cm.output}",
+ )
+
def test_match_fuzzy(self):
self._test_matching(
"Springfield, Miss.",
{ name = "pyzbar", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "rapidfuzz", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "redis", extra = ["hiredis"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
+ { name = "regex", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "scikit-learn", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "setproctitle", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "tika-client", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "pyzbar", specifier = "~=0.1.9" },
{ name = "rapidfuzz", specifier = "~=3.14.0" },
{ name = "redis", extras = ["hiredis"], specifier = "~=5.2.1" },
+ { name = "regex", specifier = ">=2025.9.18" },
{ name = "scikit-learn", specifier = "~=1.7.0" },
{ name = "setproctitle", specifier = "~=1.3.4" },
{ name = "tika-client", specifier = "~=0.10.0" },