Feature: Enhanced backend custom field search API (#7589)

author shamoon <4887959+shamoon@users.noreply.github.com>

Mon, 23 Sep 2024 18:28:31 +0000 (11:28 -0700)

committer shamoon <4887959+shamoon@users.noreply.github.com>

Tue, 24 Sep 2024 06:33:49 +0000 (23:33 -0700)
author shamoon <4887959+shamoon@users.noreply.github.com>
Mon, 23 Sep 2024 18:28:31 +0000 (11:28 -0700)
committer shamoon <4887959+shamoon@users.noreply.github.com>
Tue, 24 Sep 2024 06:33:49 +0000 (23:33 -0700)
diff --git a/docs/api.md b/docs/api.md

index 94ece85abafd33d6d154a0d6430aeb266649b172..057ccaedb657d3def9b34a7131ded773d8008882 100644 (file)
--- a/docs/api.md
+++ b/docs/api.md
@@ -235,12 +235,6 @@ results:
  Pagination works exactly the same as it does for normal requests on this
  endpoint.
  
-Certain limitations apply to full text queries:
-
-- Results are always sorted by search score. The results matching the
-  query best will show up first.
-- Only a small subset of filtering parameters are supported.
-
  Furthermore, each returned document has an additional `__search_hit__`
  attribute with various information about the search results:
  
@@ -280,6 +274,67 @@ attribute with various information about the search results:
  - `rank` is the index of the search results. The first result will
    have rank 0.
  
+### Filtering by custom fields
+
+You can filter documents by their custom field values by specifying the
+`custom_field_lookup` query parameter. Here are some recipes for common
+use cases:
+
+1. Documents with a custom field "due" (date) between Aug 1, 2024 and
+   Sept 1, 2024 (inclusive):
+
+   `?custom_field_lookup=["due", "range", ["2024-08-01", "2024-09-01"]]`
+
+2. Documents with a custom field "customer" (text) that equals "bob"
+   (case sensitive):
+
+   `?custom_field_lookup=["customer", "exact", "bob"]`
+
+3. Documents with a custom field "answered" (boolean) set to `true`:
+
+   `?custom_field_lookup=["answered", "exact", true]`
+
+4. Documents with a custom field "favorite animal" (select) set to either
+   "cat" or "dog":
+
+   `?custom_field_lookup=["favorite animal", "in", ["cat", "dog"]]`
+
+5. Documents with a custom field "address" (text) that is empty:
+
+   `?custom_field_lookup=["OR", ["address", "isnull", true], ["address", "exact", ""]]`
+
+6. Documents that don't have a field called "foo":
+
+   `?custom_field_lookup=["foo", "exists", false]`
+
+7. Documents that have document links "references" to both document 3 and 7:
+
+   `?custom_field_lookup=["references", "contains", [3, 7]]`
+
+All field types support basic operations including `exact`, `in`, `isnull`,
+and `exists`. String, URL, and monetary fields support case-insensitive
+substring matching operations including `icontains`, `istartswith`, and
+`iendswith`. Integer, float, and date fields support arithmetic comparisons
+including `gt` (>), `gte` (>=), `lt` (<), `lte` (<=), and `range`.
+Lastly, document link fields support a `contains` operator that behaves
+like a "is superset of" check.
+
+!!! warning
+
+    It is possible to do case-insensitive exact match (i.e., `iexact`) and
+    case-sensitive substring match (i.e., `contains`, `startswith`,
+    `endswith`) for string, URL, and monetary fields, but
+    [they may not work as expected on some database backends](https://docs.djangoproject.com/en/5.1/ref/databases/#substring-matching-and-case-sensitivity).
+
+    It is also possible to use regular expressions to match string, URL, and
+    monetary fields, but the syntax is database-dependent, and accepting
+    regular expressions from untrusted sources could make your instance
+    vulnerable to regular expression denial of service attacks.
+
+    For these reasons the above expressions are disabled by default.
+    If you understand the implications, you may enable them by uncommenting
+    `PAPERLESS_CUSTOM_FIELD_LOOKUP_OPT_IN` in your configuration file.
+
  ### `/api/search/autocomplete/`
  
  Get auto completions for a partial search term.
diff --git a/paperless.conf.example b/paperless.conf.example

index 63ee7be22397cb9743ea2e76ecebd73054bcc0c1..5fabbf390d5fd515d30c7088a19597a9a4a23dcb 100644 (file)
--- a/paperless.conf.example
+++ b/paperless.conf.example
@@ -81,6 +81,7 @@
  #PAPERLESS_THUMBNAIL_FONT_NAME=
  #PAPERLESS_IGNORE_DATES=
  #PAPERLESS_ENABLE_UPDATE_CHECK=
+#PAPERLESS_ALLOW_CUSTOM_FIELD_LOOKUP=iexact,contains,startswith,endswith,regex,iregex
  
  # Tika settings
  
diff --git a/src/documents/filters.py b/src/documents/filters.py

index 1770f851481164719785c44b4d2e38e9cf4bfc01..5288bd45c459007263f1255ef5d519fe5083774c 100644 (file)
--- a/src/documents/filters.py
+++ b/src/documents/filters.py
@@ -1,24 +1,36 @@
+import functools
+import inspect
+import json
+import operator
+from contextlib import contextmanager
+from typing import Callable
+from typing import Union
+
  from django.contrib.contenttypes.models import ContentType
  from django.db.models import CharField
  from django.db.models import Count
  from django.db.models import OuterRef
  from django.db.models import Q
  from django.db.models.functions import Cast
+from django.utils.translation import gettext_lazy as _
  from django_filters.rest_framework import BooleanFilter
  from django_filters.rest_framework import Filter
  from django_filters.rest_framework import FilterSet
  from guardian.utils import get_group_obj_perms_model
  from guardian.utils import get_user_obj_perms_model
+from rest_framework import serializers
  from rest_framework_guardian.filters import ObjectPermissionsFilter
  
  from documents.models import Correspondent
  from documents.models import CustomField
+from documents.models import CustomFieldInstance
  from documents.models import Document
  from documents.models import DocumentType
  from documents.models import Log
  from documents.models import ShareLink
  from documents.models import StoragePath
  from documents.models import Tag
+from paperless import settings
  
  CHAR_KWARGS = ["istartswith", "iendswith", "icontains", "iexact"]
  ID_KWARGS = ["in", "exact"]
@@ -182,6 +194,488 @@ class CustomFieldsFilter(Filter):
              return qs
  
  
+class SelectField(serializers.IntegerField):
+    def __init__(self, custom_field: CustomField):
+        self._options = custom_field.extra_data["select_options"]
+        super().__init__(min_value=0, max_value=len(self._options))
+
+    def to_internal_value(self, data):
+        if not isinstance(data, int):
+            # If the supplied value is not an integer,
+            # we will try to map it to an option index.
+            try:
+                data = self._options.index(data)
+            except ValueError:
+                pass
+        return super().to_internal_value(data)
+
+
+def handle_validation_prefix(func: Callable):
+    """
+    Catch ValidationErrors raised by the wrapped function
+    and add a prefix to the exception detail to track what causes the exception,
+    similar to nested serializers.
+    """
+
+    def wrapper(*args, validation_prefix=None, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except serializers.ValidationError as e:
+            raise serializers.ValidationError({validation_prefix: e.detail})
+
+    # Update the signature to include the validation_prefix argument
+    old_sig = inspect.signature(func)
+    new_param = inspect.Parameter("validation_prefix", inspect.Parameter.KEYWORD_ONLY)
+    new_sig = old_sig.replace(parameters=[*old_sig.parameters.values(), new_param])
+
+    # Apply functools.wraps and manually set the new signature
+    functools.update_wrapper(wrapper, func)
+    wrapper.__signature__ = new_sig
+
+    return wrapper
+
+
+class CustomFieldLookupParser:
+    EXPR_BY_CATEGORY = {
+        "basic": ["exact", "in", "isnull", "exists"],
+        "string": [
+            "iexact",
+            "contains",
+            "icontains",
+            "startswith",
+            "istartswith",
+            "endswith",
+            "iendswith",
+            "regex",
+            "iregex",
+        ],
+        "arithmetic": [
+            "gt",
+            "gte",
+            "lt",
+            "lte",
+            "range",
+        ],
+        "containment": ["contains"],
+    }
+
+    # These string lookup expressions are problematic. We shall disable
+    # them by default unless the user explicitly opts in.
+    STR_EXPR_DISABLED_BY_DEFAULT = [
+        # SQLite: is case-sensitive outside the ASCII range
+        "iexact",
+        # SQLite: behaves the same as icontains
+        "contains",
+        # SQLite: behaves the same as istartswith
+        "startswith",
+        # SQLite: behaves the same as iendswith
+        "endswith",
+        # Syntax depends on database backends, can be exploited for ReDoS
+        "regex",
+        # Syntax depends on database backends, can be exploited for ReDoS
+        "iregex",
+    ]
+
+    SUPPORTED_EXPR_CATEGORIES = {
+        CustomField.FieldDataType.STRING: ("basic", "string"),
+        CustomField.FieldDataType.URL: ("basic", "string"),
+        CustomField.FieldDataType.DATE: ("basic", "arithmetic"),
+        CustomField.FieldDataType.BOOL: ("basic",),
+        CustomField.FieldDataType.INT: ("basic", "arithmetic"),
+        CustomField.FieldDataType.FLOAT: ("basic", "arithmetic"),
+        CustomField.FieldDataType.MONETARY: ("basic", "string"),
+        CustomField.FieldDataType.DOCUMENTLINK: ("basic", "containment"),
+        CustomField.FieldDataType.SELECT: ("basic",),
+    }
+
+    DATE_COMPONENTS = [
+        "year",
+        "iso_year",
+        "month",
+        "day",
+        "week",
+        "week_day",
+        "iso_week_day",
+        "quarter",
+    ]
+
+    def __init__(
+        self,
+        validation_prefix,
+        max_query_depth=10,
+        max_atom_count=20,
+    ) -> None:
+        """
+        A helper class that parses the query string into a `django.db.models.Q` for filtering
+        documents based on custom field values.
+
+        The syntax of the query expression is illustrated with the below pseudo code rules:
+        1. parse([`custom_field`, "exists", true]):
+            matches documents with Q(custom_fields__field=`custom_field`)
+        2. parse([`custom_field`, "exists", false]):
+            matches documents with ~Q(custom_fields__field=`custom_field`)
+        3. parse([`custom_field`, `op`, `value`]):
+            matches documents with
+            Q(custom_fields__field=`custom_field`, custom_fields__value_`type`__`op`= `value`)
+        4. parse(["AND", [`q0`, `q1`, ..., `qn`]])
+            -> parse(`q0`) & parse(`q1`) & ... & parse(`qn`)
+        5. parse(["OR", [`q0`, `q1`, ..., `qn`]])
+            -> parse(`q0`) | parse(`q1`) | ... | parse(`qn`)
+        6. parse(["NOT", `q`])
+            -> ~parse(`q`)
+
+        Args:
+            validation_prefix: Used to generate the ValidationError message.
+            max_query_depth: Limits the maximum nesting depth of queries.
+            max_atom_count: Limits the maximum number of atoms (i.e., rule 1, 2, 3) in the query.
+
+        `max_query_depth` and `max_atom_count` can be set to guard against generating arbitrarily
+        complex SQL queries.
+        """
+        self._custom_fields: dict[Union[int, str], CustomField] = {}
+        self._validation_prefix = validation_prefix
+        # Dummy ModelSerializer used to convert a Django models.Field to serializers.Field.
+        self._model_serializer = serializers.ModelSerializer()
+        # Used for sanity check
+        self._max_query_depth = max_query_depth
+        self._max_atom_count = max_atom_count
+        self._current_depth = 0
+        self._atom_count = 0
+        # The set of annotations that we need to apply to the queryset
+        self._annotations = {}
+
+    def parse(self, query: str) -> tuple[Q, dict[str, Count]]:
+        """
+        Parses the query string into a `django.db.models.Q`
+        and a set of annotations to be applied to the queryset.
+        """
+        try:
+            expr = json.loads(query)
+        except json.JSONDecodeError:
+            raise serializers.ValidationError(
+                {self._validation_prefix: [_("Value must be valid JSON.")]},
+            )
+        return (
+            self._parse_expr(expr, validation_prefix=self._validation_prefix),
+            self._annotations,
+        )
+
+    @handle_validation_prefix
+    def _parse_expr(self, expr) -> Q:
+        """
+        Applies rule (1, 2, 3) or (4, 5, 6) based on the length of the expr.
+        """
+        with self._track_query_depth():
+            if isinstance(expr, (list, tuple)):
+                if len(expr) == 2:
+                    return self._parse_logical_expr(*expr)
+                elif len(expr) == 3:
+                    return self._parse_atom(*expr)
+            raise serializers.ValidationError(
+                [_("Invalid custom field lookup expression")],
+            )
+
+    @handle_validation_prefix
+    def _parse_expr_list(self, exprs) -> list[Q]:
+        """
+        Handles [`q0`, `q1`, ..., `qn`] in rule 4 & 5.
+        """
+        if not isinstance(exprs, (list, tuple)) or not exprs:
+            raise serializers.ValidationError(
+                [_("Invalid expression list. Must be nonempty.")],
+            )
+        return [
+            self._parse_expr(expr, validation_prefix=i) for i, expr in enumerate(exprs)
+        ]
+
+    def _parse_logical_expr(self, op, args) -> Q:
+        """
+        Handles rule 4, 5, 6.
+        """
+        op_lower = op.lower()
+
+        if op_lower == "not":
+            return ~self._parse_expr(args, validation_prefix=1)
+
+        if op_lower == "and":
+            op_func = operator.and_
+        elif op_lower == "or":
+            op_func = operator.or_
+        else:
+            raise serializers.ValidationError(
+                {"0": [_("Invalid logical operator {op!r}").format(op=op)]},
+            )
+
+        qs = self._parse_expr_list(args, validation_prefix="1")
+        return functools.reduce(op_func, qs)
+
+    def _parse_atom(self, id_or_name, op, value) -> Q:
+        """
+        Handles rule 1, 2, 3.
+        """
+        # Guard against queries with too many conditions.
+        self._atom_count += 1
+        if self._atom_count > self._max_atom_count:
+            raise serializers.ValidationError(
+                [
+                    _(
+                        "Maximum number of query conditions exceeded. You can raise "
+                        "the limit by setting PAPERLESS_CUSTOM_FIELD_LOOKUP_MAX_ATOMS "
+                        "in your configuration file.",
+                    ),
+                ],
+            )
+
+        custom_field = self._get_custom_field(id_or_name, validation_prefix="0")
+        op = self._validate_atom_op(custom_field, op, validation_prefix="1")
+        value = self._validate_atom_value(
+            custom_field,
+            op,
+            value,
+            validation_prefix="2",
+        )
+
+        # Needed because not all DB backends support Array __contains
+        if (
+            custom_field.data_type == CustomField.FieldDataType.DOCUMENTLINK
+            and op == "contains"
+        ):
+            return self._parse_atom_doc_link_contains(custom_field, value)
+
+        value_field_name = CustomFieldInstance.get_value_field_name(
+            custom_field.data_type,
+        )
+        has_field = Q(custom_fields__field=custom_field)
+
+        # Our special exists operator.
+        if op == "exists":
+            field_filter = has_field if value else ~has_field
+        else:
+            field_filter = has_field & Q(
+                **{f"custom_fields__{value_field_name}__{op}": value},
+            )
+
+        # We need to use an annotation here because different atoms
+        # might be referring to different instances of custom fields.
+        annotation_name = f"_custom_field_filter_{len(self._annotations)}"
+        self._annotations[annotation_name] = Count("custom_fields", filter=field_filter)
+
+        return Q(**{f"{annotation_name}__gt": 0})
+
+    @handle_validation_prefix
+    def _get_custom_field(self, id_or_name):
+        """Get the CustomField instance by id or name."""
+        if id_or_name in self._custom_fields:
+            return self._custom_fields[id_or_name]
+
+        kwargs = (
+            {"id": id_or_name} if isinstance(id_or_name, int) else {"name": id_or_name}
+        )
+        try:
+            custom_field = CustomField.objects.get(**kwargs)
+        except CustomField.DoesNotExist:
+            raise serializers.ValidationError(
+                [_("{name!r} is not a valid custom field.").format(name=id_or_name)],
+            )
+        self._custom_fields[custom_field.id] = custom_field
+        self._custom_fields[custom_field.name] = custom_field
+        return custom_field
+
+    @staticmethod
+    def _split_op(full_op):
+        *prefix, op = str(full_op).rsplit("__", maxsplit=1)
+        prefix = prefix[0] if prefix else None
+        return prefix, op
+
+    @handle_validation_prefix
+    def _validate_atom_op(self, custom_field, raw_op):
+        """Check if the `op` is compatible with the type of the custom field."""
+        prefix, op = self._split_op(raw_op)
+
+        # Check if the operator is supported for the current data_type.
+        supported = False
+        for category in self.SUPPORTED_EXPR_CATEGORIES[custom_field.data_type]:
+            if (
+                category == "string"
+                and op in self.STR_EXPR_DISABLED_BY_DEFAULT
+                and op not in settings.CUSTOM_FIELD_LOOKUP_OPT_IN
+            ):
+                raise serializers.ValidationError(
+                    [
+                        _(
+                            "{expr!r} is disabled by default because it does not "
+                            "behave consistently across database backends, or can "
+                            "cause security risks. If you understand the implications "
+                            "you may enabled it by adding it to "
+                            "`PAPERLESS_CUSTOM_FIELD_LOOKUP_OPT_IN`.",
+                        ).format(expr=op),
+                    ],
+                )
+            if op in self.EXPR_BY_CATEGORY[category]:
+                supported = True
+                break
+
+        # Check prefix
+        if prefix is not None:
+            if (
+                prefix in self.DATE_COMPONENTS
+                and custom_field.data_type == CustomField.FieldDataType.DATE
+            ):
+                pass  # ok - e.g., "year__exact" for date field
+            else:
+                supported = False  # anything else is invalid
+
+        if not supported:
+            raise serializers.ValidationError(
+                [
+                    _("{data_type} does not support lookup expr {expr!r}.").format(
+                        data_type=custom_field.data_type,
+                        expr=raw_op,
+                    ),
+                ],
+            )
+
+        return raw_op
+
+    def _get_serializer_field(self, custom_field, full_op):
+        """Return a serializers.Field for value validation."""
+        prefix, op = self._split_op(full_op)
+        field = None
+
+        if op in ("isnull", "exists"):
+            # `isnull` takes either True or False regardless of the data_type.
+            field = serializers.BooleanField()
+        elif (
+            custom_field.data_type == CustomField.FieldDataType.DATE
+            and prefix in self.DATE_COMPONENTS
+        ):
+            # DateField admits lookups in the form of `year__exact`, etc. These take integers.
+            field = serializers.IntegerField()
+        elif custom_field.data_type == CustomField.FieldDataType.DOCUMENTLINK:
+            # We can be more specific here and make sure the value is a list.
+            field = serializers.ListField(child=serializers.IntegerField())
+        elif custom_field.data_type == CustomField.FieldDataType.SELECT:
+            # We use this custom field to permit SELECT option names.
+            field = SelectField(custom_field)
+        elif custom_field.data_type == CustomField.FieldDataType.URL:
+            # For URL fields we don't need to be strict about validation (e.g., for istartswith).
+            field = serializers.CharField()
+        else:
+            # The general case: inferred from the corresponding field in CustomFieldInstance.
+            value_field_name = CustomFieldInstance.get_value_field_name(
+                custom_field.data_type,
+            )
+            model_field = CustomFieldInstance._meta.get_field(value_field_name)
+            field_name = model_field.deconstruct()[0]
+            field_class, field_kwargs = self._model_serializer.build_standard_field(
+                field_name,
+                model_field,
+            )
+            field = field_class(**field_kwargs)
+            field.allow_null = False
+
+            # Need to set allow_blank manually because of the inconsistency in CustomFieldInstance validation.
+            # See https://github.com/paperless-ngx/paperless-ngx/issues/7361.
+            if isinstance(field, serializers.CharField):
+                field.allow_blank = True
+
+        if op == "in":
+            # `in` takes a list of values.
+            field = serializers.ListField(child=field, allow_empty=False)
+        elif op == "range":
+            # `range` takes a list of values, i.e., [start, end].
+            field = serializers.ListField(
+                child=field,
+                min_length=2,
+                max_length=2,
+            )
+
+        return field
+
+    @handle_validation_prefix
+    def _validate_atom_value(self, custom_field, op, value):
+        """Check if `value` is valid for the custom field and `op`. Returns the validated value."""
+        serializer_field = self._get_serializer_field(custom_field, op)
+        return serializer_field.run_validation(value)
+
+    def _parse_atom_doc_link_contains(self, custom_field, value) -> Q:
+        """
+        Handles document link `contains` in a way that is supported by all DB backends.
+        """
+
+        # If the value is an empty set,
+        # this is trivially true for any document with not null document links.
+        if not value:
+            return Q(
+                custom_fields__field=custom_field,
+                custom_fields__value_document_ids__isnull=False,
+            )
+
+        # First we lookup reverse links from the requested documents.
+        links = CustomFieldInstance.objects.filter(
+            document_id__in=value,
+            field__data_type=CustomField.FieldDataType.DOCUMENTLINK,
+        )
+
+        # Check if any of the requested IDs are missing.
+        missing_ids = set(value) - set(link.document_id for link in links)
+        if missing_ids:
+            # The result should be an empty set in this case.
+            return Q(id__in=[])
+
+        # Take the intersection of the reverse links - this should be what we are looking for.
+        document_ids_we_want = functools.reduce(
+            operator.and_,
+            (set(link.value_document_ids) for link in links),
+        )
+
+        return Q(id__in=document_ids_we_want)
+
+    @contextmanager
+    def _track_query_depth(self):
+        # guard against queries that are too deeply nested
+        self._current_depth += 1
+        if self._current_depth > self._max_query_depth:
+            raise serializers.ValidationError(
+                [
+                    _(
+                        "Maximum nesting depth exceeded. You can raise the limit "
+                        "by setting PAPERLESS_CUSTOM_FIELD_LOOKUP_MAX_DEPTH in "
+                        "your configuration file.",
+                    ),
+                ],
+            )
+        try:
+            yield
+        finally:
+            self._current_depth -= 1
+
+
+class CustomFieldLookupFilter(Filter):
+    def __init__(self, validation_prefix):
+        """
+        A filter that filters documents based on custom field name and value.
+
+        Args:
+            validation_prefix: Used to generate the ValidationError message.
+        """
+        super().__init__()
+        self._validation_prefix = validation_prefix
+
+    def filter(self, qs, value):
+        if not value:
+            return qs
+
+        parser = CustomFieldLookupParser(
+            self._validation_prefix,
+            max_query_depth=settings.CUSTOM_FIELD_LOOKUP_MAX_DEPTH,
+            max_atom_count=settings.CUSTOM_FIELD_LOOKUP_MAX_ATOMS,
+        )
+        q, annotations = parser.parse(value)
+
+        return qs.annotate(**annotations).filter(q)
+
+
  class DocumentFilterSet(FilterSet):
      is_tagged = BooleanFilter(
          label="Is tagged",
@@ -229,6 +723,8 @@ class DocumentFilterSet(FilterSet):
          exclude=True,
      )
  
+    custom_field_lookup = CustomFieldLookupFilter("custom_field_lookup")
+
      shared_by__id = SharedByUser()
  
      class Meta:
diff --git a/src/documents/models.py b/src/documents/models.py

index 3ee11aeba9795eedd36365ff2e012c4600e4d348..24e8c2b267c39ceae7e669644d368d9ccc8871db 100644 (file)
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -857,6 +857,18 @@ class CustomFieldInstance(models.Model):
      and attached to a single Document to be metadata for it
      """
  
+    TYPE_TO_DATA_STORE_NAME_MAP = {
+        CustomField.FieldDataType.STRING: "value_text",
+        CustomField.FieldDataType.URL: "value_url",
+        CustomField.FieldDataType.DATE: "value_date",
+        CustomField.FieldDataType.BOOL: "value_bool",
+        CustomField.FieldDataType.INT: "value_int",
+        CustomField.FieldDataType.FLOAT: "value_float",
+        CustomField.FieldDataType.MONETARY: "value_monetary",
+        CustomField.FieldDataType.DOCUMENTLINK: "value_document_ids",
+        CustomField.FieldDataType.SELECT: "value_select",
+    }
+
      created = models.DateTimeField(
          _("created"),
          default=timezone.now,
@@ -923,31 +935,21 @@ class CustomFieldInstance(models.Model):
          )
          return str(self.field.name) + f" : {value}"
  
+    @classmethod
+    def get_value_field_name(cls, data_type: CustomField.FieldDataType):
+        try:
+            return cls.TYPE_TO_DATA_STORE_NAME_MAP[data_type]
+        except KeyError:  # pragma: no cover
+            raise NotImplementedError(data_type)
+
      @property
      def value(self):
          """
          Based on the data type, access the actual value the instance stores
          A little shorthand/quick way to get what is actually here
          """
-        if self.field.data_type == CustomField.FieldDataType.STRING:
-            return self.value_text
-        elif self.field.data_type == CustomField.FieldDataType.URL:
-            return self.value_url
-        elif self.field.data_type == CustomField.FieldDataType.DATE:
-            return self.value_date
-        elif self.field.data_type == CustomField.FieldDataType.BOOL:
-            return self.value_bool
-        elif self.field.data_type == CustomField.FieldDataType.INT:
-            return self.value_int
-        elif self.field.data_type == CustomField.FieldDataType.FLOAT:
-            return self.value_float
-        elif self.field.data_type == CustomField.FieldDataType.MONETARY:
-            return self.value_monetary
-        elif self.field.data_type == CustomField.FieldDataType.DOCUMENTLINK:
-            return self.value_document_ids
-        elif self.field.data_type == CustomField.FieldDataType.SELECT:
-            return self.value_select
-        raise NotImplementedError(self.field.data_type)
+        value_field_name = self.get_value_field_name(self.field.data_type)
+        return getattr(self, value_field_name)
  
  
  if settings.AUDIT_LOG_ENABLED:
diff --git a/src/documents/serialisers.py b/src/documents/serialisers.py

index 747d744b65cacfb2068560793634f9022f6b0968..5218cbf8aeaf5405e5c45de17a88e03a83dc3a77 100644 (file)
--- a/src/documents/serialisers.py
+++ b/src/documents/serialisers.py
@@ -578,23 +578,14 @@ class CustomFieldInstanceSerializer(serializers.ModelSerializer):
      value = ReadWriteSerializerMethodField(allow_null=True)
  
      def create(self, validated_data):
-        type_to_data_store_name_map = {
-            CustomField.FieldDataType.STRING: "value_text",
-            CustomField.FieldDataType.URL: "value_url",
-            CustomField.FieldDataType.DATE: "value_date",
-            CustomField.FieldDataType.BOOL: "value_bool",
-            CustomField.FieldDataType.INT: "value_int",
-            CustomField.FieldDataType.FLOAT: "value_float",
-            CustomField.FieldDataType.MONETARY: "value_monetary",
-            CustomField.FieldDataType.DOCUMENTLINK: "value_document_ids",
-            CustomField.FieldDataType.SELECT: "value_select",
-        }
          # An instance is attached to a document
          document: Document = validated_data["document"]
          # And to a CustomField
          custom_field: CustomField = validated_data["field"]
          # This key must exist, as it is validated
-        data_store_name = type_to_data_store_name_map[custom_field.data_type]
+        data_store_name = CustomFieldInstance.get_value_field_name(
+            custom_field.data_type,
+        )
  
          if custom_field.data_type == CustomField.FieldDataType.DOCUMENTLINK:
              # prior to update so we can look for any docs that are going to be removed
diff --git a/src/documents/tests/test_api_filter_by_custom_fields.py b/src/documents/tests/test_api_filter_by_custom_fields.py

new file mode 100644 (file)

index 0000000..9ab96d0
--- /dev/null
+++ b/src/documents/tests/test_api_filter_by_custom_fields.py
@@ -0,0 +1,670 @@
+import json
+import re
+from datetime import date
+from typing import Callable
+from unittest.mock import Mock
+from urllib.parse import quote
+
+import pytest
+from django.contrib.auth.models import User
+from rest_framework.test import APITestCase
+
+from documents.models import CustomField
+from documents.models import Document
+from documents.serialisers import DocumentSerializer
+from documents.tests.utils import DirectoriesMixin
+from paperless import settings
+
+
+class DocumentWrapper:
+    """
+    Allows Pythonic access to the custom fields associated with the wrapped document.
+    """
+
+    def __init__(self, document: Document) -> None:
+        self._document = document
+
+    def __contains__(self, custom_field: str) -> bool:
+        return self._document.custom_fields.filter(field__name=custom_field).exists()
+
+    def __getitem__(self, custom_field: str):
+        return self._document.custom_fields.get(field__name=custom_field).value
+
+
+def string_expr_opted_in(op):
+    return op in settings.CUSTOM_FIELD_LOOKUP_OPT_IN
+
+
+class TestDocumentSearchApi(DirectoriesMixin, APITestCase):
+    def setUp(self):
+        super().setUp()
+
+        self.user = User.objects.create_superuser(username="temp_admin")
+        self.client.force_authenticate(user=self.user)
+
+        # Create one custom field per type. The fields are called f"{type}_field".
+        self.custom_fields = {}
+        for data_type in CustomField.FieldDataType.values:
+            name = data_type + "_field"
+            self.custom_fields[name] = CustomField.objects.create(
+                name=name,
+                data_type=data_type,
+            )
+
+        # Add some options to the select_field
+        select = self.custom_fields["select_field"]
+        select.extra_data = {"select_options": ["A", "B", "C"]}
+        select.save()
+
+        # Now we will create some test documents
+        self.documents = []
+
+        # CustomField.FieldDataType.STRING
+        self._create_document(string_field=None)
+        self._create_document(string_field="")
+        self._create_document(string_field="paperless")
+        self._create_document(string_field="Paperless")
+        self._create_document(string_field="PAPERLESS")
+        self._create_document(string_field="pointless")
+        self._create_document(string_field="pointy")
+
+        # CustomField.FieldDataType.URL
+        self._create_document(url_field=None)
+        self._create_document(url_field="")
+        self._create_document(url_field="https://docs.paperless-ngx.com/")
+        self._create_document(url_field="https://www.django-rest-framework.org/")
+        self._create_document(url_field="http://example.com/")
+
+        # A document to check if the filter correctly associates field names with values.
+        # E.g., ["url_field", "exact", "https://docs.paperless-ngx.com/"] should not
+        # yield this document.
+        self._create_document(
+            string_field="https://docs.paperless-ngx.com/",
+            url_field="http://example.com/",
+        )
+
+        # CustomField.FieldDataType.DATE
+        self._create_document(date_field=None)
+        self._create_document(date_field=date(2023, 8, 22))
+        self._create_document(date_field=date(2024, 8, 22))
+        self._create_document(date_field=date(2024, 11, 15))
+
+        # CustomField.FieldDataType.BOOL
+        self._create_document(boolean_field=None)
+        self._create_document(boolean_field=True)
+        self._create_document(boolean_field=False)
+
+        # CustomField.FieldDataType.INT
+        self._create_document(integer_field=None)
+        self._create_document(integer_field=-1)
+        self._create_document(integer_field=0)
+        self._create_document(integer_field=1)
+
+        # CustomField.FieldDataType.FLOAT
+        self._create_document(float_field=None)
+        self._create_document(float_field=-1e9)
+        self._create_document(float_field=0.05)
+        self._create_document(float_field=270.0)
+
+        # CustomField.FieldDataType.MONETARY
+        self._create_document(monetary_field=None)
+        self._create_document(monetary_field="USD100.00")
+        self._create_document(monetary_field="USD1.00")
+        self._create_document(monetary_field="EUR50.00")
+
+        # CustomField.FieldDataType.DOCUMENTLINK
+        self._create_document(documentlink_field=None)
+        self._create_document(documentlink_field=[])
+        self._create_document(
+            documentlink_field=[
+                self.documents[0].id,
+                self.documents[1].id,
+                self.documents[2].id,
+            ],
+        )
+        self._create_document(
+            documentlink_field=[self.documents[4].id, self.documents[5].id],
+        )
+
+        # CustomField.FieldDataType.SELECT
+        self._create_document(select_field=None)
+        self._create_document(select_field=0)
+        self._create_document(select_field=1)
+        self._create_document(select_field=2)
+
+    def _create_document(self, **kwargs):
+        title = str(kwargs)
+        document = Document.objects.create(
+            title=title,
+            checksum=title,
+            archive_serial_number=len(self.documents) + 1,
+        )
+        data = {
+            "custom_fields": [
+                {"field": self.custom_fields[name].id, "value": value}
+                for name, value in kwargs.items()
+            ],
+        }
+        serializer = DocumentSerializer(
+            document,
+            data=data,
+            partial=True,
+            context={"request": Mock()},
+        )
+        serializer.is_valid(raise_exception=True)
+        serializer.save()
+        self.documents.append(document)
+        return document
+
+    def _assert_query_match_predicate(
+        self,
+        query: list,
+        reference_predicate: Callable[[DocumentWrapper], bool],
+        match_nothing_ok=False,
+    ):
+        """
+        Checks the results of the query against a callable reference predicate.
+        """
+        reference_document_ids = [
+            document.id
+            for document in self.documents
+            if reference_predicate(DocumentWrapper(document))
+        ]
+        # First sanity check our test cases
+        if not match_nothing_ok:
+            self.assertTrue(
+                reference_document_ids,
+                msg="Bad test case - should match at least one document.",
+            )
+        self.assertNotEqual(
+            len(reference_document_ids),
+            len(self.documents),
+            msg="Bad test case - should not match all documents.",
+        )
+
+        # Now make the API call.
+        query_string = quote(json.dumps(query), safe="")
+        response = self.client.get(
+            "/api/documents/?"
+            + "&".join(
+                (
+                    f"custom_field_lookup={query_string}",
+                    "ordering=archive_serial_number",
+                    "page=1",
+                    f"page_size={len(self.documents)}",
+                    "truncate_content=true",
+                ),
+            ),
+        )
+        self.assertEqual(response.status_code, 200, msg=str(response.json()))
+        response_document_ids = [
+            document["id"] for document in response.json()["results"]
+        ]
+        self.assertEqual(reference_document_ids, response_document_ids)
+
+    def _assert_validation_error(self, query: str, path: list, keyword: str):
+        """
+        Asserts that the query raises a validation error.
+        Checks the message to make sure it points to the right place.
+        """
+        query_string = quote(query, safe="")
+        response = self.client.get(
+            "/api/documents/?"
+            + "&".join(
+                (
+                    f"custom_field_lookup={query_string}",
+                    "ordering=archive_serial_number",
+                    "page=1",
+                    f"page_size={len(self.documents)}",
+                    "truncate_content=true",
+                ),
+            ),
+        )
+        self.assertEqual(response.status_code, 400)
+
+        exception_path = []
+        detail = response.json()
+        while not isinstance(detail, list):
+            path_item, detail = next(iter(detail.items()))
+            exception_path.append(path_item)
+
+        self.assertEqual(path, exception_path)
+        self.assertIn(keyword, " ".join(detail))
+
+    # ==========================================================#
+    # Sanity checks                                             #
+    # ==========================================================#
+    def test_name_value_association(self):
+        """
+        GIVEN:
+            - A document with `{"string_field": "https://docs.paperless-ngx.com/",
+              "url_field": "http://example.com/"}`
+        WHEN:
+            - Filtering by `["url_field", "exact", "https://docs.paperless-ngx.com/"]`
+        THEN:
+            - That document should not get matched.
+        """
+        self._assert_query_match_predicate(
+            ["url_field", "exact", "https://docs.paperless-ngx.com/"],
+            lambda document: "url_field" in document
+            and document["url_field"] == "https://docs.paperless-ngx.com/",
+        )
+
+    def test_filter_by_multiple_fields(self):
+        """
+        GIVEN:
+            - A document with `{"string_field": "https://docs.paperless-ngx.com/",
+              "url_field": "http://example.com/"}`
+        WHEN:
+            - Filtering by `['AND', [["string_field", "exists", True], ["url_field", "exists", True]]]`
+        THEN:
+            - That document should get matched.
+        """
+        self._assert_query_match_predicate(
+            ["AND", [["string_field", "exists", True], ["url_field", "exists", True]]],
+            lambda document: "url_field" in document and "string_field" in document,
+        )
+
+    # ==========================================================#
+    # Basic expressions supported by all custom field types     #
+    # ==========================================================#
+    def test_exact(self):
+        self._assert_query_match_predicate(
+            ["string_field", "exact", "paperless"],
+            lambda document: "string_field" in document
+            and document["string_field"] == "paperless",
+        )
+
+    def test_in(self):
+        self._assert_query_match_predicate(
+            ["string_field", "in", ["paperless", "Paperless"]],
+            lambda document: "string_field" in document
+            and document["string_field"] in ("paperless", "Paperless"),
+        )
+
+    def test_isnull(self):
+        self._assert_query_match_predicate(
+            ["string_field", "isnull", True],
+            lambda document: "string_field" in document
+            and document["string_field"] is None,
+        )
+
+    def test_exists(self):
+        self._assert_query_match_predicate(
+            ["string_field", "exists", True],
+            lambda document: "string_field" in document,
+        )
+
+    def test_select(self):
+        # For select fields, you can either specify the index
+        # or the name of the option. They function exactly the same.
+        self._assert_query_match_predicate(
+            ["select_field", "exact", 1],
+            lambda document: "select_field" in document
+            and document["select_field"] == 1,
+        )
+        # This is the same as:
+        self._assert_query_match_predicate(
+            ["select_field", "exact", "B"],
+            lambda document: "select_field" in document
+            and document["select_field"] == 1,
+        )
+
+    # ==========================================================#
+    # Expressions for string, URL, and monetary fields          #
+    # ==========================================================#
+    @pytest.mark.skipif(
+        not string_expr_opted_in("iexact"),
+        reason="iexact expr is disabled.",
+    )
+    def test_iexact(self):
+        self._assert_query_match_predicate(
+            ["string_field", "iexact", "paperless"],
+            lambda document: "string_field" in document
+            and document["string_field"] is not None
+            and document["string_field"].lower() == "paperless",
+        )
+
+    @pytest.mark.skipif(
+        not string_expr_opted_in("contains"),
+        reason="contains expr is disabled.",
+    )
+    def test_contains(self):
+        # WARNING: SQLite treats "contains" as "icontains"!
+        # You should avoid "contains" unless you know what you are doing!
+        self._assert_query_match_predicate(
+            ["string_field", "contains", "aper"],
+            lambda document: "string_field" in document
+            and document["string_field"] is not None
+            and "aper" in document["string_field"],
+        )
+
+    def test_icontains(self):
+        self._assert_query_match_predicate(
+            ["string_field", "icontains", "aper"],
+            lambda document: "string_field" in document
+            and document["string_field"] is not None
+            and "aper" in document["string_field"].lower(),
+        )
+
+    @pytest.mark.skipif(
+        not string_expr_opted_in("startswith"),
+        reason="startswith expr is disabled.",
+    )
+    def test_startswith(self):
+        # WARNING: SQLite treats "startswith" as "istartswith"!
+        # You should avoid "startswith" unless you know what you are doing!
+        self._assert_query_match_predicate(
+            ["string_field", "startswith", "paper"],
+            lambda document: "string_field" in document
+            and document["string_field"] is not None
+            and document["string_field"].startswith("paper"),
+        )
+
+    def test_istartswith(self):
+        self._assert_query_match_predicate(
+            ["string_field", "istartswith", "paper"],
+            lambda document: "string_field" in document
+            and document["string_field"] is not None
+            and document["string_field"].lower().startswith("paper"),
+        )
+
+    @pytest.mark.skipif(
+        not string_expr_opted_in("endswith"),
+        reason="endswith expr is disabled.",
+    )
+    def test_endswith(self):
+        # WARNING: SQLite treats "endswith" as "iendswith"!
+        # You should avoid "endswith" unless you know what you are doing!
+        self._assert_query_match_predicate(
+            ["string_field", "iendswith", "less"],
+            lambda document: "string_field" in document
+            and document["string_field"] is not None
+            and document["string_field"].lower().endswith("less"),
+        )
+
+    def test_iendswith(self):
+        self._assert_query_match_predicate(
+            ["string_field", "iendswith", "less"],
+            lambda document: "string_field" in document
+            and document["string_field"] is not None
+            and document["string_field"].lower().endswith("less"),
+        )
+
+    @pytest.mark.skipif(
+        not string_expr_opted_in("regex"),
+        reason="regex expr is disabled.",
+    )
+    def test_regex(self):
+        # WARNING: the regex syntax is database dependent!
+        self._assert_query_match_predicate(
+            ["string_field", "regex", r"^p.+s$"],
+            lambda document: "string_field" in document
+            and document["string_field"] is not None
+            and re.match(r"^p.+s$", document["string_field"]),
+        )
+
+    @pytest.mark.skipif(
+        not string_expr_opted_in("iregex"),
+        reason="iregex expr is disabled.",
+    )
+    def test_iregex(self):
+        # WARNING: the regex syntax is database dependent!
+        self._assert_query_match_predicate(
+            ["string_field", "iregex", r"^p.+s$"],
+            lambda document: "string_field" in document
+            and document["string_field"] is not None
+            and re.match(r"^p.+s$", document["string_field"], re.IGNORECASE),
+        )
+
+    def test_url_field_istartswith(self):
+        # URL fields supports all of the expressions above.
+        # Just showing one of them here.
+        self._assert_query_match_predicate(
+            ["url_field", "istartswith", "http://"],
+            lambda document: "url_field" in document
+            and document["url_field"] is not None
+            and document["url_field"].startswith("http://"),
+        )
+
+    @pytest.mark.skipif(
+        not string_expr_opted_in("iregex"),
+        reason="regex expr is disabled.",
+    )
+    def test_monetary_field_iregex(self):
+        # Monetary fields supports all of the expressions above.
+        # Just showing one of them here.
+        #
+        # Unfortunately we can't do arithmetic comparisons on monetary field,
+        # but you are welcome to use regex to do some of that.
+        # E.g., USD between 100.00 and 999.99:
+        self._assert_query_match_predicate(
+            ["monetary_field", "regex", r"USD[1-9][0-9]{2}\.[0-9]{2}"],
+            lambda document: "monetary_field" in document
+            and document["monetary_field"] is not None
+            and re.match(
+                r"USD[1-9][0-9]{2}\.[0-9]{2}",
+                document["monetary_field"],
+                re.IGNORECASE,
+            ),
+        )
+
+    # ==========================================================#
+    # Arithmetic comparisons                                    #
+    # ==========================================================#
+    def test_gt(self):
+        self._assert_query_match_predicate(
+            ["date_field", "gt", date(2024, 8, 22).isoformat()],
+            lambda document: "date_field" in document
+            and document["date_field"] is not None
+            and document["date_field"] > date(2024, 8, 22),
+        )
+
+    def test_gte(self):
+        self._assert_query_match_predicate(
+            ["date_field", "gte", date(2024, 8, 22).isoformat()],
+            lambda document: "date_field" in document
+            and document["date_field"] is not None
+            and document["date_field"] >= date(2024, 8, 22),
+        )
+
+    def test_lt(self):
+        self._assert_query_match_predicate(
+            ["integer_field", "lt", 0],
+            lambda document: "integer_field" in document
+            and document["integer_field"] is not None
+            and document["integer_field"] < 0,
+        )
+
+    def test_lte(self):
+        self._assert_query_match_predicate(
+            ["integer_field", "lte", 0],
+            lambda document: "integer_field" in document
+            and document["integer_field"] is not None
+            and document["integer_field"] <= 0,
+        )
+
+    def test_range(self):
+        self._assert_query_match_predicate(
+            ["float_field", "range", [-0.05, 0.05]],
+            lambda document: "float_field" in document
+            and document["float_field"] is not None
+            and -0.05 <= document["float_field"] <= 0.05,
+        )
+
+    def test_date_modifier(self):
+        # For date fields you can optionally prefix the operator
+        # with the part of the date you are comparing with.
+        self._assert_query_match_predicate(
+            ["date_field", "year__gte", 2024],
+            lambda document: "date_field" in document
+            and document["date_field"] is not None
+            and document["date_field"].year >= 2024,
+        )
+
+    # ==========================================================#
+    # Subset check (document link field only)                   #
+    # ==========================================================#
+    def test_document_link_contains(self):
+        # Document link field "contains" performs a subset check.
+        self._assert_query_match_predicate(
+            ["documentlink_field", "contains", [1, 2]],
+            lambda document: "documentlink_field" in document
+            and document["documentlink_field"] is not None
+            and set(document["documentlink_field"]) >= {1, 2},
+        )
+        # The order of IDs don't matter - this is the same as above.
+        self._assert_query_match_predicate(
+            ["documentlink_field", "contains", [2, 1]],
+            lambda document: "documentlink_field" in document
+            and document["documentlink_field"] is not None
+            and set(document["documentlink_field"]) >= {1, 2},
+        )
+
+    def test_document_link_contains_empty_set(self):
+        # An empty set is a subset of any set.
+        self._assert_query_match_predicate(
+            ["documentlink_field", "contains", []],
+            lambda document: "documentlink_field" in document
+            and document["documentlink_field"] is not None,
+        )
+
+    def test_document_link_contains_no_reverse_link(self):
+        # An edge case is that the document in the value list
+        # doesn't have a document link field and thus has no reverse link.
+        self._assert_query_match_predicate(
+            ["documentlink_field", "contains", [self.documents[6].id]],
+            lambda document: "documentlink_field" in document
+            and document["documentlink_field"] is not None
+            and set(document["documentlink_field"]) >= {self.documents[6].id},
+            match_nothing_ok=True,
+        )
+
+    # ==========================================================#
+    # Logical expressions                                       #
+    # ==========================================================#
+    def test_logical_and(self):
+        self._assert_query_match_predicate(
+            [
+                "AND",
+                [["date_field", "year__exact", 2024], ["date_field", "month__lt", 9]],
+            ],
+            lambda document: "date_field" in document
+            and document["date_field"] is not None
+            and document["date_field"].year == 2024
+            and document["date_field"].month < 9,
+        )
+
+    def test_logical_or(self):
+        # This is also the recommend way to check for "empty" text, URL, and monetary fields.
+        self._assert_query_match_predicate(
+            [
+                "OR",
+                [["string_field", "exact", ""], ["string_field", "isnull", True]],
+            ],
+            lambda document: "string_field" in document
+            and not bool(document["string_field"]),
+        )
+
+    def test_logical_not(self):
+        # This means `NOT ((document has string_field) AND (string_field iexact "paperless"))`,
+        # not `(document has string_field) AND (NOT (string_field iexact "paperless"))`!
+        self._assert_query_match_predicate(
+            [
+                "NOT",
+                ["string_field", "exact", "paperless"],
+            ],
+            lambda document: not (
+                "string_field" in document and document["string_field"] == "paperless"
+            ),
+        )
+
+    # ==========================================================#
+    # Tests for invalid queries                                 #
+    # ==========================================================#
+
+    def test_invalid_json(self):
+        self._assert_validation_error(
+            "not valid json",
+            ["custom_field_lookup"],
+            "must be valid JSON",
+        )
+
+    def test_invalid_expression(self):
+        self._assert_validation_error(
+            json.dumps("valid json but not valid expr"),
+            ["custom_field_lookup"],
+            "Invalid custom field lookup expression",
+        )
+
+    def test_invalid_custom_field_name(self):
+        self._assert_validation_error(
+            json.dumps(["invalid name", "iexact", "foo"]),
+            ["custom_field_lookup", "0"],
+            "is not a valid custom field",
+        )
+
+    def test_invalid_operator(self):
+        self._assert_validation_error(
+            json.dumps(["integer_field", "iexact", "foo"]),
+            ["custom_field_lookup", "1"],
+            "does not support lookup expr",
+        )
+
+    def test_invalid_value(self):
+        self._assert_validation_error(
+            json.dumps(["select_field", "exact", "not an option"]),
+            ["custom_field_lookup", "2"],
+            "integer",
+        )
+
+    def test_invalid_logical_operator(self):
+        self._assert_validation_error(
+            json.dumps(["invalid op", ["integer_field", "gt", 0]]),
+            ["custom_field_lookup", "0"],
+            "Invalid logical operator",
+        )
+
+    def test_invalid_expr_list(self):
+        self._assert_validation_error(
+            json.dumps(["AND", "not a list"]),
+            ["custom_field_lookup", "1"],
+            "Invalid expression list",
+        )
+
+    def test_invalid_operator_prefix(self):
+        self._assert_validation_error(
+            json.dumps(["integer_field", "foo__gt", 0]),
+            ["custom_field_lookup", "1"],
+            "does not support lookup expr",
+        )
+
+    @pytest.mark.skipif(
+        string_expr_opted_in("regex"),
+        reason="user opted into allowing regex expr",
+    )
+    def test_disabled_operator(self):
+        self._assert_validation_error(
+            json.dumps(["string_field", "regex", r"^p.+s$"]),
+            ["custom_field_lookup", "1"],
+            "disabled by default",
+        )
+
+    def test_query_too_deep(self):
+        query = ["string_field", "exact", "paperless"]
+        for _ in range(10):
+            query = ["NOT", query]
+        self._assert_validation_error(
+            json.dumps(query),
+            ["custom_field_lookup", *(["1"] * 10)],
+            "Maximum nesting depth exceeded",
+        )
+
+    def test_query_too_many_atoms(self):
+        atom = ["string_field", "exact", "paperless"]
+        query = ["AND", [atom for _ in range(21)]]
+        self._assert_validation_error(
+            json.dumps(query),
+            ["custom_field_lookup", "1", "20"],
+            "Maximum number of query conditions exceeded",
+        )
diff --git a/src/paperless/settings.py b/src/paperless/settings.py

index 46a697349f25158953009abd30babd903132de11..851fe62172e622aa4b16079d5a54448692c1be46 100644 (file)
--- a/src/paperless/settings.py
+++ b/src/paperless/settings.py
@@ -1192,6 +1192,23 @@ EMAIL_ENABLE_GPG_DECRYPTOR: Final[bool] = __get_boolean(
  
  
  ###############################################################################
-# Soft Delete
+# Soft Delete                                                                 #
  ###############################################################################
  EMPTY_TRASH_DELAY = max(__get_int("PAPERLESS_EMPTY_TRASH_DELAY", 30), 1)
+
+###############################################################################
+# custom_field_lookup Filter Settings                                         #
+###############################################################################
+
+CUSTOM_FIELD_LOOKUP_OPT_IN = __get_list(
+    "PAPERLESS_CUSTOM_FIELD_LOOKUP_OPT_IN",
+    default=[],
+)
+CUSTOM_FIELD_LOOKUP_MAX_DEPTH = __get_int(
+    "PAPERLESS_CUSTOM_FIELD_LOOKUP_MAX_DEPTH",
+    default=10,
+)
+CUSTOM_FIELD_LOOKUP_MAX_ATOMS = __get_int(
+    "PAPERLESS_CUSTOM_FIELD_LOOKUP_MAX_ATOMS",
+    default=20,
+)
author	shamoon <4887959+shamoon@users.noreply.github.com>
	Mon, 23 Sep 2024 18:28:31 +0000 (11:28 -0700)
committer	shamoon <4887959+shamoon@users.noreply.github.com>
	Tue, 24 Sep 2024 06:33:49 +0000 (23:33 -0700)
docs/api.md		patch \| blob \| blame \| history
paperless.conf.example		patch \| blob \| blame \| history
src/documents/filters.py		patch \| blob \| blame \| history
src/documents/models.py		patch \| blob \| blame \| history
src/documents/serialisers.py		patch \| blob \| blame \| history
src/documents/tests/test_api_filter_by_custom_fields.py	[new file with mode: 0644]	patch \| blob
src/paperless/settings.py		patch \| blob \| blame \| history