Enhancement: Limit excessively long content length when computing suggestions (#10656)

author Antoine Mérino <antoine.merino.dev@gmail.com>

Tue, 9 Sep 2025 20:02:16 +0000 (22:02 +0200)

committer GitHub <noreply@github.com>

Tue, 9 Sep 2025 20:02:16 +0000 (13:02 -0700)
author Antoine Mérino <antoine.merino.dev@gmail.com>
Tue, 9 Sep 2025 20:02:16 +0000 (22:02 +0200)
committer GitHub <noreply@github.com>
Tue, 9 Sep 2025 20:02:16 +0000 (13:02 -0700)
diff --git a/src/documents/matching.py b/src/documents/matching.py

index 15a8ec443cedaa79bd24e81a969c804709c7bde4..346f9d55a3e480d23aff221aa87edaa4b637d793 100644 (file)
--- a/src/documents/matching.py
+++ b/src/documents/matching.py
@@ -41,7 +41,11 @@ def log_reason(
  
  
  def match_correspondents(document: Document, classifier: DocumentClassifier, user=None):
-    pred_id = classifier.predict_correspondent(document.content) if classifier else None
+    pred_id = (
+        classifier.predict_correspondent(document.suggestion_content)
+        if classifier
+        else None
+    )
  
      if user is None and document.owner is not None:
          user = document.owner
@@ -65,8 +69,11 @@ def match_correspondents(document: Document, classifier: DocumentClassifier, use
  
  
  def match_document_types(document: Document, classifier: DocumentClassifier, user=None):
-    pred_id = classifier.predict_document_type(document.content) if classifier else None
-
+    pred_id = (
+        classifier.predict_document_type(document.suggestion_content)
+        if classifier
+        else None
+    )
      if user is None and document.owner is not None:
          user = document.owner
  
@@ -89,7 +96,9 @@ def match_document_types(document: Document, classifier: DocumentClassifier, use
  
  
  def match_tags(document: Document, classifier: DocumentClassifier, user=None):
-    predicted_tag_ids = classifier.predict_tags(document.content) if classifier else []
+    predicted_tag_ids = (
+        classifier.predict_tags(document.suggestion_content) if classifier else []
+    )
  
      if user is None and document.owner is not None:
          user = document.owner
@@ -112,7 +121,11 @@ def match_tags(document: Document, classifier: DocumentClassifier, user=None):
  
  
  def match_storage_paths(document: Document, classifier: DocumentClassifier, user=None):
-    pred_id = classifier.predict_storage_path(document.content) if classifier else None
+    pred_id = (
+        classifier.predict_storage_path(document.suggestion_content)
+        if classifier
+        else None
+    )
  
      if user is None and document.owner is not None:
          user = document.owner
diff --git a/src/documents/models.py b/src/documents/models.py

index e93f140543403579c02f7962192ad374768a5484..72e3996d5010f11afbec5c1532e433cf6ea25f8d 100644 (file)
--- a/src/documents/models.py
+++ b/src/documents/models.py
@@ -305,6 +305,28 @@ class Document(SoftDeleteModel, ModelWithOwner):
              res += f" {self.title}"
          return res
  
+    @property
+    def suggestion_content(self):
+        """
+        Returns the document text used to generate suggestions.
+
+        If the document content length exceeds a specified limit,
+        the text is cropped to include the start and end segments.
+        Otherwise, the full content is returned.
+
+        This improves processing speed for large documents while keeping
+        enough context for accurate suggestions.
+        """
+        if not self.content or len(self.content) <= 1200000:
+            return self.content
+        else:
+            # Use 80% from the start and 20% from the end
+            # to preserve both opening and closing context.
+            head_len = 800000
+            tail_len = 200000
+
+            return " ".join((self.content[:head_len], self.content[-tail_len:]))
+
      @property
      def source_path(self) -> Path:
          if self.filename:
diff --git a/src/documents/tests/test_document_model.py b/src/documents/tests/test_document_model.py

index 87ebdb56125de47b76b01c5acf2e203f5c53b5e8..fc4f17e046b081f76cff90435dfb9ffe2e103c32 100644 (file)
--- a/src/documents/tests/test_document_model.py
+++ b/src/documents/tests/test_document_model.py
@@ -6,6 +6,7 @@ from unittest import mock
  
  from django.test import TestCase
  from django.test import override_settings
+from faker import Faker
  
  from documents.models import Correspondent
  from documents.models import Document
@@ -105,3 +106,27 @@ class TestDocument(TestCase):
              created=date(2020, 12, 25),
          )
          self.assertEqual(doc.get_public_filename(), "2020-12-25 test")
+
+
+def test_suggestion_content():
+    """
+    Check that the document for suggestion is cropped, only if it exceeds the length limit.
+    """
+    fake_text = Faker().text(max_nb_chars=1201000)
+
+    # Do not crop content under 1.2M chars
+    content_under_limit = fake_text[:1200000]
+    doc = Document(
+        title="test",
+        created=date(2025, 6, 1),
+        content=content_under_limit,
+    )
+    assert doc.suggestion_content == content_under_limit
+
+    # If over the limit, crop to 1M char (800K from the beginning, 200K from the end)
+    content_over_limit = fake_text[:1200001]
+    expected_cropped_content = (
+        content_over_limit[:800000] + " " + content_over_limit[-200000:]
+    )
+    doc.content = content_over_limit
+    assert doc.suggestion_content == expected_cropped_content
author	Antoine Mérino <antoine.merino.dev@gmail.com>
	Tue, 9 Sep 2025 20:02:16 +0000 (22:02 +0200)
committer	GitHub <noreply@github.com>
	Tue, 9 Sep 2025 20:02:16 +0000 (13:02 -0700)
src/documents/matching.py		patch \| blob \| blame \| history
src/documents/models.py		patch \| blob \| blame \| history
src/documents/tests/test_document_model.py		patch \| blob \| blame \| history