def match_correspondents(document: Document, classifier: DocumentClassifier, user=None):
- pred_id = classifier.predict_correspondent(document.content) if classifier else None
+ pred_id = (
+ classifier.predict_correspondent(document.suggestion_content)
+ if classifier
+ else None
+ )
if user is None and document.owner is not None:
user = document.owner
def match_document_types(document: Document, classifier: DocumentClassifier, user=None):
- pred_id = classifier.predict_document_type(document.content) if classifier else None
-
+ pred_id = (
+ classifier.predict_document_type(document.suggestion_content)
+ if classifier
+ else None
+ )
if user is None and document.owner is not None:
user = document.owner
def match_tags(document: Document, classifier: DocumentClassifier, user=None):
- predicted_tag_ids = classifier.predict_tags(document.content) if classifier else []
+ predicted_tag_ids = (
+ classifier.predict_tags(document.suggestion_content) if classifier else []
+ )
if user is None and document.owner is not None:
user = document.owner
def match_storage_paths(document: Document, classifier: DocumentClassifier, user=None):
- pred_id = classifier.predict_storage_path(document.content) if classifier else None
+ pred_id = (
+ classifier.predict_storage_path(document.suggestion_content)
+ if classifier
+ else None
+ )
if user is None and document.owner is not None:
user = document.owner
res += f" {self.title}"
return res
+ @property
+ def suggestion_content(self):
+ """
+ Returns the document text used to generate suggestions.
+
+ If the document content length exceeds a specified limit,
+ the text is cropped to include the start and end segments.
+ Otherwise, the full content is returned.
+
+ This improves processing speed for large documents while keeping
+ enough context for accurate suggestions.
+ """
+ if not self.content or len(self.content) <= 1200000:
+ return self.content
+ else:
+ # Use 80% from the start and 20% from the end
+ # to preserve both opening and closing context.
+ head_len = 800000
+ tail_len = 200000
+
+ return " ".join((self.content[:head_len], self.content[-tail_len:]))
+
@property
def source_path(self) -> Path:
if self.filename:
from django.test import TestCase
from django.test import override_settings
+from faker import Faker
from documents.models import Correspondent
from documents.models import Document
created=date(2020, 12, 25),
)
self.assertEqual(doc.get_public_filename(), "2020-12-25 test")
+
+
+def test_suggestion_content():
+ """
+ Check that the document for suggestion is cropped, only if it exceeds the length limit.
+ """
+ fake_text = Faker().text(max_nb_chars=1201000)
+
+ # Do not crop content under 1.2M chars
+ content_under_limit = fake_text[:1200000]
+ doc = Document(
+ title="test",
+ created=date(2025, 6, 1),
+ content=content_under_limit,
+ )
+ assert doc.suggestion_content == content_under_limit
+
+ # If over the limit, crop to 1M char (800K from the beginning, 200K from the end)
+ content_over_limit = fake_text[:1200001]
+ expected_cropped_content = (
+ content_over_limit[:800000] + " " + content_over_limit[-200000:]
+ )
+ doc.content = content_over_limit
+ assert doc.suggestion_content == expected_cropped_content