Unify prompts, cover

author shamoon <4887959+shamoon@users.noreply.github.com>

Mon, 28 Apr 2025 20:46:22 +0000 (13:46 -0700)

committer shamoon <4887959+shamoon@users.noreply.github.com>

Wed, 2 Jul 2025 18:03:59 +0000 (11:03 -0700)
author shamoon <4887959+shamoon@users.noreply.github.com>
Mon, 28 Apr 2025 20:46:22 +0000 (13:46 -0700)
committer shamoon <4887959+shamoon@users.noreply.github.com>
Wed, 2 Jul 2025 18:03:59 +0000 (11:03 -0700)
diff --git a/src/paperless/ai/ai_classifier.py b/src/paperless/ai/ai_classifier.py

index f5822fa63cf59878062d310285a6486891e82b35..ab349c81beb159420b1804212d0f80b6cce0339b 100644 (file)
--- a/src/paperless/ai/ai_classifier.py
+++ b/src/paperless/ai/ai_classifier.py
@@ -21,6 +21,7 @@ def build_prompt_without_rag(document: Document) -> str:
      Never ask for further information, additional content or ask questions. Never include any other text.
      Suggested tags and document types must be strictly based on the content of the document.
      Do not change the field names or the JSON structure, only provide the values. Use double quotes and proper JSON syntax.
+    Each field must be a list of plain strings.
  
      The JSON object must contain the following fields:
      - title: A short, descriptive title
@@ -30,8 +31,6 @@ def build_prompt_without_rag(document: Document) -> str:
      - storage_paths: Suggested folder paths (e.g. "Medical/Insurance")
      - dates: List up to 3 relevant dates in YYYY-MM-DD format
  
-    Respond ONLY in JSON.
-    Each field must be a list of plain strings.
      The format of the JSON object is as follows:
      {{
          "title": "xxxxx",
@@ -43,7 +42,6 @@ def build_prompt_without_rag(document: Document) -> str:
      }}
      ---
  
-
      FILENAME:
      {filename}
  
@@ -56,41 +54,9 @@ def build_prompt_without_rag(document: Document) -> str:
  
  def build_prompt_with_rag(document: Document) -> str:
      context = get_context_for_document(document)
-    content = document.content or ""
-    filename = document.filename or ""
-
-    prompt = f"""
-    You are a helpful assistant that extracts structured information from documents.
-    You have access to similar documents as context to help improve suggestions.
-
-    Only output valid JSON in the format below. No additional explanations.
-
-    The JSON object must contain:
-    - title: A short, human-readable, descriptive title based on the content
-    - tags: A list of relevant topics
-    - correspondents: People or organizations involved
-    - document_types: Type or category of the document
-    - storage_paths: Suggested folder paths
-    - dates: Up to 3 relevant dates in YYYY-MM-DD
-
-    Respond ONLY in JSON.
-    Each field must be a list of plain strings.
-    The format of the JSON object is as follows:
-    {{
-        "title": "xxxxx",
-        "tags": ["xxxx", "xxxx"],
-        "correspondents": ["xxxx", "xxxx"],
-        "document_types": ["xxxx", "xxxx"],
-        "storage_paths": ["xxxx", "xxxx"],
-        "dates": ["YYYY-MM-DD", "YYYY-MM-DD", "YYYY-MM-DD"],
-    }}
-
-    Here is the document:
-    FILENAME:
-    {filename}
+    prompt = build_prompt_without_rag(document)
  
-    CONTENT:
-    {content[:4000]}
+    prompt += f"""
  
      CONTEXT FROM SIMILAR DOCUMENTS:
      {context[:4000]}
diff --git a/src/paperless/tests/test_ai_classifier.py b/src/paperless/tests/test_ai_classifier.py

index 9302d6fd222a6b52f729a8c70ad9cbc5ee71a068..a29f1a07edd189f9a0ab7c3fad0236cfda42fb1c 100644 (file)
--- a/src/paperless/tests/test_ai_classifier.py
+++ b/src/paperless/tests/test_ai_classifier.py
@@ -6,6 +6,8 @@ import pytest
  from django.test import override_settings
  
  from documents.models import Document
+from paperless.ai.ai_classifier import build_prompt_with_rag
+from paperless.ai.ai_classifier import build_prompt_without_rag
  from paperless.ai.ai_classifier import get_ai_document_classification
  from paperless.ai.ai_classifier import parse_ai_response
  
@@ -101,3 +103,15 @@ def test_use_without_rag_if_not_configured(
      mock_run_llm_query.return_value.text = json.dumps({})
      get_ai_document_classification(mock_document)
      mock_build_prompt_without_rag.assert_called_once()
+
+
+@override_settings(
+    LLM_BACKEND="ollama",
+    LLM_MODEL="some_model",
+)
+def test_prompt_with_without_rag(mock_document):
+    prompt = build_prompt_without_rag(mock_document)
+    assert "CONTEXT FROM SIMILAR DOCUMENTS:" not in prompt
+
+    prompt = build_prompt_with_rag(mock_document)
+    assert "CONTEXT FROM SIMILAR DOCUMENTS:" in prompt
author	shamoon <4887959+shamoon@users.noreply.github.com>
	Mon, 28 Apr 2025 20:46:22 +0000 (13:46 -0700)
committer	shamoon <4887959+shamoon@users.noreply.github.com>
	Wed, 2 Jul 2025 18:03:59 +0000 (11:03 -0700)
src/paperless/ai/ai_classifier.py		patch \| blob \| blame \| history
src/paperless/tests/test_ai_classifier.py		patch \| blob \| blame \| history