token limiting

author shamoon <4887959+shamoon@users.noreply.github.com>

Wed, 30 Apr 2025 16:44:34 +0000 (09:44 -0700)

committer shamoon <4887959+shamoon@users.noreply.github.com>

Wed, 2 Jul 2025 18:04:58 +0000 (11:04 -0700)
author shamoon <4887959+shamoon@users.noreply.github.com>
Wed, 30 Apr 2025 16:44:34 +0000 (09:44 -0700)
committer shamoon <4887959+shamoon@users.noreply.github.com>
Wed, 2 Jul 2025 18:04:58 +0000 (11:04 -0700)
diff --git a/src/paperless_ai/ai_classifier.py b/src/paperless_ai/ai_classifier.py

index 26c0f18731595afbeb2174af630bd1ebbfc39e09..fd660763c020e36d8770e11bf99a4c0bc7d434ca 100644 (file)
--- a/src/paperless_ai/ai_classifier.py
+++ b/src/paperless_ai/ai_classifier.py
@@ -9,13 +9,14 @@ from documents.permissions import get_objects_for_user_owner_aware
  from paperless.config import AIConfig
  from paperless_ai.client import AIClient
  from paperless_ai.indexing import query_similar_documents
+from paperless_ai.indexing import truncate_content
  
  logger = logging.getLogger("paperless_ai.rag_classifier")
  
  
  def build_prompt_without_rag(document: Document) -> str:
      filename = document.filename or ""
-    content = document.content or ""
+    content = truncate_content(document.content or "")
  
      prompt = f"""
      You are an assistant that extracts structured information from documents.
@@ -48,20 +49,20 @@ def build_prompt_without_rag(document: Document) -> str:
      {filename}
  
      CONTENT:
-    {content[:8000]}
+    {content}
      """
  
      return prompt
  
  
  def build_prompt_with_rag(document: Document, user: User | None = None) -> str:
-    context = get_context_for_document(document, user)
+    context = truncate_content(get_context_for_document(document, user))
      prompt = build_prompt_without_rag(document)
  
      prompt += f"""
  
      CONTEXT FROM SIMILAR DOCUMENTS:
-    {context[:4000]}
+    {context}
      """
  
      return prompt
diff --git a/src/paperless_ai/indexing.py b/src/paperless_ai/indexing.py

index 4e0f2ffdf58e3a5f0fbcfbc48ce2cc63d8f46c9b..3afa1e990f49bdb66b09b5031c753c2bc4740c15 100644 (file)
--- a/src/paperless_ai/indexing.py
+++ b/src/paperless_ai/indexing.py
@@ -10,11 +10,14 @@ from llama_index.core import Document as LlamaDocument
  from llama_index.core import StorageContext
  from llama_index.core import VectorStoreIndex
  from llama_index.core import load_index_from_storage
+from llama_index.core.indices.prompt_helper import PromptHelper
  from llama_index.core.node_parser import SimpleNodeParser
+from llama_index.core.prompts import PromptTemplate
  from llama_index.core.retrievers import VectorIndexRetriever
  from llama_index.core.schema import BaseNode
  from llama_index.core.storage.docstore import SimpleDocumentStore
  from llama_index.core.storage.index_store import SimpleIndexStore
+from llama_index.core.text_splitter import TokenTextSplitter
  from llama_index.vector_stores.faiss import FaissVectorStore
  
  from documents.models import Document
@@ -220,6 +223,23 @@ def llm_index_remove_document(document: Document):
      index.storage_context.persist(persist_dir=settings.LLM_INDEX_DIR)
  
  
+def truncate_content(content: str) -> str:
+    prompt_helper = PromptHelper(
+        context_window=8192,
+        num_output=512,
+        chunk_overlap_ratio=0.1,
+        chunk_size_limit=None,
+    )
+    splitter = TokenTextSplitter(separator=" ", chunk_size=512, chunk_overlap=50)
+    content_chunks = splitter.split_text(content)
+    truncated_chunks = prompt_helper.truncate(
+        prompt=PromptTemplate(template="{content}"),
+        text_chunks=content_chunks,
+        padding=5,
+    )
+    return " ".join(truncated_chunks)
+
+
  def query_similar_documents(
      document: Document,
      top_k: int = 5,
@@ -247,7 +267,9 @@ def query_similar_documents(
          doc_ids=doc_node_ids,
      )
  
-    query_text = (document.title or "") + "\n" + (document.content or "")
+    query_text = truncate_content(
+        (document.title or "") + "\n" + (document.content or ""),
+    )
      results = retriever.retrieve(query_text)
  
      document_ids = [
author	shamoon <4887959+shamoon@users.noreply.github.com>
	Wed, 30 Apr 2025 16:44:34 +0000 (09:44 -0700)
committer	shamoon <4887959+shamoon@users.noreply.github.com>
	Wed, 2 Jul 2025 18:04:58 +0000 (11:04 -0700)
src/paperless_ai/ai_classifier.py		patch \| blob \| blame \| history
src/paperless_ai/indexing.py		patch \| blob \| blame \| history