]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
indexing cleanup and tests
authorshamoon <4887959+shamoon@users.noreply.github.com>
Mon, 28 Apr 2025 21:39:31 +0000 (14:39 -0700)
committershamoon <4887959+shamoon@users.noreply.github.com>
Wed, 2 Jul 2025 18:04:00 +0000 (11:04 -0700)
src/paperless/ai/chat.py
src/paperless/ai/indexing.py
src/paperless/tests/test_ai_indexing.py

index 45d44db8cdf486dc397325f443b00c73c5e2dd1c..84f0db060af3e3f4faf50fe63bf84b9cb174a084 100644 (file)
@@ -26,7 +26,7 @@ def stream_chat_with_documents(query_str: str, documents: list[Document]):
     client = AIClient()
     index = load_or_build_index()
 
-    doc_ids = [doc.pk for doc in documents]
+    doc_ids = [str(doc.pk) for doc in documents]
 
     # Filter only the node(s) that match the document IDs
     nodes = [
index 11b8179ee71038dbe1c237e45687b35a3546c5a5..95442e55b9c8a2a5ea6f0cca5577d816b6a4a6b2 100644 (file)
@@ -52,23 +52,10 @@ def get_or_create_storage_context(*, rebuild=False):
     )
 
 
-def get_vector_store_index(storage_context, embed_model):
-    """
-    Returns a VectorStoreIndex given a storage context and embed model.
-    """
-    return VectorStoreIndex(
-        storage_context=storage_context,
-        embed_model=embed_model,
-    )
-
-
 def build_document_node(document: Document) -> list[BaseNode]:
     """
     Given a Document, returns parsed Nodes ready for indexing.
     """
-    if not document.content:
-        return []
-
     text = build_llm_index_text(document)
     metadata = {
         "document_id": str(document.id),
@@ -97,9 +84,10 @@ def load_or_build_index(storage_context: StorageContext, embed_model, nodes=None
     try:
         return load_index_from_storage(storage_context=storage_context)
     except ValueError as e:
-        logger.debug("Failed to load index from storage: %s", e)
+        logger.warning("Failed to load index from storage: %s", e)
         if not nodes:
-            return None
+            logger.info("No nodes provided for index creation.")
+            raise
         return VectorStoreIndex(
             nodes=nodes,
             storage_context=storage_context,
@@ -116,7 +104,7 @@ def remove_document_docstore_nodes(document: Document, index: VectorStoreIndex):
     existing_nodes = [
         node.node_id
         for node in index.docstore.get_nodes(all_node_ids)
-        if node.metadata.get("document_id") == document.id
+        if node.metadata.get("document_id") == str(document.id)
     ]
     for node_id in existing_nodes:
         # Delete from docstore, FAISS IndexFlatL2 are append-only
@@ -208,9 +196,6 @@ def llm_index_add_or_update_document(document: Document):
 
     index = load_or_build_index(storage_context, embed_model, nodes=new_nodes)
 
-    if index is None:
-        return
-
     remove_document_docstore_nodes(document, index)
 
     index.insert_nodes(new_nodes)
@@ -229,9 +214,6 @@ def llm_index_remove_document(document: Document):
 
     index = load_or_build_index(storage_context, embed_model)
 
-    if index is None:
-        return
-
     remove_document_docstore_nodes(document, index)
 
     storage_context.persist(persist_dir=settings.LLM_INDEX_DIR)
index 970f8293da41407034eee3258fd4c12964f31a0b..73df742b1e075ff8db59d2f4e9f7c9ac11a19bed 100644 (file)
@@ -28,7 +28,6 @@ def real_document(db):
 
 @pytest.fixture
 def mock_embed_model():
-    """Mocks the embedding model."""
     with patch("paperless.ai.indexing.get_embedding_model") as mock:
         mock.return_value = FakeEmbedding()
         yield mock
@@ -57,7 +56,7 @@ def test_build_document_node(real_document):
 
 
 @pytest.mark.django_db
-def test_rebuild_llm_index(
+def test_update_llm_index(
     temp_llm_index_dir,
     real_document,
     mock_embed_model,
@@ -72,6 +71,49 @@ def test_rebuild_llm_index(
         assert any(temp_llm_index_dir.glob("*.json"))
 
 
+def test_get_or_create_storage_context_raises_exception(
+    temp_llm_index_dir,
+    mock_embed_model,
+):
+    with pytest.raises(Exception):
+        indexing.get_or_create_storage_context(rebuild=False)
+
+
+def test_load_or_build_index_builds_when_nodes_given(
+    temp_llm_index_dir,
+    mock_embed_model,
+    real_document,
+):
+    storage_context = MagicMock()
+    with patch(
+        "paperless.ai.indexing.load_index_from_storage",
+        side_effect=ValueError("Index not found"),
+    ):
+        with patch(
+            "paperless.ai.indexing.VectorStoreIndex",
+            return_value=MagicMock(),
+        ) as mock_index_cls:
+            indexing.load_or_build_index(
+                storage_context,
+                mock_embed_model,
+                nodes=[indexing.build_document_node(real_document)],
+            )
+            mock_index_cls.assert_called_once()
+
+
+def test_load_or_build_index_raises_exception_when_no_nodes(
+    temp_llm_index_dir,
+    mock_embed_model,
+):
+    storage_context = MagicMock()
+    with patch(
+        "paperless.ai.indexing.load_index_from_storage",
+        side_effect=ValueError("Index not found"),
+    ):
+        with pytest.raises(Exception):
+            indexing.load_or_build_index(storage_context, mock_embed_model)
+
+
 @pytest.mark.django_db
 def test_add_or_update_document_updates_existing_entry(
     temp_llm_index_dir,
@@ -91,14 +133,18 @@ def test_remove_document_deletes_node_from_docstore(
     mock_embed_model,
 ):
     indexing.update_llm_index(rebuild=True)
-    indexing.llm_index_add_or_update_document(real_document)
-    indexing.llm_index_remove_document(real_document)
+    storage_context = indexing.get_or_create_storage_context()
+    index = indexing.load_or_build_index(storage_context, mock_embed_model)
+    assert len(index.docstore.docs) == 1
 
-    assert any(temp_llm_index_dir.glob("*.json"))
+    indexing.llm_index_remove_document(real_document)
+    storage_context = indexing.get_or_create_storage_context()
+    index = indexing.load_or_build_index(storage_context, mock_embed_model)
+    assert len(index.docstore.docs) == 0
 
 
 @pytest.mark.django_db
-def test_rebuild_llm_index_no_documents(
+def test_update_llm_index_no_documents(
     temp_llm_index_dir,
     mock_embed_model,
 ):