]> git.ipfire.org Git - thirdparty/paperless-ngx.git/commitdiff
Cover partial indexing
authorshamoon <4887959+shamoon@users.noreply.github.com>
Tue, 29 Apr 2025 02:03:53 +0000 (19:03 -0700)
committershamoon <4887959+shamoon@users.noreply.github.com>
Wed, 2 Jul 2025 18:04:01 +0000 (11:04 -0700)
src/paperless/ai/indexing.py
src/paperless/tests/test_ai_indexing.py

index bc275c83feae48587a4b7e467dbe1e80c69928b5..840d58f3709c684da4023bb3b467dcf270304021 100644 (file)
@@ -147,8 +147,6 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False):
             for node in index.docstore.get_nodes(all_node_ids)
         }
 
-        node_ids_to_remove = []
-
         for document in tqdm.tqdm(documents, disable=progress_bar_disable):
             doc_id = str(document.id)
             document_modified = document.modified.isoformat()
@@ -160,22 +158,19 @@ def update_llm_index(*, progress_bar_disable=False, rebuild=False):
                 if node_modified == document_modified:
                     continue
 
-                node_ids_to_remove.append(node.node_id)
+                # Again, delete from docstore, FAISS IndexFlatL2 are append-only
+                index.docstore.delete_document(node.node_id)
                 nodes.extend(build_document_node(document))
             else:
                 # New document, add it
                 nodes.extend(build_document_node(document))
 
-        if node_ids_to_remove or nodes:
+        if nodes:
             logger.info(
-                "Updating LLM index with %d new nodes and removing %d old nodes.",
+                "Updating %d nodes in LLM index.",
                 len(nodes),
-                len(node_ids_to_remove),
             )
-            if node_ids_to_remove:
-                index.delete_nodes(node_ids_to_remove)
-            if nodes:
-                index.insert_nodes(nodes)
+            index.insert_nodes(nodes)
         else:
             logger.info("No changes detected, skipping llm index rebuild.")
 
index c0279171a049753df5f2bb961ae73224b8b6d392..d7b83316db882758aa3d33448b399e9d5f931ebe 100644 (file)
@@ -72,6 +72,57 @@ def test_update_llm_index(
         assert any(temp_llm_index_dir.glob("*.json"))
 
 
+@pytest.mark.django_db
+def test_update_llm_index_partial_update(
+    temp_llm_index_dir,
+    real_document,
+    mock_embed_model,
+):
+    doc2 = Document.objects.create(
+        title="Test Document 2",
+        content="This is some test content 2.",
+        added=timezone.now(),
+        checksum="1234567890abcdef",
+    )
+    # Initial index
+    with patch("documents.models.Document.objects.all") as mock_all:
+        mock_queryset = MagicMock()
+        mock_queryset.exists.return_value = True
+        mock_queryset.__iter__.return_value = iter([real_document, doc2])
+        mock_all.return_value = mock_queryset
+
+        indexing.update_llm_index(rebuild=True)
+
+    # modify document
+    updated_document = real_document
+    updated_document.modified = timezone.now()  # simulate modification
+
+    # new doc
+    doc3 = Document.objects.create(
+        title="Test Document 3",
+        content="This is some test content 3.",
+        added=timezone.now(),
+        checksum="abcdef1234567890",
+    )
+
+    with patch("documents.models.Document.objects.all") as mock_all:
+        mock_queryset = MagicMock()
+        mock_queryset.exists.return_value = True
+        mock_queryset.__iter__.return_value = iter([updated_document, doc2, doc3])
+        mock_all.return_value = mock_queryset
+
+        # assert logs "Updating LLM index with %d new nodes and removing %d old nodes."
+        with patch("paperless.ai.indexing.logger") as mock_logger:
+            indexing.update_llm_index(rebuild=False)
+            mock_logger.info.assert_called_once_with(
+                "Updating %d nodes in LLM index.",
+                2,
+            )
+        indexing.update_llm_index(rebuild=False)
+
+    assert any(temp_llm_index_dir.glob("*.json"))
+
+
 def test_get_or_create_storage_context_raises_exception(
     temp_llm_index_dir,
     mock_embed_model,