for node in index.docstore.get_nodes(all_node_ids)
}
- node_ids_to_remove = []
-
for document in tqdm.tqdm(documents, disable=progress_bar_disable):
doc_id = str(document.id)
document_modified = document.modified.isoformat()
if node_modified == document_modified:
continue
- node_ids_to_remove.append(node.node_id)
+ # Again, delete from docstore, FAISS IndexFlatL2 are append-only
+ index.docstore.delete_document(node.node_id)
nodes.extend(build_document_node(document))
else:
# New document, add it
nodes.extend(build_document_node(document))
- if node_ids_to_remove or nodes:
+ if nodes:
logger.info(
- "Updating LLM index with %d new nodes and removing %d old nodes.",
+ "Updating %d nodes in LLM index.",
len(nodes),
- len(node_ids_to_remove),
)
- if node_ids_to_remove:
- index.delete_nodes(node_ids_to_remove)
- if nodes:
- index.insert_nodes(nodes)
+ index.insert_nodes(nodes)
else:
logger.info("No changes detected, skipping llm index rebuild.")
assert any(temp_llm_index_dir.glob("*.json"))
+@pytest.mark.django_db
+def test_update_llm_index_partial_update(
+ temp_llm_index_dir,
+ real_document,
+ mock_embed_model,
+):
+ doc2 = Document.objects.create(
+ title="Test Document 2",
+ content="This is some test content 2.",
+ added=timezone.now(),
+ checksum="1234567890abcdef",
+ )
+ # Initial index
+ with patch("documents.models.Document.objects.all") as mock_all:
+ mock_queryset = MagicMock()
+ mock_queryset.exists.return_value = True
+ mock_queryset.__iter__.return_value = iter([real_document, doc2])
+ mock_all.return_value = mock_queryset
+
+ indexing.update_llm_index(rebuild=True)
+
+ # modify document
+ updated_document = real_document
+ updated_document.modified = timezone.now() # simulate modification
+
+ # new doc
+ doc3 = Document.objects.create(
+ title="Test Document 3",
+ content="This is some test content 3.",
+ added=timezone.now(),
+ checksum="abcdef1234567890",
+ )
+
+ with patch("documents.models.Document.objects.all") as mock_all:
+ mock_queryset = MagicMock()
+ mock_queryset.exists.return_value = True
+ mock_queryset.__iter__.return_value = iter([updated_document, doc2, doc3])
+ mock_all.return_value = mock_queryset
+
+ # assert logs "Updating LLM index with %d new nodes and removing %d old nodes."
+ with patch("paperless.ai.indexing.logger") as mock_logger:
+ indexing.update_llm_index(rebuild=False)
+ mock_logger.info.assert_called_once_with(
+ "Updating %d nodes in LLM index.",
+ 2,
+ )
+ indexing.update_llm_index(rebuild=False)
+
+ assert any(temp_llm_index_dir.glob("*.json"))
+
+
def test_get_or_create_storage_context_raises_exception(
temp_llm_index_dir,
mock_embed_model,