)
-def get_vector_store_index(storage_context, embed_model):
- """
- Returns a VectorStoreIndex given a storage context and embed model.
- """
- return VectorStoreIndex(
- storage_context=storage_context,
- embed_model=embed_model,
- )
-
-
def build_document_node(document: Document) -> list[BaseNode]:
"""
Given a Document, returns parsed Nodes ready for indexing.
"""
- if not document.content:
- return []
-
text = build_llm_index_text(document)
metadata = {
"document_id": str(document.id),
try:
return load_index_from_storage(storage_context=storage_context)
except ValueError as e:
- logger.debug("Failed to load index from storage: %s", e)
+ logger.warning("Failed to load index from storage: %s", e)
if not nodes:
- return None
+ logger.info("No nodes provided for index creation.")
+ raise
return VectorStoreIndex(
nodes=nodes,
storage_context=storage_context,
existing_nodes = [
node.node_id
for node in index.docstore.get_nodes(all_node_ids)
- if node.metadata.get("document_id") == document.id
+ if node.metadata.get("document_id") == str(document.id)
]
for node_id in existing_nodes:
# Delete from docstore, FAISS IndexFlatL2 are append-only
index = load_or_build_index(storage_context, embed_model, nodes=new_nodes)
- if index is None:
- return
-
remove_document_docstore_nodes(document, index)
index.insert_nodes(new_nodes)
index = load_or_build_index(storage_context, embed_model)
- if index is None:
- return
-
remove_document_docstore_nodes(document, index)
storage_context.persist(persist_dir=settings.LLM_INDEX_DIR)
@pytest.fixture
def mock_embed_model():
- """Mocks the embedding model."""
with patch("paperless.ai.indexing.get_embedding_model") as mock:
mock.return_value = FakeEmbedding()
yield mock
@pytest.mark.django_db
-def test_rebuild_llm_index(
+def test_update_llm_index(
temp_llm_index_dir,
real_document,
mock_embed_model,
assert any(temp_llm_index_dir.glob("*.json"))
+def test_get_or_create_storage_context_raises_exception(
+ temp_llm_index_dir,
+ mock_embed_model,
+):
+ with pytest.raises(Exception):
+ indexing.get_or_create_storage_context(rebuild=False)
+
+
+def test_load_or_build_index_builds_when_nodes_given(
+ temp_llm_index_dir,
+ mock_embed_model,
+ real_document,
+):
+ storage_context = MagicMock()
+ with patch(
+ "paperless.ai.indexing.load_index_from_storage",
+ side_effect=ValueError("Index not found"),
+ ):
+ with patch(
+ "paperless.ai.indexing.VectorStoreIndex",
+ return_value=MagicMock(),
+ ) as mock_index_cls:
+ indexing.load_or_build_index(
+ storage_context,
+ mock_embed_model,
+ nodes=[indexing.build_document_node(real_document)],
+ )
+ mock_index_cls.assert_called_once()
+
+
+def test_load_or_build_index_raises_exception_when_no_nodes(
+ temp_llm_index_dir,
+ mock_embed_model,
+):
+ storage_context = MagicMock()
+ with patch(
+ "paperless.ai.indexing.load_index_from_storage",
+ side_effect=ValueError("Index not found"),
+ ):
+ with pytest.raises(Exception):
+ indexing.load_or_build_index(storage_context, mock_embed_model)
+
+
@pytest.mark.django_db
def test_add_or_update_document_updates_existing_entry(
temp_llm_index_dir,
mock_embed_model,
):
indexing.update_llm_index(rebuild=True)
- indexing.llm_index_add_or_update_document(real_document)
- indexing.llm_index_remove_document(real_document)
+ storage_context = indexing.get_or_create_storage_context()
+ index = indexing.load_or_build_index(storage_context, mock_embed_model)
+ assert len(index.docstore.docs) == 1
- assert any(temp_llm_index_dir.glob("*.json"))
+ indexing.llm_index_remove_document(real_document)
+ storage_context = indexing.get_or_create_storage_context()
+ index = indexing.load_or_build_index(storage_context, mock_embed_model)
+ assert len(index.docstore.docs) == 0
@pytest.mark.django_db
-def test_rebuild_llm_index_no_documents(
+def test_update_llm_index_no_documents(
temp_llm_index_dir,
mock_embed_model,
):