diff --git a/knowledgehub/embeddings/langchain_based.py b/knowledgehub/embeddings/langchain_based.py index 98090fe..d24e4a2 100644 --- a/knowledgehub/embeddings/langchain_based.py +++ b/knowledgehub/embeddings/langchain_based.py @@ -73,10 +73,13 @@ class LCEmbeddingMixin: return self._kwargs[name] return getattr(self._obj, name) - def dump(self): + def dump(self, *args, **kwargs): + from theflow.utils.modules import serialize + + params = {key: serialize(value) for key, value in self._kwargs.items()} return { "__type__": f"{self.__module__}.{self.__class__.__qualname__}", - **self._kwargs, + **params, } def specs(self, path: str): diff --git a/knowledgehub/indices/base.py b/knowledgehub/indices/base.py index 8843aad..938be66 100644 --- a/knowledgehub/indices/base.py +++ b/knowledgehub/indices/base.py @@ -82,10 +82,13 @@ class LlamaIndexDocTransformerMixin: return self._kwargs[name] return getattr(self._obj, name) - def dump(self): + def dump(self, *args, **kwargs): + from theflow.utils.modules import serialize + + params = {key: serialize(value) for key, value in self._kwargs.items()} return { "__type__": f"{self.__module__}.{self.__class__.__qualname__}", - **self._kwargs, + **params, } def run( diff --git a/knowledgehub/indices/vectorindex.py b/knowledgehub/indices/vectorindex.py index 0f686d2..8eba908 100644 --- a/knowledgehub/indices/vectorindex.py +++ b/knowledgehub/indices/vectorindex.py @@ -1,7 +1,6 @@ from __future__ import annotations import uuid -from pathlib import Path from typing import Optional, Sequence, cast from kotaemon.base import BaseComponent, Document, RetrievedDocument @@ -68,37 +67,6 @@ class VectorIndexing(BaseIndexing): if self.doc_store: self.doc_store.add(input_) - def save( - self, - path: str | Path, - vectorstore_fname: str = VECTOR_STORE_FNAME, - docstore_fname: str = DOC_STORE_FNAME, - ): - """Save the whole state of the indexing pipeline vector store and all - necessary information to disk - - Args: - path (str): path to save the state - """ - if isinstance(path, str): - path = Path(path) - self.vector_store.save(path / vectorstore_fname) - if self.doc_store: - self.doc_store.save(path / docstore_fname) - - def load( - self, - path: str | Path, - vectorstore_fname: str = VECTOR_STORE_FNAME, - docstore_fname: str = DOC_STORE_FNAME, - ): - """Load all information from disk to an object""" - if isinstance(path, str): - path = Path(path) - self.vector_store.load(path / vectorstore_fname) - if self.doc_store: - self.doc_store.load(path / docstore_fname) - class VectorRetrieval(BaseRetrieval): """Retrieve list of documents from vector store""" @@ -144,37 +112,6 @@ class VectorRetrieval(BaseRetrieval): return result - def save( - self, - path: str | Path, - vectorstore_fname: str = VECTOR_STORE_FNAME, - docstore_fname: str = DOC_STORE_FNAME, - ): - """Save the whole state of the indexing pipeline vector store and all - necessary information to disk - - Args: - path (str): path to save the state - """ - if isinstance(path, str): - path = Path(path) - self.vector_store.save(path / vectorstore_fname) - if self.doc_store: - self.doc_store.save(path / docstore_fname) - - def load( - self, - path: str | Path, - vectorstore_fname: str = VECTOR_STORE_FNAME, - docstore_fname: str = DOC_STORE_FNAME, - ): - """Load all information from disk to an object""" - if isinstance(path, str): - path = Path(path) - self.vector_store.load(path / vectorstore_fname) - if self.doc_store: - self.doc_store.load(path / docstore_fname) - class TextVectorQA(BaseComponent): retrieving_pipeline: BaseRetrieval diff --git a/knowledgehub/llms/chats/langchain_based.py b/knowledgehub/llms/chats/langchain_based.py index ccade14..1b937c8 100644 --- a/knowledgehub/llms/chats/langchain_based.py +++ b/knowledgehub/llms/chats/langchain_based.py @@ -101,10 +101,13 @@ class LCChatMixin: return self._kwargs[name] return getattr(self._obj, name) - def dump(self): + def dump(self, *args, **kwargs): + from theflow.utils.modules import serialize + + params = {key: serialize(value) for key, value in self._kwargs.items()} return { "__type__": f"{self.__module__}.{self.__class__.__qualname__}", - **self._kwargs, + **params, } def specs(self, path: str): diff --git a/knowledgehub/llms/completions/langchain_based.py b/knowledgehub/llms/completions/langchain_based.py index a8d36ab..5dbebfe 100644 --- a/knowledgehub/llms/completions/langchain_based.py +++ b/knowledgehub/llms/completions/langchain_based.py @@ -78,10 +78,13 @@ class LCCompletionMixin: return self._kwargs[name] return getattr(self._obj, name) - def dump(self): + def dump(self, *args, **kwargs): + from theflow.utils.modules import serialize + + params = {key: serialize(value) for key, value in self._kwargs.items()} return { "__type__": f"{self.__module__}.{self.__class__.__qualname__}", - **self._kwargs, + **params, } def specs(self, path: str): diff --git a/knowledgehub/storages/__init__.py b/knowledgehub/storages/__init__.py index 86225a9..63cc6f6 100644 --- a/knowledgehub/storages/__init__.py +++ b/knowledgehub/storages/__init__.py @@ -2,16 +2,24 @@ from .docstores import ( BaseDocumentStore, ElasticsearchDocumentStore, InMemoryDocumentStore, + SimpleFileDocumentStore, +) +from .vectorstores import ( + BaseVectorStore, + ChromaVectorStore, + InMemoryVectorStore, + SimpleFileVectorStore, ) -from .vectorstores import BaseVectorStore, ChromaVectorStore, InMemoryVectorStore __all__ = [ # Document stores "BaseDocumentStore", "InMemoryDocumentStore", "ElasticsearchDocumentStore", + "SimpleFileDocumentStore", # Vector stores "BaseVectorStore", "ChromaVectorStore", "InMemoryVectorStore", + "SimpleFileVectorStore", ] diff --git a/knowledgehub/storages/docstores/__init__.py b/knowledgehub/storages/docstores/__init__.py index a592c08..8c1da98 100644 --- a/knowledgehub/storages/docstores/__init__.py +++ b/knowledgehub/storages/docstores/__init__.py @@ -1,5 +1,11 @@ from .base import BaseDocumentStore from .elasticsearch import ElasticsearchDocumentStore from .in_memory import InMemoryDocumentStore +from .simple_file import SimpleFileDocumentStore -__all__ = ["BaseDocumentStore", "InMemoryDocumentStore", "ElasticsearchDocumentStore"] +__all__ = [ + "BaseDocumentStore", + "InMemoryDocumentStore", + "ElasticsearchDocumentStore", + "SimpleFileDocumentStore", +] diff --git a/knowledgehub/storages/docstores/base.py b/knowledgehub/storages/docstores/base.py index 62c9314..620a9b1 100644 --- a/knowledgehub/storages/docstores/base.py +++ b/knowledgehub/storages/docstores/base.py @@ -1,8 +1,7 @@ from abc import ABC, abstractmethod -from pathlib import Path from typing import List, Optional, Union -from ...base import Document +from kotaemon.base import Document class BaseDocumentStore(ABC): @@ -46,13 +45,3 @@ class BaseDocumentStore(ABC): def delete(self, ids: Union[List[str], str]): """Delete document by id""" ... - - @abstractmethod - def save(self, path: Union[str, Path]): - """Save document to path""" - ... - - @abstractmethod - def load(self, path: Union[str, Path]): - """Load document store from path""" - ... diff --git a/knowledgehub/storages/docstores/elasticsearch.py b/knowledgehub/storages/docstores/elasticsearch.py index 902e823..3d93c62 100644 --- a/knowledgehub/storages/docstores/elasticsearch.py +++ b/knowledgehub/storages/docstores/elasticsearch.py @@ -1,7 +1,7 @@ -from pathlib import Path from typing import List, Optional, Union -from ...base import Document +from kotaemon.base import Document + from .base import BaseDocumentStore MAX_DOCS_TO_GET = 10**4 @@ -27,6 +27,8 @@ class ElasticsearchDocumentStore(BaseDocumentStore): self.elasticsearch_url = elasticsearch_url self.index_name = index_name + self.k1 = k1 + self.b = b # Create an Elasticsearch client instance self.client = Elasticsearch(elasticsearch_url) @@ -160,10 +162,10 @@ class ElasticsearchDocumentStore(BaseDocumentStore): self.client.delete_by_query(index=self.index_name, body=query) self.client.indices.refresh(index=self.index_name) - def save(self, path: Union[str, Path]): - """Save document to path""" - # not required for ElasticDocstore - - def load(self, path: Union[str, Path]): - """Load document store from path""" - # not required for ElasticDocstore + def __persist_flow__(self): + return { + "index_name": self.index_name, + "elasticsearch_url": self.elasticsearch_url, + "k1": self.k1, + "b": self.b, + } diff --git a/knowledgehub/storages/docstores/in_memory.py b/knowledgehub/storages/docstores/in_memory.py index 645890e..3bf22c8 100644 --- a/knowledgehub/storages/docstores/in_memory.py +++ b/knowledgehub/storages/docstores/in_memory.py @@ -2,7 +2,8 @@ import json from pathlib import Path from typing import List, Optional, Union -from ...base import Document +from kotaemon.base import Document + from .base import BaseDocumentStore @@ -74,3 +75,6 @@ class InMemoryDocumentStore(BaseDocumentStore): with open(path) as f: store = json.load(f) self._store = {key: Document.from_dict(value) for key, value in store.items()} + + def __persist_flow__(self): + return {} diff --git a/knowledgehub/storages/docstores/simple_file.py b/knowledgehub/storages/docstores/simple_file.py new file mode 100644 index 0000000..8967096 --- /dev/null +++ b/knowledgehub/storages/docstores/simple_file.py @@ -0,0 +1,44 @@ +from pathlib import Path +from typing import List, Optional, Union + +from kotaemon.base import Document + +from .in_memory import InMemoryDocumentStore + + +class SimpleFileDocumentStore(InMemoryDocumentStore): + """Improve InMemoryDocumentStore by auto saving whenever the corpus is changed""" + + def __init__(self, path: str | Path): + super().__init__() + self._path = path + if path is not None and Path(path).is_file(): + self.load(path) + + def add( + self, + docs: Union[Document, List[Document]], + ids: Optional[Union[List[str], str]] = None, + **kwargs, + ): + """Add document into document store + + Args: + docs: list of documents to add + ids: specify the ids of documents to add or + use existing doc.doc_id + exist_ok: raise error when duplicate doc-id + found in the docstore (default to False) + """ + super().add(docs=docs, ids=ids, **kwargs) + self.save(self._path) + + def delete(self, ids: Union[List[str], str]): + """Delete document by id""" + super().delete(ids=ids) + self.save(self._path) + + def __persist_flow__(self): + from theflow.utils.modules import serialize + + return {"path": serialize(self._path)} diff --git a/knowledgehub/storages/vectorstores/__init__.py b/knowledgehub/storages/vectorstores/__init__.py index 62e3127..1606393 100644 --- a/knowledgehub/storages/vectorstores/__init__.py +++ b/knowledgehub/storages/vectorstores/__init__.py @@ -1,5 +1,11 @@ from .base import BaseVectorStore from .chroma import ChromaVectorStore from .in_memory import InMemoryVectorStore +from .simple_file import SimpleFileVectorStore -__all__ = ["BaseVectorStore", "ChromaVectorStore", "InMemoryVectorStore"] +__all__ = [ + "BaseVectorStore", + "ChromaVectorStore", + "InMemoryVectorStore", + "SimpleFileVectorStore", +] diff --git a/knowledgehub/storages/vectorstores/base.py b/knowledgehub/storages/vectorstores/base.py index 2213b85..7f8f2a5 100644 --- a/knowledgehub/storages/vectorstores/base.py +++ b/knowledgehub/storages/vectorstores/base.py @@ -1,12 +1,14 @@ +from __future__ import annotations + from abc import ABC, abstractmethod -from typing import Any, List, Optional, Tuple, Type, Union +from typing import Any, Optional from llama_index.schema import NodeRelationship, RelatedNodeInfo from llama_index.vector_stores.types import BasePydanticVectorStore from llama_index.vector_stores.types import VectorStore as LIVectorStore from llama_index.vector_stores.types import VectorStoreQuery -from kotaemon.base import Document, DocumentWithEmbedding +from kotaemon.base import DocumentWithEmbedding class BaseVectorStore(ABC): @@ -17,10 +19,10 @@ class BaseVectorStore(ABC): @abstractmethod def add( self, - embeddings: List[List[float]] | List[DocumentWithEmbedding], - metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, - ) -> List[str]: + embeddings: list[list[float]] | list[DocumentWithEmbedding], + metadatas: Optional[list[dict]] = None, + ids: Optional[list[str]] = None, + ) -> list[str]: """Add vector embeddings to vector stores Args: @@ -35,16 +37,7 @@ class BaseVectorStore(ABC): ... @abstractmethod - def add_from_docs(self, docs: List[Document]): - """Add vector embeddings to vector stores - - Args: - docs: List of Document objects - """ - ... - - @abstractmethod - def delete(self, ids: List[str], **kwargs): + def delete(self, ids: list[str], **kwargs): """Delete vector embeddings from vector stores Args: @@ -56,11 +49,11 @@ class BaseVectorStore(ABC): @abstractmethod def query( self, - embedding: List[float], + embedding: list[float], top_k: int = 1, - ids: Optional[List[str]] = None, + ids: Optional[list[str]] = None, **kwargs, - ) -> Tuple[List[List[float]], List[float], List[str]]: + ) -> tuple[list[list[float]], list[float], list[str]]: """Return the top k most similar vector embeddings Args: @@ -73,17 +66,9 @@ class BaseVectorStore(ABC): """ ... - @abstractmethod - def load(self, *args, **kwargs): - pass - - @abstractmethod - def save(self, *args, **kwargs): - pass - class LlamaIndexVectorStore(BaseVectorStore): - _li_class: Type[Union[LIVectorStore, BasePydanticVectorStore]] + _li_class: type[LIVectorStore | BasePydanticVectorStore] def __init__(self, *args, **kwargs): if self._li_class is None: @@ -104,12 +89,12 @@ class LlamaIndexVectorStore(BaseVectorStore): def add( self, - embeddings: List[List[float]] | List[DocumentWithEmbedding], - metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, + embeddings: list[list[float]] | list[DocumentWithEmbedding], + metadatas: Optional[list[dict]] = None, + ids: Optional[list[str]] = None, ): if isinstance(embeddings[0], list): - nodes = [ + nodes: list[DocumentWithEmbedding] = [ DocumentWithEmbedding(embedding=embedding) for embedding in embeddings ] else: @@ -126,20 +111,17 @@ class LlamaIndexVectorStore(BaseVectorStore): return self._client.add(nodes=nodes) - def add_from_docs(self, docs: List[Document]): - return self._client.add(nodes=docs) - - def delete(self, ids: List[str], **kwargs): + def delete(self, ids: list[str], **kwargs): for id_ in ids: self._client.delete(ref_doc_id=id_, **kwargs) def query( self, - embedding: List[float], + embedding: list[float], top_k: int = 1, - ids: Optional[List[str]] = None, + ids: Optional[list[str]] = None, **kwargs, - ) -> Tuple[List[List[float]], List[float], List[str]]: + ) -> tuple[list[list[float]], list[float], list[str]]: output = self._client.query( query=VectorStoreQuery( query_embedding=embedding, diff --git a/knowledgehub/storages/vectorstores/chroma.py b/knowledgehub/storages/vectorstores/chroma.py index 462a89d..431dcdd 100644 --- a/knowledgehub/storages/vectorstores/chroma.py +++ b/knowledgehub/storages/vectorstores/chroma.py @@ -21,6 +21,17 @@ class ChromaVectorStore(LlamaIndexVectorStore): flat_metadata: bool = True, **kwargs: Any, ): + self._path = path + self._collection_name = collection_name + self._host = host + self._port = port + self._ssl = ssl + self._headers = headers + self._collection_kwargs = collection_kwargs + self._stores_text = stores_text + self._flat_metadata = flat_metadata + self._kwargs = kwargs + try: import chromadb except ImportError: @@ -70,8 +81,16 @@ class ChromaVectorStore(LlamaIndexVectorStore): def count(self) -> int: return self._collection.count() - def save(self, *args, **kwargs): - pass - - def load(self, *args, **kwargs): - pass + def __persist_flow__(self): + return { + "path": self._path, + "collection_name": self._collection_name, + "host": self._host, + "port": self._port, + "ssl": self._ssl, + "headers": self._headers, + "collection_kwargs": self._collection_kwargs, + "stores_text": self._stores_text, + "flat_metadata": self._flat_metadata, + **self._kwargs, + } diff --git a/knowledgehub/storages/vectorstores/in_memory.py b/knowledgehub/storages/vectorstores/in_memory.py index f8f20cc..c636d9d 100644 --- a/knowledgehub/storages/vectorstores/in_memory.py +++ b/knowledgehub/storages/vectorstores/in_memory.py @@ -1,5 +1,4 @@ """Simple vector store index.""" - from typing import Any, Optional, Type import fsspec @@ -53,3 +52,11 @@ class InMemoryVectorStore(LlamaIndexVectorStore): fs: An abstract super-class for pythonic file-systems """ self._client = self._client.from_persist_path(persist_path=load_path, fs=fs) + + def __persist_flow__(self): + d = self._data.to_dict() + d["__type__"] = f"{self._data.__module__}.{self._data.__class__.__qualname__}" + return { + "data": d, + # "fs": self._fs, + } diff --git a/knowledgehub/storages/vectorstores/simple_file.py b/knowledgehub/storages/vectorstores/simple_file.py new file mode 100644 index 0000000..6f14a34 --- /dev/null +++ b/knowledgehub/storages/vectorstores/simple_file.py @@ -0,0 +1,66 @@ +"""Simple file vector store index.""" +from pathlib import Path +from typing import Any, Optional, Type + +import fsspec +from llama_index.vector_stores import SimpleVectorStore as LISimpleVectorStore +from llama_index.vector_stores.simple import SimpleVectorStoreData + +from kotaemon.base import DocumentWithEmbedding + +from .base import LlamaIndexVectorStore + + +class SimpleFileVectorStore(LlamaIndexVectorStore): + """Similar to InMemoryVectorStore but is backed by file by default""" + + _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore + store_text: bool = False + + def __init__( + self, + path: str | Path, + data: Optional[SimpleVectorStoreData] = None, + fs: Optional[fsspec.AbstractFileSystem] = None, + **kwargs: Any, + ) -> None: + """Initialize params.""" + self._data = data or SimpleVectorStoreData() + self._fs = fs or fsspec.filesystem("file") + self._path = path + self._save_path = Path(path) + + super().__init__( + data=data, + fs=fs, + **kwargs, + ) + + if self._save_path.is_file(): + self._client = self._li_class.from_persist_path( + persist_path=str(self._save_path), fs=self._fs + ) + + def add( + self, + embeddings: list[list[float]] | list[DocumentWithEmbedding], + metadatas: Optional[list[dict]] = None, + ids: Optional[list[str]] = None, + ): + r = super().add(embeddings, metadatas, ids) + self._client.persist(str(self._save_path), self._fs) + return r + + def delete(self, ids: list[str], **kwargs): + r = super().delete(ids, **kwargs) + self._client.persist(str(self._save_path), self._fs) + return r + + def __persist_flow__(self): + d = self._data.to_dict() + d["__type__"] = f"{self._data.__module__}.{self._data.__class__.__qualname__}" + return { + "data": d, + "path": str(self._path), + # "fs": self._fs, + } diff --git a/tests/test_docstores.py b/tests/test_docstores.py index d8ebe51..90bae43 100644 --- a/tests/test_docstores.py +++ b/tests/test_docstores.py @@ -1,10 +1,15 @@ +import os from unittest.mock import patch import pytest from elastic_transport import ApiResponseMeta from kotaemon.base import Document -from kotaemon.storages import ElasticsearchDocumentStore, InMemoryDocumentStore +from kotaemon.storages import ( + ElasticsearchDocumentStore, + InMemoryDocumentStore, + SimpleFileDocumentStore, +) meta_success = ApiResponseMeta( status=200, @@ -207,7 +212,7 @@ _elastic_search_responses = [ ] -def test_simple_document_store_base_interfaces(tmp_path): +def test_inmemory_document_store_base_interfaces(tmp_path): """Test all interfaces of a a document store""" store = InMemoryDocumentStore() @@ -260,6 +265,64 @@ def test_simple_document_store_base_interfaces(tmp_path): store2.load(tmp_path / "store.json") assert len(store2.get_all()) == 17, "Laded document store should have 17 documents" + os.remove(tmp_path / "store.json") + + +def test_simplefile_document_store_base_interfaces(tmp_path): + """Test all interfaces of a a document store""" + + path = tmp_path / "store.json" + + store = SimpleFileDocumentStore(path=path) + docs = [ + Document(text=f"Sample text {idx}", meta={"meta_key": f"meta_value_{idx}"}) + for idx in range(10) + ] + + # Test add and get all + assert len(store.get_all()) == 0, "Document store should be empty" + store.add(docs) + assert len(store.get_all()) == 10, "Document store should have 10 documents" + + # Test add with provided ids + store.add(docs=docs, ids=[f"doc_{idx}" for idx in range(10)]) + assert len(store.get_all()) == 20, "Document store should have 20 documents" + + # Test add without exist_ok + with pytest.raises(ValueError): + store.add(docs=docs, ids=[f"doc_{idx}" for idx in range(10)]) + + # Update ok with add exist_ok + store.add(docs=docs, ids=[f"doc_{idx}" for idx in range(10)], exist_ok=True) + assert len(store.get_all()) == 20, "Document store should have 20 documents" + + # Test get with str id + matched = store.get(docs[0].doc_id) + assert len(matched) == 1, "Should return 1 document" + assert matched[0].text == docs[0].text, "Should return the correct document" + + # Test get with list of ids + matched = store.get([docs[0].doc_id, docs[1].doc_id]) + assert len(matched) == 2, "Should return 2 documents" + assert [doc.text for doc in matched] == [doc.text for doc in docs[:2]] + + # Test delete with str id + store.delete(docs[0].doc_id) + assert len(store.get_all()) == 19, "Document store should have 19 documents" + + # Test delete with list of ids + store.delete([docs[1].doc_id, docs[2].doc_id]) + assert len(store.get_all()) == 17, "Document store should have 17 documents" + + # Test save + assert path.exists(), "File should exist" + + # Test load + store2 = SimpleFileDocumentStore(path=path) + assert len(store2.get_all()) == 17, "Laded document store should have 17 documents" + + os.remove(path) + @patch( "elastic_transport.Transport.perform_request", diff --git a/tests/test_vectorstore.py b/tests/test_vectorstore.py index 1a7ac65..fc9a30c 100644 --- a/tests/test_vectorstore.py +++ b/tests/test_vectorstore.py @@ -1,7 +1,12 @@ import json +import os -from kotaemon.base import Document -from kotaemon.storages import ChromaVectorStore, InMemoryVectorStore +from kotaemon.base import DocumentWithEmbedding +from kotaemon.storages import ( + ChromaVectorStore, + InMemoryVectorStore, + SimpleFileVectorStore, +) class TestChromaVectorStore: @@ -24,11 +29,11 @@ class TestChromaVectorStore: embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]] metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}] documents = [ - Document(embedding=embedding, metadata=metadata) + DocumentWithEmbedding(embedding=embedding, metadata=metadata) for embedding, metadata in zip(embeddings, metadatas) ] assert db._collection.count() == 0, "Expected empty collection" - output = db.add_from_docs(documents) + output = db.add(documents) assert len(output) == 2, "Expected outputing 2 ids" assert db._collection.count() == 2, "Expected 2 added entries" @@ -69,10 +74,8 @@ class TestChromaVectorStore: ids = ["1", "2", "3"] db = ChromaVectorStore(path=str(tmp_path)) db.add(embeddings=embeddings, metadatas=metadatas, ids=ids) - db.save() db2 = ChromaVectorStore(path=str(tmp_path)) - db2.load() assert ( db2._collection.count() == 3 ), "load function does not load data completely" @@ -122,3 +125,30 @@ class TestInMemoryVectorStore: 0.5, 0.6, ], "load function does not load data completely" + + +class TestSimpleFileVectorStore: + def test_add_delete(self, tmp_path): + """Test that delete func deletes correctly.""" + embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]] + metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}] + ids = ["1", "2", "3"] + db = SimpleFileVectorStore(path=tmp_path / "test_save_load_delete.json") + db.add(embeddings=embeddings, metadatas=metadatas, ids=ids) + db.delete(["3"]) + f = open(tmp_path / "test_save_load_delete.json") + data = json.load(f) + assert ( + "1" and "2" in data["text_id_to_ref_doc_id"] + ), "save function does not save data completely" + assert ( + "3" not in data["text_id_to_ref_doc_id"] + ), "delete function does not delete data completely" + db2 = SimpleFileVectorStore(path=tmp_path / "test_save_load_delete.json") + assert db2.get("2") == [ + 0.4, + 0.5, + 0.6, + ], "load function does not load data completely" + + os.remove(tmp_path / "test_save_load_delete.json")