diff --git a/knowledgehub/vectorstores/__init__.py b/knowledgehub/vectorstores/__init__.py index e6c8f0c..62e3127 100644 --- a/knowledgehub/vectorstores/__init__.py +++ b/knowledgehub/vectorstores/__init__.py @@ -1,4 +1,5 @@ from .base import BaseVectorStore from .chroma import ChromaVectorStore +from .in_memory import InMemoryVectorStore -__all__ = ["BaseVectorStore", "ChromaVectorStore"] +__all__ = ["BaseVectorStore", "ChromaVectorStore", "InMemoryVectorStore"] diff --git a/knowledgehub/vectorstores/base.py b/knowledgehub/vectorstores/base.py index 3778a15..ac66e6b 100644 --- a/knowledgehub/vectorstores/base.py +++ b/knowledgehub/vectorstores/base.py @@ -1,6 +1,7 @@ from abc import ABC, abstractmethod from typing import Any, List, Optional, Tuple, Type, Union +from llama_index.schema import NodeRelationship, RelatedNodeInfo from llama_index.vector_stores.types import BasePydanticVectorStore from llama_index.vector_stores.types import VectorStore as LIVectorStore from llama_index.vector_stores.types import VectorStoreQuery @@ -118,6 +119,9 @@ class LlamaIndexVectorStore(BaseVectorStore): if ids is not None: for node, id in zip(nodes, ids): node.id_ = id + node.relationships = { + NodeRelationship.SOURCE: RelatedNodeInfo(node_id=id) + } return self._client.add(nodes=nodes) # type: ignore diff --git a/knowledgehub/vectorstores/in_memory.py b/knowledgehub/vectorstores/in_memory.py new file mode 100644 index 0000000..86e82d2 --- /dev/null +++ b/knowledgehub/vectorstores/in_memory.py @@ -0,0 +1,57 @@ +"""Simple vector store index.""" + +from typing import Any, Optional, Type + +import fsspec +from llama_index.vector_stores import SimpleVectorStore as LISimpleVectorStore +from llama_index.vector_stores.simple import SimpleVectorStoreData + +from kotaemon.vectorstores.base import LlamaIndexVectorStore + + +class InMemoryVectorStore(LlamaIndexVectorStore): + _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore + store_text: bool = False + + def __init__( + self, + data: Optional[SimpleVectorStoreData] = None, + fs: Optional[fsspec.AbstractFileSystem] = None, + **kwargs: Any, + ) -> None: + """Initialize params.""" + self._data = data or SimpleVectorStoreData() + self._fs = fs or fsspec.filesystem("file") + + super().__init__( + data=data, + fs=fs, + **kwargs, + ) + + def save( + self, + save_path: str, + fs: Optional[fsspec.AbstractFileSystem] = None, + **kwargs, + ): + + """save a simpleVectorStore to a dictionary. + + Args: + save_path: Path of saving vector to disk. + fs: An abstract super-class for pythonic file-systems + """ + self._client.persist(persist_path=save_path, fs=fs) + + def load( + self, load_path: str, fs: Optional[fsspec.AbstractFileSystem] = None + ) -> "InMemoryVectorStore": + + """Create a SimpleKVStore from a load directory. + + Args: + load_path: Path of loading vector. + fs: An abstract super-class for pythonic file-systems + """ + return self._client.from_persist_path(persist_path=load_path, fs=fs) diff --git a/tests/test_vectorstore.py b/tests/test_vectorstore.py index 96e9ba1..8b3556c 100644 --- a/tests/test_vectorstore.py +++ b/tests/test_vectorstore.py @@ -1,5 +1,7 @@ +import json + from kotaemon.documents.base import Document -from kotaemon.vectorstores import ChromaVectorStore +from kotaemon.vectorstores import ChromaVectorStore, InMemoryVectorStore class TestChromaVectorStore: @@ -59,3 +61,41 @@ class TestChromaVectorStore: _, _, out_ids = db.query(embedding=[0.42, 0.52, 0.53], top_k=1) assert out_ids == ["b"] + + +class TestSimpleVectorStore: + def test_add(self): + """Test that add func adds correctly.""" + + embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]] + metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}] + ids = ["1", "2"] + db = InMemoryVectorStore() + + output = db.add(embeddings=embeddings, metadatas=metadatas, ids=ids) + assert output == ids, "Excepted output to be the same as ids" + + def test_save_load_delete(self, tmp_path): + """Test that delete func deletes correctly.""" + embeddings = [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]] + metadatas = [{"a": 1, "b": 2}, {"a": 3, "b": 4}, {"a": 5, "b": 6}] + ids = ["1", "2", "3"] + db = InMemoryVectorStore() + db.add(embeddings=embeddings, metadatas=metadatas, ids=ids) + db.delete(["3"]) + db.save(save_path=tmp_path / "test_save_load_delete.json") + f = open(tmp_path / "test_save_load_delete.json") + data = json.load(f) + assert ( + "1" and "2" in data["text_id_to_ref_doc_id"] + ), "persist function does not save data completely" + assert ( + "3" not in data["text_id_to_ref_doc_id"] + ), "delete function does not delete data completely" + db2 = InMemoryVectorStore() + output = db2.load(load_path=tmp_path / "test_save_load_delete.json") + assert output.get("2") == [ + 0.4, + 0.5, + 0.6, + ], "persist function does not load data completely"