Add file-based document store and vector store (#96)

* Modify docstore and vectorstore objects to be reconstructable * Simplify the file docstore * Use the simple file docstore and vector store in MVP
2023-12-04 17:46:00 +07:00
parent 0ce3a8832f
commit 37c744b616
18 changed files with 324 additions and 149 deletions
--- a/knowledgehub/storages/vectorstores/init.py
+++ b/knowledgehub/storages/vectorstores/init.py
@@ -1,5 +1,11 @@
 from .base import BaseVectorStore
 from .chroma import ChromaVectorStore
 from .in_memory import InMemoryVectorStore
+from .simple_file import SimpleFileVectorStore

-__all__ = ["BaseVectorStore", "ChromaVectorStore", "InMemoryVectorStore"]
+__all__ = [
+    "BaseVectorStore",
+    "ChromaVectorStore",
+    "InMemoryVectorStore",
+    "SimpleFileVectorStore",
+]
--- a/knowledgehub/storages/vectorstores/base.py
+++ b/knowledgehub/storages/vectorstores/base.py
@@ -1,12 +1,14 @@
+from __future__ import annotations
+
 from abc import ABC, abstractmethod
-from typing import Any, List, Optional, Tuple, Type, Union
+from typing import Any, Optional

 from llama_index.schema import NodeRelationship, RelatedNodeInfo
 from llama_index.vector_stores.types import BasePydanticVectorStore
 from llama_index.vector_stores.types import VectorStore as LIVectorStore
 from llama_index.vector_stores.types import VectorStoreQuery

-from kotaemon.base import Document, DocumentWithEmbedding
+from kotaemon.base import DocumentWithEmbedding


 class BaseVectorStore(ABC):
@@ -17,10 +19,10 @@ class BaseVectorStore(ABC):
    @abstractmethod
    def add(
        self,
-        embeddings: List[List[float]] | List[DocumentWithEmbedding],
-        metadatas: Optional[List[dict]] = None,
-        ids: Optional[List[str]] = None,
-    ) -> List[str]:
+        embeddings: list[list[float]] | list[DocumentWithEmbedding],
+        metadatas: Optional[list[dict]] = None,
+        ids: Optional[list[str]] = None,
+    ) -> list[str]:
        """Add vector embeddings to vector stores

        Args:
@@ -35,16 +37,7 @@ class BaseVectorStore(ABC):
        ...

    @abstractmethod
-    def add_from_docs(self, docs: List[Document]):
-        """Add vector embeddings to vector stores
-
-        Args:
-            docs: List of Document objects
-        """
-        ...
-
-    @abstractmethod
-    def delete(self, ids: List[str], **kwargs):
+    def delete(self, ids: list[str], **kwargs):
        """Delete vector embeddings from vector stores

        Args:
@@ -56,11 +49,11 @@ class BaseVectorStore(ABC):
    @abstractmethod
    def query(
        self,
-        embedding: List[float],
+        embedding: list[float],
        top_k: int = 1,
-        ids: Optional[List[str]] = None,
+        ids: Optional[list[str]] = None,
        **kwargs,
-    ) -> Tuple[List[List[float]], List[float], List[str]]:
+    ) -> tuple[list[list[float]], list[float], list[str]]:
        """Return the top k most similar vector embeddings

        Args:
@@ -73,17 +66,9 @@ class BaseVectorStore(ABC):
        """
        ...

-    @abstractmethod
-    def load(self, *args, **kwargs):
-        pass
-
-    @abstractmethod
-    def save(self, *args, **kwargs):
-        pass
-

 class LlamaIndexVectorStore(BaseVectorStore):
-    _li_class: Type[Union[LIVectorStore, BasePydanticVectorStore]]
+    _li_class: type[LIVectorStore | BasePydanticVectorStore]

    def __init__(self, *args, **kwargs):
        if self._li_class is None:
@@ -104,12 +89,12 @@ class LlamaIndexVectorStore(BaseVectorStore):

    def add(
        self,
-        embeddings: List[List[float]] | List[DocumentWithEmbedding],
-        metadatas: Optional[List[dict]] = None,
-        ids: Optional[List[str]] = None,
+        embeddings: list[list[float]] | list[DocumentWithEmbedding],
+        metadatas: Optional[list[dict]] = None,
+        ids: Optional[list[str]] = None,
    ):
        if isinstance(embeddings[0], list):
-            nodes = [
+            nodes: list[DocumentWithEmbedding] = [
                DocumentWithEmbedding(embedding=embedding) for embedding in embeddings
            ]
        else:
@@ -126,20 +111,17 @@ class LlamaIndexVectorStore(BaseVectorStore):

        return self._client.add(nodes=nodes)

-    def add_from_docs(self, docs: List[Document]):
-        return self._client.add(nodes=docs)
-
-    def delete(self, ids: List[str], **kwargs):
+    def delete(self, ids: list[str], **kwargs):
        for id_ in ids:
            self._client.delete(ref_doc_id=id_, **kwargs)

    def query(
        self,
-        embedding: List[float],
+        embedding: list[float],
        top_k: int = 1,
-        ids: Optional[List[str]] = None,
+        ids: Optional[list[str]] = None,
        **kwargs,
-    ) -> Tuple[List[List[float]], List[float], List[str]]:
+    ) -> tuple[list[list[float]], list[float], list[str]]:
        output = self._client.query(
            query=VectorStoreQuery(
                query_embedding=embedding,
--- a/knowledgehub/storages/vectorstores/chroma.py
+++ b/knowledgehub/storages/vectorstores/chroma.py
@@ -21,6 +21,17 @@ class ChromaVectorStore(LlamaIndexVectorStore):
        flat_metadata: bool = True,
        **kwargs: Any,
    ):
+        self._path = path
+        self._collection_name = collection_name
+        self._host = host
+        self._port = port
+        self._ssl = ssl
+        self._headers = headers
+        self._collection_kwargs = collection_kwargs
+        self._stores_text = stores_text
+        self._flat_metadata = flat_metadata
+        self._kwargs = kwargs
+
        try:
            import chromadb
        except ImportError:
@@ -70,8 +81,16 @@ class ChromaVectorStore(LlamaIndexVectorStore):
    def count(self) -> int:
        return self._collection.count()

-    def save(self, *args, **kwargs):
-        pass
-
-    def load(self, *args, **kwargs):
-        pass
+    def __persist_flow__(self):
+        return {
+            "path": self._path,
+            "collection_name": self._collection_name,
+            "host": self._host,
+            "port": self._port,
+            "ssl": self._ssl,
+            "headers": self._headers,
+            "collection_kwargs": self._collection_kwargs,
+            "stores_text": self._stores_text,
+            "flat_metadata": self._flat_metadata,
+            **self._kwargs,
+        }
--- a/knowledgehub/storages/vectorstores/in_memory.py
+++ b/knowledgehub/storages/vectorstores/in_memory.py
@@ -1,5 +1,4 @@
 """Simple vector store index."""
-
 from typing import Any, Optional, Type

 import fsspec
@@ -53,3 +52,11 @@ class InMemoryVectorStore(LlamaIndexVectorStore):
            fs: An abstract super-class for pythonic file-systems
        """
        self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)
+
+    def __persist_flow__(self):
+        d = self._data.to_dict()
+        d["__type__"] = f"{self._data.__module__}.{self._data.__class__.__qualname__}"
+        return {
+            "data": d,
+            # "fs": self._fs,
+        }
--- a/knowledgehub/storages/vectorstores/simple_file.py
+++ b/knowledgehub/storages/vectorstores/simple_file.py
@@ -0,0 +1,66 @@
+"""Simple file vector store index."""
+from pathlib import Path
+from typing import Any, Optional, Type
+
+import fsspec
+from llama_index.vector_stores import SimpleVectorStore as LISimpleVectorStore
+from llama_index.vector_stores.simple import SimpleVectorStoreData
+
+from kotaemon.base import DocumentWithEmbedding
+
+from .base import LlamaIndexVectorStore
+
+
+class SimpleFileVectorStore(LlamaIndexVectorStore):
+    """Similar to InMemoryVectorStore but is backed by file by default"""
+
+    _li_class: Type[LISimpleVectorStore] = LISimpleVectorStore
+    store_text: bool = False
+
+    def __init__(
+        self,
+        path: str | Path,
+        data: Optional[SimpleVectorStoreData] = None,
+        fs: Optional[fsspec.AbstractFileSystem] = None,
+        **kwargs: Any,
+    ) -> None:
+        """Initialize params."""
+        self._data = data or SimpleVectorStoreData()
+        self._fs = fs or fsspec.filesystem("file")
+        self._path = path
+        self._save_path = Path(path)
+
+        super().__init__(
+            data=data,
+            fs=fs,
+            **kwargs,
+        )
+
+        if self._save_path.is_file():
+            self._client = self._li_class.from_persist_path(
+                persist_path=str(self._save_path), fs=self._fs
+            )
+
+    def add(
+        self,
+        embeddings: list[list[float]] | list[DocumentWithEmbedding],
+        metadatas: Optional[list[dict]] = None,
+        ids: Optional[list[str]] = None,
+    ):
+        r = super().add(embeddings, metadatas, ids)
+        self._client.persist(str(self._save_path), self._fs)
+        return r
+
+    def delete(self, ids: list[str], **kwargs):
+        r = super().delete(ids, **kwargs)
+        self._client.persist(str(self._save_path), self._fs)
+        return r
+
+    def __persist_flow__(self):
+        d = self._data.to_dict()
+        d["__type__"] = f"{self._data.__module__}.{self._data.__class__.__qualname__}"
+        return {
+            "data": d,
+            "path": str(self._path),
+            # "fs": self._fs,
+        }