Add file-based document store and vector store (#96)

* Modify docstore and vectorstore objects to be reconstructable
* Simplify the file docstore
* Use the simple file docstore and vector store in MVP
This commit is contained in:
Duc Nguyen (john)
2023-12-04 17:46:00 +07:00
committed by GitHub
parent 0ce3a8832f
commit 37c744b616
18 changed files with 324 additions and 149 deletions

View File

@@ -1,5 +1,11 @@
from .base import BaseVectorStore
from .chroma import ChromaVectorStore
from .in_memory import InMemoryVectorStore
from .simple_file import SimpleFileVectorStore
__all__ = ["BaseVectorStore", "ChromaVectorStore", "InMemoryVectorStore"]
__all__ = [
"BaseVectorStore",
"ChromaVectorStore",
"InMemoryVectorStore",
"SimpleFileVectorStore",
]

View File

@@ -1,12 +1,14 @@
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Any, List, Optional, Tuple, Type, Union
from typing import Any, Optional
from llama_index.schema import NodeRelationship, RelatedNodeInfo
from llama_index.vector_stores.types import BasePydanticVectorStore
from llama_index.vector_stores.types import VectorStore as LIVectorStore
from llama_index.vector_stores.types import VectorStoreQuery
from kotaemon.base import Document, DocumentWithEmbedding
from kotaemon.base import DocumentWithEmbedding
class BaseVectorStore(ABC):
@@ -17,10 +19,10 @@ class BaseVectorStore(ABC):
@abstractmethod
def add(
self,
embeddings: List[List[float]] | List[DocumentWithEmbedding],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
) -> List[str]:
embeddings: list[list[float]] | list[DocumentWithEmbedding],
metadatas: Optional[list[dict]] = None,
ids: Optional[list[str]] = None,
) -> list[str]:
"""Add vector embeddings to vector stores
Args:
@@ -35,16 +37,7 @@ class BaseVectorStore(ABC):
...
@abstractmethod
def add_from_docs(self, docs: List[Document]):
"""Add vector embeddings to vector stores
Args:
docs: List of Document objects
"""
...
@abstractmethod
def delete(self, ids: List[str], **kwargs):
def delete(self, ids: list[str], **kwargs):
"""Delete vector embeddings from vector stores
Args:
@@ -56,11 +49,11 @@ class BaseVectorStore(ABC):
@abstractmethod
def query(
self,
embedding: List[float],
embedding: list[float],
top_k: int = 1,
ids: Optional[List[str]] = None,
ids: Optional[list[str]] = None,
**kwargs,
) -> Tuple[List[List[float]], List[float], List[str]]:
) -> tuple[list[list[float]], list[float], list[str]]:
"""Return the top k most similar vector embeddings
Args:
@@ -73,17 +66,9 @@ class BaseVectorStore(ABC):
"""
...
@abstractmethod
def load(self, *args, **kwargs):
pass
@abstractmethod
def save(self, *args, **kwargs):
pass
class LlamaIndexVectorStore(BaseVectorStore):
_li_class: Type[Union[LIVectorStore, BasePydanticVectorStore]]
_li_class: type[LIVectorStore | BasePydanticVectorStore]
def __init__(self, *args, **kwargs):
if self._li_class is None:
@@ -104,12 +89,12 @@ class LlamaIndexVectorStore(BaseVectorStore):
def add(
self,
embeddings: List[List[float]] | List[DocumentWithEmbedding],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
embeddings: list[list[float]] | list[DocumentWithEmbedding],
metadatas: Optional[list[dict]] = None,
ids: Optional[list[str]] = None,
):
if isinstance(embeddings[0], list):
nodes = [
nodes: list[DocumentWithEmbedding] = [
DocumentWithEmbedding(embedding=embedding) for embedding in embeddings
]
else:
@@ -126,20 +111,17 @@ class LlamaIndexVectorStore(BaseVectorStore):
return self._client.add(nodes=nodes)
def add_from_docs(self, docs: List[Document]):
return self._client.add(nodes=docs)
def delete(self, ids: List[str], **kwargs):
def delete(self, ids: list[str], **kwargs):
for id_ in ids:
self._client.delete(ref_doc_id=id_, **kwargs)
def query(
self,
embedding: List[float],
embedding: list[float],
top_k: int = 1,
ids: Optional[List[str]] = None,
ids: Optional[list[str]] = None,
**kwargs,
) -> Tuple[List[List[float]], List[float], List[str]]:
) -> tuple[list[list[float]], list[float], list[str]]:
output = self._client.query(
query=VectorStoreQuery(
query_embedding=embedding,

View File

@@ -21,6 +21,17 @@ class ChromaVectorStore(LlamaIndexVectorStore):
flat_metadata: bool = True,
**kwargs: Any,
):
self._path = path
self._collection_name = collection_name
self._host = host
self._port = port
self._ssl = ssl
self._headers = headers
self._collection_kwargs = collection_kwargs
self._stores_text = stores_text
self._flat_metadata = flat_metadata
self._kwargs = kwargs
try:
import chromadb
except ImportError:
@@ -70,8 +81,16 @@ class ChromaVectorStore(LlamaIndexVectorStore):
def count(self) -> int:
return self._collection.count()
def save(self, *args, **kwargs):
pass
def load(self, *args, **kwargs):
pass
def __persist_flow__(self):
return {
"path": self._path,
"collection_name": self._collection_name,
"host": self._host,
"port": self._port,
"ssl": self._ssl,
"headers": self._headers,
"collection_kwargs": self._collection_kwargs,
"stores_text": self._stores_text,
"flat_metadata": self._flat_metadata,
**self._kwargs,
}

View File

@@ -1,5 +1,4 @@
"""Simple vector store index."""
from typing import Any, Optional, Type
import fsspec
@@ -53,3 +52,11 @@ class InMemoryVectorStore(LlamaIndexVectorStore):
fs: An abstract super-class for pythonic file-systems
"""
self._client = self._client.from_persist_path(persist_path=load_path, fs=fs)
def __persist_flow__(self):
d = self._data.to_dict()
d["__type__"] = f"{self._data.__module__}.{self._data.__class__.__qualname__}"
return {
"data": d,
# "fs": self._fs,
}

View File

@@ -0,0 +1,66 @@
"""Simple file vector store index."""
from pathlib import Path
from typing import Any, Optional, Type
import fsspec
from llama_index.vector_stores import SimpleVectorStore as LISimpleVectorStore
from llama_index.vector_stores.simple import SimpleVectorStoreData
from kotaemon.base import DocumentWithEmbedding
from .base import LlamaIndexVectorStore
class SimpleFileVectorStore(LlamaIndexVectorStore):
"""Similar to InMemoryVectorStore but is backed by file by default"""
_li_class: Type[LISimpleVectorStore] = LISimpleVectorStore
store_text: bool = False
def __init__(
self,
path: str | Path,
data: Optional[SimpleVectorStoreData] = None,
fs: Optional[fsspec.AbstractFileSystem] = None,
**kwargs: Any,
) -> None:
"""Initialize params."""
self._data = data or SimpleVectorStoreData()
self._fs = fs or fsspec.filesystem("file")
self._path = path
self._save_path = Path(path)
super().__init__(
data=data,
fs=fs,
**kwargs,
)
if self._save_path.is_file():
self._client = self._li_class.from_persist_path(
persist_path=str(self._save_path), fs=self._fs
)
def add(
self,
embeddings: list[list[float]] | list[DocumentWithEmbedding],
metadatas: Optional[list[dict]] = None,
ids: Optional[list[str]] = None,
):
r = super().add(embeddings, metadatas, ids)
self._client.persist(str(self._save_path), self._fs)
return r
def delete(self, ids: list[str], **kwargs):
r = super().delete(ids, **kwargs)
self._client.persist(str(self._save_path), self._fs)
return r
def __persist_flow__(self):
d = self._data.to_dict()
d["__type__"] = f"{self._data.__module__}.{self._data.__class__.__qualname__}"
return {
"data": d,
"path": str(self._path),
# "fs": self._fs,
}