[AUR-392, AUR-413, AUR-414] Define base vector store, and make use of ChromaVectorStore from llama_index. Indexing and retrieving vectors with vector store (#18)

Design the base interface of vector store, and apply it to the Chroma Vector Store (wrapped around llama_index's implementation). Provide the pipelines to populate and retrieve from vector store.
This commit is contained in:
Nguyen Trung Duc (john)
2023-09-14 14:18:20 +07:00
committed by GitHub
parent c339912312
commit 620b2b03ca
10 changed files with 486 additions and 16 deletions

View File

@@ -0,0 +1,58 @@
from typing import List
from theflow import Node, Param
from ..components import BaseComponent
from ..documents.base import Document
from ..embeddings import BaseEmbeddings
from ..vectorstores import BaseVectorStore
class IndexVectorStoreFromDocumentPipeline(BaseComponent):
"""Ingest the document, run through the embedding, and store the embedding in a
vector store.
This pipeline supports the following set of inputs:
- List of documents
- List of texts
"""
vector_store: Param[BaseVectorStore] = Param()
embedding: Node[BaseEmbeddings] = Node()
# TODO: populate to document store as well when it's finished
# TODO: refer to llama_index's storage as well
def run_raw(self, text: str) -> None:
self.vector_store.add([self.embedding(text)])
def run_batch_raw(self, text: List[str]) -> None:
self.vector_store.add(self.embedding(text))
def run_document(self, text: Document) -> None:
self.vector_store.add([self.embedding(text)])
def run_batch_document(self, text: List[Document]) -> None:
self.vector_store.add(self.embedding(text))
def is_document(self, text) -> bool:
if isinstance(text, Document):
return True
elif isinstance(text, List) and isinstance(text[0], Document):
return True
return False
def is_batch(self, text) -> bool:
if isinstance(text, list):
return True
return False
def persist(self, path: str):
"""Save the whole state of the indexing pipeline vector store and all
necessary information to disk
Args:
path (str): path to save the state
"""
def load(self, path: str):
"""Load all information from disk to an object"""