[AUR-392, AUR-413, AUR-414] Define base vector store, and make use of ChromaVectorStore from llama_index. Indexing and retrieving vectors with vector store (#18)
Design the base interface of vector store, and apply it to the Chroma Vector Store (wrapped around llama_index's implementation). Provide the pipelines to populate and retrieve from vector store.
This commit is contained in:
committed by
GitHub
parent
c339912312
commit
620b2b03ca
58
knowledgehub/pipelines/indexing.py
Normal file
58
knowledgehub/pipelines/indexing.py
Normal file
@@ -0,0 +1,58 @@
|
||||
from typing import List
|
||||
|
||||
from theflow import Node, Param
|
||||
|
||||
from ..components import BaseComponent
|
||||
from ..documents.base import Document
|
||||
from ..embeddings import BaseEmbeddings
|
||||
from ..vectorstores import BaseVectorStore
|
||||
|
||||
|
||||
class IndexVectorStoreFromDocumentPipeline(BaseComponent):
|
||||
"""Ingest the document, run through the embedding, and store the embedding in a
|
||||
vector store.
|
||||
|
||||
This pipeline supports the following set of inputs:
|
||||
- List of documents
|
||||
- List of texts
|
||||
"""
|
||||
|
||||
vector_store: Param[BaseVectorStore] = Param()
|
||||
embedding: Node[BaseEmbeddings] = Node()
|
||||
# TODO: populate to document store as well when it's finished
|
||||
# TODO: refer to llama_index's storage as well
|
||||
|
||||
def run_raw(self, text: str) -> None:
|
||||
self.vector_store.add([self.embedding(text)])
|
||||
|
||||
def run_batch_raw(self, text: List[str]) -> None:
|
||||
self.vector_store.add(self.embedding(text))
|
||||
|
||||
def run_document(self, text: Document) -> None:
|
||||
self.vector_store.add([self.embedding(text)])
|
||||
|
||||
def run_batch_document(self, text: List[Document]) -> None:
|
||||
self.vector_store.add(self.embedding(text))
|
||||
|
||||
def is_document(self, text) -> bool:
|
||||
if isinstance(text, Document):
|
||||
return True
|
||||
elif isinstance(text, List) and isinstance(text[0], Document):
|
||||
return True
|
||||
return False
|
||||
|
||||
def is_batch(self, text) -> bool:
|
||||
if isinstance(text, list):
|
||||
return True
|
||||
return False
|
||||
|
||||
def persist(self, path: str):
|
||||
"""Save the whole state of the indexing pipeline vector store and all
|
||||
necessary information to disk
|
||||
|
||||
Args:
|
||||
path (str): path to save the state
|
||||
"""
|
||||
|
||||
def load(self, path: str):
|
||||
"""Load all information from disk to an object"""
|
58
knowledgehub/pipelines/retrieving.py
Normal file
58
knowledgehub/pipelines/retrieving.py
Normal file
@@ -0,0 +1,58 @@
|
||||
from typing import List
|
||||
|
||||
from theflow import Node, Param
|
||||
|
||||
from ..components import BaseComponent
|
||||
from ..documents.base import Document
|
||||
from ..embeddings import BaseEmbeddings
|
||||
from ..vectorstores import BaseVectorStore
|
||||
|
||||
|
||||
class RetrieveDocumentFromVectorStorePipeline(BaseComponent):
|
||||
"""Retrieve list of documents from vector store"""
|
||||
|
||||
vector_store: Param[BaseVectorStore] = Param()
|
||||
embedding: Node[BaseEmbeddings] = Node()
|
||||
# TODO: populate to document store as well when it's finished
|
||||
# TODO: refer to llama_index's storage as well
|
||||
|
||||
def run_raw(self, text: str) -> List[str]:
|
||||
emb = self.embedding(text)
|
||||
return self.vector_store.query(embedding=emb)[2]
|
||||
|
||||
def run_batch_raw(self, text: List[str]) -> List[List[str]]:
|
||||
result = []
|
||||
for each_text in text:
|
||||
emb = self.embedding(each_text)
|
||||
result.append(self.vector_store.query(embedding=emb)[2])
|
||||
return result
|
||||
|
||||
def run_document(self, text: Document) -> List[str]:
|
||||
return self.run_raw(text.text)
|
||||
|
||||
def run_batch_document(self, text: List[Document]) -> List[List[str]]:
|
||||
input_text = [each.text for each in text]
|
||||
return self.run_batch_raw(input_text)
|
||||
|
||||
def is_document(self, text) -> bool:
|
||||
if isinstance(text, Document):
|
||||
return True
|
||||
elif isinstance(text, List) and isinstance(text[0], Document):
|
||||
return True
|
||||
return False
|
||||
|
||||
def is_batch(self, text) -> bool:
|
||||
if isinstance(text, list):
|
||||
return True
|
||||
return False
|
||||
|
||||
def persist(self, path: str):
|
||||
"""Save the whole state of the indexing pipeline vector store and all
|
||||
necessary information to disk
|
||||
|
||||
Args:
|
||||
path (str): path to save the state
|
||||
"""
|
||||
|
||||
def load(self, path: str):
|
||||
"""Load all information from disk to an object"""
|
Reference in New Issue
Block a user