[AUR-392, AUR-413, AUR-414] Define base vector store, and make use of ChromaVectorStore from llama_index. Indexing and retrieving vectors with vector store (#18)

Design the base interface of vector store, and apply it to the Chroma Vector Store (wrapped around llama_index's implementation). Provide the pipelines to populate and retrieve from vector store.
2023-09-14 14:18:20 +07:00
parent c339912312
commit 620b2b03ca
10 changed files with 486 additions and 16 deletions
--- a/knowledgehub/pipelines/indexing.py
+++ b/knowledgehub/pipelines/indexing.py
@@ -0,0 +1,58 @@
+from typing import List
+
+from theflow import Node, Param
+
+from ..components import BaseComponent
+from ..documents.base import Document
+from ..embeddings import BaseEmbeddings
+from ..vectorstores import BaseVectorStore
+
+
+class IndexVectorStoreFromDocumentPipeline(BaseComponent):
+    """Ingest the document, run through the embedding, and store the embedding in a
+    vector store.
+
+    This pipeline supports the following set of inputs:
+        - List of documents
+        - List of texts
+    """
+
+    vector_store: Param[BaseVectorStore] = Param()
+    embedding: Node[BaseEmbeddings] = Node()
+    # TODO: populate to document store as well when it's finished
+    # TODO: refer to llama_index's storage as well
+
+    def run_raw(self, text: str) -> None:
+        self.vector_store.add([self.embedding(text)])
+
+    def run_batch_raw(self, text: List[str]) -> None:
+        self.vector_store.add(self.embedding(text))
+
+    def run_document(self, text: Document) -> None:
+        self.vector_store.add([self.embedding(text)])
+
+    def run_batch_document(self, text: List[Document]) -> None:
+        self.vector_store.add(self.embedding(text))
+
+    def is_document(self, text) -> bool:
+        if isinstance(text, Document):
+            return True
+        elif isinstance(text, List) and isinstance(text[0], Document):
+            return True
+        return False
+
+    def is_batch(self, text) -> bool:
+        if isinstance(text, list):
+            return True
+        return False
+
+    def persist(self, path: str):
+        """Save the whole state of the indexing pipeline vector store and all
+        necessary information to disk
+
+        Args:
+            path (str): path to save the state
+        """
+
+    def load(self, path: str):
+        """Load all information from disk to an object"""
--- a/knowledgehub/pipelines/retrieving.py
+++ b/knowledgehub/pipelines/retrieving.py
@@ -0,0 +1,58 @@
+from typing import List
+
+from theflow import Node, Param
+
+from ..components import BaseComponent
+from ..documents.base import Document
+from ..embeddings import BaseEmbeddings
+from ..vectorstores import BaseVectorStore
+
+
+class RetrieveDocumentFromVectorStorePipeline(BaseComponent):
+    """Retrieve list of documents from vector store"""
+
+    vector_store: Param[BaseVectorStore] = Param()
+    embedding: Node[BaseEmbeddings] = Node()
+    # TODO: populate to document store as well when it's finished
+    # TODO: refer to llama_index's storage as well
+
+    def run_raw(self, text: str) -> List[str]:
+        emb = self.embedding(text)
+        return self.vector_store.query(embedding=emb)[2]
+
+    def run_batch_raw(self, text: List[str]) -> List[List[str]]:
+        result = []
+        for each_text in text:
+            emb = self.embedding(each_text)
+            result.append(self.vector_store.query(embedding=emb)[2])
+        return result
+
+    def run_document(self, text: Document) -> List[str]:
+        return self.run_raw(text.text)
+
+    def run_batch_document(self, text: List[Document]) -> List[List[str]]:
+        input_text = [each.text for each in text]
+        return self.run_batch_raw(input_text)
+
+    def is_document(self, text) -> bool:
+        if isinstance(text, Document):
+            return True
+        elif isinstance(text, List) and isinstance(text[0], Document):
+            return True
+        return False
+
+    def is_batch(self, text) -> bool:
+        if isinstance(text, list):
+            return True
+        return False
+
+    def persist(self, path: str):
+        """Save the whole state of the indexing pipeline vector store and all
+        necessary information to disk
+
+        Args:
+            path (str): path to save the state
+        """
+
+    def load(self, path: str):
+        """Load all information from disk to an object"""