Update Base interface of Index/Retrieval pipeline (#36)

* add base Tool * minor update test_tool * update test dependency * update test dependency * Fix namespace conflict * update test * add base Agent Interface, add ReWoo Agent * minor update * update test * fix typo * remove unneeded print * update rewoo agent * add LLMTool * update BaseAgent type * add ReAct agent * add ReAct agent * minor update * minor update * minor update * minor update * update base reader with BaseComponent * add splitter * update agent and tool * update vectorstores * update load/save for indexing and retrieving pipeline * update test_agent for more use-cases * add missing dependency for test * update test case for in memory vectorstore * add TextSplitter to BaseComponent * update type hint basetool --------- Co-authored-by: trducng <trungduc1992@gmail.com>
2023-10-04 14:27:44 +07:00
parent 49ed3f6994
commit 56bc41b673
13 changed files with 302 additions and 36 deletions
--- a/knowledgehub/pipelines/indexing.py
+++ b/knowledgehub/pipelines/indexing.py
@@ -1,5 +1,6 @@
 import uuid
-from typing import List, Optional
+from pathlib import Path
+from typing import List, Union

 from theflow import Node, Param

@@ -9,6 +10,9 @@ from ..documents.base import Document
 from ..embeddings import BaseEmbeddings
 from ..vectorstores import BaseVectorStore

+VECTOR_STORE_FNAME = "vectorstore"
+DOC_STORE_FNAME = "docstore"
+

 class IndexVectorStoreFromDocumentPipeline(BaseComponent):
    """Ingest the document, run through the embedding, and store the embedding in a
@@ -20,7 +24,7 @@ class IndexVectorStoreFromDocumentPipeline(BaseComponent):
    """

    vector_store: Param[BaseVectorStore] = Param()
-    doc_store: Optional[BaseDocumentStore] = None
+    doc_store: Param[BaseDocumentStore] = Param()
    embedding: Node[BaseEmbeddings] = Node()

    # TODO: refer to llama_index's storage as well
@@ -30,7 +34,7 @@ class IndexVectorStoreFromDocumentPipeline(BaseComponent):
        self.run_batch_document([document])

    def run_batch_raw(self, text: List[str]) -> None:
-        documents = [Document(t, id_=str(uuid.uuid4())) for t in text]
+        documents = [Document(text=t, id_=str(uuid.uuid4())) for t in text]
        self.run_batch_document(documents)

    def run_document(self, text: Document) -> None:
@@ -57,13 +61,31 @@ class IndexVectorStoreFromDocumentPipeline(BaseComponent):
            return True
        return False

-    def persist(self, path: str):
+    def save(
+        self,
+        path: Union[str, Path],
+        vectorstore_fname: str = VECTOR_STORE_FNAME,
+        docstore_fname: str = DOC_STORE_FNAME,
+    ):
        """Save the whole state of the indexing pipeline vector store and all
        necessary information to disk

        Args:
            path (str): path to save the state
        """
+        if isinstance(path, str):
+            path = Path(path)
+        self.vector_store.save(path / vectorstore_fname)
+        self.doc_store.save(path / docstore_fname)

-    def load(self, path: str):
+    def load(
+        self,
+        path: Union[str, Path],
+        vectorstore_fname: str = VECTOR_STORE_FNAME,
+        docstore_fname: str = DOC_STORE_FNAME,
+    ):
        """Load all information from disk to an object"""
+        if isinstance(path, str):
+            path = Path(path)
+        self.vector_store.load(path / vectorstore_fname)
+        self.doc_store.load(path / docstore_fname)