[AUR-338, AUR-406, AUR-407] Export pipeline to config for PromptUI. Construct PromptUI dynamically based on config. (#16)

From pipeline > config > UI. Provide example project for promptui - Pipeline to config: `kotaemon.contribs.promptui.config.export_pipeline_to_config`. The config follows schema specified in this document: https://cinnamon-ai.atlassian.net/wiki/spaces/ATM/pages/2748711193/Technical+Detail. Note: this implementation exclude the logs, which will be handled in AUR-408. - Config to UI: `kotaemon.contribs.promptui.build_from_yaml` - Example project is located at `examples/promptui/`
2023-09-21 14:27:23 +07:00
parent c329c4c03f
commit c6dd01e820
18 changed files with 503 additions and 46 deletions
--- a/knowledgehub/pipelines/indexing.py
+++ b/knowledgehub/pipelines/indexing.py
@@ -1,8 +1,10 @@
-from typing import List
+import uuid
+from typing import List, Optional

 from theflow import Node, Param

 from ..base import BaseComponent
+from ..docstores import BaseDocumentStore
 from ..documents.base import Document
 from ..embeddings import BaseEmbeddings
 from ..vectorstores import BaseVectorStore
@@ -18,21 +20,30 @@ class IndexVectorStoreFromDocumentPipeline(BaseComponent):
    """

    vector_store: Param[BaseVectorStore] = Param()
+    doc_store: Optional[BaseDocumentStore] = None
    embedding: Node[BaseEmbeddings] = Node()
-    # TODO: populate to document store as well when it's finished
+
    # TODO: refer to llama_index's storage as well

    def run_raw(self, text: str) -> None:
-        self.vector_store.add([self.embedding(text)])
+        document = Document(text=text, id_=str(uuid.uuid4()))
+        self.run_batch_document([document])

    def run_batch_raw(self, text: List[str]) -> None:
-        self.vector_store.add(self.embedding(text))
+        documents = [Document(t, id_=str(uuid.uuid4())) for t in text]
+        self.run_batch_document(documents)

    def run_document(self, text: Document) -> None:
-        self.vector_store.add([self.embedding(text)])
+        self.run_batch_document([text])

    def run_batch_document(self, text: List[Document]) -> None:
-        self.vector_store.add(self.embedding(text))
+        embeddings = self.embedding(text)
+        self.vector_store.add(
+            embeddings=embeddings,
+            ids=[t.id_ for t in text],
+        )
+        if self.doc_store:
+            self.doc_store.add(text)

    def is_document(self, text) -> bool:
        if isinstance(text, Document):
--- a/knowledgehub/pipelines/retrieving.py
+++ b/knowledgehub/pipelines/retrieving.py
@@ -1,47 +1,87 @@
-from typing import List
+from abc import abstractmethod
+from typing import List, Optional

 from theflow import Node, Param

 from ..base import BaseComponent
-from ..documents.base import Document
+from ..docstores import BaseDocumentStore
+from ..documents.base import Document, RetrievedDocument
 from ..embeddings import BaseEmbeddings
 from ..vectorstores import BaseVectorStore


-class RetrieveDocumentFromVectorStorePipeline(BaseComponent):
+class BaseRetrieval(BaseComponent):
+    """Define the base interface of a retrieval pipeline"""
+
+    @abstractmethod
+    def run_raw(self, text: str, top_k: int = 1) -> List[RetrievedDocument]:
+        ...
+
+    @abstractmethod
+    def run_batch_raw(
+        self, text: List[str], top_k: int = 1
+    ) -> List[List[RetrievedDocument]]:
+        ...
+
+    @abstractmethod
+    def run_document(self, text: Document, top_k: int = 1) -> List[RetrievedDocument]:
+        ...
+
+    @abstractmethod
+    def run_batch_document(
+        self, text: List[Document], top_k: int = 1
+    ) -> List[List[RetrievedDocument]]:
+        ...
+
+
+class RetrieveDocumentFromVectorStorePipeline(BaseRetrieval):
    """Retrieve list of documents from vector store"""

    vector_store: Param[BaseVectorStore] = Param()
+    doc_store: Optional[BaseDocumentStore] = None
    embedding: Node[BaseEmbeddings] = Node()
-    # TODO: populate to document store as well when it's finished
    # TODO: refer to llama_index's storage as well

-    def run_raw(self, text: str) -> List[str]:
-        emb = self.embedding(text)
-        return self.vector_store.query(embedding=emb)[2]
+    def run_raw(self, text: str, top_k: int = 1) -> List[RetrievedDocument]:
+        return self.run_batch_raw([text], top_k=top_k)[0]
+
+    def run_batch_raw(
+        self, text: List[str], top_k: int = 1
+    ) -> List[List[RetrievedDocument]]:
+        if self.doc_store is None:
+            raise ValueError(
+                "doc_store is not provided. Please provide a doc_store to "
+                "retrieve the documents"
+            )

-    def run_batch_raw(self, text: List[str]) -> List[List[str]]:
        result = []
        for each_text in text:
            emb = self.embedding(each_text)
-            result.append(self.vector_store.query(embedding=emb)[2])
+            _, scores, ids = self.vector_store.query(embedding=emb, top_k=top_k)
+            docs = self.doc_store.get(ids)
+            each_result = [
+                RetrievedDocument(**doc.to_dict(), score=score)
+                for doc, score in zip(docs, scores)
+            ]
+            result.append(each_result)
        return result

-    def run_document(self, text: Document) -> List[str]:
-        return self.run_raw(text.text)
+    def run_document(self, text: Document, top_k: int = 1) -> List[RetrievedDocument]:
+        return self.run_raw(text.text, top_k)

-    def run_batch_document(self, text: List[Document]) -> List[List[str]]:
-        input_text = [each.text for each in text]
-        return self.run_batch_raw(input_text)
+    def run_batch_document(
+        self, text: List[Document], top_k: int = 1
+    ) -> List[List[RetrievedDocument]]:
+        return self.run_batch_raw(text=[t.text for t in text], top_k=top_k)

-    def is_document(self, text) -> bool:
+    def is_document(self, text, *args, **kwargs) -> bool:
        if isinstance(text, Document):
            return True
        elif isinstance(text, List) and isinstance(text[0], Document):
            return True
        return False

-    def is_batch(self, text) -> bool:
+    def is_batch(self, text, *args, **kwargs) -> bool:
        if isinstance(text, list):
            return True
        return False