[AUR-338, AUR-406, AUR-407] Export pipeline to config for PromptUI. Construct PromptUI dynamically based on config. (#16)

From pipeline > config > UI. Provide example project for promptui

- Pipeline to config: `kotaemon.contribs.promptui.config.export_pipeline_to_config`. The config follows schema specified in this document: https://cinnamon-ai.atlassian.net/wiki/spaces/ATM/pages/2748711193/Technical+Detail. Note: this implementation exclude the logs, which will be handled in AUR-408.
- Config to UI: `kotaemon.contribs.promptui.build_from_yaml`
- Example project is located at `examples/promptui/`
This commit is contained in:
Nguyen Trung Duc (john)
2023-09-21 14:27:23 +07:00
committed by GitHub
parent c329c4c03f
commit c6dd01e820
18 changed files with 503 additions and 46 deletions

View File

@@ -1,8 +1,10 @@
from typing import List
import uuid
from typing import List, Optional
from theflow import Node, Param
from ..base import BaseComponent
from ..docstores import BaseDocumentStore
from ..documents.base import Document
from ..embeddings import BaseEmbeddings
from ..vectorstores import BaseVectorStore
@@ -18,21 +20,30 @@ class IndexVectorStoreFromDocumentPipeline(BaseComponent):
"""
vector_store: Param[BaseVectorStore] = Param()
doc_store: Optional[BaseDocumentStore] = None
embedding: Node[BaseEmbeddings] = Node()
# TODO: populate to document store as well when it's finished
# TODO: refer to llama_index's storage as well
def run_raw(self, text: str) -> None:
self.vector_store.add([self.embedding(text)])
document = Document(text=text, id_=str(uuid.uuid4()))
self.run_batch_document([document])
def run_batch_raw(self, text: List[str]) -> None:
self.vector_store.add(self.embedding(text))
documents = [Document(t, id_=str(uuid.uuid4())) for t in text]
self.run_batch_document(documents)
def run_document(self, text: Document) -> None:
self.vector_store.add([self.embedding(text)])
self.run_batch_document([text])
def run_batch_document(self, text: List[Document]) -> None:
self.vector_store.add(self.embedding(text))
embeddings = self.embedding(text)
self.vector_store.add(
embeddings=embeddings,
ids=[t.id_ for t in text],
)
if self.doc_store:
self.doc_store.add(text)
def is_document(self, text) -> bool:
if isinstance(text, Document):

View File

@@ -1,47 +1,87 @@
from typing import List
from abc import abstractmethod
from typing import List, Optional
from theflow import Node, Param
from ..base import BaseComponent
from ..documents.base import Document
from ..docstores import BaseDocumentStore
from ..documents.base import Document, RetrievedDocument
from ..embeddings import BaseEmbeddings
from ..vectorstores import BaseVectorStore
class RetrieveDocumentFromVectorStorePipeline(BaseComponent):
class BaseRetrieval(BaseComponent):
"""Define the base interface of a retrieval pipeline"""
@abstractmethod
def run_raw(self, text: str, top_k: int = 1) -> List[RetrievedDocument]:
...
@abstractmethod
def run_batch_raw(
self, text: List[str], top_k: int = 1
) -> List[List[RetrievedDocument]]:
...
@abstractmethod
def run_document(self, text: Document, top_k: int = 1) -> List[RetrievedDocument]:
...
@abstractmethod
def run_batch_document(
self, text: List[Document], top_k: int = 1
) -> List[List[RetrievedDocument]]:
...
class RetrieveDocumentFromVectorStorePipeline(BaseRetrieval):
"""Retrieve list of documents from vector store"""
vector_store: Param[BaseVectorStore] = Param()
doc_store: Optional[BaseDocumentStore] = None
embedding: Node[BaseEmbeddings] = Node()
# TODO: populate to document store as well when it's finished
# TODO: refer to llama_index's storage as well
def run_raw(self, text: str) -> List[str]:
emb = self.embedding(text)
return self.vector_store.query(embedding=emb)[2]
def run_raw(self, text: str, top_k: int = 1) -> List[RetrievedDocument]:
return self.run_batch_raw([text], top_k=top_k)[0]
def run_batch_raw(
self, text: List[str], top_k: int = 1
) -> List[List[RetrievedDocument]]:
if self.doc_store is None:
raise ValueError(
"doc_store is not provided. Please provide a doc_store to "
"retrieve the documents"
)
def run_batch_raw(self, text: List[str]) -> List[List[str]]:
result = []
for each_text in text:
emb = self.embedding(each_text)
result.append(self.vector_store.query(embedding=emb)[2])
_, scores, ids = self.vector_store.query(embedding=emb, top_k=top_k)
docs = self.doc_store.get(ids)
each_result = [
RetrievedDocument(**doc.to_dict(), score=score)
for doc, score in zip(docs, scores)
]
result.append(each_result)
return result
def run_document(self, text: Document) -> List[str]:
return self.run_raw(text.text)
def run_document(self, text: Document, top_k: int = 1) -> List[RetrievedDocument]:
return self.run_raw(text.text, top_k)
def run_batch_document(self, text: List[Document]) -> List[List[str]]:
input_text = [each.text for each in text]
return self.run_batch_raw(input_text)
def run_batch_document(
self, text: List[Document], top_k: int = 1
) -> List[List[RetrievedDocument]]:
return self.run_batch_raw(text=[t.text for t in text], top_k=top_k)
def is_document(self, text) -> bool:
def is_document(self, text, *args, **kwargs) -> bool:
if isinstance(text, Document):
return True
elif isinstance(text, List) and isinstance(text[0], Document):
return True
return False
def is_batch(self, text) -> bool:
def is_batch(self, text, *args, **kwargs) -> bool:
if isinstance(text, list):
return True
return False