From pipeline > config > UI. Provide example project for promptui - Pipeline to config: `kotaemon.contribs.promptui.config.export_pipeline_to_config`. The config follows schema specified in this document: https://cinnamon-ai.atlassian.net/wiki/spaces/ATM/pages/2748711193/Technical+Detail. Note: this implementation exclude the logs, which will be handled in AUR-408. - Config to UI: `kotaemon.contribs.promptui.build_from_yaml` - Example project is located at `examples/promptui/`
70 lines
2.1 KiB
Python
70 lines
2.1 KiB
Python
import uuid
|
|
from typing import List, Optional
|
|
|
|
from theflow import Node, Param
|
|
|
|
from ..base import BaseComponent
|
|
from ..docstores import BaseDocumentStore
|
|
from ..documents.base import Document
|
|
from ..embeddings import BaseEmbeddings
|
|
from ..vectorstores import BaseVectorStore
|
|
|
|
|
|
class IndexVectorStoreFromDocumentPipeline(BaseComponent):
|
|
"""Ingest the document, run through the embedding, and store the embedding in a
|
|
vector store.
|
|
|
|
This pipeline supports the following set of inputs:
|
|
- List of documents
|
|
- List of texts
|
|
"""
|
|
|
|
vector_store: Param[BaseVectorStore] = Param()
|
|
doc_store: Optional[BaseDocumentStore] = None
|
|
embedding: Node[BaseEmbeddings] = Node()
|
|
|
|
# TODO: refer to llama_index's storage as well
|
|
|
|
def run_raw(self, text: str) -> None:
|
|
document = Document(text=text, id_=str(uuid.uuid4()))
|
|
self.run_batch_document([document])
|
|
|
|
def run_batch_raw(self, text: List[str]) -> None:
|
|
documents = [Document(t, id_=str(uuid.uuid4())) for t in text]
|
|
self.run_batch_document(documents)
|
|
|
|
def run_document(self, text: Document) -> None:
|
|
self.run_batch_document([text])
|
|
|
|
def run_batch_document(self, text: List[Document]) -> None:
|
|
embeddings = self.embedding(text)
|
|
self.vector_store.add(
|
|
embeddings=embeddings,
|
|
ids=[t.id_ for t in text],
|
|
)
|
|
if self.doc_store:
|
|
self.doc_store.add(text)
|
|
|
|
def is_document(self, text) -> bool:
|
|
if isinstance(text, Document):
|
|
return True
|
|
elif isinstance(text, List) and isinstance(text[0], Document):
|
|
return True
|
|
return False
|
|
|
|
def is_batch(self, text) -> bool:
|
|
if isinstance(text, list):
|
|
return True
|
|
return False
|
|
|
|
def persist(self, path: str):
|
|
"""Save the whole state of the indexing pipeline vector store and all
|
|
necessary information to disk
|
|
|
|
Args:
|
|
path (str): path to save the state
|
|
"""
|
|
|
|
def load(self, path: str):
|
|
"""Load all information from disk to an object"""
|