kotaemon/knowledgehub/pipelines/ingest.py
Tuan Anh Nguyen Dang (Tadashi_Cin) 79cc60e6a2 [AUR-429] Add MVP pipeline with Ingestion and QA stage (#39)
* add base Tool

* minor update test_tool

* update test dependency

* update test dependency

* Fix namespace conflict

* update test

* add base Agent Interface, add ReWoo Agent

* minor update

* update test

* fix typo

* remove unneeded print

* update rewoo agent

* add LLMTool

* update BaseAgent type

* add ReAct agent

* add ReAct agent

* minor update

* minor update

* minor update

* minor update

* update base reader with BaseComponent

* add splitter

* update agent and tool

* update vectorstores

* update load/save for indexing and retrieving pipeline

* update test_agent for more use-cases

* add missing dependency for test

* update test case for in memory vectorstore

* add TextSplitter to BaseComponent

* update type hint basetool

* add insurance mvp pipeline

* update requirements

* Remove redundant plugins param

* Mock GoogleSearch

---------

Co-authored-by: trducng <trungduc1992@gmail.com>
2023-10-05 12:31:33 +07:00

150 lines
5.1 KiB
Python

import os
from pathlib import Path
from typing import List, Optional, Union
from theflow import Node, Param
from kotaemon.base import BaseComponent
from kotaemon.docstores import InMemoryDocumentStore
from kotaemon.embeddings import AzureOpenAIEmbeddings
from kotaemon.loaders import (
AutoReader,
DirectoryReader,
MathpixPDFReader,
PandasExcelReader,
)
from kotaemon.parsers.splitter import SimpleNodeParser
from kotaemon.pipelines.agents import BaseAgent
from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
from kotaemon.vectorstores import InMemoryVectorStore
from .qa import AgentQAPipeline, QuestionAnsweringPipeline
from .utils import file_names_to_collection_name
class ReaderIndexingPipeline(BaseComponent):
"""
Indexing pipeline which takes input from list of files
and perform ingestion to vectorstore
"""
# Expose variables for users to switch in prompt ui
storage_path: Path = Path("./storage")
reader_name: str = "normal" # "normal" or "mathpix"
openai_api_base: str = "https://bleh-dummy-2.openai.azure.com/"
openai_api_key: str = os.environ.get("OPENAI_API_KEY", "")
chunk_size: int = 1024
chunk_overlap: int = 256
file_name_list: List[str] = list()
@Param.decorate()
def vector_store(self):
return InMemoryVectorStore()
@Param.decorate()
def doc_store(self):
doc_store = InMemoryDocumentStore()
return doc_store
@Node.decorate(depends_on=["openai_api_base", "openai_api_key"])
def embedding(self):
return AzureOpenAIEmbeddings(
model="text-embedding-ada-002",
deployment="dummy-q2-text-embedding",
openai_api_base=self.openai_api_base,
openai_api_key=self.openai_api_key,
)
def get_reader(self, input_files: List[Union[str, Path]]):
# document parsers
file_extractor = {
".xlsx": PandasExcelReader(),
}
if self.reader_name == "normal":
file_extractor[".pdf"] = AutoReader("UnstructuredReader")
else:
file_extractor[".pdf"] = MathpixPDFReader()
main_reader = DirectoryReader(
input_files=input_files,
file_extractor=file_extractor,
)
return main_reader
@Node.decorate(depends_on=["doc_store", "vector_store", "embedding"])
def indexing_vector_pipeline(self):
return IndexVectorStoreFromDocumentPipeline(
doc_store=self.doc_store,
vector_store=self.vector_store,
embedding=self.embedding,
)
@Node.decorate(depends_on=["chunk_size", "chunk_overlap"])
def text_splitter(self):
# chunking using NodeParser from llama-index
return SimpleNodeParser(
chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
)
def run(
self,
file_path_list: Union[List[Union[str, Path]], Union[str, Path]],
force_reindex: Optional[bool] = False,
):
self.storage_path.mkdir(exist_ok=True)
if not isinstance(file_path_list, list):
file_path_list = [file_path_list]
self.file_name_list = [Path(path).stem for path in file_path_list]
collection_name = file_names_to_collection_name(self.file_name_list)
file_storage_path = self.storage_path / collection_name
# skip indexing if storage path exist
if force_reindex or not file_storage_path.exists():
file_storage_path.mkdir(exist_ok=True)
# reader call
documents = self.get_reader(input_files=file_path_list)()
nodes = self.text_splitter(documents)
self.log_progress(".num_docs", num_docs=len(nodes))
self.indexing_vector_pipeline(nodes)
# persist right after indexing
self.indexing_vector_pipeline.save(file_storage_path)
else:
self.indexing_vector_pipeline.load(file_storage_path)
def to_retrieving_pipeline(self):
retrieving_pipeline = RetrieveDocumentFromVectorStorePipeline(
vector_store=self.vector_store,
doc_store=self.doc_store,
embedding=self.embedding,
)
return retrieving_pipeline
def to_qa_pipeline(self, llm: BaseComponent, **kwargs):
qa_pipeline = QuestionAnsweringPipeline(
storage_path=self.storage_path,
file_name_list=self.file_name_list,
vector_store=self.vector_store,
doc_score=self.doc_store,
embedding=self.embedding,
llm=llm,
**kwargs
)
return qa_pipeline
def to_agent_pipeline(self, agent: BaseAgent, **kwargs):
agent_pipeline = AgentQAPipeline(
storage_path=self.storage_path,
file_name_list=self.file_name_list,
vector_store=self.vector_store,
doc_score=self.doc_store,
embedding=self.embedding,
agent=agent,
**kwargs
)
agent_pipeline.add_search_tool()
return agent_pipeline