import os from pathlib import Path from typing import List, Optional, Union from theflow import Node, Param from kotaemon.base import BaseComponent from kotaemon.docstores import InMemoryDocumentStore from kotaemon.embeddings import AzureOpenAIEmbeddings from kotaemon.loaders import ( AutoReader, DirectoryReader, MathpixPDFReader, PandasExcelReader, ) from kotaemon.parsers.splitter import SimpleNodeParser from kotaemon.pipelines.agents import BaseAgent from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline from kotaemon.vectorstores import InMemoryVectorStore from .qa import AgentQAPipeline, QuestionAnsweringPipeline from .utils import file_names_to_collection_name class ReaderIndexingPipeline(BaseComponent): """ Indexing pipeline which takes input from list of files and perform ingestion to vectorstore """ # Expose variables for users to switch in prompt ui storage_path: Path = Path("./storage") reader_name: str = "normal" # "normal" or "mathpix" openai_api_base: str = "https://bleh-dummy-2.openai.azure.com/" openai_api_key: str = os.environ.get("OPENAI_API_KEY", "") chunk_size: int = 1024 chunk_overlap: int = 256 file_name_list: List[str] = list() @Param.decorate() def vector_store(self): return InMemoryVectorStore() @Param.decorate() def doc_store(self): doc_store = InMemoryDocumentStore() return doc_store @Node.decorate(depends_on=["openai_api_base", "openai_api_key"]) def embedding(self): return AzureOpenAIEmbeddings( model="text-embedding-ada-002", deployment="dummy-q2-text-embedding", openai_api_base=self.openai_api_base, openai_api_key=self.openai_api_key, ) def get_reader(self, input_files: List[Union[str, Path]]): # document parsers file_extractor = { ".xlsx": PandasExcelReader(), } if self.reader_name == "normal": file_extractor[".pdf"] = AutoReader("UnstructuredReader") else: file_extractor[".pdf"] = MathpixPDFReader() main_reader = DirectoryReader( input_files=input_files, file_extractor=file_extractor, ) return main_reader @Node.decorate(depends_on=["doc_store", "vector_store", "embedding"]) def indexing_vector_pipeline(self): return IndexVectorStoreFromDocumentPipeline( doc_store=self.doc_store, vector_store=self.vector_store, embedding=self.embedding, ) @Node.decorate(depends_on=["chunk_size", "chunk_overlap"]) def text_splitter(self): # chunking using NodeParser from llama-index return SimpleNodeParser( chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap ) def run( self, file_path_list: Union[List[Union[str, Path]], Union[str, Path]], force_reindex: Optional[bool] = False, ): self.storage_path.mkdir(exist_ok=True) if not isinstance(file_path_list, list): file_path_list = [file_path_list] self.file_name_list = [Path(path).stem for path in file_path_list] collection_name = file_names_to_collection_name(self.file_name_list) file_storage_path = self.storage_path / collection_name # skip indexing if storage path exist if force_reindex or not file_storage_path.exists(): file_storage_path.mkdir(exist_ok=True) # reader call documents = self.get_reader(input_files=file_path_list)() nodes = self.text_splitter(documents) self.log_progress(".num_docs", num_docs=len(nodes)) self.indexing_vector_pipeline(nodes) # persist right after indexing self.indexing_vector_pipeline.save(file_storage_path) else: self.indexing_vector_pipeline.load(file_storage_path) def to_retrieving_pipeline(self): retrieving_pipeline = RetrieveDocumentFromVectorStorePipeline( vector_store=self.vector_store, doc_store=self.doc_store, embedding=self.embedding, ) return retrieving_pipeline def to_qa_pipeline(self, llm: BaseComponent, **kwargs): qa_pipeline = QuestionAnsweringPipeline( storage_path=self.storage_path, file_name_list=self.file_name_list, vector_store=self.vector_store, doc_score=self.doc_store, embedding=self.embedding, llm=llm, **kwargs ) return qa_pipeline def to_agent_pipeline(self, agent: BaseAgent, **kwargs): agent_pipeline = AgentQAPipeline( storage_path=self.storage_path, file_name_list=self.file_name_list, vector_store=self.vector_store, doc_score=self.doc_store, embedding=self.embedding, agent=agent, **kwargs ) agent_pipeline.add_search_tool() return agent_pipeline