[AUR-429] Add MVP pipeline with Ingestion and QA stage (#39)
* add base Tool * minor update test_tool * update test dependency * update test dependency * Fix namespace conflict * update test * add base Agent Interface, add ReWoo Agent * minor update * update test * fix typo * remove unneeded print * update rewoo agent * add LLMTool * update BaseAgent type * add ReAct agent * add ReAct agent * minor update * minor update * minor update * minor update * update base reader with BaseComponent * add splitter * update agent and tool * update vectorstores * update load/save for indexing and retrieving pipeline * update test_agent for more use-cases * add missing dependency for test * update test case for in memory vectorstore * add TextSplitter to BaseComponent * update type hint basetool * add insurance mvp pipeline * update requirements * Remove redundant plugins param * Mock GoogleSearch --------- Co-authored-by: trducng <trungduc1992@gmail.com>
This commit is contained in:
parent
2638152054
commit
79cc60e6a2
|
@ -22,4 +22,10 @@ class AzureOpenAIEmbeddings(LangchainEmbeddings):
|
||||||
|
|
||||||
def __init__(self, **params):
|
def __init__(self, **params):
|
||||||
params["openai_api_type"] = "azure"
|
params["openai_api_type"] = "azure"
|
||||||
|
|
||||||
|
# openai.error.InvalidRequestError: Too many inputs. The max number of
|
||||||
|
# inputs is 16. We hope to increase the number of inputs per request
|
||||||
|
# soon. Please contact us through an Azure support request at:
|
||||||
|
# https://go.microsoft.com/fwlink/?linkid=2213926 for further questions.
|
||||||
|
params["chunk_size"] = 16
|
||||||
super().__init__(**params)
|
super().__init__(**params)
|
||||||
|
|
149
knowledgehub/pipelines/ingest.py
Normal file
149
knowledgehub/pipelines/ingest.py
Normal file
|
@ -0,0 +1,149 @@
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
|
from theflow import Node, Param
|
||||||
|
|
||||||
|
from kotaemon.base import BaseComponent
|
||||||
|
from kotaemon.docstores import InMemoryDocumentStore
|
||||||
|
from kotaemon.embeddings import AzureOpenAIEmbeddings
|
||||||
|
from kotaemon.loaders import (
|
||||||
|
AutoReader,
|
||||||
|
DirectoryReader,
|
||||||
|
MathpixPDFReader,
|
||||||
|
PandasExcelReader,
|
||||||
|
)
|
||||||
|
from kotaemon.parsers.splitter import SimpleNodeParser
|
||||||
|
from kotaemon.pipelines.agents import BaseAgent
|
||||||
|
from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline
|
||||||
|
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
|
||||||
|
from kotaemon.vectorstores import InMemoryVectorStore
|
||||||
|
|
||||||
|
from .qa import AgentQAPipeline, QuestionAnsweringPipeline
|
||||||
|
from .utils import file_names_to_collection_name
|
||||||
|
|
||||||
|
|
||||||
|
class ReaderIndexingPipeline(BaseComponent):
|
||||||
|
"""
|
||||||
|
Indexing pipeline which takes input from list of files
|
||||||
|
and perform ingestion to vectorstore
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Expose variables for users to switch in prompt ui
|
||||||
|
storage_path: Path = Path("./storage")
|
||||||
|
reader_name: str = "normal" # "normal" or "mathpix"
|
||||||
|
openai_api_base: str = "https://bleh-dummy-2.openai.azure.com/"
|
||||||
|
openai_api_key: str = os.environ.get("OPENAI_API_KEY", "")
|
||||||
|
chunk_size: int = 1024
|
||||||
|
chunk_overlap: int = 256
|
||||||
|
file_name_list: List[str] = list()
|
||||||
|
|
||||||
|
@Param.decorate()
|
||||||
|
def vector_store(self):
|
||||||
|
return InMemoryVectorStore()
|
||||||
|
|
||||||
|
@Param.decorate()
|
||||||
|
def doc_store(self):
|
||||||
|
doc_store = InMemoryDocumentStore()
|
||||||
|
return doc_store
|
||||||
|
|
||||||
|
@Node.decorate(depends_on=["openai_api_base", "openai_api_key"])
|
||||||
|
def embedding(self):
|
||||||
|
return AzureOpenAIEmbeddings(
|
||||||
|
model="text-embedding-ada-002",
|
||||||
|
deployment="dummy-q2-text-embedding",
|
||||||
|
openai_api_base=self.openai_api_base,
|
||||||
|
openai_api_key=self.openai_api_key,
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_reader(self, input_files: List[Union[str, Path]]):
|
||||||
|
# document parsers
|
||||||
|
file_extractor = {
|
||||||
|
".xlsx": PandasExcelReader(),
|
||||||
|
}
|
||||||
|
if self.reader_name == "normal":
|
||||||
|
file_extractor[".pdf"] = AutoReader("UnstructuredReader")
|
||||||
|
else:
|
||||||
|
file_extractor[".pdf"] = MathpixPDFReader()
|
||||||
|
main_reader = DirectoryReader(
|
||||||
|
input_files=input_files,
|
||||||
|
file_extractor=file_extractor,
|
||||||
|
)
|
||||||
|
return main_reader
|
||||||
|
|
||||||
|
@Node.decorate(depends_on=["doc_store", "vector_store", "embedding"])
|
||||||
|
def indexing_vector_pipeline(self):
|
||||||
|
return IndexVectorStoreFromDocumentPipeline(
|
||||||
|
doc_store=self.doc_store,
|
||||||
|
vector_store=self.vector_store,
|
||||||
|
embedding=self.embedding,
|
||||||
|
)
|
||||||
|
|
||||||
|
@Node.decorate(depends_on=["chunk_size", "chunk_overlap"])
|
||||||
|
def text_splitter(self):
|
||||||
|
# chunking using NodeParser from llama-index
|
||||||
|
return SimpleNodeParser(
|
||||||
|
chunk_size=self.chunk_size, chunk_overlap=self.chunk_overlap
|
||||||
|
)
|
||||||
|
|
||||||
|
def run(
|
||||||
|
self,
|
||||||
|
file_path_list: Union[List[Union[str, Path]], Union[str, Path]],
|
||||||
|
force_reindex: Optional[bool] = False,
|
||||||
|
):
|
||||||
|
self.storage_path.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
if not isinstance(file_path_list, list):
|
||||||
|
file_path_list = [file_path_list]
|
||||||
|
|
||||||
|
self.file_name_list = [Path(path).stem for path in file_path_list]
|
||||||
|
collection_name = file_names_to_collection_name(self.file_name_list)
|
||||||
|
|
||||||
|
file_storage_path = self.storage_path / collection_name
|
||||||
|
|
||||||
|
# skip indexing if storage path exist
|
||||||
|
if force_reindex or not file_storage_path.exists():
|
||||||
|
file_storage_path.mkdir(exist_ok=True)
|
||||||
|
# reader call
|
||||||
|
documents = self.get_reader(input_files=file_path_list)()
|
||||||
|
nodes = self.text_splitter(documents)
|
||||||
|
self.log_progress(".num_docs", num_docs=len(nodes))
|
||||||
|
|
||||||
|
self.indexing_vector_pipeline(nodes)
|
||||||
|
# persist right after indexing
|
||||||
|
self.indexing_vector_pipeline.save(file_storage_path)
|
||||||
|
else:
|
||||||
|
self.indexing_vector_pipeline.load(file_storage_path)
|
||||||
|
|
||||||
|
def to_retrieving_pipeline(self):
|
||||||
|
retrieving_pipeline = RetrieveDocumentFromVectorStorePipeline(
|
||||||
|
vector_store=self.vector_store,
|
||||||
|
doc_store=self.doc_store,
|
||||||
|
embedding=self.embedding,
|
||||||
|
)
|
||||||
|
return retrieving_pipeline
|
||||||
|
|
||||||
|
def to_qa_pipeline(self, llm: BaseComponent, **kwargs):
|
||||||
|
qa_pipeline = QuestionAnsweringPipeline(
|
||||||
|
storage_path=self.storage_path,
|
||||||
|
file_name_list=self.file_name_list,
|
||||||
|
vector_store=self.vector_store,
|
||||||
|
doc_score=self.doc_store,
|
||||||
|
embedding=self.embedding,
|
||||||
|
llm=llm,
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
|
return qa_pipeline
|
||||||
|
|
||||||
|
def to_agent_pipeline(self, agent: BaseAgent, **kwargs):
|
||||||
|
agent_pipeline = AgentQAPipeline(
|
||||||
|
storage_path=self.storage_path,
|
||||||
|
file_name_list=self.file_name_list,
|
||||||
|
vector_store=self.vector_store,
|
||||||
|
doc_score=self.doc_store,
|
||||||
|
embedding=self.embedding,
|
||||||
|
agent=agent,
|
||||||
|
**kwargs
|
||||||
|
)
|
||||||
|
agent_pipeline.add_search_tool()
|
||||||
|
return agent_pipeline
|
130
knowledgehub/pipelines/qa.py
Normal file
130
knowledgehub/pipelines/qa.py
Normal file
|
@ -0,0 +1,130 @@
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from theflow import Node, Param
|
||||||
|
|
||||||
|
from kotaemon.base import BaseComponent
|
||||||
|
from kotaemon.docstores import InMemoryDocumentStore
|
||||||
|
from kotaemon.documents.base import RetrievedDocument
|
||||||
|
from kotaemon.embeddings import AzureOpenAIEmbeddings
|
||||||
|
from kotaemon.llms.chats.openai import AzureChatOpenAI
|
||||||
|
from kotaemon.pipelines.agents import BaseAgent
|
||||||
|
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
|
||||||
|
from kotaemon.pipelines.tools import ComponentTool
|
||||||
|
from kotaemon.prompt.template import PromptTemplate
|
||||||
|
from kotaemon.vectorstores import InMemoryVectorStore
|
||||||
|
|
||||||
|
from .utils import file_names_to_collection_name
|
||||||
|
|
||||||
|
|
||||||
|
class QuestionAnsweringPipeline(BaseComponent):
|
||||||
|
"""
|
||||||
|
Question Answering pipeline ultilizing a child Retrieving pipeline
|
||||||
|
"""
|
||||||
|
|
||||||
|
storage_path: Path = Path("./storage")
|
||||||
|
retrieval_top_k: int = 3
|
||||||
|
openai_api_base: str = "https://bleh-dummy-2.openai.azure.com/"
|
||||||
|
openai_api_key: str = os.environ.get("OPENAI_API_KEY", "")
|
||||||
|
file_name_list: List[str]
|
||||||
|
"""List of filename, incombination with storage_path to
|
||||||
|
create persistent path of vectorstore"""
|
||||||
|
prompt_template: PromptTemplate = PromptTemplate(
|
||||||
|
'Answer the following question: "{question}". '
|
||||||
|
"The context is: \n{context}\nAnswer: "
|
||||||
|
)
|
||||||
|
|
||||||
|
@Node.decorate(depends_on=["openai_api_base", "openai_api_key"])
|
||||||
|
def llm(self):
|
||||||
|
return AzureChatOpenAI(
|
||||||
|
openai_api_base="https://bleh-dummy-2.openai.azure.com/",
|
||||||
|
openai_api_key=self.openai_api_key,
|
||||||
|
openai_api_version="2023-03-15-preview",
|
||||||
|
deployment_name="dummy-q2-gpt35",
|
||||||
|
temperature=0,
|
||||||
|
request_timeout=60,
|
||||||
|
)
|
||||||
|
|
||||||
|
@Param.decorate()
|
||||||
|
def vector_store(self):
|
||||||
|
return InMemoryVectorStore()
|
||||||
|
|
||||||
|
@Param.decorate()
|
||||||
|
def doc_store(self):
|
||||||
|
doc_store = InMemoryDocumentStore()
|
||||||
|
return doc_store
|
||||||
|
|
||||||
|
@Node.decorate(depends_on=["openai_api_base", "openai_api_key"])
|
||||||
|
def embedding(self):
|
||||||
|
return AzureOpenAIEmbeddings(
|
||||||
|
model="text-embedding-ada-002",
|
||||||
|
deployment="dummy-q2-text-embedding",
|
||||||
|
openai_api_base=self.openai_api_base,
|
||||||
|
openai_api_key=self.openai_api_key,
|
||||||
|
)
|
||||||
|
|
||||||
|
@Node.decorate(depends_on=["doc_store", "vector_store", "embedding"])
|
||||||
|
def retrieving_pipeline(self):
|
||||||
|
retrieving_pipeline = RetrieveDocumentFromVectorStorePipeline(
|
||||||
|
vector_store=self.vector_store,
|
||||||
|
doc_store=self.doc_store,
|
||||||
|
embedding=self.embedding,
|
||||||
|
)
|
||||||
|
# load persistent from selected path
|
||||||
|
collection_name = file_names_to_collection_name(self.file_name_list)
|
||||||
|
retrieving_pipeline.load(self.storage_path / collection_name)
|
||||||
|
return retrieving_pipeline
|
||||||
|
|
||||||
|
def _format_doc_text(self, text: str) -> str:
|
||||||
|
return text.replace("\n", " ")
|
||||||
|
|
||||||
|
def _format_retrieved_context(self, documents: List[RetrievedDocument]) -> str:
|
||||||
|
matched_texts: List[str] = [
|
||||||
|
self._format_doc_text(doc.text) for doc in documents
|
||||||
|
]
|
||||||
|
return "\n\n".join(matched_texts)
|
||||||
|
|
||||||
|
def run(self, question: str) -> str:
|
||||||
|
# retrieve relevant documents as context
|
||||||
|
documents = self.retrieving_pipeline(question, top_k=int(self.retrieval_top_k))
|
||||||
|
context = self._format_retrieved_context(documents)
|
||||||
|
self.log_progress(".context", context=context)
|
||||||
|
|
||||||
|
# generate the answer
|
||||||
|
prompt = self.prompt_template.populate(
|
||||||
|
context=context,
|
||||||
|
question=question,
|
||||||
|
)
|
||||||
|
self.log_progress(".prompt", prompt=prompt)
|
||||||
|
answer = self.llm(prompt).text
|
||||||
|
return answer
|
||||||
|
|
||||||
|
|
||||||
|
class AgentQAPipeline(QuestionAnsweringPipeline):
|
||||||
|
"""
|
||||||
|
QA pipeline ultilizing a child Retrieving pipeline and a Agent pipeline
|
||||||
|
"""
|
||||||
|
|
||||||
|
agent: BaseAgent
|
||||||
|
|
||||||
|
def add_search_tool(self):
|
||||||
|
search_tool = ComponentTool(
|
||||||
|
name="search_doc",
|
||||||
|
description=(
|
||||||
|
"A vector store that searches for similar and "
|
||||||
|
"related content "
|
||||||
|
f"in a document: {' '.join(self.file_name_list)}. "
|
||||||
|
"The result is a huge chunk of text related "
|
||||||
|
"to your search but can also "
|
||||||
|
"contain irrelevant info."
|
||||||
|
),
|
||||||
|
postprocessor=self._format_retrieved_context,
|
||||||
|
component=self.retrieving_pipeline,
|
||||||
|
)
|
||||||
|
if search_tool not in self.agent.plugins:
|
||||||
|
self.agent.plugins.append(search_tool)
|
||||||
|
|
||||||
|
def run(self, question: str) -> str:
|
||||||
|
answer = self.agent(question).output
|
||||||
|
return answer
|
17
knowledgehub/pipelines/utils.py
Normal file
17
knowledgehub/pipelines/utils.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
import hashlib
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
|
||||||
|
def filename_to_hash(filename: str) -> str:
|
||||||
|
"""
|
||||||
|
Convert filename to hash to be used as collection name for storage
|
||||||
|
"""
|
||||||
|
result = hashlib.md5(filename.encode())
|
||||||
|
return result.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def file_names_to_collection_name(file_name_list: List[str]) -> str:
|
||||||
|
"""
|
||||||
|
Convert list of filenames to collection name
|
||||||
|
"""
|
||||||
|
return filename_to_hash(" ".join(file_name_list))
|
1
setup.py
1
setup.py
|
@ -57,6 +57,7 @@ setuptools.setup(
|
||||||
"googlesearch-python",
|
"googlesearch-python",
|
||||||
"python-dotenv",
|
"python-dotenv",
|
||||||
"pytest-mock",
|
"pytest-mock",
|
||||||
|
"unstructured[pdf]",
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
entry_points={"console_scripts": ["kh=kotaemon.cli:main"]},
|
entry_points={"console_scripts": ["kh=kotaemon.cli:main"]},
|
||||||
|
|
15
tests/conftest.py
Normal file
15
tests/conftest.py
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="function")
|
||||||
|
def mock_google_search(monkeypatch):
|
||||||
|
import googlesearch
|
||||||
|
|
||||||
|
def result(*args, **kwargs):
|
||||||
|
yield googlesearch.SearchResult(
|
||||||
|
url="https://www.cinnamon.is/en/",
|
||||||
|
title="Cinnamon AI",
|
||||||
|
description="Cinnamon AI is an enterprise AI company.",
|
||||||
|
)
|
||||||
|
|
||||||
|
monkeypatch.setattr(googlesearch, "search", result)
|
|
@ -135,7 +135,7 @@ def llm():
|
||||||
"openai.api_resources.chat_completion.ChatCompletion.create",
|
"openai.api_resources.chat_completion.ChatCompletion.create",
|
||||||
side_effect=_openai_chat_completion_responses_rewoo,
|
side_effect=_openai_chat_completion_responses_rewoo,
|
||||||
)
|
)
|
||||||
def test_rewoo_agent(openai_completion, llm):
|
def test_rewoo_agent(openai_completion, llm, mock_google_search):
|
||||||
plugins = [
|
plugins = [
|
||||||
GoogleSearchTool(),
|
GoogleSearchTool(),
|
||||||
WikipediaTool(),
|
WikipediaTool(),
|
||||||
|
@ -153,7 +153,7 @@ def test_rewoo_agent(openai_completion, llm):
|
||||||
"openai.api_resources.chat_completion.ChatCompletion.create",
|
"openai.api_resources.chat_completion.ChatCompletion.create",
|
||||||
side_effect=_openai_chat_completion_responses_react,
|
side_effect=_openai_chat_completion_responses_react,
|
||||||
)
|
)
|
||||||
def test_react_agent(openai_completion, llm):
|
def test_react_agent(openai_completion, llm, mock_google_search):
|
||||||
plugins = [
|
plugins = [
|
||||||
GoogleSearchTool(),
|
GoogleSearchTool(),
|
||||||
WikipediaTool(),
|
WikipediaTool(),
|
||||||
|
@ -170,7 +170,7 @@ def test_react_agent(openai_completion, llm):
|
||||||
"openai.api_resources.chat_completion.ChatCompletion.create",
|
"openai.api_resources.chat_completion.ChatCompletion.create",
|
||||||
side_effect=_openai_chat_completion_responses_react,
|
side_effect=_openai_chat_completion_responses_react,
|
||||||
)
|
)
|
||||||
def test_react_agent_langchain(openai_completion, llm):
|
def test_react_agent_langchain(openai_completion, llm, mock_google_search):
|
||||||
from langchain.agents import AgentType, initialize_agent
|
from langchain.agents import AgentType, initialize_agent
|
||||||
|
|
||||||
plugins = [
|
plugins = [
|
||||||
|
|
67
tests/test_qa.py
Normal file
67
tests/test_qa.py
Normal file
|
@ -0,0 +1,67 @@
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from openai.api_resources.embedding import Embedding
|
||||||
|
|
||||||
|
from kotaemon.llms.chats.openai import AzureChatOpenAI
|
||||||
|
from kotaemon.pipelines.ingest import ReaderIndexingPipeline
|
||||||
|
|
||||||
|
with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f:
|
||||||
|
openai_embedding = json.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
_openai_chat_completion_response = {
|
||||||
|
"id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x",
|
||||||
|
"object": "chat.completion",
|
||||||
|
"created": 1692338378,
|
||||||
|
"model": "gpt-35-turbo",
|
||||||
|
"choices": [
|
||||||
|
{
|
||||||
|
"index": 0,
|
||||||
|
"finish_reason": "stop",
|
||||||
|
"message": {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": "Hello! How can I assist you today?",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="function")
|
||||||
|
def mock_openai_embedding(monkeypatch):
|
||||||
|
monkeypatch.setattr(Embedding, "create", lambda *args, **kwargs: openai_embedding)
|
||||||
|
|
||||||
|
|
||||||
|
@patch(
|
||||||
|
"openai.api_resources.chat_completion.ChatCompletion.create",
|
||||||
|
side_effect=lambda *args, **kwargs: _openai_chat_completion_response,
|
||||||
|
)
|
||||||
|
def test_ingest_pipeline(patch, mock_openai_embedding, tmp_path):
|
||||||
|
indexing_pipeline = ReaderIndexingPipeline(
|
||||||
|
storage=tmp_path, openai_api_key="some-key"
|
||||||
|
)
|
||||||
|
input_file_path = Path(__file__).parent / "resources/dummy.pdf"
|
||||||
|
|
||||||
|
# call ingestion pipeline
|
||||||
|
indexing_pipeline(input_file_path, force_reindex=True)
|
||||||
|
retrieving_pipeline = indexing_pipeline.to_retrieving_pipeline()
|
||||||
|
|
||||||
|
results = retrieving_pipeline("This is a query")
|
||||||
|
assert len(results) == 1
|
||||||
|
|
||||||
|
# create llm
|
||||||
|
llm = AzureChatOpenAI(
|
||||||
|
openai_api_base="https://test.openai.azure.com/",
|
||||||
|
openai_api_key="some-key",
|
||||||
|
openai_api_version="2023-03-15-preview",
|
||||||
|
deployment_name="gpt35turbo",
|
||||||
|
temperature=0,
|
||||||
|
request_timeout=60,
|
||||||
|
)
|
||||||
|
qa_pipeline = indexing_pipeline.to_qa_pipeline(llm=llm, openai_api_key="some-key")
|
||||||
|
response = qa_pipeline("Summarize this document.")
|
||||||
|
assert response
|
|
@ -21,7 +21,7 @@ def mock_openai_embedding(monkeypatch):
|
||||||
monkeypatch.setattr(Embedding, "create", lambda *args, **kwargs: openai_embedding)
|
monkeypatch.setattr(Embedding, "create", lambda *args, **kwargs: openai_embedding)
|
||||||
|
|
||||||
|
|
||||||
def test_google_tool():
|
def test_google_tool(mock_google_search):
|
||||||
tool = GoogleSearchTool()
|
tool = GoogleSearchTool()
|
||||||
assert tool.name
|
assert tool.name
|
||||||
assert tool.description
|
assert tool.description
|
||||||
|
|
Loading…
Reference in New Issue
Block a user