Refactor the `kotaemon/pipelines` module to `kotaemon/indices`. Create the VectorIndex. Note: currently I place `qa` to be inside `kotaemon/indices` since at the moment we only have `qa` in RAG. At the same time, I think `qa` can be an independent module in `kotaemon/qa`. Since this can be changed later, I still go at the 1st option for now to observe if we can change it later.
63 lines
2.1 KiB
Python
63 lines
2.1 KiB
Python
import os
|
|
|
|
from kotaemon.base import BaseComponent, Document, RetrievedDocument
|
|
from kotaemon.llms import AzureChatOpenAI, BaseLLM, PromptTemplate
|
|
|
|
from .citation import CitationPipeline
|
|
|
|
|
|
class CitationQAPipeline(BaseComponent):
|
|
"""Answering question from a text corpus with citation"""
|
|
|
|
qa_prompt_template: PromptTemplate = PromptTemplate(
|
|
'Answer the following question: "{question}". '
|
|
"The context is: \n{context}\nAnswer: "
|
|
)
|
|
llm: BaseLLM = AzureChatOpenAI.withx(
|
|
azure_endpoint="https://bleh-dummy.openai.azure.com/",
|
|
openai_api_key=os.environ.get("OPENAI_API_KEY", ""),
|
|
openai_api_version="2023-07-01-preview",
|
|
deployment_name="dummy-q2-16k",
|
|
temperature=0,
|
|
request_timeout=60,
|
|
)
|
|
|
|
def _format_doc_text(self, text: str) -> str:
|
|
"""Format the text of each document"""
|
|
return text.replace("\n", " ")
|
|
|
|
def _format_retrieved_context(self, documents: list[RetrievedDocument]) -> str:
|
|
"""Format the texts between all documents"""
|
|
matched_texts: list[str] = [
|
|
self._format_doc_text(doc.text) for doc in documents
|
|
]
|
|
return "\n\n".join(matched_texts)
|
|
|
|
def run(
|
|
self,
|
|
question: str,
|
|
documents: list[RetrievedDocument],
|
|
use_citation: bool = False,
|
|
**kwargs
|
|
) -> Document:
|
|
# retrieve relevant documents as context
|
|
context = self._format_retrieved_context(documents)
|
|
self.log_progress(".context", context=context)
|
|
|
|
# generate the answer
|
|
prompt = self.qa_prompt_template.populate(
|
|
context=context,
|
|
question=question,
|
|
)
|
|
self.log_progress(".prompt", prompt=prompt)
|
|
answer_text = self.llm(prompt).text
|
|
if use_citation:
|
|
# run citation pipeline
|
|
citation_pipeline = CitationPipeline(llm=self.llm)
|
|
citation = citation_pipeline(context=context, question=question)
|
|
else:
|
|
citation = None
|
|
|
|
answer = Document(text=answer_text, metadata={"citation": citation})
|
|
return answer
|