Refactor the index component and update the MVP insurance accordingly (#90)

Refactor the `kotaemon/pipelines` module to `kotaemon/indices`. Create the VectorIndex. Note: currently I place `qa` to be inside `kotaemon/indices` since at the moment we only have `qa` in RAG. At the same time, I think `qa` can be an independent module in `kotaemon/qa`. Since this can be changed later, I still go at the 1st option for now to observe if we can change it later.
2023-11-30 18:35:07 +07:00
parent 8e3a1d193f
commit e34b1e4c6d
25 changed files with 396 additions and 605 deletions
--- a/knowledgehub/indices/qa/text_based.py
+++ b/knowledgehub/indices/qa/text_based.py
@@ -0,0 +1,62 @@
+import os
+
+from kotaemon.base import BaseComponent, Document, RetrievedDocument
+from kotaemon.llms import AzureChatOpenAI, BaseLLM, PromptTemplate
+
+from .citation import CitationPipeline
+
+
+class CitationQAPipeline(BaseComponent):
+    """Answering question from a text corpus with citation"""
+
+    qa_prompt_template: PromptTemplate = PromptTemplate(
+        'Answer the following question: "{question}". '
+        "The context is: \n{context}\nAnswer: "
+    )
+    llm: BaseLLM = AzureChatOpenAI.withx(
+        azure_endpoint="https://bleh-dummy.openai.azure.com/",
+        openai_api_key=os.environ.get("OPENAI_API_KEY", ""),
+        openai_api_version="2023-07-01-preview",
+        deployment_name="dummy-q2-16k",
+        temperature=0,
+        request_timeout=60,
+    )
+
+    def _format_doc_text(self, text: str) -> str:
+        """Format the text of each document"""
+        return text.replace("\n", " ")
+
+    def _format_retrieved_context(self, documents: list[RetrievedDocument]) -> str:
+        """Format the texts between all documents"""
+        matched_texts: list[str] = [
+            self._format_doc_text(doc.text) for doc in documents
+        ]
+        return "\n\n".join(matched_texts)
+
+    def run(
+        self,
+        question: str,
+        documents: list[RetrievedDocument],
+        use_citation: bool = False,
+        **kwargs
+    ) -> Document:
+        # retrieve relevant documents as context
+        context = self._format_retrieved_context(documents)
+        self.log_progress(".context", context=context)
+
+        # generate the answer
+        prompt = self.qa_prompt_template.populate(
+            context=context,
+            question=question,
+        )
+        self.log_progress(".prompt", prompt=prompt)
+        answer_text = self.llm(prompt).text
+        if use_citation:
+            # run citation pipeline
+            citation_pipeline = CitationPipeline(llm=self.llm)
+            citation = citation_pipeline(context=context, question=question)
+        else:
+            citation = None
+
+        answer = Document(text=answer_text, metadata={"citation": citation})
+        return answer