Refactor the index component and update the MVP insurance accordingly (#90)

Refactor the `kotaemon/pipelines` module to `kotaemon/indices`. Create the VectorIndex. Note: currently I place `qa` to be inside `kotaemon/indices` since at the moment we only have `qa` in RAG. At the same time, I think `qa` can be an independent module in `kotaemon/qa`. Since this can be changed later, I still go at the 1st option for now to observe if we can change it later.
2023-11-30 18:35:07 +07:00
parent 8e3a1d193f
commit e34b1e4c6d
25 changed files with 396 additions and 605 deletions
--- a/tests/simple_pipeline.py
+++ b/tests/simple_pipeline.py
@@ -5,8 +5,8 @@ from theflow.utils.modules import ObjectInitDeclaration as _

 from kotaemon.base import BaseComponent
 from kotaemon.embeddings import AzureOpenAIEmbeddings
+from kotaemon.indices import VectorRetrieval
 from kotaemon.llms.completions.openai import AzureOpenAI
-from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
 from kotaemon.storages import ChromaVectorStore


@@ -20,16 +20,14 @@ class Pipeline(BaseComponent):
        request_timeout=60,
    )

-    retrieving_pipeline: RetrieveDocumentFromVectorStorePipeline = (
-        RetrieveDocumentFromVectorStorePipeline.withx(
-            vector_store=_(ChromaVectorStore).withx(path=str(tempfile.mkdtemp())),
-            embedding=AzureOpenAIEmbeddings.withx(
-                model="text-embedding-ada-002",
-                deployment="embedding-deployment",
-                openai_api_base="https://test.openai.azure.com/",
-                openai_api_key="some-key",
-            ),
-        )
+    retrieving_pipeline: VectorRetrieval = VectorRetrieval.withx(
+        vector_store=_(ChromaVectorStore).withx(path=str(tempfile.mkdtemp())),
+        embedding=AzureOpenAIEmbeddings.withx(
+            model="text-embedding-ada-002",
+            deployment="embedding-deployment",
+            openai_api_base="https://test.openai.azure.com/",
+            openai_api_key="some-key",
+        ),
    )

    def run_raw(self, text: str) -> str:
--- a/tests/test_citation.py
+++ b/tests/test_citation.py
@@ -4,8 +4,8 @@ from unittest.mock import patch
 import pytest
 from openai.types.chat.chat_completion import ChatCompletion

+from kotaemon.indices.qa import CitationPipeline
 from kotaemon.llms.chats.openai import AzureChatOpenAI
-from kotaemon.pipelines.citation import CitationPipeline

 function_output = '{\n  "question": "What is the provided _example_ benefits?",\n  "answer": [\n    {\n      "fact": "特約死亡保険金: 被保険者がこの特約の保険期間中に死亡したときに支払います。",\n      "substring_quote": ["特約死亡保険金"]\n    },\n    {\n      "fact": "特約特定疾病保険金: 被保険者がこの特約の保険期間中に特定の疾病（悪性新生物（がん）、急性心筋梗塞または脳卒中）により所定の状態に該当したときに支払います。",\n      "substring_quote": ["特約特定疾病保険金"]\n    },\n    {\n      "fact": "特約障害保険金: 被保険者がこの特約の保険期間中に傷害もしくは疾病により所定の身体障害の状態に該当したとき、または不慮の事故により所定の身体障害の状態に該当したときに支払います。",\n      "substring_quote": ["特約障害保険金"]\n    },\n    {\n      "fact": "特約介護保険金: 被保険者がこの特約の保険期間中に傷害または疾病により所定の要介護状態に該当したときに支払います。",\n      "substring_quote": ["特約介護保険金"]\n    }\n  ]\n}'

--- a/tests/test_cot.py
+++ b/tests/test_cot.py
@@ -2,8 +2,8 @@ from unittest.mock import patch

 from openai.types.chat.chat_completion import ChatCompletion

-from kotaemon.llms.chats.openai import AzureChatOpenAI
-from kotaemon.pipelines.cot import ManualSequentialChainOfThought, Thought
+from kotaemon.llms import AzureChatOpenAI
+from kotaemon.llms.cot import ManualSequentialChainOfThought, Thought

 _openai_chat_completion_response = [
    ChatCompletion.parse_obj(
--- a/tests/test_indexing_retrieval.py
+++ b/tests/test_indexing_retrieval.py
@@ -7,8 +7,7 @@ from openai.resources.embeddings import Embeddings

 from kotaemon.base import Document
 from kotaemon.embeddings.openai import AzureOpenAIEmbeddings
-from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline
-from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
+from kotaemon.indices import VectorIndexing, VectorRetrieval
 from kotaemon.storages import ChromaVectorStore, InMemoryDocumentStore

 with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f:
@@ -30,9 +29,7 @@ def test_indexing(mock_openai_embedding, tmp_path):
        openai_api_key="some-key",
    )

-    pipeline = IndexVectorStoreFromDocumentPipeline(
-        vector_store=db, embedding=embedding, doc_store=doc_store
-    )
+    pipeline = VectorIndexing(vector_store=db, embedding=embedding, doc_store=doc_store)
    pipeline.doc_store = cast(InMemoryDocumentStore, pipeline.doc_store)
    pipeline.vector_store = cast(ChromaVectorStore, pipeline.vector_store)
    assert pipeline.vector_store._collection.count() == 0, "Expected empty collection"
@@ -52,10 +49,10 @@ def test_retrieving(mock_openai_embedding, tmp_path):
        openai_api_key="some-key",
    )

-    index_pipeline = IndexVectorStoreFromDocumentPipeline(
+    index_pipeline = VectorIndexing(
        vector_store=db, embedding=embedding, doc_store=doc_store
    )
-    retrieval_pipeline = RetrieveDocumentFromVectorStorePipeline(
+    retrieval_pipeline = VectorRetrieval(
        vector_store=db, doc_store=doc_store, embedding=embedding
    )

--- a/tests/test_qa.py
+++ b/tests/test_qa.py
@@ -1,73 +0,0 @@
-import json
-from pathlib import Path
-from unittest.mock import patch
-
-import pytest
-from openai.resources.embeddings import Embeddings
-from openai.types.chat.chat_completion import ChatCompletion
-
-from kotaemon.llms.chats.openai import AzureChatOpenAI
-from kotaemon.pipelines.ingest import ReaderIndexingPipeline
-
-with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f:
-    openai_embedding = json.load(f)
-
-
-_openai_chat_completion_response = ChatCompletion.parse_obj(
-    {
-        "id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x",
-        "object": "chat.completion",
-        "created": 1692338378,
-        "model": "gpt-35-turbo",
-        "system_fingerprint": None,
-        "choices": [
-            {
-                "index": 0,
-                "finish_reason": "stop",
-                "message": {
-                    "role": "assistant",
-                    "content": "Hello! How can I assist you today?",
-                    "function_call": None,
-                    "tool_calls": None,
-                },
-            }
-        ],
-        "usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19},
-    }
-)
-
-
-@pytest.fixture(scope="function")
-def mock_openai_embedding(monkeypatch):
-    monkeypatch.setattr(Embeddings, "create", lambda *args, **kwargs: openai_embedding)
-
-
-@patch(
-    "openai.resources.chat.completions.Completions.create",
-    side_effect=lambda *args, **kwargs: _openai_chat_completion_response,
-)
-def test_ingest_pipeline(patch, mock_openai_embedding, tmp_path):
-    indexing_pipeline = ReaderIndexingPipeline(
-        storage_path=tmp_path,
-    )
-    indexing_pipeline.embedding.openai_api_key = "some-key"
-    input_file_path = Path(__file__).parent / "resources/dummy.pdf"
-
-    # call ingestion pipeline
-    indexing_pipeline(input_file_path, force_reindex=True)
-    retrieving_pipeline = indexing_pipeline.to_retrieving_pipeline()
-
-    results = retrieving_pipeline("This is a query")
-    assert len(results) == 1
-
-    # create llm
-    llm = AzureChatOpenAI(
-        openai_api_base="https://test.openai.azure.com/",
-        openai_api_key="some-key",
-        openai_api_version="2023-03-15-preview",
-        deployment_name="gpt35turbo",
-        temperature=0,
-    )
-    qa_pipeline = indexing_pipeline.to_qa_pipeline(llm=llm, openai_api_key="some-key")
-    response = qa_pipeline("Summarize this document.")
-    assert response
--- a/tests/test_tools.py
+++ b/tests/test_tools.py
@@ -7,8 +7,7 @@ from openai.resources.embeddings import Embeddings
 from kotaemon.agents.tools import ComponentTool, GoogleSearchTool, WikipediaTool
 from kotaemon.base import Document
 from kotaemon.embeddings.openai import AzureOpenAIEmbeddings
-from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline
-from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
+from kotaemon.indices.vectorindex import VectorIndexing, VectorRetrieval
 from kotaemon.storages import ChromaVectorStore, InMemoryDocumentStore

 with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f:
@@ -46,10 +45,10 @@ def test_pipeline_tool(mock_openai_embedding, tmp_path):
        openai_api_key="some-key",
    )

-    index_pipeline = IndexVectorStoreFromDocumentPipeline(
+    index_pipeline = VectorIndexing(
        vector_store=db, embedding=embedding, doc_store=doc_store
    )
-    retrieval_pipeline = RetrieveDocumentFromVectorStorePipeline(
+    retrieval_pipeline = VectorRetrieval(
        vector_store=db, doc_store=doc_store, embedding=embedding
    )