Refactor the index component and update the MVP insurance accordingly (#90)

Refactor the `kotaemon/pipelines` module to `kotaemon/indices`. Create the VectorIndex.

Note: currently I place `qa` to be inside `kotaemon/indices` since at the moment we only have `qa` in RAG. At the same time, I think `qa` can be an independent module in `kotaemon/qa`. Since this can be changed later, I still go at the 1st option for now to observe if we can change it later.
This commit is contained in:
Duc Nguyen (john)
2023-11-30 18:35:07 +07:00
committed by GitHub
parent 8e3a1d193f
commit e34b1e4c6d
25 changed files with 396 additions and 605 deletions

View File

@@ -5,8 +5,8 @@ from theflow.utils.modules import ObjectInitDeclaration as _
from kotaemon.base import BaseComponent
from kotaemon.embeddings import AzureOpenAIEmbeddings
from kotaemon.indices import VectorRetrieval
from kotaemon.llms.completions.openai import AzureOpenAI
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
from kotaemon.storages import ChromaVectorStore
@@ -20,16 +20,14 @@ class Pipeline(BaseComponent):
request_timeout=60,
)
retrieving_pipeline: RetrieveDocumentFromVectorStorePipeline = (
RetrieveDocumentFromVectorStorePipeline.withx(
vector_store=_(ChromaVectorStore).withx(path=str(tempfile.mkdtemp())),
embedding=AzureOpenAIEmbeddings.withx(
model="text-embedding-ada-002",
deployment="embedding-deployment",
openai_api_base="https://test.openai.azure.com/",
openai_api_key="some-key",
),
)
retrieving_pipeline: VectorRetrieval = VectorRetrieval.withx(
vector_store=_(ChromaVectorStore).withx(path=str(tempfile.mkdtemp())),
embedding=AzureOpenAIEmbeddings.withx(
model="text-embedding-ada-002",
deployment="embedding-deployment",
openai_api_base="https://test.openai.azure.com/",
openai_api_key="some-key",
),
)
def run_raw(self, text: str) -> str:

View File

@@ -4,8 +4,8 @@ from unittest.mock import patch
import pytest
from openai.types.chat.chat_completion import ChatCompletion
from kotaemon.indices.qa import CitationPipeline
from kotaemon.llms.chats.openai import AzureChatOpenAI
from kotaemon.pipelines.citation import CitationPipeline
function_output = '{\n "question": "What is the provided _example_ benefits?",\n "answer": [\n {\n "fact": "特約死亡保険金: 被保険者がこの特約の保険期間中に死亡したときに支払います。",\n "substring_quote": ["特約死亡保険金"]\n },\n {\n "fact": "特約特定疾病保険金: 被保険者がこの特約の保険期間中に特定の疾病(悪性新生物(がん)、急性心筋梗塞または脳卒中)により所定の状態に該当したときに支払います。",\n "substring_quote": ["特約特定疾病保険金"]\n },\n {\n "fact": "特約障害保険金: 被保険者がこの特約の保険期間中に傷害もしくは疾病により所定の身体障害の状態に該当したとき、または不慮の事故により所定の身体障害の状態に該当したときに支払います。",\n "substring_quote": ["特約障害保険金"]\n },\n {\n "fact": "特約介護保険金: 被保険者がこの特約の保険期間中に傷害または疾病により所定の要介護状態に該当したときに支払います。",\n "substring_quote": ["特約介護保険金"]\n }\n ]\n}'

View File

@@ -2,8 +2,8 @@ from unittest.mock import patch
from openai.types.chat.chat_completion import ChatCompletion
from kotaemon.llms.chats.openai import AzureChatOpenAI
from kotaemon.pipelines.cot import ManualSequentialChainOfThought, Thought
from kotaemon.llms import AzureChatOpenAI
from kotaemon.llms.cot import ManualSequentialChainOfThought, Thought
_openai_chat_completion_response = [
ChatCompletion.parse_obj(

View File

@@ -7,8 +7,7 @@ from openai.resources.embeddings import Embeddings
from kotaemon.base import Document
from kotaemon.embeddings.openai import AzureOpenAIEmbeddings
from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
from kotaemon.indices import VectorIndexing, VectorRetrieval
from kotaemon.storages import ChromaVectorStore, InMemoryDocumentStore
with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f:
@@ -30,9 +29,7 @@ def test_indexing(mock_openai_embedding, tmp_path):
openai_api_key="some-key",
)
pipeline = IndexVectorStoreFromDocumentPipeline(
vector_store=db, embedding=embedding, doc_store=doc_store
)
pipeline = VectorIndexing(vector_store=db, embedding=embedding, doc_store=doc_store)
pipeline.doc_store = cast(InMemoryDocumentStore, pipeline.doc_store)
pipeline.vector_store = cast(ChromaVectorStore, pipeline.vector_store)
assert pipeline.vector_store._collection.count() == 0, "Expected empty collection"
@@ -52,10 +49,10 @@ def test_retrieving(mock_openai_embedding, tmp_path):
openai_api_key="some-key",
)
index_pipeline = IndexVectorStoreFromDocumentPipeline(
index_pipeline = VectorIndexing(
vector_store=db, embedding=embedding, doc_store=doc_store
)
retrieval_pipeline = RetrieveDocumentFromVectorStorePipeline(
retrieval_pipeline = VectorRetrieval(
vector_store=db, doc_store=doc_store, embedding=embedding
)

View File

@@ -1,73 +0,0 @@
import json
from pathlib import Path
from unittest.mock import patch
import pytest
from openai.resources.embeddings import Embeddings
from openai.types.chat.chat_completion import ChatCompletion
from kotaemon.llms.chats.openai import AzureChatOpenAI
from kotaemon.pipelines.ingest import ReaderIndexingPipeline
with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f:
openai_embedding = json.load(f)
_openai_chat_completion_response = ChatCompletion.parse_obj(
{
"id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x",
"object": "chat.completion",
"created": 1692338378,
"model": "gpt-35-turbo",
"system_fingerprint": None,
"choices": [
{
"index": 0,
"finish_reason": "stop",
"message": {
"role": "assistant",
"content": "Hello! How can I assist you today?",
"function_call": None,
"tool_calls": None,
},
}
],
"usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19},
}
)
@pytest.fixture(scope="function")
def mock_openai_embedding(monkeypatch):
monkeypatch.setattr(Embeddings, "create", lambda *args, **kwargs: openai_embedding)
@patch(
"openai.resources.chat.completions.Completions.create",
side_effect=lambda *args, **kwargs: _openai_chat_completion_response,
)
def test_ingest_pipeline(patch, mock_openai_embedding, tmp_path):
indexing_pipeline = ReaderIndexingPipeline(
storage_path=tmp_path,
)
indexing_pipeline.embedding.openai_api_key = "some-key"
input_file_path = Path(__file__).parent / "resources/dummy.pdf"
# call ingestion pipeline
indexing_pipeline(input_file_path, force_reindex=True)
retrieving_pipeline = indexing_pipeline.to_retrieving_pipeline()
results = retrieving_pipeline("This is a query")
assert len(results) == 1
# create llm
llm = AzureChatOpenAI(
openai_api_base="https://test.openai.azure.com/",
openai_api_key="some-key",
openai_api_version="2023-03-15-preview",
deployment_name="gpt35turbo",
temperature=0,
)
qa_pipeline = indexing_pipeline.to_qa_pipeline(llm=llm, openai_api_key="some-key")
response = qa_pipeline("Summarize this document.")
assert response

View File

@@ -7,8 +7,7 @@ from openai.resources.embeddings import Embeddings
from kotaemon.agents.tools import ComponentTool, GoogleSearchTool, WikipediaTool
from kotaemon.base import Document
from kotaemon.embeddings.openai import AzureOpenAIEmbeddings
from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
from kotaemon.indices.vectorindex import VectorIndexing, VectorRetrieval
from kotaemon.storages import ChromaVectorStore, InMemoryDocumentStore
with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f:
@@ -46,10 +45,10 @@ def test_pipeline_tool(mock_openai_embedding, tmp_path):
openai_api_key="some-key",
)
index_pipeline = IndexVectorStoreFromDocumentPipeline(
index_pipeline = VectorIndexing(
vector_store=db, embedding=embedding, doc_store=doc_store
)
retrieval_pipeline = RetrieveDocumentFromVectorStorePipeline(
retrieval_pipeline = VectorRetrieval(
vector_store=db, doc_store=doc_store, embedding=embedding
)