From 2186c5558f65476be4758cf15bb1d8d4685ae8be Mon Sep 17 00:00:00 2001 From: "Nguyen Trung Duc (john)" Date: Mon, 27 Nov 2023 14:25:54 +0700 Subject: [PATCH] Separate rerankers, splitters and extractors (#85) --- knowledgehub/indexing/doc_parsers.py | 58 --------------- .../{indexing => indices}/__init__.py | 0 knowledgehub/indices/base.py | 72 +++++++++++++++++++ knowledgehub/indices/extractors/__init__.py | 7 ++ .../indices/extractors/doc_parsers.py | 19 +++++ knowledgehub/indices/rankings/__init__.py | 5 ++ knowledgehub/indices/rankings/base.py | 13 ++++ knowledgehub/indices/rankings/cohere.py | 38 ++++++++++ .../reranking.py => indices/rankings/llm.py} | 66 +++-------------- knowledgehub/indices/splitters/__init__.py | 21 ++++++ knowledgehub/pipelines/indexing.py | 10 +-- knowledgehub/pipelines/ingest.py | 26 +++---- knowledgehub/pipelines/qa.py | 4 +- knowledgehub/pipelines/retrieving.py | 5 +- tests/test_reranking.py | 2 +- 15 files changed, 211 insertions(+), 135 deletions(-) delete mode 100644 knowledgehub/indexing/doc_parsers.py rename knowledgehub/{indexing => indices}/__init__.py (100%) create mode 100644 knowledgehub/indices/base.py create mode 100644 knowledgehub/indices/extractors/__init__.py create mode 100644 knowledgehub/indices/extractors/doc_parsers.py create mode 100644 knowledgehub/indices/rankings/__init__.py create mode 100644 knowledgehub/indices/rankings/base.py create mode 100644 knowledgehub/indices/rankings/cohere.py rename knowledgehub/{pipelines/reranking.py => indices/rankings/llm.py} (50%) create mode 100644 knowledgehub/indices/splitters/__init__.py diff --git a/knowledgehub/indexing/doc_parsers.py b/knowledgehub/indexing/doc_parsers.py deleted file mode 100644 index 83e2cd5..0000000 --- a/knowledgehub/indexing/doc_parsers.py +++ /dev/null @@ -1,58 +0,0 @@ -from typing import Any, Sequence, Type - -from llama_index.extractors import SummaryExtractor as LISummaryExtractor -from llama_index.extractors import TitleExtractor as LITitleExtractor -from llama_index.node_parser import ( - SentenceWindowNodeParser as LISentenceWindowNodeParser, -) -from llama_index.node_parser.interface import NodeParser -from llama_index.text_splitter import TokenTextSplitter as LITokenTextSplitter - -from ..base import BaseComponent, Document - - -class LIDocParser(BaseComponent): - _parser_class: Type[NodeParser] - - def __init__(self, *args, **kwargs): - if self._parser_class is None: - raise AttributeError( - "Require `_parser_class` to set a NodeParser class from LlamarIndex" - ) - self._parser = self._parser_class(*args, **kwargs) - super().__init__() - - def __setattr__(self, name: str, value: Any) -> None: - if name.startswith("_") or name in self._protected_keywords(): - return super().__setattr__(name, value) - - return setattr(self._parser, name, value) - - def __getattr__(self, name: str) -> Any: - return getattr(self._parser, name) - - def run( - self, - documents: Sequence[Document], - **kwargs, - ) -> Sequence[Document]: - documents = self._parser(documents, **kwargs) - # convert Document to new base class from kotaemon - converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents] - return converted_documents - - -class TokenSplitter(LIDocParser): - _parser_class = LITokenTextSplitter - - -class SentenceWindowNodeParser(LIDocParser): - _parser_class = LISentenceWindowNodeParser - - -class TitleExtractor(LIDocParser): - _parser_class = LITitleExtractor - - -class SummaryExtractor(LIDocParser): - _parser_class = LISummaryExtractor diff --git a/knowledgehub/indexing/__init__.py b/knowledgehub/indices/__init__.py similarity index 100% rename from knowledgehub/indexing/__init__.py rename to knowledgehub/indices/__init__.py diff --git a/knowledgehub/indices/base.py b/knowledgehub/indices/base.py new file mode 100644 index 0000000..dfdf9aa --- /dev/null +++ b/knowledgehub/indices/base.py @@ -0,0 +1,72 @@ +from __future__ import annotations + +from abc import abstractmethod +from typing import Any, Sequence, Type + +from llama_index.node_parser.interface import NodeParser + +from ..base import BaseComponent, Document + + +class DocTransformer(BaseComponent): + """This is a base class for document transformers + + A document transformer transforms a list of documents into another list + of documents. Transforming can mean splitting a document into multiple documents, + reducing a large list of documents into a smaller list of documents, or adding + metadata to each document in a list of documents, etc. + """ + + @abstractmethod + def run( + self, + documents: Sequence[Document], + **kwargs, + ) -> Sequence[Document]: + ... + + +class LlamaIndexMixin: + """Allow automatically wrapping a Llama-index component into kotaemon component + + Example: + class TokenSplitter(LlamaIndexMixin, BaseSplitter): + def _get_li_class(self): + from llama_index.text_splitter import TokenTextSplitter + return TokenTextSplitter + + To use this mixin, please: + 1. Use this class as the 1st parent class, so that Python will prefer to use + the attributes and methods of this class whenever possible. + 2. Overwrite `_get_li_class` to return the relevant LlamaIndex component. + """ + + def _get_li_class(self) -> Type[NodeParser]: + raise NotImplementedError( + "Please return the relevant LlamaIndex class in _get_li_class" + ) + + def __init__(self, *args, **kwargs): + _li_cls = self._get_li_class() + self._obj = _li_cls(*args, **kwargs) + super().__init__() + + def __setattr__(self, name: str, value: Any) -> None: + if name.startswith("_") or name in self._protected_keywords(): + return super().__setattr__(name, value) + + return setattr(self._obj, name, value) + + def __getattr__(self, name: str) -> Any: + return getattr(self._obj, name) + + def run( + self, + documents: Sequence[Document], + **kwargs, + ) -> Sequence[Document]: + """Run Llama-index node parser and convert the output to Document from + kotaemon + """ + docs = self._obj(documents, **kwargs) # type: ignore + return [Document.from_dict(doc.to_dict()) for doc in docs] diff --git a/knowledgehub/indices/extractors/__init__.py b/knowledgehub/indices/extractors/__init__.py new file mode 100644 index 0000000..7bdec4b --- /dev/null +++ b/knowledgehub/indices/extractors/__init__.py @@ -0,0 +1,7 @@ +from .doc_parsers import BaseDocParser, SummaryExtractor, TitleExtractor + +__all__ = [ + "BaseDocParser", + "TitleExtractor", + "SummaryExtractor", +] diff --git a/knowledgehub/indices/extractors/doc_parsers.py b/knowledgehub/indices/extractors/doc_parsers.py new file mode 100644 index 0000000..867f0ac --- /dev/null +++ b/knowledgehub/indices/extractors/doc_parsers.py @@ -0,0 +1,19 @@ +from ..base import DocTransformer, LlamaIndexMixin + + +class BaseDocParser(DocTransformer): + ... + + +class TitleExtractor(LlamaIndexMixin, BaseDocParser): + def _get_li_class(self): + from llama_index.extractors import TitleExtractor + + return TitleExtractor + + +class SummaryExtractor(LlamaIndexMixin, BaseDocParser): + def _get_li_class(self): + from llama_index.extractors import SummaryExtractor + + return SummaryExtractor diff --git a/knowledgehub/indices/rankings/__init__.py b/knowledgehub/indices/rankings/__init__.py new file mode 100644 index 0000000..ccd99bb --- /dev/null +++ b/knowledgehub/indices/rankings/__init__.py @@ -0,0 +1,5 @@ +from .base import BaseReranking +from .cohere import CohereReranking +from .llm import LLMReranking + +__all__ = ["CohereReranking", "LLMReranking", "BaseReranking"] diff --git a/knowledgehub/indices/rankings/base.py b/knowledgehub/indices/rankings/base.py new file mode 100644 index 0000000..9515199 --- /dev/null +++ b/knowledgehub/indices/rankings/base.py @@ -0,0 +1,13 @@ +from __future__ import annotations + +from abc import abstractmethod + +from ...base import BaseComponent, Document + + +class BaseReranking(BaseComponent): + @abstractmethod + def run(self, documents: list[Document], query: str) -> list[Document]: + """Main method to transform list of documents + (re-ranking, filtering, etc)""" + ... diff --git a/knowledgehub/indices/rankings/cohere.py b/knowledgehub/indices/rankings/cohere.py new file mode 100644 index 0000000..fce039b --- /dev/null +++ b/knowledgehub/indices/rankings/cohere.py @@ -0,0 +1,38 @@ +from __future__ import annotations + +import os + +from ...base import Document +from .base import BaseReranking + + +class CohereReranking(BaseReranking): + model_name: str = "rerank-multilingual-v2.0" + cohere_api_key: str = os.environ.get("COHERE_API_KEY", "") + top_k: int = 1 + + def run(self, documents: list[Document], query: str) -> list[Document]: + """Use Cohere Reranker model to re-order documents + with their relevance score""" + try: + import cohere + except ImportError: + raise ImportError( + "Please install Cohere " "`pip install cohere` to use Cohere Reranking" + ) + + cohere_client = cohere.Client(self.cohere_api_key) + + # output documents + compressed_docs = [] + if len(documents) > 0: # to avoid empty api call + _docs = [d.content for d in documents] + results = cohere_client.rerank( + model=self.model_name, query=query, documents=_docs, top_n=self.top_k + ) + for r in results: + doc = documents[r.index] + doc.metadata["relevance_score"] = r.relevance_score + compressed_docs.append(doc) + + return compressed_docs diff --git a/knowledgehub/pipelines/reranking.py b/knowledgehub/indices/rankings/llm.py similarity index 50% rename from knowledgehub/pipelines/reranking.py rename to knowledgehub/indices/rankings/llm.py index 8c4c20f..f36eca5 100644 --- a/knowledgehub/pipelines/reranking.py +++ b/knowledgehub/indices/rankings/llm.py @@ -1,62 +1,18 @@ -import os -from abc import abstractmethod +from __future__ import annotations + from concurrent.futures import ThreadPoolExecutor -from typing import List, Optional, Union +from typing import Union from langchain.output_parsers.boolean import BooleanOutputParser -from ..base import BaseComponent -from ..base.schema import Document -from ..llms import PromptTemplate -from ..llms.chats.base import ChatLLM -from ..llms.completions.base import LLM +from ...base import Document +from ...llms import PromptTemplate +from ...llms.chats.base import ChatLLM +from ...llms.completions.base import LLM +from .base import BaseReranking BaseLLM = Union[ChatLLM, LLM] - -class BaseRerankingPipeline(BaseComponent): - @abstractmethod - def run(self, documents: List[Document], query: str) -> List[Document]: - """Main method to transform list of documents - (re-ranking, filtering, etc)""" - ... - - -class CohereReranking(BaseRerankingPipeline): - model_name: str = "rerank-multilingual-v2.0" - cohere_api_key: Optional[str] = None - top_k: int = 1 - - def run(self, documents: List[Document], query: str) -> List[Document]: - """Use Cohere Reranker model to re-order documents - with their relevance score""" - try: - import cohere - except ImportError: - raise ImportError( - "Please install Cohere " "`pip install cohere` to use Cohere Reranking" - ) - - cohere_api_key = ( - self.cohere_api_key if self.cohere_api_key else os.environ["COHERE_API_KEY"] - ) - cohere_client = cohere.Client(cohere_api_key) - - # output documents - compressed_docs = [] - if len(documents) > 0: # to avoid empty api call - _docs = [d.content for d in documents] - results = cohere_client.rerank( - model=self.model_name, query=query, documents=_docs, top_n=self.top_k - ) - for r in results: - doc = documents[r.index] - doc.metadata["relevance_score"] = r.relevance_score - compressed_docs.append(doc) - - return compressed_docs - - RERANK_PROMPT_TEMPLATE = """Given the following question and context, return YES if the context is relevant to the question and NO if it isn't. @@ -68,7 +24,7 @@ return YES if the context is relevant to the question and NO if it isn't. > Relevant (YES / NO):""" -class LLMReranking(BaseRerankingPipeline): +class LLMReranking(BaseReranking): llm: BaseLLM prompt_template: PromptTemplate = PromptTemplate(template=RERANK_PROMPT_TEMPLATE) top_k: int = 3 @@ -76,9 +32,9 @@ class LLMReranking(BaseRerankingPipeline): def run( self, - documents: List[Document], + documents: list[Document], query: str, - ) -> List[Document]: + ) -> list[Document]: """Filter down documents based on their relevance to the query.""" filtered_docs = [] output_parser = BooleanOutputParser() diff --git a/knowledgehub/indices/splitters/__init__.py b/knowledgehub/indices/splitters/__init__.py new file mode 100644 index 0000000..8c89324 --- /dev/null +++ b/knowledgehub/indices/splitters/__init__.py @@ -0,0 +1,21 @@ +from ..base import DocTransformer, LlamaIndexMixin + + +class BaseSplitter(DocTransformer): + """Represent base splitter class""" + + ... + + +class TokenSplitter(LlamaIndexMixin, BaseSplitter): + def _get_li_class(self): + from llama_index.text_splitter import TokenTextSplitter + + return TokenTextSplitter + + +class SentenceWindowSplitter(LlamaIndexMixin, BaseSplitter): + def _get_li_class(self): + from llama_index.node_parser import SentenceWindowNodeParser + + return SentenceWindowNodeParser diff --git a/knowledgehub/pipelines/indexing.py b/knowledgehub/pipelines/indexing.py index 52db865..5c13a27 100644 --- a/knowledgehub/pipelines/indexing.py +++ b/knowledgehub/pipelines/indexing.py @@ -2,7 +2,7 @@ from __future__ import annotations import uuid from pathlib import Path -from typing import cast +from typing import Optional, cast from ..base import BaseComponent, Document from ..embeddings import BaseEmbeddings @@ -22,7 +22,7 @@ class IndexVectorStoreFromDocumentPipeline(BaseComponent): """ vector_store: BaseVectorStore - doc_store: BaseDocumentStore + doc_store: Optional[BaseDocumentStore] = None embedding: BaseEmbeddings # TODO: refer to llama_index's storage as well @@ -64,7 +64,8 @@ class IndexVectorStoreFromDocumentPipeline(BaseComponent): if isinstance(path, str): path = Path(path) self.vector_store.save(path / vectorstore_fname) - self.doc_store.save(path / docstore_fname) + if self.doc_store: + self.doc_store.save(path / docstore_fname) def load( self, @@ -76,4 +77,5 @@ class IndexVectorStoreFromDocumentPipeline(BaseComponent): if isinstance(path, str): path = Path(path) self.vector_store.load(path / vectorstore_fname) - self.doc_store.load(path / docstore_fname) + if self.doc_store: + self.doc_store.load(path / docstore_fname) diff --git a/knowledgehub/pipelines/ingest.py b/knowledgehub/pipelines/ingest.py index da55189..aee2217 100644 --- a/knowledgehub/pipelines/ingest.py +++ b/knowledgehub/pipelines/ingest.py @@ -1,6 +1,8 @@ +from __future__ import annotations + import os from pathlib import Path -from typing import Dict, List, Optional, Sequence, Union +from typing import Optional, Sequence from llama_index.readers.base import BaseReader from theflow import Node @@ -8,8 +10,9 @@ from theflow.utils.modules import ObjectInitDeclaration as _ from kotaemon.base import BaseComponent from kotaemon.embeddings import AzureOpenAIEmbeddings -from kotaemon.indexing.doc_parsers import LIDocParser as DocParser -from kotaemon.indexing.doc_parsers import TokenSplitter +from kotaemon.indices.extractors import BaseDocParser +from kotaemon.indices.rankings import BaseReranking +from kotaemon.indices.splitters import TokenSplitter from kotaemon.loaders import ( AutoReader, DirectoryReader, @@ -19,7 +22,6 @@ from kotaemon.loaders import ( ) from kotaemon.pipelines.agents import BaseAgent from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline -from kotaemon.pipelines.reranking import BaseRerankingPipeline from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline from kotaemon.storages import ( BaseDocumentStore, @@ -45,7 +47,7 @@ class ReaderIndexingPipeline(BaseComponent): chunk_overlap: int = 256 vector_store: BaseVectorStore = _(InMemoryVectorStore) doc_store: BaseDocumentStore = _(InMemoryDocumentStore) - doc_parsers: List[DocParser] = [] + doc_parsers: list[BaseDocParser] = [] embedding: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings.withx( model="text-embedding-ada-002", @@ -55,9 +57,9 @@ class ReaderIndexingPipeline(BaseComponent): chunk_size=16, ) - def get_reader(self, input_files: List[Union[str, Path]]): + def get_reader(self, input_files: list[str | Path]): # document parsers - file_extractor: Dict[str, BaseReader] = { + file_extractor: dict[str, BaseReader | AutoReader] = { ".xlsx": PandasExcelReader(), } if self.reader_name == "normal": @@ -89,7 +91,7 @@ class ReaderIndexingPipeline(BaseComponent): def run( self, - file_path_list: Union[List[Union[str, Path]], Union[str, Path]], + file_path_list: list[str | Path] | str | Path, force_reindex: Optional[bool] = False, ): self.storage_path.mkdir(exist_ok=True) @@ -121,9 +123,7 @@ class ReaderIndexingPipeline(BaseComponent): else: self.indexing_vector_pipeline.load(file_storage_path) - def to_retrieving_pipeline( - self, top_k=3, rerankers: Sequence[BaseRerankingPipeline] = [] - ): + def to_retrieving_pipeline(self, top_k=3, rerankers: Sequence[BaseReranking] = []): retrieving_pipeline = RetrieveDocumentFromVectorStorePipeline( vector_store=self.vector_store, doc_store=self.doc_store, @@ -141,7 +141,7 @@ class ReaderIndexingPipeline(BaseComponent): doc_store=self.doc_store, embedding=self.embedding, llm=llm, - **kwargs + **kwargs, ) return qa_pipeline @@ -153,7 +153,7 @@ class ReaderIndexingPipeline(BaseComponent): doc_store=self.doc_store, embedding=self.embedding, agent=agent, - **kwargs + **kwargs, ) agent_pipeline.add_search_tool() return agent_pipeline diff --git a/knowledgehub/pipelines/qa.py b/knowledgehub/pipelines/qa.py index b370648..5d5154f 100644 --- a/knowledgehub/pipelines/qa.py +++ b/knowledgehub/pipelines/qa.py @@ -8,11 +8,11 @@ from theflow.utils.modules import ObjectInitDeclaration as _ from kotaemon.base import BaseComponent from kotaemon.base.schema import Document, RetrievedDocument from kotaemon.embeddings import AzureOpenAIEmbeddings +from kotaemon.indices.rankings import BaseReranking from kotaemon.llms import PromptTemplate from kotaemon.llms.chats.openai import AzureChatOpenAI from kotaemon.pipelines.agents import BaseAgent from kotaemon.pipelines.citation import CitationPipeline -from kotaemon.pipelines.reranking import BaseRerankingPipeline from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline from kotaemon.pipelines.tools import ComponentTool from kotaemon.storages import ( @@ -51,7 +51,7 @@ class QuestionAnsweringPipeline(BaseComponent): vector_store: BaseVectorStore = _(InMemoryVectorStore) doc_store: BaseDocumentStore = _(InMemoryDocumentStore) - rerankers: Sequence[BaseRerankingPipeline] = [] + rerankers: Sequence[BaseReranking] = [] embedding: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings.withx( model="text-embedding-ada-002", diff --git a/knowledgehub/pipelines/retrieving.py b/knowledgehub/pipelines/retrieving.py index cc69924..1af6854 100644 --- a/knowledgehub/pipelines/retrieving.py +++ b/knowledgehub/pipelines/retrieving.py @@ -3,11 +3,12 @@ from __future__ import annotations from pathlib import Path from typing import Optional, Sequence +from kotaemon.indices.rankings import BaseReranking + from ..base import BaseComponent from ..base.schema import Document, RetrievedDocument from ..embeddings import BaseEmbeddings from ..storages import BaseDocumentStore, BaseVectorStore -from .reranking import BaseRerankingPipeline VECTOR_STORE_FNAME = "vectorstore" DOC_STORE_FNAME = "docstore" @@ -19,7 +20,7 @@ class RetrieveDocumentFromVectorStorePipeline(BaseComponent): vector_store: BaseVectorStore doc_store: BaseDocumentStore embedding: BaseEmbeddings - rerankers: Sequence[BaseRerankingPipeline] = [] + rerankers: Sequence[BaseReranking] = [] top_k: int = 1 # TODO: refer to llama_index's storage as well diff --git a/tests/test_reranking.py b/tests/test_reranking.py index 3652b8c..b1addca 100644 --- a/tests/test_reranking.py +++ b/tests/test_reranking.py @@ -4,8 +4,8 @@ import pytest from openai.types.chat.chat_completion import ChatCompletion from kotaemon.base import Document +from kotaemon.indices.rankings import LLMReranking from kotaemon.llms.chats.openai import AzureChatOpenAI -from kotaemon.pipelines.reranking import LLMReranking _openai_chat_completion_responses = [ ChatCompletion.parse_obj(