Separate rerankers, splitters and extractors (#85)

2023-11-27 14:25:54 +07:00
parent 0dede9c82d
commit 2186c5558f
15 changed files with 211 additions and 135 deletions
--- a/knowledgehub/indices/base.py
+++ b/knowledgehub/indices/base.py
@@ -0,0 +1,72 @@
+from __future__ import annotations
+
+from abc import abstractmethod
+from typing import Any, Sequence, Type
+
+from llama_index.node_parser.interface import NodeParser
+
+from ..base import BaseComponent, Document
+
+
+class DocTransformer(BaseComponent):
+    """This is a base class for document transformers
+
+    A document transformer transforms a list of documents into another list
+    of documents. Transforming can mean splitting a document into multiple documents,
+    reducing a large list of documents into a smaller list of documents, or adding
+    metadata to each document in a list of documents, etc.
+    """
+
+    @abstractmethod
+    def run(
+        self,
+        documents: Sequence[Document],
+        **kwargs,
+    ) -> Sequence[Document]:
+        ...
+
+
+class LlamaIndexMixin:
+    """Allow automatically wrapping a Llama-index component into kotaemon component
+
+    Example:
+        class TokenSplitter(LlamaIndexMixin, BaseSplitter):
+            def _get_li_class(self):
+                from llama_index.text_splitter import TokenTextSplitter
+                return TokenTextSplitter
+
+    To use this mixin, please:
+        1. Use this class as the 1st parent class, so that Python will prefer to use
+        the attributes and methods of this class whenever possible.
+        2. Overwrite `_get_li_class` to return the relevant LlamaIndex component.
+    """
+
+    def _get_li_class(self) -> Type[NodeParser]:
+        raise NotImplementedError(
+            "Please return the relevant LlamaIndex class in _get_li_class"
+        )
+
+    def __init__(self, *args, **kwargs):
+        _li_cls = self._get_li_class()
+        self._obj = _li_cls(*args, **kwargs)
+        super().__init__()
+
+    def __setattr__(self, name: str, value: Any) -> None:
+        if name.startswith("_") or name in self._protected_keywords():
+            return super().__setattr__(name, value)
+
+        return setattr(self._obj, name, value)
+
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self._obj, name)
+
+    def run(
+        self,
+        documents: Sequence[Document],
+        **kwargs,
+    ) -> Sequence[Document]:
+        """Run Llama-index node parser and convert the output to Document from
+        kotaemon
+        """
+        docs = self._obj(documents, **kwargs)  # type: ignore
+        return [Document.from_dict(doc.to_dict()) for doc in docs]