Enforce all IO objects to be subclassed from Document (#88)

* enforce Document as IO * Separate rerankers, splitters and extractors (#85) * partially refractor importing * add text to embedding outputs --------- Co-authored-by: Nguyen Trung Duc (john) <trungduc1992@gmail.com>
2023-11-27 16:35:09 +07:00
parent 2186c5558f
commit 8e0779a22d
13 changed files with 108 additions and 59 deletions
--- a/knowledgehub/embeddings/base.py
+++ b/knowledgehub/embeddings/base.py
@@ -6,14 +6,14 @@ from typing import Type
 from langchain.schema.embeddings import Embeddings as LCEmbeddings
 from theflow import Param

-from ..base import BaseComponent, Document
+from kotaemon.base import BaseComponent, Document, DocumentWithEmbedding


 class BaseEmbeddings(BaseComponent):
    @abstractmethod
    def run(
        self, text: str | list[str] | Document | list[Document]
-    ) -> list[list[float]]:
+    ) -> list[DocumentWithEmbedding]:
        ...


@@ -43,7 +43,7 @@ class LangchainEmbeddings(BaseEmbeddings):
    def agent(self):
        return self._lc_class(**self._kwargs)

-    def run(self, text) -> list[list[float]]:
+    def run(self, text):
        input_: list[str] = []
        if not isinstance(text, list):
            text = [text]
@@ -58,4 +58,9 @@ class LangchainEmbeddings(BaseEmbeddings):
                    f"Invalid input type {type(item)}, should be str or Document"
                )

-        return self.agent.embed_documents(input_)
+        embeddings = self.agent.embed_documents(input_)
+
+        return [
+            DocumentWithEmbedding(text=each_text, embedding=each_embedding)
+            for each_text, each_embedding in zip(input_, embeddings)
+        ]