Enforce all IO objects to be subclassed from Document (#88)

* enforce Document as IO

* Separate rerankers, splitters and extractors (#85)

* partially refractor importing

* add text to embedding outputs

---------

Co-authored-by: Nguyen Trung Duc (john) <trungduc1992@gmail.com>
This commit is contained in:
ian_Cin
2023-11-27 16:35:09 +07:00
committed by GitHub
parent 2186c5558f
commit 8e0779a22d
13 changed files with 108 additions and 59 deletions

View File

@@ -6,14 +6,14 @@ from typing import Type
from langchain.schema.embeddings import Embeddings as LCEmbeddings
from theflow import Param
from ..base import BaseComponent, Document
from kotaemon.base import BaseComponent, Document, DocumentWithEmbedding
class BaseEmbeddings(BaseComponent):
@abstractmethod
def run(
self, text: str | list[str] | Document | list[Document]
) -> list[list[float]]:
) -> list[DocumentWithEmbedding]:
...
@@ -43,7 +43,7 @@ class LangchainEmbeddings(BaseEmbeddings):
def agent(self):
return self._lc_class(**self._kwargs)
def run(self, text) -> list[list[float]]:
def run(self, text):
input_: list[str] = []
if not isinstance(text, list):
text = [text]
@@ -58,4 +58,9 @@ class LangchainEmbeddings(BaseEmbeddings):
f"Invalid input type {type(item)}, should be str or Document"
)
return self.agent.embed_documents(input_)
embeddings = self.agent.embed_documents(input_)
return [
DocumentWithEmbedding(text=each_text, embedding=each_embedding)
for each_text, each_embedding in zip(input_, embeddings)
]