Provide type hints for pass-through Langchain and Llama-index objects (#95)

This commit is contained in:
Duc Nguyen (john)
2023-12-04 10:59:13 +07:00
committed by GitHub
parent e34b1e4c6d
commit 0ce3a8832f
34 changed files with 641 additions and 310 deletions

View File

@@ -1,4 +1,15 @@
from .base import BaseEmbeddings
from .openai import AzureOpenAIEmbeddings, OpenAIEmbeddings
from .langchain_based import (
AzureOpenAIEmbeddings,
CohereEmbdeddings,
HuggingFaceEmbeddings,
OpenAIEmbeddings,
)
__all__ = ["BaseEmbeddings", "OpenAIEmbeddings", "AzureOpenAIEmbeddings"]
__all__ = [
"BaseEmbeddings",
"OpenAIEmbeddings",
"AzureOpenAIEmbeddings",
"CohereEmbdeddings",
"HuggingFaceEmbeddings",
]

View File

@@ -1,10 +1,6 @@
from __future__ import annotations
from abc import abstractmethod
from typing import Type
from langchain.schema.embeddings import Embeddings as LCEmbeddings
from theflow import Param
from kotaemon.base import BaseComponent, Document, DocumentWithEmbedding
@@ -15,52 +11,3 @@ class BaseEmbeddings(BaseComponent):
self, text: str | list[str] | Document | list[Document]
) -> list[DocumentWithEmbedding]:
...
class LangchainEmbeddings(BaseEmbeddings):
_lc_class: Type[LCEmbeddings]
def __init__(self, **params):
if self._lc_class is None:
raise AttributeError(
"Should set _lc_class attribute to the LLM class from Langchain "
"if using LLM from Langchain"
)
self._kwargs: dict = {}
for param in list(params.keys()):
if param in self._lc_class.__fields__: # type: ignore
self._kwargs[param] = params.pop(param)
super().__init__(**params)
def __setattr__(self, name, value):
if name in self._lc_class.__fields__:
self._kwargs[name] = value
else:
super().__setattr__(name, value)
@Param.auto(cache=False)
def agent(self):
return self._lc_class(**self._kwargs)
def run(self, text):
input_: list[str] = []
if not isinstance(text, list):
text = [text]
for item in text:
if isinstance(item, str):
input_.append(item)
elif isinstance(item, Document):
input_.append(item.text)
else:
raise ValueError(
f"Invalid input type {type(item)}, should be str or Document"
)
embeddings = self.agent.embed_documents(input_)
return [
DocumentWithEmbedding(text=each_text, embedding=each_embedding)
for each_text, each_embedding in zip(input_, embeddings)
]

View File

@@ -1,12 +0,0 @@
from langchain.embeddings import CohereEmbeddings as LCCohereEmbeddings
from kotaemon.embeddings.base import LangchainEmbeddings
class CohereEmbdeddings(LangchainEmbeddings):
"""Cohere embeddings.
This class wraps around the Langchain CohereEmbeddings class.
"""
_lc_class = LCCohereEmbeddings

View File

@@ -1,12 +0,0 @@
from langchain.embeddings import HuggingFaceBgeEmbeddings as LCHuggingFaceEmbeddings
from kotaemon.embeddings.base import LangchainEmbeddings
class HuggingFaceEmbeddings(LangchainEmbeddings):
"""HuggingFace embeddings
This class wraps around the Langchain HuggingFaceEmbeddings class
"""
_lc_class = LCHuggingFaceEmbeddings

View File

@@ -0,0 +1,194 @@
from typing import Optional
from kotaemon.base import Document, DocumentWithEmbedding
from .base import BaseEmbeddings
class LCEmbeddingMixin:
def _get_lc_class(self):
raise NotImplementedError(
"Please return the relevant Langchain class in in _get_lc_class"
)
def __init__(self, **params):
self._lc_class = self._get_lc_class()
self._obj = self._lc_class(**params)
self._kwargs: dict = params
super().__init__()
def run(self, text):
input_: list[str] = []
if not isinstance(text, list):
text = [text]
for item in text:
if isinstance(item, str):
input_.append(item)
elif isinstance(item, Document):
input_.append(item.text)
else:
raise ValueError(
f"Invalid input type {type(item)}, should be str or Document"
)
embeddings = self._obj.embed_documents(input_)
return [
DocumentWithEmbedding(text=each_text, embedding=each_embedding)
for each_text, each_embedding in zip(input_, embeddings)
]
def __repr__(self):
kwargs = []
for key, value_obj in self._kwargs.items():
value = repr(value_obj)
kwargs.append(f"{key}={value}")
kwargs_repr = ", ".join(kwargs)
return f"{self.__class__.__name__}({kwargs_repr})"
def __str__(self):
kwargs = []
for key, value_obj in self._kwargs.items():
value = str(value_obj)
if len(value) > 20:
value = f"{value[:15]}..."
kwargs.append(f"{key}={value}")
kwargs_repr = ", ".join(kwargs)
return f"{self.__class__.__name__}({kwargs_repr})"
def __setattr__(self, name, value):
if name == "_lc_class":
return super().__setattr__(name, value)
if name in self._lc_class.__fields__:
self._kwargs[name] = value
self._obj = self._lc_class(**self._kwargs)
else:
super().__setattr__(name, value)
def __getattr__(self, name):
if name in self._kwargs:
return self._kwargs[name]
return getattr(self._obj, name)
def dump(self):
return {
"__type__": f"{self.__module__}.{self.__class__.__qualname__}",
**self._kwargs,
}
def specs(self, path: str):
path = path.strip(".")
if "." in path:
raise ValueError("path should not contain '.'")
if path in self._lc_class.__fields__:
return {
"__type__": "theflow.base.ParamAttr",
"refresh_on_set": True,
"strict_type": True,
}
raise ValueError(f"Invalid param {path}")
class OpenAIEmbeddings(LCEmbeddingMixin, BaseEmbeddings):
"""Wrapper around Langchain's OpenAI embedding, focusing on key parameters"""
def __init__(
self,
model: str = "text-embedding-ada-002",
openai_api_version: Optional[str] = None,
openai_api_base: Optional[str] = None,
openai_api_type: Optional[str] = None,
openai_api_key: Optional[str] = None,
request_timeout: Optional[float] = None,
**params,
):
super().__init__(
model=model,
openai_api_version=openai_api_version,
openai_api_base=openai_api_base,
openai_api_type=openai_api_type,
openai_api_key=openai_api_key,
request_timeout=request_timeout,
**params,
)
def _get_lc_class(self):
import langchain.embeddings
return langchain.emebddings.OpenAIEmbeddings
class AzureOpenAIEmbeddings(LCEmbeddingMixin, BaseEmbeddings):
"""Wrapper around Langchain's AzureOpenAI embedding, focusing on key parameters"""
def __init__(
self,
azure_endpoint: Optional[str] = None,
deployment: Optional[str] = None,
openai_api_key: Optional[str] = None,
openai_api_version: Optional[str] = None,
request_timeout: Optional[float] = None,
**params,
):
super().__init__(
azure_endpoint=azure_endpoint,
deployment=deployment,
openai_api_version=openai_api_version,
openai_api_key=openai_api_key,
request_timeout=request_timeout,
**params,
)
def _get_lc_class(self):
import langchain.embeddings
return langchain.embeddings.AzureOpenAIEmbeddings
class CohereEmbdeddings(LCEmbeddingMixin, BaseEmbeddings):
"""Wrapper around Langchain's Cohere embedding, focusing on key parameters"""
def __init__(
self,
model: str = "embed-english-v2.0",
cohere_api_key: Optional[str] = None,
truncate: Optional[str] = None,
request_timeout: Optional[float] = None,
**params,
):
super().__init__(
model=model,
cohere_api_key=cohere_api_key,
truncate=truncate,
request_timeout=request_timeout,
**params,
)
def _get_lc_class(self):
import langchain.embeddings
return langchain.embeddings.CohereEmbeddings
class HuggingFaceEmbeddings(LCEmbeddingMixin, BaseEmbeddings):
"""Wrapper around Langchain's HuggingFace embedding, focusing on key parameters"""
def __init__(
self,
model_name: str = "sentence-transformers/all-mpnet-base-v2",
**params,
):
super().__init__(
model_name=model_name,
**params,
)
def _get_lc_class(self):
import langchain.embeddings
return langchain.embeddings.HuggingFaceBgeEmbeddings

View File

@@ -1,21 +0,0 @@
from langchain import embeddings as lcembeddings
from .base import LangchainEmbeddings
class OpenAIEmbeddings(LangchainEmbeddings):
"""OpenAI embeddings.
This method is wrapped around the Langchain OpenAIEmbeddings class.
"""
_lc_class = lcembeddings.OpenAIEmbeddings
class AzureOpenAIEmbeddings(LangchainEmbeddings):
"""Azure OpenAI embeddings.
This method is wrapped around the Langchain AzureOpenAIEmbeddings class.
"""
_lc_class = lcembeddings.AzureOpenAIEmbeddings