From 96d20860176c67fa89bcc958654e654707c22767 Mon Sep 17 00:00:00 2001 From: "Tuan Anh Nguyen Dang (Tadashi_Cin)" Date: Mon, 9 Sep 2024 14:15:34 +0700 Subject: [PATCH] fix: add guidance parameters for LC wrapper models (#255) * fix: add docstring to LC wrapper models * fix: fix metadata passing with LC embedding wrapper --- flowsettings.py | 1 + .../kotaemon/embeddings/langchain_based.py | 44 ++++++++++++------- .../kotaemon/llms/chats/langchain_based.py | 24 +++++++++- libs/ktem/ktem/embeddings/manager.py | 1 + libs/ktem/ktem/index/file/index.py | 2 +- 5 files changed, 54 insertions(+), 18 deletions(-) diff --git a/flowsettings.py b/flowsettings.py index a0ffd0e..cd487ce 100644 --- a/flowsettings.py +++ b/flowsettings.py @@ -208,6 +208,7 @@ KH_EMBEDDINGS["cohere"] = { "__type__": "kotaemon.embeddings.LCCohereEmbeddings", "model": "embed-multilingual-v2.0", "cohere_api_key": "your-key", + "user_agent": "default", }, "default": False, } diff --git a/libs/kotaemon/kotaemon/embeddings/langchain_based.py b/libs/kotaemon/kotaemon/embeddings/langchain_based.py index aa2bb04..d415b2e 100644 --- a/libs/kotaemon/kotaemon/embeddings/langchain_based.py +++ b/libs/kotaemon/kotaemon/embeddings/langchain_based.py @@ -1,6 +1,6 @@ from typing import Optional -from kotaemon.base import Document, DocumentWithEmbedding +from kotaemon.base import DocumentWithEmbedding, Param from .base import BaseEmbeddings @@ -19,25 +19,14 @@ class LCEmbeddingMixin: super().__init__() def run(self, text): - input_: list[str] = [] - if not isinstance(text, list): - text = [text] - - for item in text: - if isinstance(item, str): - input_.append(item) - elif isinstance(item, Document): - input_.append(item.text) - else: - raise ValueError( - f"Invalid input type {type(item)}, should be str or Document" - ) + input_docs = self.prepare_input(text) + input_ = [doc.text for doc in input_docs] embeddings = self._obj.embed_documents(input_) return [ - DocumentWithEmbedding(text=each_text, embedding=each_embedding) - for each_text, each_embedding in zip(input_, embeddings) + DocumentWithEmbedding(content=doc, embedding=each_embedding) + for doc, each_embedding in zip(input_docs, embeddings) ] def __repr__(self): @@ -162,6 +151,20 @@ class LCAzureOpenAIEmbeddings(LCEmbeddingMixin, BaseEmbeddings): class LCCohereEmbeddings(LCEmbeddingMixin, BaseEmbeddings): """Wrapper around Langchain's Cohere embedding, focusing on key parameters""" + cohere_api_key: str = Param( + help="API key (https://dashboard.cohere.com/api-keys)", + default=None, + required=True, + ) + model: str = Param( + help="Model name to use (https://docs.cohere.com/docs/models)", + default=None, + required=True, + ) + user_agent: str = Param( + help="User agent (leave default)", default="default", required=True + ) + def __init__( self, model: str = "embed-english-v2.0", @@ -190,6 +193,15 @@ class LCCohereEmbeddings(LCEmbeddingMixin, BaseEmbeddings): class LCHuggingFaceEmbeddings(LCEmbeddingMixin, BaseEmbeddings): """Wrapper around Langchain's HuggingFace embedding, focusing on key parameters""" + model_name: str = Param( + help=( + "Model name to use (https://huggingface.co/models?" + "pipeline_tag=sentence-similarity&sort=trending)" + ), + default=None, + required=True, + ) + def __init__( self, model_name: str = "sentence-transformers/all-mpnet-base-v2", diff --git a/libs/kotaemon/kotaemon/llms/chats/langchain_based.py b/libs/kotaemon/kotaemon/llms/chats/langchain_based.py index 663c195..a2d3409 100644 --- a/libs/kotaemon/kotaemon/llms/chats/langchain_based.py +++ b/libs/kotaemon/kotaemon/llms/chats/langchain_based.py @@ -3,7 +3,7 @@ from __future__ import annotations import logging from typing import AsyncGenerator, Iterator -from kotaemon.base import BaseMessage, HumanMessage, LLMInterface +from kotaemon.base import BaseMessage, HumanMessage, LLMInterface, Param from .base import ChatLLM @@ -224,6 +224,17 @@ class LCAzureChatOpenAI(LCChatMixin, ChatLLM): # type: ignore class LCAnthropicChat(LCChatMixin, ChatLLM): # type: ignore + api_key: str = Param( + help="API key (https://console.anthropic.com/settings/keys)", required=True + ) + model_name: str = Param( + help=( + "Model name to use " + "(https://docs.anthropic.com/en/docs/about-claude/models)" + ), + required=True, + ) + def __init__( self, api_key: str | None = None, @@ -248,6 +259,17 @@ class LCAnthropicChat(LCChatMixin, ChatLLM): # type: ignore class LCGeminiChat(LCChatMixin, ChatLLM): # type: ignore + api_key: str = Param( + help="API key (https://aistudio.google.com/app/apikey)", required=True + ) + model_name: str = Param( + help=( + "Model name to use (https://cloud.google" + ".com/vertex-ai/generative-ai/docs/learn/models)" + ), + required=True, + ) + def __init__( self, api_key: str | None = None, diff --git a/libs/ktem/ktem/embeddings/manager.py b/libs/ktem/ktem/embeddings/manager.py index f1ff6a0..88cdacb 100644 --- a/libs/ktem/ktem/embeddings/manager.py +++ b/libs/ktem/ktem/embeddings/manager.py @@ -50,6 +50,7 @@ class EmbeddingManager: } if item.default: self._default = item.name + self._models["default"] = self._models[item.name] def load_vendors(self): from kotaemon.embeddings import ( diff --git a/libs/ktem/ktem/index/file/index.py b/libs/ktem/ktem/index/file/index.py index e94a8a7..2f1bc8a 100644 --- a/libs/ktem/ktem/index/file/index.py +++ b/libs/ktem/ktem/index/file/index.py @@ -344,7 +344,7 @@ class FileIndex(BaseIndex): def get_admin_settings(cls): from ktem.embeddings.manager import embedding_models_manager - embedding_default = embedding_models_manager.get_default_name() + embedding_default = "default" embedding_choices = list(embedding_models_manager.options().keys()) return {