From ed10020ea342a74f5fa0783607e5ac7190b85227 Mon Sep 17 00:00:00 2001 From: "Duc Nguyen (john)" Date: Tue, 9 Apr 2024 15:07:59 +0700 Subject: [PATCH] Refactor embeddings and provide vanilla OpenAI-based embeddings (#11) * Prepend all Langchain-based embeddings with LC * Provide vanilla OpenAI embeddings * Add test for AzureOpenAIEmbeddings and OpenAIEmbeddings * Fix disallowed empty string * Use OpenAIEmbeddings in flowsettings --------- Co-authored-by: ian_Cin --- libs/kotaemon/kotaemon/embeddings/openai.py | 4 ++-- libs/ktem/flowsettings.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/libs/kotaemon/kotaemon/embeddings/openai.py b/libs/kotaemon/kotaemon/embeddings/openai.py index 62517fe..6f9a246 100644 --- a/libs/kotaemon/kotaemon/embeddings/openai.py +++ b/libs/kotaemon/kotaemon/embeddings/openai.py @@ -59,7 +59,7 @@ class BaseOpenAIEmbeddings(BaseEmbeddings): input_ = self.prepare_input(text) client = self.prepare_client(async_version=False) resp = self.openai_response( - client, input=[_.text for _ in input_], **kwargs + client, input=[_.text if _.text else " " for _ in input_], **kwargs ).dict() output_ = sorted(resp["data"], key=lambda x: x["index"]) return [ @@ -73,7 +73,7 @@ class BaseOpenAIEmbeddings(BaseEmbeddings): input_ = self.prepare_input(text) client = self.prepare_client(async_version=True) resp = await self.openai_response( - client, input=[_.text for _ in input_], **kwargs + client, input=[_.text if _.text else " " for _ in input_], **kwargs ).dict() output_ = sorted(resp["data"], key=lambda x: x["index"]) return [ diff --git a/libs/ktem/flowsettings.py b/libs/ktem/flowsettings.py index 0284e25..d73b9cf 100644 --- a/libs/ktem/flowsettings.py +++ b/libs/ktem/flowsettings.py @@ -59,12 +59,13 @@ if config("AZURE_OPENAI_API_KEY", default="") and config( "spec": { "__type__": "kotaemon.embeddings.LCAzureOpenAIEmbeddings", "azure_endpoint": config("AZURE_OPENAI_ENDPOINT", default=""), - "openai_api_key": config("AZURE_OPENAI_API_KEY", default=""), + "api_key": config("AZURE_OPENAI_API_KEY", default=""), "api_version": config("OPENAI_API_VERSION", default="") or "2024-02-15-preview", - "deployment": config("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT", default=""), - "request_timeout": 10, - "chunk_size": 16, + "azure_deployment": config( + "AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT", default="" + ), + "timeout": 10, }, "default": False, "accuracy": 5, @@ -96,7 +97,6 @@ if config("OPENAI_API_KEY", default=""): ) or "text-embedding-ada-002", "timeout": 10, - "chunk_size": 16, }, "default": False, }