diff --git a/.gitignore b/.gitignore index 5487fed..22de1c3 100644 --- a/.gitignore +++ b/.gitignore @@ -448,4 +448,6 @@ $RECYCLE.BIN/ # Windows shortcuts *.lnk +.theflow/ + # End of https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm diff --git a/README.md b/README.md index 55b8e7a..a79c399 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,31 @@ -Modules: +# kotaemon + +Quick and easy AI components to build Kotaemon - applicable in client +project. + +## Install + +```shell +pip install kotaemon@git+ssh://git@github.com/Cinnamon/kotaemon.git +``` + +## Contribute + +### Setup + +```shell +# Create conda environment (suggest 3.10) +conda create -n kotaemon python=3.10 +conda activate kotaemon + +# Install all +pip install -e ".[dev]" + +# Test +pytest tests +``` + +### Code base structure + - documents: define document - loaders diff --git a/knowledgehub/__init__.py b/knowledgehub/__init__.py index 00e571b..e1eac21 100644 --- a/knowledgehub/__init__.py +++ b/knowledgehub/__init__.py @@ -22,4 +22,4 @@ try: except ImportError: pass -__version__ = "0.0.1" +__version__ = "0.0.2" diff --git a/knowledgehub/components.py b/knowledgehub/components.py new file mode 100644 index 0000000..bccba58 --- /dev/null +++ b/knowledgehub/components.py @@ -0,0 +1,56 @@ +from abc import abstractmethod + +from theflow.base import Composable + + +class BaseComponent(Composable): + """Base class for component + + A component is a class that can be used to compose a pipeline. To use the + component, you should implement the following methods: + - run_raw: run on raw input + - run_batch_raw: run on batch of raw input + - run_document: run on document + - run_batch_document: run on batch of documents + - is_document: check if input is document + - is_batch: check if input is batch + """ + + @abstractmethod + def run_raw(self, *args, **kwargs): + ... + + @abstractmethod + def run_batch_raw(self, *args, **kwargs): + ... + + @abstractmethod + def run_document(self, *args, **kwargs): + ... + + @abstractmethod + def run_batch_document(self, *args, **kwargs): + ... + + @abstractmethod + def is_document(self, *args, **kwargs) -> bool: + ... + + @abstractmethod + def is_batch(self, *args, **kwargs) -> bool: + ... + + def run(self, *args, **kwargs): + """Run the component.""" + + is_document = self.is_document(*args, **kwargs) + is_batch = self.is_batch(*args, **kwargs) + + if is_document and is_batch: + return self.run_batch_document(*args, **kwargs) + elif is_document and not is_batch: + return self.run_document(*args, **kwargs) + elif not is_document and is_batch: + return self.run_batch_raw(*args, **kwargs) + else: + return self.run_raw(*args, **kwargs) diff --git a/knowledgehub/llms/__init__.py b/knowledgehub/llms/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/knowledgehub/llms/base.py b/knowledgehub/llms/base.py new file mode 100644 index 0000000..85ea0b9 --- /dev/null +++ b/knowledgehub/llms/base.py @@ -0,0 +1,24 @@ +from dataclasses import dataclass, field + +from ..components import BaseComponent + + +@dataclass +class LLMInterface: + text: list[str] + completion_tokens: int = -1 + total_tokens: int = -1 + prompt_tokens: int = -1 + logits: list[list[float]] = field(default_factory=list) + + +class PromptTemplate(BaseComponent): + pass + + +class Extract(BaseComponent): + pass + + +class PromptNode(BaseComponent): + pass diff --git a/knowledgehub/llms/chats/__init__.py b/knowledgehub/llms/chats/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/knowledgehub/llms/chats/base.py b/knowledgehub/llms/chats/base.py new file mode 100644 index 0000000..ea1aa26 --- /dev/null +++ b/knowledgehub/llms/chats/base.py @@ -0,0 +1,85 @@ +from typing import Type, TypeVar + +from theflow.base import Param +from langchain.schema.language_model import BaseLanguageModel + +from langchain.schema.messages import ( + BaseMessage, + HumanMessage, +) + +from ...components import BaseComponent +from ..base import LLMInterface + + +Message = TypeVar("Message", bound=BaseMessage) + + +class ChatLLM(BaseComponent): + ... + + +class LangchainChatLLM(ChatLLM): + _lc_class: Type[BaseLanguageModel] + + def __init__(self, **params): + if self._lc_class is None: + raise AttributeError( + "Should set _lc_class attribute to the LLM class from Langchain " + "if using LLM from Langchain" + ) + + self._kwargs: dict = {} + for param in list(params.keys()): + if param in self._lc_class.__fields__: + self._kwargs[param] = params.pop(param) + super().__init__(**params) + + @Param.decorate() + def agent(self): + return self._lc_class(**self._kwargs) + + def run_raw(self, text: str) -> LLMInterface: + message = HumanMessage(content=text) + return self.run_document([message]) + + def run_batch_raw(self, text: list[str]) -> list[LLMInterface]: + inputs = [[HumanMessage(content=each)] for each in text] + return self.run_batch_document(inputs) + + def run_document(self, text: list[Message]) -> LLMInterface: + pred = self.agent.generate([text]) + return LLMInterface( + text=[each.text for each in pred.generations[0]], + completion_tokens=pred.llm_output["token_usage"]["completion_tokens"], + total_tokens=pred.llm_output["token_usage"]["total_tokens"], + prompt_tokens=pred.llm_output["token_usage"]["prompt_tokens"], + logits=[], + ) + + def run_batch_document(self, text: list[list[Message]]) -> list[LLMInterface]: + outputs = [] + for each_text in text: + outputs.append(self.run_document(each_text)) + return outputs + + def is_document(self, text) -> bool: + if isinstance(text, str): + return False + elif isinstance(text, list) and isinstance(text[0], str): + return False + return True + + def is_batch(self, text) -> bool: + if isinstance(text, str): + return False + elif isinstance(text, list): + if isinstance(text[0], BaseMessage): + return False + return True + + def __setattr__(self, name, value): + if name in self._lc_class.__fields__: + setattr(self.agent, name, value) + else: + super().__setattr__(name, value) diff --git a/knowledgehub/llms/chats/openai.py b/knowledgehub/llms/chats/openai.py new file mode 100644 index 0000000..2d0ec52 --- /dev/null +++ b/knowledgehub/llms/chats/openai.py @@ -0,0 +1,7 @@ +from langchain.chat_models import AzureChatOpenAI as AzureChatOpenAILC + +from .base import LangchainChatLLM + + +class AzureChatOpenAI(LangchainChatLLM): + _lc_class = AzureChatOpenAILC diff --git a/knowledgehub/llms/completions/__init__.py b/knowledgehub/llms/completions/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/knowledgehub/llms/completions/base.py b/knowledgehub/llms/completions/base.py new file mode 100644 index 0000000..2409c91 --- /dev/null +++ b/knowledgehub/llms/completions/base.py @@ -0,0 +1,70 @@ +from typing import Type + +from theflow.base import Param +from langchain.schema.language_model import BaseLanguageModel + +from ...components import BaseComponent +from ..base import LLMInterface + + +class LLM(BaseComponent): + pass + + +class LangchainLLM(LLM): + _lc_class: Type[BaseLanguageModel] + + def __init__(self, **params): + if self._lc_class is None: + raise AttributeError( + "Should set _lc_class attribute to the LLM class from Langchain " + "if using LLM from Langchain" + ) + + self._kwargs: dict = {} + for param in list(params.keys()): + if param in self._lc_class.__fields__: + self._kwargs[param] = params.pop(param) + super().__init__(**params) + + @Param.decorate() + def agent(self): + return self._lc_class(**self._kwargs) + + def run_raw(self, text: str) -> LLMInterface: + pred = self.agent.generate([text]) + return LLMInterface( + text=[each.text for each in pred.generations[0]], + completion_tokens=pred.llm_output["token_usage"]["completion_tokens"], + total_tokens=pred.llm_output["token_usage"]["total_tokens"], + prompt_tokens=pred.llm_output["token_usage"]["prompt_tokens"], + logits=[], + ) + + def run_batch_raw(self, text: list[str]) -> list[LLMInterface]: + outputs = [] + for each_text in text: + outputs.append(self.run_raw(each_text)) + return outputs + + def run_document(self, text: str) -> LLMInterface: + return self.run_raw(text) + + def run_batch_document(self, text: list[str]) -> list[LLMInterface]: + return self.run_batch_raw(text) + + def is_document(self, text) -> bool: + return False + + def is_batch(self, text) -> bool: + return False if isinstance(text, str) else True + + def __setattr__(self, name, value): + if name in self._lc_class.__fields__: + setattr(self.agent, name, value) + else: + super().__setattr__(name, value) + + +class LLMChat(BaseComponent): + pass diff --git a/knowledgehub/llms/completions/openai.py b/knowledgehub/llms/completions/openai.py new file mode 100644 index 0000000..a510e27 --- /dev/null +++ b/knowledgehub/llms/completions/openai.py @@ -0,0 +1,13 @@ +import langchain.llms as langchain_llms + +from .base import LangchainLLM + + +class OpenAI(LangchainLLM): + """Wrapper around Langchain's OpenAI class""" + _lc_class = langchain_llms.OpenAI + + +class AzureOpenAI(LangchainLLM): + """Wrapper around Langchain's AzureOpenAI class""" + _lc_class = langchain_llms.AzureOpenAI diff --git a/setup.py b/setup.py index c35a4f9..39ed677 100644 --- a/setup.py +++ b/setup.py @@ -28,17 +28,23 @@ setuptools.setup( url="https://github.com/Cinnamon/kotaemon/", packages=setuptools.find_packages(exclude=("tests", "tests.*")), install_requires=[ - "farm-haystack" + "farm-haystack==1.19.0", + "langchain", + "theflow", ], extras_require={ "dev": [ + "ipython", "pytest", "pre-commit", "black", "flake8", "sphinx", "coverage", - ] + + # optional dependency needed for test + "openai" + ], }, entry_points={"console_scripts": ["kh=kotaemon.cli:main"]}, python_requires=">=3", diff --git a/tests/test_llms_chat_models.py b/tests/test_llms_chat_models.py new file mode 100644 index 0000000..aba8e8e --- /dev/null +++ b/tests/test_llms_chat_models.py @@ -0,0 +1,78 @@ +from unittest.mock import patch + +from langchain.chat_models import AzureChatOpenAI as AzureChatOpenAILC +from langchain.schema.messages import ( + SystemMessage, + HumanMessage, + AIMessage, +) + +from kotaemon.llms.chats.openai import AzureChatOpenAI +from kotaemon.llms.base import LLMInterface + + +_openai_chat_completion_response = { + "id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x", + "object": "chat.completion", + "created": 1692338378, + "model": "gpt-35-turbo", + "choices": [ + { + "index": 0, + "finish_reason": "stop", + "message": { + "role": "assistant", + "content": "Hello! How can I assist you today?", + }, + } + ], + "usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19}, +} + + +@patch( + "openai.api_resources.chat_completion.ChatCompletion.create", + side_effect=lambda *args, **kwargs: _openai_chat_completion_response, +) +def test_azureopenai_model(openai_completion): + model = AzureChatOpenAI( + openai_api_base="https://test.openai.azure.com/", + openai_api_key="some-key", + openai_api_version="2023-03-15-preview", + deployment_name="gpt35turbo", + temperature=0, + request_timeout=60, + ) + assert isinstance( + model.agent, AzureChatOpenAILC + ), "Agent not wrapped in Langchain's AzureChatOpenAI" + + # test for str input - stream mode + output = model("hello world") + assert isinstance(output, LLMInterface), "Output for single text is not LLMInterface" + openai_completion.assert_called() + + # test for list[str] input - batch mode + output = model(["hello world"]) + assert isinstance(output, list), "Output for batch string is not a list" + assert isinstance(output[0], LLMInterface), "Output for text is not LLMInterface" + openai_completion.assert_called() + + # test for list[message] input - stream mode + messages = [ + SystemMessage(content="You are a philosohper"), + HumanMessage(content="What is the meaning of life"), + AIMessage(content="42"), + HumanMessage(content="What is the meaning of 42"), + ] + + output = model(messages) + assert isinstance(output, LLMInterface), "Output for single text is not LLMInterface" + openai_completion.assert_called() + + # test for list[list[message]] input - batch mode + output = model([messages]) + assert isinstance(output, list), "Output for batch string is not a list" + assert isinstance(output[0], LLMInterface), "Output for text is not LLMInterface" + openai_completion.assert_called() + diff --git a/tests/test_llms_completion_models.py b/tests/test_llms_completion_models.py new file mode 100644 index 0000000..99a354e --- /dev/null +++ b/tests/test_llms_completion_models.py @@ -0,0 +1,70 @@ +from unittest.mock import patch + +from langchain.llms import AzureOpenAI as AzureOpenAILC, OpenAI as OpenAILC + +from kotaemon.llms.completions.openai import AzureOpenAI, OpenAI +from kotaemon.llms.base import LLMInterface + + +_openai_completion_response = { + "id": "cmpl-7qyNoIo6gRSCJR0hi8o3ZKBH4RkJ0", + "object": "sample text_completion", + "created": 1392751226, + "model": "gpt-35-turbo", + "choices": [ + {"text": "completion", "index": 0, "finish_reason": "length", "logprobs": None} + ], + "usage": {"completion_tokens": 20, "prompt_tokens": 2, "total_tokens": 22}, +} + + +@patch( + "openai.api_resources.completion.Completion.create", + side_effect=lambda *args, **kwargs: _openai_completion_response, +) +def test_azureopenai_model(openai_completion): + model = AzureOpenAI( + openai_api_base="https://test.openai.azure.com/", + openai_api_key="some-key", + openai_api_version="2023-03-15-preview", + deployment_name="gpt35turbo", + temperature=0, + request_timeout=60, + ) + assert isinstance( + model.agent, AzureOpenAILC + ), "Agent not wrapped in Langchain's AzureOpenAI" + + output = model(["hello world"]) + assert isinstance(output, list), "Output for batch is not a list" + assert isinstance(output[0], LLMInterface), "Output for text is not LLMInterface" + openai_completion.assert_called() + + output = model("hello world") + assert isinstance(output, LLMInterface), "Output for single text is not LLMInterface" + + +@patch( + "openai.api_resources.completion.Completion.create", + side_effect=lambda *args, **kwargs: _openai_completion_response, +) +def test_openai_model(openai_completion): + model = OpenAI( + openai_api_base="https://test.openai.azure.com/", + openai_api_key="some-key", + openai_api_version="2023-03-15-preview", + deployment_name="gpt35turbo", + temperature=0, + request_timeout=60, + ) + assert isinstance( + model.agent, OpenAILC + ), "Agent is not wrapped in Langchain's OpenAI" + + output = model(["hello world"]) + assert isinstance(output, list), "Output for batch is not a list" + assert isinstance(output[0], LLMInterface), "Output for text is not LLMInterface" + openai_completion.assert_called() + + output = model("hello world") + assert isinstance(output, LLMInterface), "Output for single text is not LLMInterface" diff --git a/tests/test_telemetry.py b/tests/test_telemetry.py index 349facd..59b184f 100644 --- a/tests/test_telemetry.py +++ b/tests/test_telemetry.py @@ -1,3 +1,31 @@ +import os +import sys + +import pytest + + +@pytest.fixture +def clean_artifacts_for_telemetry(): + try: + del sys.modules["kotaemon"] + except KeyError: + pass + + try: + del sys.modules["haystack"] + except KeyError: + pass + + try: + del sys.modules["haystack.telemetry"] + except KeyError: + pass + + if "HAYSTACK_TELEMETRY_ENABLED" in os.environ: + del os.environ["HAYSTACK_TELEMETRY_ENABLED"] + + +@pytest.mark.usefixtures("clean_artifacts_for_telemetry") def test_disable_telemetry_import_haystack_first(): """Test that telemetry is disabled when kotaemon lib is initiated after""" import os @@ -10,6 +38,7 @@ def test_disable_telemetry_import_haystack_first(): assert os.environ.get("HAYSTACK_TELEMETRY_ENABLED", "True") == "False" +@pytest.mark.usefixtures("clean_artifacts_for_telemetry") def test_disable_telemetry_import_haystack_after_kotaemon(): """Test that telemetry is disabled when kotaemon lib is initiated before""" import os