diff --git a/.github/workflows/unit-test.yaml b/.github/workflows/unit-test.yaml index d0ce722..ff1f2c8 100644 --- a/.github/workflows/unit-test.yaml +++ b/.github/workflows/unit-test.yaml @@ -60,7 +60,7 @@ jobs: # built-in tomllib run: | pip install tomli - package_version=$(python -c "import tomli; print(tomli.load(open('pyproject.toml', 'rb'))['project']['version'])") + package_version=$(python -c "import tomli; print(tomli.load(open('libs/kotaemon/pyproject.toml', 'rb'))['project']['version'])") cache_key="${{ runner.os }}-py${{ matrix.python-version }}-v${package_version}" echo "key=$cache_key" | tee -a ${{ matrix.GITHUB_OUTPUT }} @@ -99,7 +99,8 @@ jobs: path: ${{ env.pythonLocation }} key: ${{ steps.restore-dependencies.outputs.cache-primary-key }} - - name: Test with pytest + - name: Test kotaemon with pytest run: | pip show pytest + cd libs/kotaemon pytest diff --git a/.gitignore b/.gitignore index 19ecc56..1948399 100644 --- a/.gitignore +++ b/.gitignore @@ -448,6 +448,7 @@ $RECYCLE.BIN/ # PDF files *.pdf +!libs/kotaemon/tests/resources/*.pdf .theflow/ diff --git a/docs/scripts/generate_reference_docs.py b/docs/scripts/generate_reference_docs.py index 878fd51..f533cb3 100644 --- a/docs/scripts/generate_reference_docs.py +++ b/docs/scripts/generate_reference_docs.py @@ -69,7 +69,7 @@ def generate_docs_for_src_code( generate_docs_for_src_code( - code_dir=doc_dir.parent / "kotaemon", + code_dir=doc_dir.parent / "libs" / "kotaemon", target_doc_folder="reference", ignored_modules={"contribs"}, ) diff --git a/libs/kotaemon/README.md b/libs/kotaemon/README.md new file mode 100644 index 0000000..ad70b68 --- /dev/null +++ b/libs/kotaemon/README.md @@ -0,0 +1,130 @@ +# kotaemon + +Quick and easy AI components to build Kotaemon - applicable in client +project. + +## Documentation + +https://docs.promptui.dm.cinnamon.is + +## Install + +```shell +pip install kotaemon@git+ssh://git@github.com/Cinnamon/kotaemon.git +``` + +## Contribute + +### Setup + +- Create conda environment (suggest 3.10) + + ```shell + conda create -n kotaemon python=3.10 + conda activate kotaemon + ``` + +- Clone the repo + + ```shell + git clone git@github.com:Cinnamon/kotaemon.git + cd kotaemon + ``` + +- Install all + + ```shell + pip install -e ".[dev]" + ``` + +- Pre-commit + + ```shell + pre-commit install + ``` + +- Test + + ```shell + pytest tests + ``` + +### Credential sharing + +This repo uses [git-secret](https://sobolevn.me/git-secret/) to share credentials, which +internally uses `gpg` to encrypt and decrypt secret files. + +This repo uses `python-dotenv` to manage credentials stored as environment variable. +Please note that the use of `python-dotenv` and credentials are for development +purposes only. Thus, it should not be used in the main source code (i.e. `kotaemon/` and `tests/`), but can be used in `examples/`. + +#### Install git-secret + +Please follow the [official guide](https://sobolevn.me/git-secret/installation) to install git-secret. + +For Windows users, see [For Windows users](#for-windows-users). + +For users who don't have sudo privilege to install packages, follow the `Manual Installation` in the [official guide](https://sobolevn.me/git-secret/installation) and set `PREFIX` to a path that you have access to. And please don't forget to add `PREFIX` to your `PATH`. + +#### Gaining access + +In order to gain access to the secret files, you must provide your gpg public file to anyone who has access and ask them to ask your key to the keyring. For a quick tutorial on generating your gpg key pair, you can refer to the `Using gpg` section from the [git-secret main page](https://sobolevn.me/git-secret/). + +#### Decrypt the secret file + +The credentials are encrypted in the `.env.secret` file. To print the decrypted content to stdout, run + +```shell +git-secret cat [filename] +``` + +Or to get the decrypted `.env` file, run + +```shell +git-secret reveal [filename] +``` + +#### For Windows users + +git-secret is currently not available for Windows, thus the easiest way is to use it in WSL (please use the latest version of WSL2). From there you have 2 options: + +1. Using the gpg of WSL. + + This is the most straight-forward option since you would use WSL just like any other unix environment. However, the downside is that you have to make WSL your main environment, which means WSL must have write permission on your repo. To achieve this, you must either: + + - Clone and store your repo inside WSL's file system. + - Provide WSL with necessary permission on your Windows file system. This can be achieve by setting `automount` options for WSL. To do that, add these content to `/etc/wsl.conf` and then restart your sub-system. + + ```shell + [automount] + options = "metadata,umask=022,fmask=011" + ``` + + This enables all permissions for user owner. + +2. Using the gpg of Windows but with git-secret from WSL. + + For those who use Windows as the main environment, having to switch back and forth between Windows and WSL will be inconvenient. You can instead stay within your Windows environment and apply some tricks to use `git-secret` from WSL. + + - Install and setup `gpg` on Windows. + - Install `git-secret` on WSL. Now in Windows, you can invoke `git-secret` using `wsl git-secret`. + - Alternatively you can setup alias in CMD to shorten the syntax. Please refer to [this SO answer](https://stackoverflow.com/a/65823225) for the instruction. Some recommended aliases are: + + ```bat + @echo off + + :: Commands + DOSKEY ls=dir /B $* + DOSKEY ll=dir /a $* + DOSKEY git-secret=wsl git-secret $* + DOSKEY gs=wsl git-secret $* + ``` + + Now you can invoke `git-secret` in CMD using `git-secret` or `gs`. + + - For Powershell users, similar behaviours can be achieved using `Set-Alias` and `profile.ps1`. Please refer this [SO thread](https://stackoverflow.com/questions/61081434/how-do-i-create-a-permanent-alias-file-in-powershell-core) as an example. + +### Code base structure + +- documents: define document +- loaders diff --git a/knowledgehub/__init__.py b/libs/kotaemon/kotaemon/__init__.py similarity index 100% rename from knowledgehub/__init__.py rename to libs/kotaemon/kotaemon/__init__.py diff --git a/knowledgehub/agents/__init__.py b/libs/kotaemon/kotaemon/agents/__init__.py similarity index 100% rename from knowledgehub/agents/__init__.py rename to libs/kotaemon/kotaemon/agents/__init__.py diff --git a/knowledgehub/agents/base.py b/libs/kotaemon/kotaemon/agents/base.py similarity index 100% rename from knowledgehub/agents/base.py rename to libs/kotaemon/kotaemon/agents/base.py diff --git a/knowledgehub/agents/io/__init__.py b/libs/kotaemon/kotaemon/agents/io/__init__.py similarity index 100% rename from knowledgehub/agents/io/__init__.py rename to libs/kotaemon/kotaemon/agents/io/__init__.py diff --git a/knowledgehub/agents/io/base.py b/libs/kotaemon/kotaemon/agents/io/base.py similarity index 99% rename from knowledgehub/agents/io/base.py rename to libs/kotaemon/kotaemon/agents/io/base.py index c27eed0..da6e3b9 100644 --- a/knowledgehub/agents/io/base.py +++ b/libs/kotaemon/kotaemon/agents/io/base.py @@ -5,9 +5,8 @@ from dataclasses import dataclass from enum import Enum from typing import Any, Dict, Literal, NamedTuple, Optional, Union -from pydantic import Extra - from kotaemon.base import LLMInterface +from pydantic import Extra def check_log(): diff --git a/knowledgehub/agents/langchain_based.py b/libs/kotaemon/kotaemon/agents/langchain_based.py similarity index 99% rename from knowledgehub/agents/langchain_based.py rename to libs/kotaemon/kotaemon/agents/langchain_based.py index 3083b70..7bdb7aa 100644 --- a/knowledgehub/agents/langchain_based.py +++ b/libs/kotaemon/kotaemon/agents/langchain_based.py @@ -1,11 +1,10 @@ from typing import List, Optional +from kotaemon.llms import LLM, ChatLLM from langchain.agents import AgentType as LCAgentType from langchain.agents import initialize_agent from langchain.agents.agent import AgentExecutor as LCAgentExecutor -from kotaemon.llms import LLM, ChatLLM - from .base import BaseAgent from .io import AgentOutput, AgentType from .tools import BaseTool diff --git a/knowledgehub/agents/react/__init__.py b/libs/kotaemon/kotaemon/agents/react/__init__.py similarity index 100% rename from knowledgehub/agents/react/__init__.py rename to libs/kotaemon/kotaemon/agents/react/__init__.py diff --git a/knowledgehub/agents/react/agent.py b/libs/kotaemon/kotaemon/agents/react/agent.py similarity index 100% rename from knowledgehub/agents/react/agent.py rename to libs/kotaemon/kotaemon/agents/react/agent.py diff --git a/knowledgehub/agents/react/prompt.py b/libs/kotaemon/kotaemon/agents/react/prompt.py similarity index 100% rename from knowledgehub/agents/react/prompt.py rename to libs/kotaemon/kotaemon/agents/react/prompt.py diff --git a/knowledgehub/agents/rewoo/__init__.py b/libs/kotaemon/kotaemon/agents/rewoo/__init__.py similarity index 100% rename from knowledgehub/agents/rewoo/__init__.py rename to libs/kotaemon/kotaemon/agents/rewoo/__init__.py diff --git a/knowledgehub/agents/rewoo/agent.py b/libs/kotaemon/kotaemon/agents/rewoo/agent.py similarity index 100% rename from knowledgehub/agents/rewoo/agent.py rename to libs/kotaemon/kotaemon/agents/rewoo/agent.py diff --git a/knowledgehub/agents/rewoo/planner.py b/libs/kotaemon/kotaemon/agents/rewoo/planner.py similarity index 100% rename from knowledgehub/agents/rewoo/planner.py rename to libs/kotaemon/kotaemon/agents/rewoo/planner.py diff --git a/knowledgehub/agents/rewoo/prompt.py b/libs/kotaemon/kotaemon/agents/rewoo/prompt.py similarity index 100% rename from knowledgehub/agents/rewoo/prompt.py rename to libs/kotaemon/kotaemon/agents/rewoo/prompt.py diff --git a/knowledgehub/agents/rewoo/solver.py b/libs/kotaemon/kotaemon/agents/rewoo/solver.py similarity index 100% rename from knowledgehub/agents/rewoo/solver.py rename to libs/kotaemon/kotaemon/agents/rewoo/solver.py diff --git a/knowledgehub/agents/tools/__init__.py b/libs/kotaemon/kotaemon/agents/tools/__init__.py similarity index 100% rename from knowledgehub/agents/tools/__init__.py rename to libs/kotaemon/kotaemon/agents/tools/__init__.py diff --git a/knowledgehub/agents/tools/base.py b/libs/kotaemon/kotaemon/agents/tools/base.py similarity index 99% rename from knowledgehub/agents/tools/base.py rename to libs/kotaemon/kotaemon/agents/tools/base.py index 1caf3d2..42f7533 100644 --- a/knowledgehub/agents/tools/base.py +++ b/libs/kotaemon/kotaemon/agents/tools/base.py @@ -1,10 +1,9 @@ from typing import Any, Callable, Dict, Optional, Tuple, Type, Union +from kotaemon.base import BaseComponent from langchain.agents import Tool as LCTool from pydantic import BaseModel -from kotaemon.base import BaseComponent - class ToolException(Exception): """An optional exception that tool throws when execution error occurs. diff --git a/knowledgehub/agents/tools/google.py b/libs/kotaemon/kotaemon/agents/tools/google.py similarity index 100% rename from knowledgehub/agents/tools/google.py rename to libs/kotaemon/kotaemon/agents/tools/google.py diff --git a/knowledgehub/agents/tools/llm.py b/libs/kotaemon/kotaemon/agents/tools/llm.py similarity index 99% rename from knowledgehub/agents/tools/llm.py rename to libs/kotaemon/kotaemon/agents/tools/llm.py index 750462e..2df87cc 100644 --- a/knowledgehub/agents/tools/llm.py +++ b/libs/kotaemon/kotaemon/agents/tools/llm.py @@ -1,8 +1,7 @@ from typing import AnyStr, Optional, Type -from pydantic import BaseModel, Field - from kotaemon.llms import BaseLLM +from pydantic import BaseModel, Field from .base import BaseTool, ToolException diff --git a/knowledgehub/agents/tools/wikipedia.py b/libs/kotaemon/kotaemon/agents/tools/wikipedia.py similarity index 99% rename from knowledgehub/agents/tools/wikipedia.py rename to libs/kotaemon/kotaemon/agents/tools/wikipedia.py index 9e6a362..3046522 100644 --- a/knowledgehub/agents/tools/wikipedia.py +++ b/libs/kotaemon/kotaemon/agents/tools/wikipedia.py @@ -1,8 +1,7 @@ from typing import Any, AnyStr, Optional, Type, Union -from pydantic import BaseModel, Field - from kotaemon.base import Document +from pydantic import BaseModel, Field from .base import BaseTool diff --git a/knowledgehub/agents/utils.py b/libs/kotaemon/kotaemon/agents/utils.py similarity index 100% rename from knowledgehub/agents/utils.py rename to libs/kotaemon/kotaemon/agents/utils.py diff --git a/knowledgehub/base/__init__.py b/libs/kotaemon/kotaemon/base/__init__.py similarity index 100% rename from knowledgehub/base/__init__.py rename to libs/kotaemon/kotaemon/base/__init__.py diff --git a/knowledgehub/base/component.py b/libs/kotaemon/kotaemon/base/component.py similarity index 99% rename from knowledgehub/base/component.py rename to libs/kotaemon/kotaemon/base/component.py index 90823ae..8c24c14 100644 --- a/knowledgehub/base/component.py +++ b/libs/kotaemon/kotaemon/base/component.py @@ -1,9 +1,8 @@ from abc import abstractmethod from typing import Iterator -from theflow import Function, Node, Param, lazy - from kotaemon.base.schema import Document +from theflow import Function, Node, Param, lazy class BaseComponent(Function): diff --git a/knowledgehub/base/schema.py b/libs/kotaemon/kotaemon/base/schema.py similarity index 100% rename from knowledgehub/base/schema.py rename to libs/kotaemon/kotaemon/base/schema.py diff --git a/knowledgehub/chatbot/__init__.py b/libs/kotaemon/kotaemon/chatbot/__init__.py similarity index 100% rename from knowledgehub/chatbot/__init__.py rename to libs/kotaemon/kotaemon/chatbot/__init__.py diff --git a/knowledgehub/chatbot/base.py b/libs/kotaemon/kotaemon/chatbot/base.py similarity index 99% rename from knowledgehub/chatbot/base.py rename to libs/kotaemon/kotaemon/chatbot/base.py index b6a3baf..f4b8c86 100644 --- a/knowledgehub/chatbot/base.py +++ b/libs/kotaemon/kotaemon/chatbot/base.py @@ -1,10 +1,9 @@ from abc import abstractmethod from typing import List, Optional -from theflow import SessionFunction - from kotaemon.base import BaseComponent, LLMInterface from kotaemon.base.schema import AIMessage, BaseMessage, HumanMessage, SystemMessage +from theflow import SessionFunction class BaseChatBot(BaseComponent): diff --git a/knowledgehub/chatbot/simple_respondent.py b/libs/kotaemon/kotaemon/chatbot/simple_respondent.py similarity index 100% rename from knowledgehub/chatbot/simple_respondent.py rename to libs/kotaemon/kotaemon/chatbot/simple_respondent.py diff --git a/knowledgehub/cli.py b/libs/kotaemon/kotaemon/cli.py similarity index 99% rename from knowledgehub/cli.py rename to libs/kotaemon/kotaemon/cli.py index 75101ff..84a29b1 100644 --- a/knowledgehub/cli.py +++ b/libs/kotaemon/kotaemon/cli.py @@ -36,9 +36,8 @@ def export(export_path, output): """Export a pipeline to a config file""" import sys - from theflow.utils.modules import import_dotted_string - from kotaemon.contribs.promptui.config import export_pipeline_to_config + from theflow.utils.modules import import_dotted_string sys.path.append(os.getcwd()) cls = import_dotted_string(export_path, safe=False) diff --git a/knowledgehub/contribs/__init__.py b/libs/kotaemon/kotaemon/contribs/__init__.py similarity index 100% rename from knowledgehub/contribs/__init__.py rename to libs/kotaemon/kotaemon/contribs/__init__.py diff --git a/knowledgehub/contribs/docs.py b/libs/kotaemon/kotaemon/contribs/docs.py similarity index 100% rename from knowledgehub/contribs/docs.py rename to libs/kotaemon/kotaemon/contribs/docs.py diff --git a/knowledgehub/contribs/promptui/.gitignore b/libs/kotaemon/kotaemon/contribs/promptui/.gitignore similarity index 100% rename from knowledgehub/contribs/promptui/.gitignore rename to libs/kotaemon/kotaemon/contribs/promptui/.gitignore diff --git a/knowledgehub/contribs/promptui/__init__.py b/libs/kotaemon/kotaemon/contribs/promptui/__init__.py similarity index 100% rename from knowledgehub/contribs/promptui/__init__.py rename to libs/kotaemon/kotaemon/contribs/promptui/__init__.py diff --git a/knowledgehub/contribs/promptui/base.py b/libs/kotaemon/kotaemon/contribs/promptui/base.py similarity index 100% rename from knowledgehub/contribs/promptui/base.py rename to libs/kotaemon/kotaemon/contribs/promptui/base.py diff --git a/knowledgehub/contribs/promptui/cli.py b/libs/kotaemon/kotaemon/contribs/promptui/cli.py similarity index 100% rename from knowledgehub/contribs/promptui/cli.py rename to libs/kotaemon/kotaemon/contribs/promptui/cli.py diff --git a/knowledgehub/contribs/promptui/config.py b/libs/kotaemon/kotaemon/contribs/promptui/config.py similarity index 99% rename from knowledgehub/contribs/promptui/config.py rename to libs/kotaemon/kotaemon/contribs/promptui/config.py index e9098b1..e660168 100644 --- a/knowledgehub/contribs/promptui/config.py +++ b/libs/kotaemon/kotaemon/contribs/promptui/config.py @@ -4,7 +4,6 @@ from pathlib import Path from typing import Any, Dict, Optional, Type, Union import yaml - from kotaemon.base import BaseComponent from kotaemon.chatbot import BaseChatBot diff --git a/knowledgehub/contribs/promptui/export.py b/libs/kotaemon/kotaemon/contribs/promptui/export.py similarity index 99% rename from knowledgehub/contribs/promptui/export.py rename to libs/kotaemon/kotaemon/contribs/promptui/export.py index 8b174ea..ce152f1 100644 --- a/knowledgehub/contribs/promptui/export.py +++ b/libs/kotaemon/kotaemon/contribs/promptui/export.py @@ -6,11 +6,10 @@ from typing import Any, Dict, List, Type, Union import pandas as pd import yaml +from kotaemon.base import BaseComponent from theflow.storage import storage from theflow.utils.modules import import_dotted_string -from kotaemon.base import BaseComponent - from .logs import ResultLog diff --git a/knowledgehub/contribs/promptui/logs.py b/libs/kotaemon/kotaemon/contribs/promptui/logs.py similarity index 100% rename from knowledgehub/contribs/promptui/logs.py rename to libs/kotaemon/kotaemon/contribs/promptui/logs.py diff --git a/knowledgehub/contribs/promptui/themes.py b/libs/kotaemon/kotaemon/contribs/promptui/themes.py similarity index 100% rename from knowledgehub/contribs/promptui/themes.py rename to libs/kotaemon/kotaemon/contribs/promptui/themes.py diff --git a/knowledgehub/contribs/promptui/tunnel.py b/libs/kotaemon/kotaemon/contribs/promptui/tunnel.py similarity index 100% rename from knowledgehub/contribs/promptui/tunnel.py rename to libs/kotaemon/kotaemon/contribs/promptui/tunnel.py diff --git a/knowledgehub/contribs/promptui/ui/__init__.py b/libs/kotaemon/kotaemon/contribs/promptui/ui/__init__.py similarity index 100% rename from knowledgehub/contribs/promptui/ui/__init__.py rename to libs/kotaemon/kotaemon/contribs/promptui/ui/__init__.py diff --git a/knowledgehub/contribs/promptui/ui/blocks.py b/libs/kotaemon/kotaemon/contribs/promptui/ui/blocks.py similarity index 100% rename from knowledgehub/contribs/promptui/ui/blocks.py rename to libs/kotaemon/kotaemon/contribs/promptui/ui/blocks.py diff --git a/knowledgehub/contribs/promptui/ui/chat.py b/libs/kotaemon/kotaemon/contribs/promptui/ui/chat.py similarity index 99% rename from knowledgehub/contribs/promptui/ui/chat.py rename to libs/kotaemon/kotaemon/contribs/promptui/ui/chat.py index 78eecb5..6c4b743 100644 --- a/knowledgehub/contribs/promptui/ui/chat.py +++ b/libs/kotaemon/kotaemon/contribs/promptui/ui/chat.py @@ -3,12 +3,11 @@ from datetime import datetime from pathlib import Path import gradio as gr -from theflow.storage import storage - from kotaemon.chatbot import ChatConversation from kotaemon.contribs.promptui.base import get_component from kotaemon.contribs.promptui.export import export from kotaemon.contribs.promptui.ui.blocks import ChatBlock +from theflow.storage import storage from ..logs import ResultLog diff --git a/knowledgehub/contribs/promptui/ui/pipeline.py b/libs/kotaemon/kotaemon/contribs/promptui/ui/pipeline.py similarity index 99% rename from knowledgehub/contribs/promptui/ui/pipeline.py rename to libs/kotaemon/kotaemon/contribs/promptui/ui/pipeline.py index 725893d..12b1133 100644 --- a/knowledgehub/contribs/promptui/ui/pipeline.py +++ b/libs/kotaemon/kotaemon/contribs/promptui/ui/pipeline.py @@ -6,10 +6,9 @@ from typing import Any, Dict import gradio as gr import pandas as pd -from theflow.storage import storage - from kotaemon.contribs.promptui.base import get_component from kotaemon.contribs.promptui.export import export +from theflow.storage import storage from ..logs import ResultLog diff --git a/knowledgehub/embeddings/__init__.py b/libs/kotaemon/kotaemon/embeddings/__init__.py similarity index 100% rename from knowledgehub/embeddings/__init__.py rename to libs/kotaemon/kotaemon/embeddings/__init__.py diff --git a/knowledgehub/embeddings/base.py b/libs/kotaemon/kotaemon/embeddings/base.py similarity index 100% rename from knowledgehub/embeddings/base.py rename to libs/kotaemon/kotaemon/embeddings/base.py diff --git a/knowledgehub/embeddings/langchain_based.py b/libs/kotaemon/kotaemon/embeddings/langchain_based.py similarity index 100% rename from knowledgehub/embeddings/langchain_based.py rename to libs/kotaemon/kotaemon/embeddings/langchain_based.py diff --git a/knowledgehub/indices/__init__.py b/libs/kotaemon/kotaemon/indices/__init__.py similarity index 100% rename from knowledgehub/indices/__init__.py rename to libs/kotaemon/kotaemon/indices/__init__.py diff --git a/knowledgehub/indices/base.py b/libs/kotaemon/kotaemon/indices/base.py similarity index 99% rename from knowledgehub/indices/base.py rename to libs/kotaemon/kotaemon/indices/base.py index 938be66..0de39c7 100644 --- a/knowledgehub/indices/base.py +++ b/libs/kotaemon/kotaemon/indices/base.py @@ -3,9 +3,8 @@ from __future__ import annotations from abc import abstractmethod from typing import Any, Type -from llama_index.node_parser.interface import NodeParser - from kotaemon.base import BaseComponent, Document, RetrievedDocument +from llama_index.node_parser.interface import NodeParser class DocTransformer(BaseComponent): diff --git a/knowledgehub/indices/extractors/__init__.py b/libs/kotaemon/kotaemon/indices/extractors/__init__.py similarity index 100% rename from knowledgehub/indices/extractors/__init__.py rename to libs/kotaemon/kotaemon/indices/extractors/__init__.py diff --git a/knowledgehub/indices/extractors/doc_parsers.py b/libs/kotaemon/kotaemon/indices/extractors/doc_parsers.py similarity index 100% rename from knowledgehub/indices/extractors/doc_parsers.py rename to libs/kotaemon/kotaemon/indices/extractors/doc_parsers.py diff --git a/knowledgehub/indices/ingests/__init__.py b/libs/kotaemon/kotaemon/indices/ingests/__init__.py similarity index 100% rename from knowledgehub/indices/ingests/__init__.py rename to libs/kotaemon/kotaemon/indices/ingests/__init__.py diff --git a/knowledgehub/indices/ingests/files.py b/libs/kotaemon/kotaemon/indices/ingests/files.py similarity index 99% rename from knowledgehub/indices/ingests/files.py rename to libs/kotaemon/kotaemon/indices/ingests/files.py index 22e7db9..b3b4f48 100644 --- a/knowledgehub/indices/ingests/files.py +++ b/libs/kotaemon/kotaemon/indices/ingests/files.py @@ -1,7 +1,5 @@ from pathlib import Path -from llama_index.readers.base import BaseReader - from kotaemon.base import BaseComponent, Document, Param from kotaemon.indices.extractors import BaseDocParser from kotaemon.indices.splitters import BaseSplitter, TokenSplitter @@ -13,6 +11,7 @@ from kotaemon.loaders import ( PandasExcelReader, UnstructuredReader, ) +from llama_index.readers.base import BaseReader class DocumentIngestor(BaseComponent): diff --git a/knowledgehub/indices/qa/__init__.py b/libs/kotaemon/kotaemon/indices/qa/__init__.py similarity index 100% rename from knowledgehub/indices/qa/__init__.py rename to libs/kotaemon/kotaemon/indices/qa/__init__.py diff --git a/knowledgehub/indices/qa/citation.py b/libs/kotaemon/kotaemon/indices/qa/citation.py similarity index 99% rename from knowledgehub/indices/qa/citation.py rename to libs/kotaemon/kotaemon/indices/qa/citation.py index 4c1281a..bafa2d4 100644 --- a/knowledgehub/indices/qa/citation.py +++ b/libs/kotaemon/kotaemon/indices/qa/citation.py @@ -1,10 +1,9 @@ from typing import Iterator, List -from pydantic import BaseModel, Field - from kotaemon.base import BaseComponent from kotaemon.base.schema import HumanMessage, SystemMessage from kotaemon.llms import BaseLLM +from pydantic import BaseModel, Field class FactWithEvidence(BaseModel): diff --git a/knowledgehub/indices/qa/text_based.py b/libs/kotaemon/kotaemon/indices/qa/text_based.py similarity index 100% rename from knowledgehub/indices/qa/text_based.py rename to libs/kotaemon/kotaemon/indices/qa/text_based.py diff --git a/knowledgehub/indices/rankings/__init__.py b/libs/kotaemon/kotaemon/indices/rankings/__init__.py similarity index 100% rename from knowledgehub/indices/rankings/__init__.py rename to libs/kotaemon/kotaemon/indices/rankings/__init__.py diff --git a/knowledgehub/indices/rankings/base.py b/libs/kotaemon/kotaemon/indices/rankings/base.py similarity index 100% rename from knowledgehub/indices/rankings/base.py rename to libs/kotaemon/kotaemon/indices/rankings/base.py diff --git a/knowledgehub/indices/rankings/cohere.py b/libs/kotaemon/kotaemon/indices/rankings/cohere.py similarity index 100% rename from knowledgehub/indices/rankings/cohere.py rename to libs/kotaemon/kotaemon/indices/rankings/cohere.py diff --git a/knowledgehub/indices/rankings/llm.py b/libs/kotaemon/kotaemon/indices/rankings/llm.py similarity index 99% rename from knowledgehub/indices/rankings/llm.py rename to libs/kotaemon/kotaemon/indices/rankings/llm.py index bff81ff..7b280f1 100644 --- a/knowledgehub/indices/rankings/llm.py +++ b/libs/kotaemon/kotaemon/indices/rankings/llm.py @@ -2,10 +2,9 @@ from __future__ import annotations from concurrent.futures import ThreadPoolExecutor -from langchain.output_parsers.boolean import BooleanOutputParser - from kotaemon.base import Document from kotaemon.llms import BaseLLM, PromptTemplate +from langchain.output_parsers.boolean import BooleanOutputParser from .base import BaseReranking diff --git a/knowledgehub/indices/splitters/__init__.py b/libs/kotaemon/kotaemon/indices/splitters/__init__.py similarity index 100% rename from knowledgehub/indices/splitters/__init__.py rename to libs/kotaemon/kotaemon/indices/splitters/__init__.py diff --git a/knowledgehub/indices/vectorindex.py b/libs/kotaemon/kotaemon/indices/vectorindex.py similarity index 100% rename from knowledgehub/indices/vectorindex.py rename to libs/kotaemon/kotaemon/indices/vectorindex.py diff --git a/knowledgehub/llms/__init__.py b/libs/kotaemon/kotaemon/llms/__init__.py similarity index 100% rename from knowledgehub/llms/__init__.py rename to libs/kotaemon/kotaemon/llms/__init__.py diff --git a/knowledgehub/llms/base.py b/libs/kotaemon/kotaemon/llms/base.py similarity index 99% rename from knowledgehub/llms/base.py rename to libs/kotaemon/kotaemon/llms/base.py index ff315ea..56bd910 100644 --- a/knowledgehub/llms/base.py +++ b/libs/kotaemon/kotaemon/llms/base.py @@ -1,6 +1,5 @@ -from langchain_core.language_models.base import BaseLanguageModel - from kotaemon.base import BaseComponent +from langchain_core.language_models.base import BaseLanguageModel class BaseLLM(BaseComponent): diff --git a/knowledgehub/llms/branching.py b/libs/kotaemon/kotaemon/llms/branching.py similarity index 99% rename from knowledgehub/llms/branching.py rename to libs/kotaemon/kotaemon/llms/branching.py index a9cbbe8..09fe3c2 100644 --- a/knowledgehub/llms/branching.py +++ b/libs/kotaemon/kotaemon/llms/branching.py @@ -156,7 +156,6 @@ class GatedBranchingPipeline(SimpleBranchingPipeline): if __name__ == "__main__": import dotenv - from kotaemon.llms import AzureChatOpenAI, BasePromptComponent from kotaemon.parsers import RegexExtractor diff --git a/knowledgehub/llms/chats/__init__.py b/libs/kotaemon/kotaemon/llms/chats/__init__.py similarity index 100% rename from knowledgehub/llms/chats/__init__.py rename to libs/kotaemon/kotaemon/llms/chats/__init__.py diff --git a/knowledgehub/llms/chats/base.py b/libs/kotaemon/kotaemon/llms/chats/base.py similarity index 100% rename from knowledgehub/llms/chats/base.py rename to libs/kotaemon/kotaemon/llms/chats/base.py diff --git a/knowledgehub/llms/chats/langchain_based.py b/libs/kotaemon/kotaemon/llms/chats/langchain_based.py similarity index 100% rename from knowledgehub/llms/chats/langchain_based.py rename to libs/kotaemon/kotaemon/llms/chats/langchain_based.py diff --git a/knowledgehub/llms/completions/__init__.py b/libs/kotaemon/kotaemon/llms/completions/__init__.py similarity index 100% rename from knowledgehub/llms/completions/__init__.py rename to libs/kotaemon/kotaemon/llms/completions/__init__.py diff --git a/knowledgehub/llms/completions/base.py b/libs/kotaemon/kotaemon/llms/completions/base.py similarity index 100% rename from knowledgehub/llms/completions/base.py rename to libs/kotaemon/kotaemon/llms/completions/base.py diff --git a/knowledgehub/llms/completions/langchain_based.py b/libs/kotaemon/kotaemon/llms/completions/langchain_based.py similarity index 97% rename from knowledgehub/llms/completions/langchain_based.py rename to libs/kotaemon/kotaemon/llms/completions/langchain_based.py index 8ad5a59..11dbe7e 100644 --- a/knowledgehub/llms/completions/langchain_based.py +++ b/libs/kotaemon/kotaemon/llms/completions/langchain_based.py @@ -141,9 +141,12 @@ class OpenAI(LCCompletionMixin, LLM): ) def _get_lc_class(self): - import langchain.llms as langchain_llms + try: + from langchain_openai import OpenAI + except ImportError: + from langchain.llms import OpenAI - return langchain_llms.OpenAI + return OpenAI class AzureOpenAI(LCCompletionMixin, LLM): diff --git a/knowledgehub/llms/cot.py b/libs/kotaemon/kotaemon/llms/cot.py similarity index 99% rename from knowledgehub/llms/cot.py rename to libs/kotaemon/kotaemon/llms/cot.py index 6721dc3..bac2a22 100644 --- a/knowledgehub/llms/cot.py +++ b/libs/kotaemon/kotaemon/llms/cot.py @@ -1,9 +1,8 @@ from copy import deepcopy from typing import Callable, List -from theflow import Function, Node, Param - from kotaemon.base import BaseComponent, Document +from theflow import Function, Node, Param from .chats import AzureChatOpenAI from .completions import LLM diff --git a/knowledgehub/llms/linear.py b/libs/kotaemon/kotaemon/llms/linear.py similarity index 100% rename from knowledgehub/llms/linear.py rename to libs/kotaemon/kotaemon/llms/linear.py diff --git a/knowledgehub/llms/prompts/__init__.py b/libs/kotaemon/kotaemon/llms/prompts/__init__.py similarity index 100% rename from knowledgehub/llms/prompts/__init__.py rename to libs/kotaemon/kotaemon/llms/prompts/__init__.py diff --git a/knowledgehub/llms/prompts/base.py b/libs/kotaemon/kotaemon/llms/prompts/base.py similarity index 100% rename from knowledgehub/llms/prompts/base.py rename to libs/kotaemon/kotaemon/llms/prompts/base.py diff --git a/knowledgehub/llms/prompts/template.py b/libs/kotaemon/kotaemon/llms/prompts/template.py similarity index 100% rename from knowledgehub/llms/prompts/template.py rename to libs/kotaemon/kotaemon/llms/prompts/template.py diff --git a/knowledgehub/loaders/__init__.py b/libs/kotaemon/kotaemon/loaders/__init__.py similarity index 100% rename from knowledgehub/loaders/__init__.py rename to libs/kotaemon/kotaemon/loaders/__init__.py diff --git a/knowledgehub/loaders/base.py b/libs/kotaemon/kotaemon/loaders/base.py similarity index 99% rename from knowledgehub/loaders/base.py rename to libs/kotaemon/kotaemon/loaders/base.py index cb92d5b..956002f 100644 --- a/knowledgehub/loaders/base.py +++ b/libs/kotaemon/kotaemon/loaders/base.py @@ -1,11 +1,10 @@ from pathlib import Path from typing import Any, List, Type, Union +from kotaemon.base import BaseComponent, Document from llama_index import SimpleDirectoryReader, download_loader from llama_index.readers.base import BaseReader -from kotaemon.base import BaseComponent, Document - class AutoReader(BaseComponent): """General auto reader for a variety of files. (based on llama-hub)""" diff --git a/knowledgehub/loaders/excel_loader.py b/libs/kotaemon/kotaemon/loaders/excel_loader.py similarity index 99% rename from knowledgehub/loaders/excel_loader.py rename to libs/kotaemon/kotaemon/loaders/excel_loader.py index 347719b..72fd0b3 100644 --- a/knowledgehub/loaders/excel_loader.py +++ b/libs/kotaemon/kotaemon/loaders/excel_loader.py @@ -6,9 +6,8 @@ Pandas parser for .xlsx files. from pathlib import Path from typing import Any, List, Optional, Union -from llama_index.readers.base import BaseReader - from kotaemon.base import Document +from llama_index.readers.base import BaseReader class PandasExcelReader(BaseReader): diff --git a/knowledgehub/loaders/mathpix_loader.py b/libs/kotaemon/kotaemon/loaders/mathpix_loader.py similarity index 99% rename from knowledgehub/loaders/mathpix_loader.py rename to libs/kotaemon/kotaemon/loaders/mathpix_loader.py index 1fefe33..4bbf769 100644 --- a/knowledgehub/loaders/mathpix_loader.py +++ b/libs/kotaemon/kotaemon/loaders/mathpix_loader.py @@ -5,11 +5,10 @@ from pathlib import Path from typing import Any, Dict, List import requests +from kotaemon.base import Document from langchain.utils import get_from_dict_or_env from llama_index.readers.base import BaseReader -from kotaemon.base import Document - from .utils.table import parse_markdown_text_to_tables, strip_special_chars_markdown diff --git a/knowledgehub/loaders/ocr_loader.py b/libs/kotaemon/kotaemon/loaders/ocr_loader.py similarity index 99% rename from knowledgehub/loaders/ocr_loader.py rename to libs/kotaemon/kotaemon/loaders/ocr_loader.py index f9e6fe9..13bd907 100644 --- a/knowledgehub/loaders/ocr_loader.py +++ b/libs/kotaemon/kotaemon/loaders/ocr_loader.py @@ -3,9 +3,8 @@ from typing import List from uuid import uuid4 import requests -from llama_index.readers.base import BaseReader - from kotaemon.base import Document +from llama_index.readers.base import BaseReader from .utils.pdf_ocr import parse_ocr_output, read_pdf_unstructured from .utils.table import strip_special_chars_markdown diff --git a/knowledgehub/loaders/unstructured_loader.py b/libs/kotaemon/kotaemon/loaders/unstructured_loader.py similarity index 99% rename from knowledgehub/loaders/unstructured_loader.py rename to libs/kotaemon/kotaemon/loaders/unstructured_loader.py index c386bc5..1c3e526 100644 --- a/knowledgehub/loaders/unstructured_loader.py +++ b/libs/kotaemon/kotaemon/loaders/unstructured_loader.py @@ -12,9 +12,8 @@ pip install xlrd from pathlib import Path from typing import Any, Dict, List, Optional -from llama_index.readers.base import BaseReader - from kotaemon.base import Document +from llama_index.readers.base import BaseReader class UnstructuredReader(BaseReader): diff --git a/knowledgehub/loaders/utils/__init__.py b/libs/kotaemon/kotaemon/loaders/utils/__init__.py similarity index 100% rename from knowledgehub/loaders/utils/__init__.py rename to libs/kotaemon/kotaemon/loaders/utils/__init__.py diff --git a/knowledgehub/loaders/utils/box.py b/libs/kotaemon/kotaemon/loaders/utils/box.py similarity index 100% rename from knowledgehub/loaders/utils/box.py rename to libs/kotaemon/kotaemon/loaders/utils/box.py diff --git a/knowledgehub/loaders/utils/pdf_ocr.py b/libs/kotaemon/kotaemon/loaders/utils/pdf_ocr.py similarity index 100% rename from knowledgehub/loaders/utils/pdf_ocr.py rename to libs/kotaemon/kotaemon/loaders/utils/pdf_ocr.py diff --git a/knowledgehub/loaders/utils/table.py b/libs/kotaemon/kotaemon/loaders/utils/table.py similarity index 100% rename from knowledgehub/loaders/utils/table.py rename to libs/kotaemon/kotaemon/loaders/utils/table.py diff --git a/knowledgehub/parsers/__init__.py b/libs/kotaemon/kotaemon/parsers/__init__.py similarity index 100% rename from knowledgehub/parsers/__init__.py rename to libs/kotaemon/kotaemon/parsers/__init__.py diff --git a/knowledgehub/parsers/regex_extractor.py b/libs/kotaemon/kotaemon/parsers/regex_extractor.py similarity index 100% rename from knowledgehub/parsers/regex_extractor.py rename to libs/kotaemon/kotaemon/parsers/regex_extractor.py diff --git a/knowledgehub/storages/__init__.py b/libs/kotaemon/kotaemon/storages/__init__.py similarity index 100% rename from knowledgehub/storages/__init__.py rename to libs/kotaemon/kotaemon/storages/__init__.py diff --git a/knowledgehub/storages/docstores/__init__.py b/libs/kotaemon/kotaemon/storages/docstores/__init__.py similarity index 100% rename from knowledgehub/storages/docstores/__init__.py rename to libs/kotaemon/kotaemon/storages/docstores/__init__.py diff --git a/knowledgehub/storages/docstores/base.py b/libs/kotaemon/kotaemon/storages/docstores/base.py similarity index 100% rename from knowledgehub/storages/docstores/base.py rename to libs/kotaemon/kotaemon/storages/docstores/base.py diff --git a/knowledgehub/storages/docstores/elasticsearch.py b/libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py similarity index 100% rename from knowledgehub/storages/docstores/elasticsearch.py rename to libs/kotaemon/kotaemon/storages/docstores/elasticsearch.py diff --git a/knowledgehub/storages/docstores/in_memory.py b/libs/kotaemon/kotaemon/storages/docstores/in_memory.py similarity index 100% rename from knowledgehub/storages/docstores/in_memory.py rename to libs/kotaemon/kotaemon/storages/docstores/in_memory.py diff --git a/knowledgehub/storages/docstores/simple_file.py b/libs/kotaemon/kotaemon/storages/docstores/simple_file.py similarity index 100% rename from knowledgehub/storages/docstores/simple_file.py rename to libs/kotaemon/kotaemon/storages/docstores/simple_file.py diff --git a/knowledgehub/storages/vectorstores/__init__.py b/libs/kotaemon/kotaemon/storages/vectorstores/__init__.py similarity index 100% rename from knowledgehub/storages/vectorstores/__init__.py rename to libs/kotaemon/kotaemon/storages/vectorstores/__init__.py diff --git a/knowledgehub/storages/vectorstores/base.py b/libs/kotaemon/kotaemon/storages/vectorstores/base.py similarity index 99% rename from knowledgehub/storages/vectorstores/base.py rename to libs/kotaemon/kotaemon/storages/vectorstores/base.py index ba4f3ec..d353886 100644 --- a/knowledgehub/storages/vectorstores/base.py +++ b/libs/kotaemon/kotaemon/storages/vectorstores/base.py @@ -3,13 +3,12 @@ from __future__ import annotations from abc import ABC, abstractmethod from typing import Any, Optional +from kotaemon.base import DocumentWithEmbedding from llama_index.schema import NodeRelationship, RelatedNodeInfo from llama_index.vector_stores.types import BasePydanticVectorStore from llama_index.vector_stores.types import VectorStore as LIVectorStore from llama_index.vector_stores.types import VectorStoreQuery -from kotaemon.base import DocumentWithEmbedding - class BaseVectorStore(ABC): @abstractmethod diff --git a/knowledgehub/storages/vectorstores/chroma.py b/libs/kotaemon/kotaemon/storages/vectorstores/chroma.py similarity index 100% rename from knowledgehub/storages/vectorstores/chroma.py rename to libs/kotaemon/kotaemon/storages/vectorstores/chroma.py diff --git a/knowledgehub/storages/vectorstores/in_memory.py b/libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py similarity index 100% rename from knowledgehub/storages/vectorstores/in_memory.py rename to libs/kotaemon/kotaemon/storages/vectorstores/in_memory.py diff --git a/knowledgehub/storages/vectorstores/simple_file.py b/libs/kotaemon/kotaemon/storages/vectorstores/simple_file.py similarity index 99% rename from knowledgehub/storages/vectorstores/simple_file.py rename to libs/kotaemon/kotaemon/storages/vectorstores/simple_file.py index 6f14a34..407d4fc 100644 --- a/knowledgehub/storages/vectorstores/simple_file.py +++ b/libs/kotaemon/kotaemon/storages/vectorstores/simple_file.py @@ -3,11 +3,10 @@ from pathlib import Path from typing import Any, Optional, Type import fsspec +from kotaemon.base import DocumentWithEmbedding from llama_index.vector_stores import SimpleVectorStore as LISimpleVectorStore from llama_index.vector_stores.simple import SimpleVectorStoreData -from kotaemon.base import DocumentWithEmbedding - from .base import LlamaIndexVectorStore diff --git a/libs/kotaemon/pyproject.toml b/libs/kotaemon/pyproject.toml new file mode 100644 index 0000000..f16e2e5 --- /dev/null +++ b/libs/kotaemon/pyproject.toml @@ -0,0 +1,73 @@ +# build backand and build dependencies +[build-system] +requires = ["setuptools >= 61.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools] +include-package-data = false +packages.find.include = ["kotaemon*"] +packages.find.exclude = ["tests*", "env*"] + +# metadata and dependencies +[project] +name = "kotaemon" +version = "0.3.5" +requires-python = ">= 3.10" +description = "Kotaemon core library for AI development." +dependencies = [ + "langchain", + "langchain-community", + "theflow", + "llama-index>=0.9.0", + "llama-hub", + "gradio>=4.0.0", + "openpyxl", + "cookiecutter", + "click", + "pandas", + "trogon", +] +readme = "README.md" +license = { text = "MIT License" } +authors = [ + { name = "john", email = "john@cinnamon.is" }, + { name = "ian", email = "ian@cinnamon.is" }, + { name = "tadashi", email = "tadashi@cinnamon.is" }, +] +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] + +[project.optional-dependencies] +dev = [ + "ipython", + "pytest", + "pre-commit", + "black", + "flake8", + "sphinx", + "coverage", + "openai", + "langchain-openai", + "chromadb", + "wikipedia", + "duckduckgo-search", + "googlesearch-python", + "python-dotenv", + "pytest-mock", + "unstructured[pdf]", + "sentence_transformers", + "cohere", + "elasticsearch", + "pypdf", +] + +[project.scripts] +kh = "kotaemon.cli:main" + +[project.urls] +Homepage = "https://github.com/Cinnamon/kotaemon/" +Repository = "https://github.com/Cinnamon/kotaemon/" +Documentation = "https://github.com/Cinnamon/kotaemon/wiki" diff --git a/pytest.ini b/libs/kotaemon/pytest.ini similarity index 100% rename from pytest.ini rename to libs/kotaemon/pytest.ini diff --git a/tests/__init__.py b/libs/kotaemon/tests/__init__.py similarity index 100% rename from tests/__init__.py rename to libs/kotaemon/tests/__init__.py diff --git a/tests/conftest.py b/libs/kotaemon/tests/conftest.py similarity index 100% rename from tests/conftest.py rename to libs/kotaemon/tests/conftest.py diff --git a/tests/resources/7810d908b0ff4ce381dcab873196d133.jpg b/libs/kotaemon/tests/resources/7810d908b0ff4ce381dcab873196d133.jpg similarity index 100% rename from tests/resources/7810d908b0ff4ce381dcab873196d133.jpg rename to libs/kotaemon/tests/resources/7810d908b0ff4ce381dcab873196d133.jpg diff --git a/tests/resources/dummy.pdf b/libs/kotaemon/tests/resources/dummy.pdf similarity index 100% rename from tests/resources/dummy.pdf rename to libs/kotaemon/tests/resources/dummy.pdf diff --git a/tests/resources/dummy.xlsx b/libs/kotaemon/tests/resources/dummy.xlsx similarity index 100% rename from tests/resources/dummy.xlsx rename to libs/kotaemon/tests/resources/dummy.xlsx diff --git a/tests/resources/embedding_openai.json b/libs/kotaemon/tests/resources/embedding_openai.json similarity index 100% rename from tests/resources/embedding_openai.json rename to libs/kotaemon/tests/resources/embedding_openai.json diff --git a/tests/resources/embedding_openai_batch.json b/libs/kotaemon/tests/resources/embedding_openai_batch.json similarity index 100% rename from tests/resources/embedding_openai_batch.json rename to libs/kotaemon/tests/resources/embedding_openai_batch.json diff --git a/tests/resources/fullocr_sample_output.json b/libs/kotaemon/tests/resources/fullocr_sample_output.json similarity index 100% rename from tests/resources/fullocr_sample_output.json rename to libs/kotaemon/tests/resources/fullocr_sample_output.json diff --git a/tests/resources/policy.md b/libs/kotaemon/tests/resources/policy.md similarity index 100% rename from tests/resources/policy.md rename to libs/kotaemon/tests/resources/policy.md diff --git a/tests/resources/table.pdf b/libs/kotaemon/tests/resources/table.pdf similarity index 100% rename from tests/resources/table.pdf rename to libs/kotaemon/tests/resources/table.pdf diff --git a/tests/simple_pipeline.py b/libs/kotaemon/tests/simple_pipeline.py similarity index 89% rename from tests/simple_pipeline.py rename to libs/kotaemon/tests/simple_pipeline.py index e3c6cf2..9dbbb90 100644 --- a/tests/simple_pipeline.py +++ b/libs/kotaemon/tests/simple_pipeline.py @@ -10,7 +10,7 @@ from kotaemon.storages import ChromaVectorStore class Pipeline(BaseComponent): llm: AzureOpenAI = AzureOpenAI.withx( - openai_api_base="https://test.openai.azure.com/", + azure_endpoint="https://test.openai.azure.com/", openai_api_key="some-key", openai_api_version="2023-03-15-preview", deployment_name="gpt35turbo", @@ -23,7 +23,7 @@ class Pipeline(BaseComponent): embedding=AzureOpenAIEmbeddings.withx( model="text-embedding-ada-002", deployment="embedding-deployment", - openai_api_base="https://test.openai.azure.com/", + azure_endpoint="https://test.openai.azure.com/", openai_api_key="some-key", ), ) diff --git a/tests/test_agent.py b/libs/kotaemon/tests/test_agent.py similarity index 99% rename from tests/test_agent.py rename to libs/kotaemon/tests/test_agent.py index dad9a33..49740cd 100644 --- a/tests/test_agent.py +++ b/libs/kotaemon/tests/test_agent.py @@ -1,8 +1,6 @@ from unittest.mock import patch import pytest -from openai.types.chat.chat_completion import ChatCompletion - from kotaemon.agents import ( AgentType, BaseTool, @@ -14,6 +12,7 @@ from kotaemon.agents import ( WikipediaTool, ) from kotaemon.llms import AzureChatOpenAI +from openai.types.chat.chat_completion import ChatCompletion FINAL_RESPONSE_TEXT = "Final Answer: Hello Cinnamon AI!" REWOO_VALID_PLAN = ( diff --git a/tests/test_composite.py b/libs/kotaemon/tests/test_composite.py similarity index 99% rename from tests/test_composite.py rename to libs/kotaemon/tests/test_composite.py index ce6ef69..85cb35a 100644 --- a/tests/test_composite.py +++ b/libs/kotaemon/tests/test_composite.py @@ -1,8 +1,6 @@ from copy import deepcopy import pytest -from openai.types.chat.chat_completion import ChatCompletion - from kotaemon.llms import ( AzureChatOpenAI, BasePromptComponent, @@ -12,6 +10,7 @@ from kotaemon.llms import ( SimpleLinearPipeline, ) from kotaemon.parsers import RegexExtractor +from openai.types.chat.chat_completion import ChatCompletion _openai_chat_completion_response = ChatCompletion.parse_obj( { @@ -41,7 +40,7 @@ _openai_chat_completion_response = ChatCompletion.parse_obj( @pytest.fixture def mock_llm(): return AzureChatOpenAI( - openai_api_base="OPENAI_API_BASE", + azure_endpoint="OPENAI_API_BASE", openai_api_key="OPENAI_API_KEY", openai_api_version="OPENAI_API_VERSION", deployment_name="dummy-q2-gpt35", diff --git a/tests/test_cot.py b/libs/kotaemon/tests/test_cot.py similarity index 95% rename from tests/test_cot.py rename to libs/kotaemon/tests/test_cot.py index e697485..58833c1 100644 --- a/tests/test_cot.py +++ b/libs/kotaemon/tests/test_cot.py @@ -1,9 +1,8 @@ from unittest.mock import patch -from openai.types.chat.chat_completion import ChatCompletion - from kotaemon.llms import AzureChatOpenAI from kotaemon.llms.cot import ManualSequentialChainOfThought, Thought +from openai.types.chat.chat_completion import ChatCompletion _openai_chat_completion_response = [ ChatCompletion.parse_obj( @@ -39,7 +38,7 @@ _openai_chat_completion_response = [ ) def test_cot_plus_operator(openai_completion): llm = AzureChatOpenAI( - openai_api_base="https://dummy.openai.azure.com/", + azure_endpoint="https://dummy.openai.azure.com/", openai_api_key="dummy", openai_api_version="2023-03-15-preview", deployment_name="dummy-q2", @@ -71,7 +70,7 @@ def test_cot_plus_operator(openai_completion): ) def test_cot_manual(openai_completion): llm = AzureChatOpenAI( - openai_api_base="https://dummy.openai.azure.com/", + azure_endpoint="https://dummy.openai.azure.com/", openai_api_key="dummy", openai_api_version="2023-03-15-preview", deployment_name="dummy-q2", @@ -101,7 +100,7 @@ def test_cot_manual(openai_completion): ) def test_cot_with_termination_callback(openai_completion): llm = AzureChatOpenAI( - openai_api_base="https://dummy.openai.azure.com/", + azure_endpoint="https://dummy.openai.azure.com/", openai_api_key="dummy", openai_api_version="2023-03-15-preview", deployment_name="dummy-q2", diff --git a/tests/test_docstores.py b/libs/kotaemon/tests/test_docstores.py similarity index 99% rename from tests/test_docstores.py rename to libs/kotaemon/tests/test_docstores.py index 90bae43..af4834c 100644 --- a/tests/test_docstores.py +++ b/libs/kotaemon/tests/test_docstores.py @@ -3,7 +3,6 @@ from unittest.mock import patch import pytest from elastic_transport import ApiResponseMeta - from kotaemon.base import Document from kotaemon.storages import ( ElasticsearchDocumentStore, diff --git a/tests/test_documents.py b/libs/kotaemon/tests/test_documents.py similarity index 100% rename from tests/test_documents.py rename to libs/kotaemon/tests/test_documents.py diff --git a/tests/test_embedding_models.py b/libs/kotaemon/tests/test_embedding_models.py similarity index 100% rename from tests/test_embedding_models.py rename to libs/kotaemon/tests/test_embedding_models.py diff --git a/tests/test_indexing_retrieval.py b/libs/kotaemon/tests/test_indexing_retrieval.py similarity index 95% rename from tests/test_indexing_retrieval.py rename to libs/kotaemon/tests/test_indexing_retrieval.py index 17ce2fb..032cbde 100644 --- a/tests/test_indexing_retrieval.py +++ b/libs/kotaemon/tests/test_indexing_retrieval.py @@ -3,12 +3,11 @@ from pathlib import Path from typing import cast import pytest -from openai.resources.embeddings import Embeddings - from kotaemon.base import Document from kotaemon.embeddings import AzureOpenAIEmbeddings from kotaemon.indices import VectorIndexing, VectorRetrieval from kotaemon.storages import ChromaVectorStore, InMemoryDocumentStore +from openai.resources.embeddings import Embeddings with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f: openai_embedding = json.load(f) @@ -25,7 +24,7 @@ def test_indexing(mock_openai_embedding, tmp_path): embedding = AzureOpenAIEmbeddings( model="text-embedding-ada-002", deployment="embedding-deployment", - openai_api_base="https://test.openai.azure.com/", + azure_endpoint="https://test.openai.azure.com/", openai_api_key="some-key", ) @@ -45,7 +44,7 @@ def test_retrieving(mock_openai_embedding, tmp_path): embedding = AzureOpenAIEmbeddings( model="text-embedding-ada-002", deployment="embedding-deployment", - openai_api_base="https://test.openai.azure.com/", + azure_endpoint="https://test.openai.azure.com/", openai_api_key="some-key", ) diff --git a/tests/test_ingestor.py b/libs/kotaemon/tests/test_ingestor.py similarity index 84% rename from tests/test_ingestor.py rename to libs/kotaemon/tests/test_ingestor.py index cd1450e..33fa5a2 100644 --- a/tests/test_ingestor.py +++ b/libs/kotaemon/tests/test_ingestor.py @@ -8,7 +8,7 @@ def test_ingestor_include_src(): dirpath = Path(__file__).parent ingestor = DocumentIngestor( pdf_mode="normal", - text_splitter=TokenSplitter(chunk_size=50, chunk_overlap=10), + text_splitter=TokenSplitter(chunk_size=200, chunk_overlap=10), ) nodes = ingestor(dirpath / "resources" / "table.pdf") assert type(nodes) is list diff --git a/tests/test_llms_chat_models.py b/libs/kotaemon/tests/test_llms_chat_models.py similarity index 90% rename from tests/test_llms_chat_models.py rename to libs/kotaemon/tests/test_llms_chat_models.py index 8e0de5f..0ae0e4b 100644 --- a/tests/test_llms_chat_models.py +++ b/libs/kotaemon/tests/test_llms_chat_models.py @@ -1,8 +1,5 @@ from unittest.mock import patch -from langchain.chat_models import AzureChatOpenAI as AzureChatOpenAILC -from openai.types.chat.chat_completion import ChatCompletion - from kotaemon.base.schema import ( AIMessage, HumanMessage, @@ -11,6 +8,13 @@ from kotaemon.base.schema import ( ) from kotaemon.llms import AzureChatOpenAI +try: + from langchain_openai import AzureChatOpenAI as AzureChatOpenAILC +except ImportError: + from langchain.chat_models import AzureChatOpenAI as AzureChatOpenAILC + +from openai.types.chat.chat_completion import ChatCompletion + _openai_chat_completion_response = ChatCompletion.parse_obj( { "id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x", @@ -42,7 +46,7 @@ _openai_chat_completion_response = ChatCompletion.parse_obj( ) def test_azureopenai_model(openai_completion): model = AzureChatOpenAI( - openai_api_base="https://test.openai.azure.com/", + azure_endpoint="https://test.openai.azure.com/", openai_api_key="some-key", openai_api_version="2023-03-15-preview", deployment_name="gpt35turbo", diff --git a/tests/test_llms_completion_models.py b/libs/kotaemon/tests/test_llms_completion_models.py similarity index 87% rename from tests/test_llms_completion_models.py rename to libs/kotaemon/tests/test_llms_completion_models.py index ea56782..ac4794f 100644 --- a/tests/test_llms_completion_models.py +++ b/libs/kotaemon/tests/test_llms_completion_models.py @@ -1,12 +1,16 @@ from unittest.mock import patch -from langchain.llms import AzureOpenAI as AzureOpenAILC -from langchain.llms import OpenAI as OpenAILC -from openai.types.completion import Completion - from kotaemon.base.schema import LLMInterface from kotaemon.llms import AzureOpenAI, OpenAI +try: + from langchain_openai import AzureOpenAI as AzureOpenAILC + from langchain_openai import OpenAI as OpenAILC +except ImportError: + from langchain.llms import AzureOpenAI as AzureOpenAILC + from langchain.llms import OpenAI as OpenAILC +from openai.types.completion import Completion + _openai_completion_response = Completion.parse_obj( { "id": "cmpl-7qyNoIo6gRSCJR0hi8o3ZKBH4RkJ0", @@ -33,7 +37,7 @@ _openai_completion_response = Completion.parse_obj( ) def test_azureopenai_model(openai_completion): model = AzureOpenAI( - openai_api_base="https://test.openai.azure.com/", + azure_endpoint="https://test.openai.azure.com/", openai_api_key="some-key", openai_api_version="2023-03-15-preview", deployment_name="gpt35turbo", diff --git a/tests/test_post_processing.py b/libs/kotaemon/tests/test_post_processing.py similarity index 99% rename from tests/test_post_processing.py rename to libs/kotaemon/tests/test_post_processing.py index d32808f..0b0359a 100644 --- a/tests/test_post_processing.py +++ b/libs/kotaemon/tests/test_post_processing.py @@ -1,5 +1,4 @@ import pytest - from kotaemon.base import Document from kotaemon.parsers import RegexExtractor diff --git a/tests/test_prompt.py b/libs/kotaemon/tests/test_prompt.py similarity index 99% rename from tests/test_prompt.py rename to libs/kotaemon/tests/test_prompt.py index 9cc72e0..6e9e4f9 100644 --- a/tests/test_prompt.py +++ b/libs/kotaemon/tests/test_prompt.py @@ -1,5 +1,4 @@ import pytest - from kotaemon.base import Document from kotaemon.llms import BasePromptComponent, PromptTemplate from kotaemon.parsers import RegexExtractor diff --git a/tests/test_promptui.py b/libs/kotaemon/tests/test_promptui.py similarity index 97% rename from tests/test_promptui.py rename to libs/kotaemon/tests/test_promptui.py index 9963087..9d9b8e6 100644 --- a/tests/test_promptui.py +++ b/libs/kotaemon/tests/test_promptui.py @@ -17,7 +17,7 @@ class TestPromptConfig: assert "params" in config, "params should be in config" assert "llm.deployment_name" in config["params"] - assert "llm.openai_api_base" in config["params"] + assert "llm.azure_endpoint" in config["params"] assert "llm.openai_api_key" in config["params"] assert "llm.openai_api_version" in config["params"] assert "llm.request_timeout" in config["params"] diff --git a/tests/test_reader.py b/libs/kotaemon/tests/test_reader.py similarity index 99% rename from tests/test_reader.py rename to libs/kotaemon/tests/test_reader.py index 4231e18..f8f6ef3 100644 --- a/tests/test_reader.py +++ b/libs/kotaemon/tests/test_reader.py @@ -1,10 +1,9 @@ from pathlib import Path -from langchain.schema import Document as LangchainDocument -from llama_index.node_parser import SimpleNodeParser - from kotaemon.base import Document from kotaemon.loaders import AutoReader, UnstructuredReader +from langchain.schema import Document as LangchainDocument +from llama_index.node_parser import SimpleNodeParser def test_pdf_reader(): diff --git a/tests/test_reranking.py b/libs/kotaemon/tests/test_reranking.py similarity index 99% rename from tests/test_reranking.py rename to libs/kotaemon/tests/test_reranking.py index d4f7be8..953afdb 100644 --- a/tests/test_reranking.py +++ b/libs/kotaemon/tests/test_reranking.py @@ -1,11 +1,10 @@ from unittest.mock import patch import pytest -from openai.types.chat.chat_completion import ChatCompletion - from kotaemon.base import Document from kotaemon.indices.rankings import LLMReranking from kotaemon.llms import AzureChatOpenAI +from openai.types.chat.chat_completion import ChatCompletion _openai_chat_completion_responses = [ ChatCompletion.parse_obj( diff --git a/tests/test_splitter.py b/libs/kotaemon/tests/test_splitter.py similarity index 99% rename from tests/test_splitter.py rename to libs/kotaemon/tests/test_splitter.py index 71e63ee..c87e196 100644 --- a/tests/test_splitter.py +++ b/libs/kotaemon/tests/test_splitter.py @@ -1,7 +1,6 @@ -from llama_index.schema import NodeRelationship - from kotaemon.base import Document from kotaemon.indices.splitters import TokenSplitter +from llama_index.schema import NodeRelationship source1 = Document( content="The City Hall and Raffles Place MRT stations are paired cross-platform " diff --git a/tests/test_table_reader.py b/libs/kotaemon/tests/test_table_reader.py similarity index 99% rename from tests/test_table_reader.py rename to libs/kotaemon/tests/test_table_reader.py index b723778..88f56fb 100644 --- a/tests/test_table_reader.py +++ b/libs/kotaemon/tests/test_table_reader.py @@ -2,7 +2,6 @@ import json from pathlib import Path import pytest - from kotaemon.loaders import MathpixPDFReader, OCRReader, PandasExcelReader input_file = Path(__file__).parent / "resources" / "table.pdf" diff --git a/tests/test_telemetry.py b/libs/kotaemon/tests/test_telemetry.py similarity index 99% rename from tests/test_telemetry.py rename to libs/kotaemon/tests/test_telemetry.py index 942b0ef..e5528ab 100644 --- a/tests/test_telemetry.py +++ b/libs/kotaemon/tests/test_telemetry.py @@ -51,7 +51,6 @@ def test_disable_telemetry_import_haystack_after_kotaemon(): import os import haystack.telemetry - import kotaemon # noqa: F401 assert haystack.telemetry.telemetry is None diff --git a/tests/test_template.py b/libs/kotaemon/tests/test_template.py similarity index 99% rename from tests/test_template.py rename to libs/kotaemon/tests/test_template.py index cf7ad46..917e494 100644 --- a/tests/test_template.py +++ b/libs/kotaemon/tests/test_template.py @@ -1,5 +1,4 @@ import pytest - from kotaemon.llms import PromptTemplate diff --git a/tests/test_tools.py b/libs/kotaemon/tests/test_tools.py similarity index 97% rename from tests/test_tools.py rename to libs/kotaemon/tests/test_tools.py index e6f940b..58e519a 100644 --- a/tests/test_tools.py +++ b/libs/kotaemon/tests/test_tools.py @@ -2,13 +2,12 @@ import json from pathlib import Path import pytest -from openai.resources.embeddings import Embeddings - from kotaemon.agents.tools import ComponentTool, GoogleSearchTool, WikipediaTool from kotaemon.base import Document from kotaemon.embeddings import AzureOpenAIEmbeddings from kotaemon.indices.vectorindex import VectorIndexing, VectorRetrieval from kotaemon.storages import ChromaVectorStore, InMemoryDocumentStore +from openai.resources.embeddings import Embeddings with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f: openai_embedding = json.load(f) @@ -41,7 +40,7 @@ def test_pipeline_tool(mock_openai_embedding, tmp_path): embedding = AzureOpenAIEmbeddings( model="text-embedding-ada-002", deployment="embedding-deployment", - openai_api_base="https://test.openai.azure.com/", + azure_endpoint="https://test.openai.azure.com/", openai_api_key="some-key", ) diff --git a/tests/test_vectorstore.py b/libs/kotaemon/tests/test_vectorstore.py similarity index 100% rename from tests/test_vectorstore.py rename to libs/kotaemon/tests/test_vectorstore.py diff --git a/libs/ktem/.gitignore b/libs/ktem/.gitignore new file mode 100644 index 0000000..a3659c1 --- /dev/null +++ b/libs/ktem/.gitignore @@ -0,0 +1,2 @@ +14-1_抜粋-1.pdf +_example_.db diff --git a/libs/ktem/README.md b/libs/ktem/README.md new file mode 100644 index 0000000..5af3ea9 --- /dev/null +++ b/libs/ktem/README.md @@ -0,0 +1,24 @@ +# Example of MVP pipeline for _example_ + +## Prerequisite + +To run the system out-of-the-box, please supply the following environment +variables: + +``` +OPENAI_API_KEY= +OPENAI_API_BASE= +OPENAI_API_VERSION= +SERPAPI_API_KEY= +COHERE_API_KEY= +OPENAI_API_KEY_EMBEDDING= + +# optional +KH_APP_NAME= +``` + +## Run + +``` +gradio launch.py +``` diff --git a/libs/ktem/flowsettings.py b/libs/ktem/flowsettings.py new file mode 100644 index 0000000..fee8eb9 --- /dev/null +++ b/libs/ktem/flowsettings.py @@ -0,0 +1,101 @@ +from pathlib import Path + +from decouple import config +from platformdirs import user_cache_dir +from theflow.settings.default import * # noqa + +user_cache_dir = Path( + user_cache_dir(str(config("KH_APP_NAME", default="ktem")), "Cinnamon") +) +user_cache_dir.mkdir(parents=True, exist_ok=True) + + +COHERE_API_KEY = config("COHERE_API_KEY", default="") +KH_DATABASE = f"sqlite:///{user_cache_dir / 'sql.db'}" +KH_DOCSTORE = { + "__type__": "kotaemon.storages.SimpleFileDocumentStore", + "path": str(user_cache_dir / "docstore"), +} +KH_VECTORSTORE = { + "__type__": "kotaemon.storages.ChromaVectorStore", + "path": str(user_cache_dir / "vectorstore"), +} +KH_FILESTORAGE_PATH = str(user_cache_dir / "files") +KH_LLMS = { + "gpt4": { + "def": { + "__type__": "kotaemon.llms.AzureChatOpenAI", + "temperature": 0, + "azure_endpoint": config("OPENAI_API_BASE", default=""), + "openai_api_key": config("OPENAI_API_KEY", default=""), + "openai_api_version": config("OPENAI_API_VERSION", default=""), + "deployment_name": "dummy-q2", + "stream": True, + }, + "accuracy": 10, + "cost": 10, + "default": False, + }, + "gpt35": { + "def": { + "__type__": "kotaemon.llms.AzureChatOpenAI", + "temperature": 0, + "azure_endpoint": config("OPENAI_API_BASE", default=""), + "openai_api_key": config("OPENAI_API_KEY", default=""), + "openai_api_version": config("OPENAI_API_VERSION", default=""), + "deployment_name": "dummy-q2", + "request_timeout": 10, + "stream": False, + }, + "accuracy": 5, + "cost": 5, + "default": True, + }, +} +KH_EMBEDDINGS = { + "ada": { + "def": { + "__type__": "kotaemon.embeddings.AzureOpenAIEmbeddings", + "model": "text-embedding-ada-002", + "azure_endpoint": config("OPENAI_API_BASE", default=""), + "openai_api_key": config("OPENAI_API_KEY", default=""), + "deployment": "dummy-q2-text-embedding", + "chunk_size": 16, + }, + "accuracy": 5, + "cost": 5, + "default": True, + }, +} +KH_REASONINGS = { + "simple": "ktem.reasoning.simple.FullQAPipeline", +} + + +SETTINGS_APP = { + "lang": { + "name": "Language", + "value": "en", + "choices": [("English", "en"), ("Japanese", "ja")], + "component": "dropdown", + } +} + + +SETTINGS_REASONING = { + "use": { + "name": "Reasoning options", + "value": None, + "choices": [], + "component": "radio", + }, + "lang": { + "name": "Language", + "value": "en", + "choices": [("English", "en"), ("Japanese", "ja")], + "component": "dropdown", + }, +} + + +KH_INDEX = "ktem.indexing.file.IndexDocumentPipeline" diff --git a/libs/ktem/khapptests/__init__.py b/libs/ktem/khapptests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libs/ktem/khapptests/resources/embedding_openai.json b/libs/ktem/khapptests/resources/embedding_openai.json new file mode 100644 index 0000000..7d9ba4d --- /dev/null +++ b/libs/ktem/khapptests/resources/embedding_openai.json @@ -0,0 +1,1552 @@ +{ + "object": "list", + "data": [ + { + "object": "embedding", + "index": 0, + "embedding": [ + 0.006555966101586819, + 0.003670461941510439, + -0.011642491444945335, + -0.026776473969221115, + -0.012383491732180119, + -0.0014341175556182861, + -0.013375678099691868, + 0.009356695227324963, + -0.006364436354488134, + -0.0294390507042408, + 0.023950627073645592, + 0.0029859787318855524, + -0.023234745487570763, + -0.009205983020365238, + 0.006744355894625187, + 0.0011790062999352813, + 0.02607315219938755, + -0.018437083810567856, + 0.008904559537768364, + 0.009620440192520618, + -0.01306169480085373, + -0.0011358336778357625, + 0.007253008428961039, + 0.00875384733080864, + -0.012710033915936947, + 0.0037206991109997034, + 0.005419347435235977, + -0.017243949696421623, + 0.036246202886104584, + -0.0266759991645813, + 0.012647237628698349, + -0.008552898652851582, + -0.00762350857257843, + -0.012546762824058533, + 0.007083457428961992, + -0.014078999869525433, + 0.0048761568032205105, + -0.013689660467207432, + 0.018211016431450844, + -0.014367864467203617, + 0.008307991549372673, + 0.006022194866091013, + 0.005457025486975908, + -0.006320478860288858, + -0.03609549254179001, + 0.012986338697373867, + 0.00017592862423043698, + -0.016427593305706978, + -0.004734864458441734, + 0.0257717277854681, + 0.01944182999432087, + -0.00211938563734293, + -0.0147195253521204, + 0.01332544069737196, + -0.017670966684818268, + 0.0013956546317785978, + -0.04024006798863411, + 0.024754423648118973, + 0.032051388174295425, + -0.018713390454649925, + 0.011265711858868599, + 0.012370931915938854, + -0.013576626777648926, + 0.0040974789299070835, + -0.002153923735022545, + 0.009915584698319435, + 0.006637601647526026, + 0.011353626847267151, + -0.019919084385037422, + 0.030142372474074364, + 0.01966789737343788, + -0.008213796652853489, + -0.004213652573525906, + -0.009162025526165962, + 0.012446288019418716, + 0.008979915641248226, + -0.02098662778735161, + 0.003113141981884837, + 0.0018838982796296477, + -0.0008289152756333351, + 0.02722861059010029, + -0.03355850651860237, + -0.004408321809023619, + 0.017859356477856636, + 0.019893966615200043, + -0.0014137086691334844, + -0.0021602034103125334, + 0.017959831282496452, + -0.0114603815600276, + -0.020069796591997147, + 0.007604669313877821, + 0.001281835837289691, + 0.01132222916930914, + 0.006668999791145325, + -0.01743233948945999, + 0.0048761568032205105, + 0.0016923686489462852, + 0.02356128767132759, + -0.005796127021312714, + -0.041470881551504135, + -0.01292354241013527, + -0.005824385676532984, + -0.010700542479753494, + -0.017005322501063347, + -0.007950050756335258, + 0.011560855433344841, + -0.005485283676534891, + -0.0002590360236354172, + 0.028434304520487785, + 0.007447678130120039, + -0.026198744773864746, + 0.009306457825005054, + -0.00500488979741931, + -0.03107176162302494, + -0.0007555217016488314, + -0.008056805469095707, + 0.007479076273739338, + 0.003978165332227945, + -0.005984516814351082, + -0.02778122015297413, + 0.015272135846316814, + 0.029338575899600983, + 0.010964288376271725, + -0.023548727855086327, + 0.007014381233602762, + 0.0014137086691334844, + -0.03380969539284706, + -0.016741575673222542, + -0.005230957642197609, + -0.004794521257281303, + 0.0315992534160614, + 0.0041100382804870605, + 0.010644025169312954, + -0.0014294078573584557, + -0.035367049276828766, + 0.0246916264295578, + -0.007096016779541969, + 0.02096150815486908, + -0.017030440270900726, + -0.011906237341463566, + 0.006273381412029266, + 0.021237812936306, + 0.01757049188017845, + -0.013224965892732143, + -0.005092805251479149, + 0.01288586389273405, + 0.004486817866563797, + -0.016339678317308426, + 0.008917118422687054, + -0.015083746053278446, + 0.008377067744731903, + 0.0185626782476902, + 0.012961219996213913, + -0.002535413019359112, + 0.0050143091939389706, + 0.016025694087147713, + 0.004618690814822912, + 0.0205470509827137, + 0.008000288158655167, + -0.013074253685772419, + -0.0007264782907441258, + -0.0004474258457776159, + 0.0033125211484730244, + -0.03084569424390793, + 0.009400652721524239, + 0.015787066891789436, + 0.02607315219938755, + 0.003689300734549761, + -0.013940847478806973, + -0.02604803442955017, + -0.011454101651906967, + 0.007340923883020878, + -0.043178949505090714, + 0.02660064399242401, + -0.01593777909874916, + 0.00514304218813777, + -0.001305384561419487, + 0.022820288315415382, + -0.012697474099695683, + -0.017269067466259003, + -0.03024284727871418, + 0.011617372743785381, + 0.008992474526166916, + 0.025897322222590446, + 0.00629850011318922, + -0.00014511904737446457, + 0.009890465997159481, + -0.006364436354488134, + 0.00384629238396883, + -0.017030440270900726, + 0.016666220501065254, + 0.03333244100213051, + 0.021011745557188988, + 0.01844964362680912, + -0.6892555952072144, + -0.007196491584181786, + 0.025319593027234077, + 0.02024562656879425, + 0.006245122756808996, + 0.009909304790198803, + 0.012176262214779854, + 0.027806337922811508, + 0.006518288049846888, + 0.0274546779692173, + -0.02107454277575016, + 0.013011457398533821, + -0.015083746053278446, + -0.010719381272792816, + -0.0002598209830466658, + -0.00823263544589281, + 0.0055763390846550465, + -0.012339534237980843, + -0.011881118640303612, + 0.015196779742836952, + -0.010141652077436447, + 0.03129782900214195, + -0.022104406729340553, + -0.014568813145160675, + 0.009928143583238125, + 0.011730406433343887, + 0.0025102945510298014, + -0.009739753790199757, + -0.006964143831282854, + 0.030192609876394272, + -0.0021272350568324327, + 0.0114603815600276, + 9.394961671205238e-05, + 0.014154355973005295, + 0.06611227244138718, + 0.007610949221998453, + -0.005463304929435253, + 0.008088203147053719, + 0.009827669709920883, + 0.028986915946006775, + -0.019077610224485397, + -0.022267676889896393, + 0.012559321708977222, + -0.01662854291498661, + -0.002763050841167569, + 0.009205983020365238, + 0.012659796513617039, + -0.0024051100481301546, + 0.0022543983068317175, + 0.0017849936848506331, + 0.014568813145160675, + -0.004851038102060556, + 0.010857533663511276, + 0.019504627212882042, + -0.005343991331756115, + 0.0071588135324418545, + 0.02151411771774292, + 0.011962753720581532, + -0.002935741562396288, + 0.004521355964243412, + 0.0005718416068702936, + 0.017168592661619186, + -0.0016280021518468857, + 0.0038494321051985025, + -0.022443508729338646, + 0.02381247468292713, + -0.021313169971108437, + 0.01400364376604557, + -0.008929678238928318, + -0.012050669640302658, + -0.0018368008313700557, + -0.007994008250534534, + -0.018047746270895004, + -0.014744644053280354, + 0.013965966179966927, + 0.028107762336730957, + 0.01681693270802498, + -0.0033219405449926853, + -0.006160347256809473, + 0.01966789737343788, + 0.010926609858870506, + 0.013237525708973408, + -0.014066440984606743, + -0.01764584705233574, + 0.02349849045276642, + -0.019780931994318962, + -0.030067017301917076, + -0.016276881098747253, + 0.010430516675114632, + 0.011479220353066921, + 0.030192609876394272, + 0.0033250804990530014, + -0.012370931915938854, + -0.01980605162680149, + 0.02883620373904705, + 0.0005384809337556362, + -0.011548296548426151, + 0.003821173682808876, + 0.027303965762257576, + 0.0004238771216478199, + 0.014933033846318722, + -0.0028038686141371727, + 0.0003752097545657307, + 0.00652456795796752, + 0.010279805399477482, + 0.006656440440565348, + -0.00809448305517435, + 0.033156611025333405, + 0.02444044128060341, + -0.030142372474074364, + 0.006317338906228542, + 0.004094338975846767, + -0.04071732237935066, + 0.02121269516646862, + 0.008471262641251087, + -0.032001152634620667, + 0.0013022447237744927, + 0.016013136133551598, + 0.026374576613307, + -0.014694406650960445, + 0.023159390315413475, + 0.0020424597896635532, + 0.013664542697370052, + 0.0057458896189928055, + 0.0041539957746863365, + 0.005303173791617155, + 0.0003883577883243561, + 0.004342385567724705, + -0.01690484769642353, + 0.001676669460721314, + 0.020697763189673424, + 0.005133622791618109, + 0.00046273251064121723, + -0.005651694722473621, + 0.009808829985558987, + 0.00202676048502326, + 0.009896745905280113, + -0.017884474247694016, + 0.00862825382500887, + 0.020308423787355423, + -0.007994008250534534, + 0.005127343349158764, + -0.0029388812836259604, + -0.0006303209811449051, + 0.011818322353065014, + -0.018399406224489212, + -0.012025550939142704, + 0.008678491227328777, + 0.00016974708705674857, + -0.013677101582288742, + 0.01983116939663887, + -0.007918652147054672, + -0.03245328739285469, + 0.021162457764148712, + -0.008044245652854443, + -0.0002005566784646362, + 0.0036076651886105537, + -0.029187863692641258, + -0.03569359332323074, + -0.028961796313524246, + 0.0027096737176179886, + 0.016364796087145805, + -0.0071022966876626015, + -0.005174440797418356, + -0.01400364376604557, + -0.023787355050444603, + -0.024566033855080605, + 0.011648771353065968, + 0.005579478573054075, + -0.029012033715844154, + 0.010857533663511276, + -0.0049420930445194244, + -0.025093525648117065, + -0.0024961652234196663, + -0.0030016780365258455, + 0.010694262571632862, + -0.010681703686714172, + -0.015724271535873413, + -0.003249724628403783, + -0.01731930486857891, + 0.006876228842884302, + -0.009601601399481297, + -0.009821389801800251, + 0.012176262214779854, + 0.02582196518778801, + -0.010185610502958298, + -0.0005722341011278331, + 0.02747979573905468, + -0.019793491810560226, + -0.0016405613860115409, + 0.017005322501063347, + 0.0072027710266411304, + -0.02220488153398037, + 0.0006103045307099819, + -0.007912373170256615, + -0.004559034015983343, + -0.0010808866936713457, + 0.0038023346569389105, + 0.01679181307554245, + 0.01718115247786045, + 0.018713390454649925, + 0.004100618418306112, + 0.017859356477856636, + -0.026022914797067642, + -0.016013136133551598, + -0.028811084106564522, + -0.0023705719504505396, + -0.030343322083353996, + 0.003858851734548807, + 0.013890610076487064, + 0.011969033628702164, + -0.002329754177480936, + -0.0014223431935533881, + 0.01764584705233574, + 0.016276881098747253, + 0.02692718617618084, + 0.0021821821574121714, + -0.020923830568790436, + -0.02855989895761013, + -0.007510474417358637, + -0.002681415295228362, + 0.008326830342411995, + -0.01458137296140194, + -0.0007727907504886389, + 0.01720627024769783, + 0.032604001462459564, + 0.004417741671204567, + 0.03554287925362587, + 0.003953046631067991, + -0.0322021022439003, + -0.02505584806203842, + 0.009231101721525192, + 0.012659796513617039, + 0.009563923813402653, + -0.016842050477862358, + -0.005513542331755161, + 0.0041414364241063595, + -0.014405542053282261, + 0.02916274592280388, + 0.012163703329861164, + -0.008502661250531673, + 0.010185610502958298, + 0.028710609301924706, + -0.0177839994430542, + 0.00950112659484148, + 0.028384067118167877, + 0.014744644053280354, + -0.009444610215723515, + 0.005055127199739218, + 0.02803240716457367, + -0.0009089809027500451, + 0.0004933458403684199, + -0.014405542053282261, + -0.008433585055172443, + 0.0012111896649003029, + -0.031498778611421585, + 0.0004556679050438106, + -0.021388525143265724, + 0.019605102017521858, + 0.03569359332323074, + 0.013689660467207432, + 0.01161109283566475, + -0.0064680506475269794, + -0.005783567670732737, + 0.009205983020365238, + -0.013413355685770512, + -0.001551076304167509, + -0.013488711789250374, + 0.003160239430144429, + -0.005114783998578787, + -0.012408610433340073, + -0.003158669453114271, + 0.0335836261510849, + -0.026274101808667183, + 0.009319016709923744, + 0.0005424057017080486, + -0.0011326938401907682, + -0.008841762319207191, + -0.005205838941037655, + 0.0030566249042749405, + -0.013036576099693775, + -0.022443508729338646, + -0.0017865635454654694, + 0.011830881237983704, + 0.007096016779541969, + -0.01605081371963024, + -0.024038542062044144, + -0.005039427895098925, + -0.0011122849537059665, + -0.0032779830507934093, + -0.008571737445890903, + 0.0041194576770067215, + 0.014292508363723755, + -0.02342313528060913, + 0.015485644340515137, + -0.002646877197548747, + 0.020722880959510803, + 0.00680087273940444, + -0.009865347295999527, + -0.00600649556145072, + 0.021036865189671516, + 0.020371221005916595, + -0.0016107329865917563, + -0.019291117787361145, + 0.013287762179970741, + -0.004006423521786928, + -0.00762350857257843, + -0.015334932133555412, + 0.0008681631297804415, + -0.008295431733131409, + -0.0007543442770838737, + 0.011523177847266197, + -0.0027818898670375347, + 0.0037332584615796804, + 0.01304913591593504, + -0.005237237084656954, + -0.00340043636970222, + 0.016691338270902634, + 0.02612338960170746, + -0.002295216079801321, + -0.008333110250532627, + -0.009683237411081791, + -0.022305356338620186, + -0.00935041531920433, + 0.06972935795783997, + -0.001001605880446732, + -0.0031319810077548027, + 0.019617659971117973, + -0.019014813005924225, + -0.012848186306655407, + -0.04051637277007103, + -0.018223576247692108, + -0.01665366068482399, + -0.0031979172490537167, + -0.004436580464243889, + 0.004458559211343527, + -0.013802695088088512, + 0.01994420401751995, + 0.013551508076488972, + 0.009080389514565468, + 0.00340043636970222, + -0.027052778750658035, + -0.011341067962348461, + 0.0048667374067008495, + -0.0151088647544384, + 0.010110254399478436, + 0.00046077012666501105, + 0.009990940801799297, + 0.0205470509827137, + 0.008307991549372673, + 0.00277247023768723, + 0.01720627024769783, + -0.007460237015038729, + -0.0038965295534580946, + 0.008483821526169777, + -0.00610383041203022, + 0.01745745725929737, + 0.008778966031968594, + 0.013312880881130695, + 0.014078999869525433, + 0.0029043431859463453, + 0.03780356049537659, + 0.008464982733130455, + -0.005384809337556362, + 0.011868558824062347, + 0.005287474486976862, + -0.006179186515510082, + -0.006009635515511036, + 0.0142673896625638, + 0.0023344638757407665, + -0.0057710083201527596, + 0.028660371899604797, + -0.0014239131705835462, + -0.028685491532087326, + 0.023121710866689682, + 0.007742822170257568, + -0.009218541905283928, + -0.023071475327014923, + -0.01187483873218298, + 0.018072864040732384, + 0.00027355772908777, + -0.013928287662565708, + -0.010844974778592587, + -0.017796559259295464, + -0.014744644053280354, + -0.01513398252427578, + -0.014694406650960445, + -0.001071467180736363, + -0.031172236427664757, + -0.019190644845366478, + -0.02369944006204605, + 0.002376851625740528, + -0.007466516923159361, + 0.001561280689202249, + -0.0048981355503201485, + -0.03863247483968735, + -0.0125404829159379, + -0.0036516229156404734, + 0.01909017004072666, + 0.0004517431079875678, + -0.001898027490824461, + -0.006486889906227589, + 0.004044101573526859, + 0.013802695088088512, + -0.006047313567250967, + -0.029012033715844154, + 0.011271991766989231, + -0.006292220205068588, + 0.015611236914992332, + 0.0031727987807244062, + 0.00666272034868598, + 0.009111788123846054, + -0.01690484769642353, + 0.029589762911200523, + 0.008521500043570995, + 0.015071186237037182, + 0.026299219578504562, + 0.0003149642434436828, + 0.018650593236088753, + 0.004537055268883705, + 0.0050582666881382465, + 0.001783423707820475, + -0.019906524568796158, + -1.6802998288767412e-05, + -0.008307991549372673, + -0.01179320365190506, + -0.0020785678643733263, + 0.004810220096260309, + -0.0034695127978920937, + 0.01676669530570507, + 0.01690484769642353, + 0.01568659394979477, + -0.015372609719634056, + -0.0016327118501067162, + 0.034211594611406326, + -0.024088779464364052, + 0.016251763328909874, + -0.0064806099981069565, + 0.0001965337578440085, + 0.013438474386930466, + 0.021564355120062828, + 0.03504050895571709, + -0.004694046452641487, + -0.03443766012787819, + -0.005557499825954437, + -0.025281915441155434, + 0.008490101434290409, + 0.0157996267080307, + 0.009130626916885376, + 0.0012418029364198446, + -0.0018619195325300097, + -0.022192321717739105, + -0.006907626986503601, + -0.0032748430967330933, + -0.005548080429434776, + 0.019542304798960686, + -0.006304779555648565, + -0.0023250444792211056, + -0.015309813432395458, + -0.006499449256807566, + -0.018324051052331924, + 0.005968817975372076, + -0.006851110141724348, + -0.017859356477856636, + -0.02425205148756504, + -0.0063236188143491745, + -0.006568525452166796, + -0.013890610076487064, + -0.04551498219370842, + -0.044786542654037476, + -0.016163846477866173, + 0.00394362723454833, + -0.017444897443056107, + 0.016364796087145805, + -0.003949906677007675, + -0.000322813808452338, + -0.019981881603598595, + -0.010625186376273632, + -0.006342457607388496, + -0.01692996546626091, + -0.0006311059114523232, + -0.006116389762610197, + 0.027002541348338127, + 0.027931932359933853, + 0.025369830429553986, + 0.013978525064885616, + 0.0114603815600276, + 0.006474330555647612, + -0.008559177629649639, + -0.01662854291498661, + 0.004559034015983343, + -0.012094627134501934, + -0.014204593375325203, + 0.012710033915936947, + 0.020999185740947723, + 0.028459424152970314, + -0.007026940584182739, + 0.009595322422683239, + 0.0048447586596012115, + -0.008188677951693535, + -0.020873593166470528, + 0.0010173050686717033, + -0.013928287662565708, + -0.014405542053282261, + -0.020974067971110344, + -0.0010471334680914879, + 0.002153923735022545, + 0.010958008468151093, + 0.011234313249588013, + -0.015548440627753735, + 0.011812042444944382, + 0.008596856147050858, + 0.02079823799431324, + 0.003050345228984952, + 0.026299219578504562, + -0.01969301700592041, + 0.02742955833673477, + 0.007265567779541016, + 0.020597288385033607, + -0.013162169605493546, + 0.007711423560976982, + -0.006794593296945095, + 0.016917407512664795, + 0.009262500330805779, + 0.021451322361826897, + 0.02770586498081684, + -0.012270457111299038, + -0.010593787766993046, + 0.005884042475372553, + 0.026173627004027367, + 0.0031162817031145096, + 0.007510474417358637, + 0.004194813314825296, + -0.007862135767936707, + 0.0027096737176179886, + -0.020195389166474342, + -0.004998610354959965, + -0.013501270674169064, + 0.004712885711342096, + -0.012201380915939808, + -0.01872594840824604, + 0.018625473603606224, + -0.011881118640303612, + -0.029087388888001442, + -0.015787066891789436, + -0.0003167304093949497, + 0.048001728951931, + 0.005959398113191128, + 0.012100907042622566, + 0.01939159259200096, + 0.0028556757606565952, + -0.005940559320151806, + 0.02068520337343216, + -0.018675711005926132, + -0.005522961728274822, + 0.039813049137592316, + 0.0015455815009772778, + -0.0063236188143491745, + -0.02416413463652134, + 0.004684627056121826, + 0.004078639671206474, + -0.015071186237037182, + -0.024051101878285408, + 0.008050525560975075, + 0.028760846704244614, + -0.002221429953351617, + -0.014619050547480583, + 0.006113249808549881, + -0.033131491392850876, + 0.02471674606204033, + 0.003482071915641427, + -0.021363407373428345, + -0.015058627352118492, + 0.012440008111298084, + -0.023071475327014923, + 0.0088919997215271, + -0.0305191520601511, + 0.02720349095761776, + 0.03413623571395874, + -0.005836945027112961, + -0.006668999791145325, + 0.003796054981648922, + 0.00045880774268880486, + -0.0013477721950039268, + -0.007686304859817028, + 0.02833382971584797, + -0.007058338727802038, + 0.0036641822662204504, + 0.01891433820128441, + -0.0021272350568324327, + -0.02079823799431324, + -0.005208978895097971, + -0.012320694513618946, + 0.022581661120057106, + -0.017331864684820175, + 0.005482144188135862, + -0.011899957433342934, + 0.011328508146107197, + -0.003271703375503421, + 0.012075788341462612, + -0.013790135271847248, + -0.00652456795796752, + -0.006210584659129381, + -0.0016672499477863312, + 0.02665088139474392, + 2.0819775272684637e-06, + -0.001816391944885254, + -0.01485767774283886, + -0.017984949052333832, + -0.024503236636519432, + -0.04712257534265518, + -0.004483677912503481, + 0.0015801197150722146, + -0.029589762911200523, + -0.004659508354961872, + -0.005466444883495569, + 0.009551363997161388, + 0.02565869502723217, + 0.016779253259301186, + -0.012358373031020164, + 0.012553042732179165, + 0.023247305303812027, + -0.0181733388453722, + -0.0057301907800138, + 0.018688270822167397, + 0.015975456684827805, + -0.02584708482027054, + 0.016251763328909874, + -0.0035699873697012663, + -0.019178085029125214, + 0.004577872809022665, + -0.014104118570685387, + -0.004160275217145681, + 0.011585974134504795, + 0.013388236984610558, + 0.00026119465474039316, + -0.016113610938191414, + 0.006348737049847841, + 0.02936369553208351, + 0.016867170110344887, + 0.009036432020366192, + -0.007322084624320269, + -0.003296822076663375, + 0.037050001323223114, + -0.014217152260243893, + 0.0041539957746863365, + -0.007334643974900246, + -0.012697474099695683, + 0.019655339419841766, + -0.007052059285342693, + -0.007667466066777706, + -0.0007700434071011841, + -0.03576894849538803, + -0.003962466027587652, + -0.0019985020626336336, + 0.008138440549373627, + 0.014342745766043663, + -0.015837304294109344, + -0.03210162743926048, + -0.0056108771823346615, + -0.016703898087143898, + 0.00034538135514594615, + 0.009984660893678665, + -0.01095172856003046, + 0.001340707647614181, + 0.013840372674167156, + 0.016716457903385162, + 0.00038973146001808345, + 0.005940559320151806, + -0.011535737663507462, + -0.01093916967511177, + -0.016314558684825897, + -0.02427716925740242, + -0.012634677812457085, + -0.0012151143746450543, + 0.041244812309741974, + 0.006957864388823509, + -0.018110541626811028, + -0.0088919997215271, + -0.005127343349158764, + -0.02747979573905468, + -0.03227745741605759, + -0.002741072094067931, + -0.0029655699618160725, + -0.0056234365329146385, + 0.016741575673222542, + -0.008527779951691628, + 0.022016491740942, + 0.006263962015509605, + -0.0022748070769011974, + -0.00584008451551199, + -0.019630219787359238, + 0.011824601329863071, + 0.004703465849161148, + 0.01513398252427578, + -0.000919970334507525, + -0.02328498288989067, + 0.002163343131542206, + 0.0026296081487089396, + 0.011912517249584198, + -0.0042073726654052734, + 0.00936297420412302, + -0.009331576526165009, + -0.010487033985555172, + -0.014355304650962353, + -0.0040284027345478535, + -0.002763050841167569, + 0.007253008428961039, + 0.016364796087145805, + -0.0157996267080307, + 0.001935705542564392, + 0.0050582666881382465, + -0.003422415116801858, + 0.003422415116801858, + -0.00468148710206151, + 0.008433585055172443, + -0.015360050834715366, + 0.015837304294109344, + -0.019981881603598595, + -0.006320478860288858, + 0.003088023280724883, + -0.02336033806204796, + 0.015159101225435734, + -0.025420067831873894, + 0.00652456795796752, + 0.0294390507042408, + 0.01068798266351223, + 0.00756699126213789, + -0.0008603135356679559, + -0.0018933177925646305, + 0.0024160996545106173, + 0.005488423630595207, + -0.00935041531920433, + -0.011315949261188507, + -2.2285437808022834e-05, + -0.007372322026640177, + -0.021149897947907448, + -0.02966511808335781, + 0.0021617733873426914, + -0.025721492245793343, + -0.014330185949802399, + 0.001420773332938552, + 0.03476420417428017, + 0.015787066891789436, + -0.026550406590104103, + -0.01319984719157219, + -0.023159390315413475, + 0.013011457398533821, + 0.009406931698322296, + -0.013388236984610558, + 0.001562850666232407, + -0.00962672010064125, + -0.0042324913665652275, + 0.021916016936302185, + -0.01762072928249836, + 0.005290614441037178, + 0.018537558615207672, + 0.016176406294107437, + -0.004251330625265837, + 0.039260439574718475, + 0.2337038666009903, + -0.012791668996214867, + 0.0035228899214416742, + 0.027354203164577484, + 0.02217976190149784, + 0.013551508076488972, + 0.010210729204118252, + 0.0017080678371712565, + 0.006982983089983463, + 0.0028289873152971268, + -0.01159853395074606, + 0.0012826207093894482, + -0.00470974575728178, + 0.006756915245205164, + 0.005450745578855276, + 0.00941949151456356, + -0.02107454277575016, + -0.019793491810560226, + -0.030921051278710365, + -0.024503236636519432, + 0.006279660854488611, + -0.03295566141605377, + -0.025256795808672905, + -0.014305067248642445, + 0.017495134845376015, + -0.0072969659231603146, + -0.008446143940091133, + 0.0011868559522554278, + 0.03363386541604996, + 0.009746033698320389, + 0.004763122648000717, + -0.011573415249586105, + 0.011228034272789955, + 0.011246873065829277, + -0.014367864467203617, + -0.019567424431443214, + 0.03604525327682495, + 0.013363118283450603, + 0.03084569424390793, + 0.003871411085128784, + -0.016553185880184174, + -0.025068406015634537, + -0.004518216010183096, + -0.017633287236094475, + 0.0027238030452281237, + 0.011096160858869553, + 0.007246728986501694, + -0.026173627004027367, + 0.018123101443052292, + 0.033985525369644165, + 0.004656368400901556, + 0.007014381233602762, + 0.02300867810845375, + 0.044032983481884, + 0.00041406514355912805, + -0.015950338914990425, + 0.008653372526168823, + 0.016666220501065254, + -0.005259216297417879, + -0.000553002639207989, + 0.0005902881384827197, + 0.03185044229030609, + -2.5854542400338687e-05, + 0.03943626955151558, + -0.023598965257406235, + 0.007472796365618706, + -0.02519400045275688, + -0.002959290286526084, + 0.003968745935708284, + -0.0035166102461516857, + -0.01667878031730652, + -0.0014144936576485634, + 0.00603475421667099, + -0.00840846635401249, + -0.029514405876398087, + -0.009519966319203377, + 0.010499592870473862, + 0.0036798813380301, + 0.03335756063461304, + 0.0274546779692173, + -0.009388092905282974, + -0.0015463664894923568, + -0.009859067387878895, + -0.02010747417807579, + -0.038657594472169876, + -0.029037151485681534, + 0.0022512583527714014, + -0.00729068648070097, + -0.012107186019420624, + -0.023372897878289223, + -0.014531135559082031, + -0.010254686698317528, + 0.001805402571335435, + -0.006069292314350605, + 0.023686882108449936, + 0.026198744773864746, + -0.0034569534473121166, + 0.010612627491354942, + -0.02091127075254917, + 0.012364652007818222, + -0.005510402377694845, + -0.00246790680103004, + 7.820140308467671e-05, + 0.0090866694226861, + 0.0022292796056717634, + 0.009903025813400745, + -0.0071148560382425785, + 0.0007563066901639104, + -0.00300324778072536, + -0.01563635654747486, + -0.022280236706137657, + -0.0016735296230763197, + -0.006807152647525072, + -0.01108988095074892, + -0.00680087273940444, + 0.00976487249135971, + -0.005080245900899172, + -0.00265472661703825, + -0.0003416528052184731, + -0.008885719813406467, + -0.003959326073527336, + -0.013915728777647018, + -0.014794881455600262, + 0.002414529677480459, + -0.006169767118990421, + -0.014807440340518951, + -0.006888788193464279, + 0.00915574561804533, + 0.014531135559082031, + -0.030016779899597168, + 0.00854661874473095, + -0.0002739502233453095, + 0.021237812936306, + 0.008113321848213673, + -0.006989262532442808, + 0.007334643974900246, + 0.016992762684822083, + -0.0028666651342064142, + -0.016879728063941002, + 0.0035448686685413122, + -0.0021131059620529413, + -0.010003499686717987, + 0.0088919997215271, + -0.0073534827679395676, + -0.014920474030077457, + -0.01649039052426815, + 0.03112199902534485, + -0.027555152773857117, + -0.010970567353069782, + 0.002863525412976742, + -0.04272681474685669, + -0.018185898661613464, + -0.013903168961405754, + -0.0035385889932513237, + 0.027931932359933853, + -0.020735440775752068, + -0.019680457189679146, + -0.02485489845275879, + -0.006210584659129381, + 0.009896745905280113, + -0.012502805329859257, + 0.009645558893680573, + 0.02996654249727726, + -0.007862135767936707, + -0.038657594472169876, + -0.013149609789252281, + -0.16045789420604706, + 0.014430660754442215, + 0.011523177847266197, + -0.012760271318256855, + 0.02066008374094963, + 0.015171661041676998, + 0.024641389027237892, + 0.004643809515982866, + -0.009067830629646778, + 0.018575238063931465, + 0.009859067387878895, + -0.01288586389273405, + -0.014945592731237411, + -0.014305067248642445, + 0.005290614441037178, + -0.01612616889178753, + -0.008182398043572903, + 0.012352093122899532, + 0.029740475118160248, + 0.028685491532087326, + 0.035115864127874374, + -0.007334643974900246, + -0.0009168304968625307, + -0.00477882195264101, + 0.013714779168367386, + -0.007811898365616798, + 0.0007111715967766941, + 0.03393528610467911, + -0.010053737089037895, + -0.007020661141723394, + -0.016000576317310333, + -0.02888644114136696, + 0.02582196518778801, + 0.004559034015983343, + 0.021966254338622093, + -0.006430373061448336, + 0.012446288019418716, + -0.042601220309734344, + -0.01635223627090454, + 0.024930253624916077, + 0.032855186611413956, + 0.018625473603606224, + 0.019228322431445122, + -0.013526389375329018, + -0.015410288237035275, + -0.005224677734076977, + 0.020609848201274872, + 0.015083746053278446, + 0.01621408388018608, + -0.0070897373370826244, + 0.008119601756334305, + -0.003028366481885314, + -0.0005110073834657669, + -0.00622942391782999, + 0.019981881603598595, + 0.029489288106560707, + 0.006888788193464279, + 0.02452835626900196, + 0.0024773261975497007, + -0.008458703756332397, + -0.0002564848982729018, + -0.012370931915938854, + -0.006254542153328657, + -0.004706605803221464, + 0.006050453521311283, + -0.005224677734076977, + 0.0014686556532979012, + 0.009689517319202423, + -0.00024942029267549515, + 0.018349168822169304, + -0.008910838514566422, + -0.023084033280611038, + -0.01039911899715662, + -0.02554566040635109, + 0.006825991440564394, + 0.008389626629650593, + -0.03378457576036453, + 0.03109688125550747, + -0.0114603815600276, + -0.014832559041678905, + -0.008295431733131409, + 0.021313169971108437, + -0.0024836058728396893, + 0.007070898078382015, + -0.023121710866689682, + 0.00038973146001808345, + 0.0032465846743434668, + 0.012653516605496407, + -0.03134806826710701, + -0.027002541348338127, + 0.007453957572579384, + -0.017984949052333832, + -0.014166914857923985, + -0.018022626638412476, + 0.00543504673987627, + 0.015598678030073643, + 0.012534203007817268, + 0.01623920351266861, + -0.01571171171963215, + -0.019893966615200043, + -0.002260677982121706, + 0.017143474891781807, + -0.016013136133551598, + 0.011190355755388737, + 0.03049403429031372, + -0.005469584837555885, + 0.017532814294099808, + 0.007981449365615845, + 0.022405831143260002, + -0.03169972822070122, + -0.03217698261141777, + 0.006631322205066681, + 0.014694406650960445, + 0.026148507371544838, + 0.01623920351266861, + 0.026449931785464287, + -0.013262644410133362, + -0.01734442450106144, + 0.0048321993090212345, + -0.013099372386932373, + 0.052045829594135284, + 0.006223144009709358, + -0.04420881345868111, + -0.0010887362295761704, + 0.0028792244847863913, + 0.0002376459160586819, + -0.08409722149372101, + -0.03438742458820343, + 0.004712885711342096, + 0.014405542053282261, + -0.001269276486709714, + 0.03835617005825043, + 0.004216792527586222, + -0.0017881334060803056, + 0.0019875126890838146, + 0.005535521078854799, + 0.008923398330807686, + -0.03049403429031372, + -0.01787191443145275, + -0.010568669065833092, + 0.04071732237935066, + -0.02720349095761776, + 0.012779110111296177, + 0.0010651875054463744, + -0.033960405737161636, + 0.011604813858866692, + -0.008678491227328777, + 0.007868414744734764, + 0.0028478263411670923, + -0.001828951295465231, + -0.02222999930381775, + 0.012201380915939808, + -0.022971000522375107, + 0.03127271309494972, + 0.0016358516877517104, + -0.004926394205540419, + -0.016754135489463806, + -0.024063661694526672, + -0.014166914857923985, + -0.008307991549372673, + -0.004449139814823866, + 0.013626864179968834, + -0.04099362716078758, + 0.03102152608335018, + 0.013413355685770512, + -0.04106898233294487, + 0.027253728359937668, + 0.010593787766993046, + -0.006461771205067635, + -0.05581362545490265, + -0.012659796513617039, + 0.017168592661619186, + -0.0019137266790494323, + 0.03777844086289406, + 0.01870083063840866, + -0.009048991836607456, + -0.03222722187638283, + -0.03139830380678177, + -0.021916016936302185, + 0.010832414962351322, + 0.01048075407743454, + -0.0017535953084006906, + 0.008056805469095707, + 0.028911558911204338, + -0.009921864606440067, + 0.011962753720581532, + 0.013375678099691868, + 0.006147787906229496, + -0.02038377895951271, + 0.01704300008714199, + -0.012559321708977222, + 0.018575238063931465, + -0.012697474099695683, + 0.0232221856713295, + 0.0201577115803957, + 0.00969579629600048, + -0.01563635654747486, + 0.028409186750650406, + 0.006245122756808996, + 0.010053737089037895, + -0.03792915120720863, + -0.005469584837555885, + -0.015837304294109344, + -0.020258186385035515, + 0.006606203503906727, + -0.005811826325953007, + -0.03310637176036835, + -0.020735440775752068, + 0.009218541905283928, + -0.008182398043572903, + 0.026022914797067642, + 0.01360174547880888, + -0.01054983027279377, + 0.0057710083201527596, + -0.015347491018474102, + -0.03544240817427635, + -0.03054427169263363, + 0.01035516057163477, + 0.006989262532442808, + -0.011510618962347507, + 0.007529313676059246, + 0.014757202938199043, + -0.01180576253682375, + -0.0010895212180912495, + 0.01607593148946762, + 0.010807296261191368, + -0.02485489845275879, + -0.011824601329863071, + -0.0650070533156395, + 0.032352812588214874, + -0.016113610938191414, + -0.019353915005922318, + 0.0009600031771697104, + -0.022016491740942, + 0.006687839049845934, + -0.01512142363935709, + 0.002095836913213134, + -0.00881036464124918, + -0.018675711005926132, + -0.0048635974526405334, + -0.002117815660312772, + 0.012615839019417763, + -0.02798216976225376, + 0.0032685634214431047, + 0.020182831212878227, + 0.014192033559083939, + 0.005880902521312237, + -0.002144504338502884, + 0.00037756460369564593, + -0.03245328739285469, + -0.016502948477864265, + 0.001036928966641426, + -0.023774797096848488, + 0.011177796870470047, + -0.0314234234392643, + 0.0005577123956754804, + -0.002411389723420143, + -0.00874128844588995, + 0.0064366525039076805, + -0.042576100677251816, + 0.0064021144062280655, + 0.022682135924696922, + 0.0008108612382784486, + -0.010122813284397125, + 0.006794593296945095, + 0.015083746053278446, + 0.007366042118519545, + 0.020333541557192802, + -0.030895931646227837, + -0.033181726932525635, + 0.015611236914992332, + -0.010832414962351322, + 0.0036013855133205652, + 0.004634389653801918, + -0.005340851843357086, + -0.0033941566944122314, + 0.030418677255511284, + 0.009808829985558987, + -0.003227745648473501, + 0.029263220727443695, + -0.028409186750650406, + -0.013036576099693775, + -0.012841906398534775, + -0.008351949043571949, + -0.017143474891781807, + -0.0003757984668482095, + -0.008333110250532627, + -0.04493725299835205, + 0.02248118631541729, + 0.00018220828496851027, + 0.02966511808335781, + -0.007868414744734764, + 0.005246656946837902, + 0.010813576169312, + -0.0205470509827137, + -0.002692404668778181, + -0.014619050547480583, + -0.02234303392469883, + -0.006920186337083578, + -0.00593741936609149, + 0.00517130084335804, + 0.005312593188136816, + 0.019630219787359238, + 0.020647525787353516, + -0.003180648200213909, + 0.0170932374894619, + -0.040616847574710846, + 0.037728201597929, + 0.020258186385035515, + 0.021112220361828804, + -0.030318202450871468, + -0.0004332966054789722, + 0.034563254565000534, + 0.006088131107389927, + -0.006254542153328657, + -0.01731930486857891, + -0.009205983020365238, + 0.00310058263130486, + -0.03607037290930748, + -0.0266759991645813, + 0.006939025595784187, + 0.024289729073643684, + -0.0038368727546185255, + 0.00473800441250205, + 0.017520254477858543, + 0.00783073715865612, + 0.017658406868577003, + 0.023372897878289223, + 0.0023721419274806976, + 0.009438330307602882, + -0.016000576317310333, + -0.004552754107862711, + -0.008785245940089226, + 0.005924860015511513, + -0.006289080251008272, + -0.037878915667533875, + -0.007328364532440901, + 0.012898423708975315, + 0.011812042444944382, + 0.029790712520480156, + -0.006361296400427818, + 0.015046067535877228, + -0.02499305084347725, + -0.00584008451551199, + 0.02609827183187008, + -0.0014945593429729342, + -0.028434304520487785, + 0.04242539033293724, + -0.008973635733127594, + 0.0035794067662209272, + 0.02770586498081684, + -0.015071186237037182, + 0.008433585055172443, + 0.002970279660075903, + -0.007793059106916189, + -0.0035228899214416742, + 0.01905249059200287, + -0.0050017498433589935, + 0.011717847548425198, + -0.010041178204119205, + -0.016452711075544357, + -0.015410288237035275, + 0.0009136906592175364, + 0.01372733898460865, + -0.008351949043571949, + 0.024339966475963593, + -0.0161889661103487, + 0.05631599947810173, + -0.012722592800855637, + -0.0041634151712059975, + 0.02287052571773529, + 0.014292508363723755, + 0.020735440775752068, + 0.024427881464362144, + 0.018072864040732384, + -0.008559177629649639, + -0.02024562656879425, + -0.0038494321051985025, + 0.006756915245205164, + -0.0008289152756333351, + -0.02259422093629837, + -0.011127559468150139, + 0.004289008677005768, + -0.01591266132891178, + 0.004398902412503958, + -0.019630219787359238, + 0.01292354241013527, + 0.018324051052331924, + 0.008942237123847008, + 0.0243902038782835, + 0.013212407007813454, + -0.018072864040732384, + -0.019931644201278687, + 0.00423877127468586, + 0.0012331684119999409, + -0.018349168822169304, + -0.016151288524270058, + 0.017243949696421623, + 0.0007649412145838141, + -0.0012857605470344424, + -0.01260955911129713, + 0.006656440440565348, + 0.002959290286526084, + -0.007108576130121946, + -0.009513686411082745, + -0.0036422032862901688, + 0.010562390089035034, + 0.02289564348757267, + 0.0006240413058549166, + -0.018097983673214912, + -0.03295566141605377, + 0.006813432089984417, + 0.002750491490587592, + -0.02936369553208351, + -0.005384809337556362, + -0.02317194826900959 + ] + } + ], + "model": "ada", + "usage": { + "prompt_tokens": 3, + "total_tokens": 3 + } +} diff --git a/libs/ktem/khapptests/test_qa.py b/libs/ktem/khapptests/test_qa.py new file mode 100644 index 0000000..645d9e8 --- /dev/null +++ b/libs/ktem/khapptests/test_qa.py @@ -0,0 +1,72 @@ +import json +from pathlib import Path +from unittest.mock import patch + +import pytest +from index import ReaderIndexingPipeline +from kotaemon.llms import AzureChatOpenAI +from openai.resources.embeddings import Embeddings +from openai.types.chat.chat_completion import ChatCompletion + +with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f: + openai_embedding = json.load(f) + + +_openai_chat_completion_response = ChatCompletion.parse_obj( + { + "id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x", + "object": "chat.completion", + "created": 1692338378, + "model": "gpt-35-turbo", + "system_fingerprint": None, + "choices": [ + { + "index": 0, + "finish_reason": "stop", + "message": { + "role": "assistant", + "content": "Hello! How can I assist you today?", + "function_call": None, + "tool_calls": None, + }, + } + ], + "usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19}, + } +) + + +@pytest.fixture(scope="function") +def mock_openai_embedding(monkeypatch): + monkeypatch.setattr(Embeddings, "create", lambda *args, **kwargs: openai_embedding) + + +@patch( + "openai.resources.chat.completions.Completions.create", + side_effect=lambda *args, **kwargs: _openai_chat_completion_response, +) +def test_ingest_pipeline(patch, mock_openai_embedding, tmp_path): + indexing_pipeline = ReaderIndexingPipeline( + storage_path=tmp_path, + ) + indexing_pipeline.indexing_vector_pipeline.embedding.openai_api_key = "some-key" + input_file_path = Path(__file__).parent / "resources/dummy.pdf" + + # call ingestion pipeline + indexing_pipeline(input_file_path, force_reindex=True) + retrieving_pipeline = indexing_pipeline.to_retrieving_pipeline() + + results = retrieving_pipeline("This is a query") + assert len(results) == 1 + + # create llm + llm = AzureChatOpenAI( + openai_api_base="https://test.openai.azure.com/", + openai_api_key="some-key", + openai_api_version="2023-03-15-preview", + deployment_name="gpt35turbo", + temperature=0, + ) + qa_pipeline = indexing_pipeline.to_qa_pipeline(llm=llm, openai_api_key="some-key") + response = qa_pipeline("Summarize this document.") + assert response diff --git a/libs/ktem/ktem/__init__.py b/libs/ktem/ktem/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libs/ktem/ktem/app.py b/libs/ktem/ktem/app.py new file mode 100644 index 0000000..c5629a0 --- /dev/null +++ b/libs/ktem/ktem/app.py @@ -0,0 +1,201 @@ +from pathlib import Path + +import gradio as gr +import pluggy +from ktem import extension_protocol +from ktem.components import reasonings +from ktem.exceptions import HookAlreadyDeclared, HookNotDeclared +from ktem.settings import ( + BaseSettingGroup, + SettingGroup, + SettingItem, + SettingReasoningGroup, +) +from theflow.settings import settings +from theflow.utils.modules import import_dotted_string + + +class BaseApp: + """The main app of Kotaemon + + The main application contains app-level information: + - setting state + - user id + + Also contains registering methods for: + - reasoning pipelines + - indexing & retrieval pipelines + + App life-cycle: + - Render + - Declare public events + - Subscribe public events + - Register events + """ + + def __init__(self): + dir_assets = Path(__file__).parent / "assets" + with (dir_assets / "css" / "main.css").open() as fi: + self._css = fi.read() + with (dir_assets / "js" / "main.js").open() as fi: + self._js = fi.read() + + self.default_settings = SettingGroup( + application=BaseSettingGroup(settings=settings.SETTINGS_APP), + reasoning=SettingReasoningGroup(settings=settings.SETTINGS_REASONING), + ) + + self._callbacks: dict[str, list] = {} + self._events: dict[str, list] = {} + + self.register_indices() + self.register_reasonings() + self.register_extensions() + + self.default_settings.reasoning.finalize() + self.default_settings.index.finalize() + + self.settings_state = gr.State(self.default_settings.flatten()) + self.user_id = gr.State(None) + + def register_indices(self): + """Register the index components from app settings""" + index = import_dotted_string(settings.KH_INDEX, safe=False) + user_settings = index().get_user_settings() + for key, value in user_settings.items(): + self.default_settings.index.settings[key] = SettingItem(**value) + + def register_reasonings(self): + """Register the reasoning components from app settings""" + if getattr(settings, "KH_REASONINGS", None) is None: + return + + for name, value in settings.KH_REASONINGS.items(): + reasoning_cls = import_dotted_string(value, safe=False) + reasonings[name] = reasoning_cls + options = reasoning_cls().get_user_settings() + self.default_settings.reasoning.options[name] = BaseSettingGroup( + settings=options + ) + + def register_extensions(self): + """Register installed extensions""" + self.exman = pluggy.PluginManager("ktem") + self.exman.add_hookspecs(extension_protocol) + self.exman.load_setuptools_entrypoints("ktem") + + # retrieve and register extension declarations + extension_declarations = self.exman.hook.ktem_declare_extensions() + for extension_declaration in extension_declarations: + # if already in database, with the same version: skip + + # otherwise, + # remove the old information from the database if it exists + # store the information into the database + + functionality = extension_declaration["functionality"] + + # update the reasoning information + if "reasoning" in functionality: + for rid, rdec in functionality["reasoning"].items(): + unique_rid = f"{extension_declaration['id']}/{rid}" + self.default_settings.reasoning.options[ + unique_rid + ] = BaseSettingGroup( + settings=rdec["settings"], + ) + + def declare_event(self, name: str): + """Declare a public gradio event for other components to subscribe to + + Args: + name: The name of the event + """ + if name in self._events: + raise HookAlreadyDeclared(f"Hook {name} is already declared") + self._events[name] = [] + + def subscribe_event(self, name: str, definition: dict): + """Register a hook for the app + + Args: + name: The name of the hook + hook: The hook to be registered + """ + if name not in self._events: + raise HookNotDeclared(f"Hook {name} is not declared") + self._events[name].append(definition) + + def get_event(self, name) -> list[dict]: + if name not in self._events: + raise HookNotDeclared(f"Hook {name} is not declared") + + return self._events[name] + + def ui(self): + raise NotImplementedError + + def make(self): + with gr.Blocks(css=self._css) as demo: + self.app = demo + self.settings_state.render() + self.user_id.render() + + self.ui() + + for value in self.__dict__.values(): + if isinstance(value, BasePage): + value.declare_public_events() + + for value in self.__dict__.values(): + if isinstance(value, BasePage): + value.subscribe_public_events() + + for value in self.__dict__.values(): + if isinstance(value, BasePage): + value.register_events() + + demo.load(lambda: None, None, None, js=f"() => {{{self._js}}}") + + return demo + + +class BasePage: + """The logic of the Kotaemon app""" + + public_events: list[str] = [] + + def __init__(self, app): + self._app = app + + def on_building_ui(self): + """Build the UI of the app""" + + def on_subscribe_public_events(self): + """Subscribe to the declared public event of the app""" + + def on_register_events(self): + """Register all events to the app""" + + def declare_public_events(self): + """Declare an event for the app""" + for event in self.public_events: + self._app.declare_event(event) + + for value in self.__dict__.values(): + if isinstance(value, BasePage): + value.declare_public_events() + + def subscribe_public_events(self): + """Subscribe to an event""" + self.on_subscribe_public_events() + for value in self.__dict__.values(): + if isinstance(value, BasePage): + value.subscribe_public_events() + + def register_events(self): + """Register all events""" + self.on_register_events() + for value in self.__dict__.values(): + if isinstance(value, BasePage): + value.register_events() diff --git a/libs/ktem/ktem/assets/css/main.css b/libs/ktem/ktem/assets/css/main.css new file mode 100644 index 0000000..15c2aa2 --- /dev/null +++ b/libs/ktem/ktem/assets/css/main.css @@ -0,0 +1,37 @@ +footer { + display: none !important; +} + +.gradio-container { + max-width: 100% !important; + padding: 0 !important; +} + +.header-bar { + background-color: #f7f7f7; + margin: 0px 0px 20px; + overflow-x: scroll; + display: block !important; + text-wrap: nowrap; +} + +.dark .header-bar { + border: none !important; + background-color: #8080802b !important; +} + +.header-bar button.selected { + border-radius: 0; +} + +#chat-tab, #settings-tab, #help-tab { + border: none !important; +} + +#main-chat-bot { + height: calc(100vh - 140px) !important; +} + +.setting-answer-mode-description { + margin: 5px 5px 2px !important +} diff --git a/libs/ktem/ktem/assets/js/main.js b/libs/ktem/ktem/assets/js/main.js new file mode 100644 index 0000000..4f3a003 --- /dev/null +++ b/libs/ktem/ktem/assets/js/main.js @@ -0,0 +1,6 @@ +let main_parent = document.getElementById("chat-tab").parentNode; + +main_parent.childNodes[0].classList.add("header-bar"); +main_parent.style = "padding: 0; margin: 0"; +main_parent.parentNode.style = "gap: 0"; +main_parent.parentNode.parentNode.style = "padding: 0"; diff --git a/libs/ktem/ktem/assets/md/about_cinnamon.md b/libs/ktem/ktem/assets/md/about_cinnamon.md new file mode 100644 index 0000000..4143e64 --- /dev/null +++ b/libs/ktem/ktem/assets/md/about_cinnamon.md @@ -0,0 +1,25 @@ +# About Cinnamon AI + +Welcome to **Cinnamon AI**, a pioneering force in the field of artificial intelligence and document processing. At Cinnamon AI, we are committed to revolutionizing the way businesses handle information, leveraging cutting-edge technologies to streamline and automate data extraction processes. + +## Our Mission + +At the core of our mission is the pursuit of innovation that simplifies complex tasks. We strive to empower organizations with transformative AI solutions that enhance efficiency, accuracy, and productivity. Cinnamon AI is dedicated to bridging the gap between human intelligence and machine capabilities, making data extraction and analysis seamless and intuitive. + +## Key Highlights + +- **Advanced Technology:** Cinnamon AI specializes in harnessing the power of natural language processing (NLP) and machine learning to develop sophisticated solutions for document understanding and data extraction. + +- **Industry Impact:** We cater to diverse industries, providing tailor-made AI solutions that address the unique challenges and opportunities within each sector. From finance to healthcare, our technology is designed to make a meaningful impact. + +- **Global Presence:** With a global perspective, Cinnamon AI operates on an international scale, collaborating with businesses and enterprises around the world to elevate their data processing capabilities. + +## Why Choose Cinnamon AI + +- **Innovation:** Our commitment to innovation is evident in our continual pursuit of technological excellence. We stay ahead of the curve to deliver solutions that meet the evolving needs of the digital landscape. + +- **Reliability:** Clients trust Cinnamon AI for reliable, accurate, and scalable AI solutions. Our track record speaks to our dedication to quality and customer satisfaction. + +- **Collaboration:** We believe in the power of collaboration. By working closely with our clients, we tailor our solutions to their specific requirements, fostering long-term partnerships built on mutual success. + +Explore the future of data processing with Cinnamon AI – where intelligence meets innovation. diff --git a/libs/ktem/ktem/assets/md/about_kotaemon.md b/libs/ktem/ktem/assets/md/about_kotaemon.md new file mode 100644 index 0000000..811a411 --- /dev/null +++ b/libs/ktem/ktem/assets/md/about_kotaemon.md @@ -0,0 +1,19 @@ +# About Kotaemon + +Welcome to the future of language technology – Cinnamon AI proudly presents our latest innovation, **Kotaemon**. At Cinnamon AI, we believe in pushing the boundaries of what's possible with natural language processing, and Kotaemon embodies the pinnacle of our endeavors. Designed to empower businesses and developers alike, Kotaemon is not just a product; it's a manifestation of our commitment to enhancing human-machine interaction. + +## Key Features + +- **Cognitive Understanding:** Kotaemon boasts advanced cognitive understanding capabilities, allowing it to interpret and respond to natural language queries with unprecedented accuracy. Whether you're building chatbots, virtual assistants, or language-driven applications, Kotaemon ensures a nuanced and contextually rich user experience. + +- **Versatility:** From analyzing vast textual datasets to generating coherent and contextually relevant responses, Kotaemon adapts seamlessly to diverse use cases. Whether you're in customer support, content creation, or data analysis, Kotaemon is your versatile companion in navigating the linguistic landscape. + +- **Scalability:** Built with scalability in mind, Kotaemon is designed to meet the evolving needs of your business. As your language-related tasks grow in complexity, Kotaemon scales with you, providing a robust foundation for future innovation and expansion. + +- **Ethical AI:** Cinnamon AI is committed to responsible and ethical AI development. Kotaemon reflects our dedication to fairness, transparency, and unbiased language processing, ensuring that your applications uphold the highest ethical standards. + +## Why Kotaemon? + +Kotaemon is not just a tool; it's a catalyst for unlocking the true potential of natural language understanding. Whether you're a developer aiming to enhance user experiences or a business leader seeking to leverage language technology, Kotaemon is your partner in navigating the intricacies of human communication. + +Join us on this transformative journey with Kotaemon – where language meets innovation, and understanding becomes seamless. Cinnamon AI: Redefining the future of natural language processing. diff --git a/libs/ktem/ktem/assets/md/changelogs.md b/libs/ktem/ktem/assets/md/changelogs.md new file mode 100644 index 0000000..e6f4458 --- /dev/null +++ b/libs/ktem/ktem/assets/md/changelogs.md @@ -0,0 +1,11 @@ +# Changelogs + +## v1.0.0 + +- Chat: interact with chatbot with simple pipeline, rewoo and react agents +- Chat: conversation management: create, delete, rename conversations +- Files: upload files +- Files: select files as context for chatbot +- User management: create, sign-in, sign-out, change password +- Setting: common settings and pipeline-based settings +- Info panel: show Cinnamon AI and Kotaemon information diff --git a/libs/ktem/ktem/components.py b/libs/ktem/ktem/components.py new file mode 100644 index 0000000..647dce5 --- /dev/null +++ b/libs/ktem/ktem/components.py @@ -0,0 +1,162 @@ +"""Common components, some kind of config""" +import logging +from functools import cache +from pathlib import Path + +from kotaemon.base import BaseComponent +from kotaemon.storages import BaseDocumentStore, BaseVectorStore +from theflow.settings import settings +from theflow.utils.modules import deserialize + +logger = logging.getLogger(__name__) + + +filestorage_path = Path(settings.KH_FILESTORAGE_PATH) +filestorage_path.mkdir(parents=True, exist_ok=True) + + +@cache +def get_docstore() -> BaseDocumentStore: + return deserialize(settings.KH_DOCSTORE, safe=False) + + +@cache +def get_vectorstore() -> BaseVectorStore: + return deserialize(settings.KH_VECTORSTORE, safe=False) + + +class ModelPool: + """Represent a pool of models""" + + def __init__(self, category: str, conf: dict): + self._category = category + self._conf = conf + + self._models: dict[str, BaseComponent] = {} + self._accuracy: list[str] = [] + self._cost: list[str] = [] + self._default: list[str] = [] + + for name, model in conf.items(): + self._models[name] = deserialize(model["def"], safe=False) + if model.get("default", False): + self._default.append(name) + + self._accuracy = list( + sorted(conf, key=lambda x: conf[x].get("accuracy", float("-inf"))) + ) + self._cost = list(sorted(conf, key=lambda x: conf[x].get("cost", float("inf")))) + + def __getitem__(self, key: str) -> BaseComponent: + return self._models[key] + + def __setitem__(self, key: str, value: BaseComponent): + self._models[key] = value + + def settings(self) -> dict: + """Present model pools option for gradio""" + return { + "label": self._category, + "choices": list(self._models.keys()), + "value": self.get_default_name(), + } + + def options(self) -> dict: + """Present a list of models""" + return self._models + + def get_random_name(self) -> str: + """Get the name of random model + + Returns: + str: random model name in the pool + """ + import random + + if not self._conf: + raise ValueError("No models in pool") + + return random.choice(list(self._conf.keys())) + + def get_default_name(self) -> str: + """Get the name of default model + + In case there is no default model, choose random model from pool. In + case there are multiple default models, choose random from them. + + Returns: + str: model name + """ + if not self._conf: + raise ValueError("No models in pool") + + if self._default: + import random + + return random.choice(self._default) + + return self.get_random_name() + + def get_random(self) -> BaseComponent: + """Get random model""" + return self._models[self.get_random_name()] + + def get_default(self) -> BaseComponent: + """Get default model + + In case there is no default model, choose random model from pool. In + case there are multiple default models, choose random from them. + + Returns: + BaseComponent: model + """ + return self._models[self.get_default_name()] + + def get_highest_accuracy_name(self) -> str: + """Get the name of model with highest accuracy + + Returns: + str: model name + """ + if not self._conf: + raise ValueError("No models in pool") + return self._accuracy[-1] + + def get_highest_accuracy(self) -> BaseComponent: + """Get model with highest accuracy + + Returns: + BaseComponent: model + """ + if not self._conf: + raise ValueError("No models in pool") + + return self._models[self._accuracy[-1]] + + def get_lowest_cost_name(self) -> str: + """Get the name of model with lowest cost + + Returns: + str: model name + """ + if not self._conf: + raise ValueError("No models in pool") + return self._cost[0] + + def get_lowest_cost(self) -> BaseComponent: + """Get model with lowest cost + + Returns: + BaseComponent: model + """ + if not self._conf: + raise ValueError("No models in pool") + + return self._models[self._cost[0]] + + +llms = ModelPool("LLMs", settings.KH_LLMS) +embeddings = ModelPool("Embeddings", settings.KH_EMBEDDINGS) +reasonings: dict = {} +tools = ModelPool("Tools", {}) +indices = ModelPool("Indices", {}) diff --git a/libs/ktem/ktem/db/__init__.py b/libs/ktem/ktem/db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libs/ktem/ktem/db/engine.py b/libs/ktem/ktem/db/engine.py new file mode 100644 index 0000000..05f946e --- /dev/null +++ b/libs/ktem/ktem/db/engine.py @@ -0,0 +1,4 @@ +from sqlmodel import create_engine +from theflow.settings import settings + +engine = create_engine(settings.KH_DATABASE) diff --git a/libs/ktem/ktem/db/models.py b/libs/ktem/ktem/db/models.py new file mode 100644 index 0000000..bb915e2 --- /dev/null +++ b/libs/ktem/ktem/db/models.py @@ -0,0 +1,97 @@ +import datetime +import uuid +from enum import Enum +from typing import Optional + +from ktem.db.engine import engine +from sqlalchemy import JSON, Column +from sqlmodel import Field, SQLModel + + +class Source(SQLModel, table=True): + """The source of the document + + Attributes: + id: id of the source + name: name of the source + path: path to the source + """ + + __table_args__ = {"extend_existing": True} + + id: str = Field( + default_factory=lambda: uuid.uuid4().hex, primary_key=True, index=True + ) + name: str + path: str + + +class SourceTargetRelation(str, Enum): + DOCUMENT = "document" + VECTOR = "vector" + + +class Index(SQLModel, table=True): + """The index pointing from the original id to the target id""" + + __table_args__ = {"extend_existing": True} + + id: Optional[int] = Field(default=None, primary_key=True, index=True) + source_id: str + target_id: str + relation_type: Optional[SourceTargetRelation] = Field(default=None) + + +class Conversation(SQLModel, table=True): + """Conversation record""" + + __table_args__ = {"extend_existing": True} + + id: str = Field( + default_factory=lambda: uuid.uuid4().hex, primary_key=True, index=True + ) + name: str = Field( + default_factory=lambda: datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") + ) + user: int = Field(default=0) # For now we only have one user + + # contains messages + current files + data_source: dict = Field(default={}, sa_column=Column(JSON)) + + date_created: datetime.datetime = Field(default_factory=datetime.datetime.utcnow) + date_updated: datetime.datetime = Field(default_factory=datetime.datetime.utcnow) + + +class User(SQLModel, table=True): + __table_args__ = {"extend_existing": True} + + id: Optional[int] = Field(default=None, primary_key=True) + username: str = Field(unique=True) + password: str + + +class Settings(SQLModel, table=True): + """Record of settings""" + + __table_args__ = {"extend_existing": True} + + id: str = Field( + default_factory=lambda: uuid.uuid4().hex, primary_key=True, index=True + ) + user: int = Field(default=0) + setting: dict = Field(default={}, sa_column=Column(JSON)) + + +class IssueReport(SQLModel, table=True): + """Record of issues""" + + __table_args__ = {"extend_existing": True} + + id: Optional[int] = Field(default=None, primary_key=True) + issues: dict = Field(default={}, sa_column=Column(JSON)) + chat: Optional[dict] = Field(default=None, sa_column=Column(JSON)) + settings: Optional[dict] = Field(default=None, sa_column=Column(JSON)) + user: Optional[int] = Field(default=None) + + +SQLModel.metadata.create_all(engine) diff --git a/libs/ktem/ktem/exceptions.py b/libs/ktem/ktem/exceptions.py new file mode 100644 index 0000000..622af93 --- /dev/null +++ b/libs/ktem/ktem/exceptions.py @@ -0,0 +1,10 @@ +class KHException(Exception): + pass + + +class HookNotDeclared(KHException): + pass + + +class HookAlreadyDeclared(KHException): + pass diff --git a/libs/ktem/ktem/extension_protocol.py b/libs/ktem/ktem/extension_protocol.py new file mode 100644 index 0000000..e52abba --- /dev/null +++ b/libs/ktem/ktem/extension_protocol.py @@ -0,0 +1,39 @@ +import pluggy + +hookspec = pluggy.HookspecMarker("ktem") +hookimpl = pluggy.HookimplMarker("ktem") + + +@hookspec +def ktem_declare_extensions() -> dict: # type: ignore + """Called before the run() function is executed. + + This hook is called without any arguments, and should return a dictionary. + The dictionary has the following structure: + + ``` + { + "id": str, # cannot contain . or / + "name": str, # human-friendly name of the plugin + "version": str, + "support_host": str, + "functionality": { + "reasoning": { + id: { # cannot contain . or / + "name": str, + "callbacks": {}, + "settings": {}, + }, + }, + "index": { + "name": str, + "callbacks": { + "get_index_pipeline": callable, + "get_retrievers": {name: callable} + }, + "settings": {}, + }, + }, + } + ``` + """ diff --git a/libs/ktem/ktem/indexing/__init__.py b/libs/ktem/ktem/indexing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libs/ktem/ktem/indexing/base.py b/libs/ktem/ktem/indexing/base.py new file mode 100644 index 0000000..0057b90 --- /dev/null +++ b/libs/ktem/ktem/indexing/base.py @@ -0,0 +1,16 @@ +from kotaemon.base import BaseComponent + + +class BaseIndex(BaseComponent): + def get_user_settings(self) -> dict: + """Get the user settings for indexing + + Returns: + dict: user settings in the dictionary format of + `ktem.settings.SettingItem` + """ + return {} + + @classmethod + def get_pipeline(cls, setting: dict) -> "BaseIndex": + raise NotImplementedError diff --git a/libs/ktem/ktem/indexing/exceptions.py b/libs/ktem/ktem/indexing/exceptions.py new file mode 100644 index 0000000..f343fb1 --- /dev/null +++ b/libs/ktem/ktem/indexing/exceptions.py @@ -0,0 +1,5 @@ +from ktem.exceptions import KHException + + +class FileExistsError(KHException): + pass diff --git a/libs/ktem/ktem/indexing/file.py b/libs/ktem/ktem/indexing/file.py new file mode 100644 index 0000000..daf0feb --- /dev/null +++ b/libs/ktem/ktem/indexing/file.py @@ -0,0 +1,182 @@ +from __future__ import annotations + +import shutil +from hashlib import sha256 +from pathlib import Path + +from ktem.components import embeddings, filestorage_path, get_docstore, get_vectorstore +from ktem.db.models import Index, Source, SourceTargetRelation, engine +from ktem.indexing.base import BaseIndex +from ktem.indexing.exceptions import FileExistsError +from kotaemon.indices import VectorIndexing +from kotaemon.indices.ingests import DocumentIngestor +from sqlmodel import Session, select + +USER_SETTINGS = { + "index_parser": { + "name": "Index parser", + "value": "normal", + "choices": [ + ("PDF text parser", "normal"), + ("Mathpix", "mathpix"), + ("Advanced ocr", "ocr"), + ], + "component": "dropdown", + }, + "separate_embedding": { + "name": "Use separate embedding", + "value": False, + "choices": [("Yes", True), ("No", False)], + "component": "dropdown", + }, + "num_retrieval": { + "name": "Number of documents to retrieve", + "value": 3, + "component": "number", + }, + "retrieval_mode": { + "name": "Retrieval mode", + "value": "vector", + "choices": ["vector", "text", "hybrid"], + "component": "dropdown", + }, + "prioritize_table": { + "name": "Prioritize table", + "value": True, + "choices": [True, False], + "component": "checkbox", + }, + "mmr": { + "name": "Use MMR", + "value": True, + "choices": [True, False], + "component": "checkbox", + }, + "use_reranking": { + "name": "Use reranking", + "value": True, + "choices": [True, False], + "component": "checkbox", + }, +} + + +class IndexDocumentPipeline(BaseIndex): + """Store the documents and index the content into vector store and doc store + + Args: + indexing_vector_pipeline: pipeline to index the documents + file_ingestor: ingestor to ingest the documents + """ + + indexing_vector_pipeline: VectorIndexing = VectorIndexing.withx( + doc_store=get_docstore(), + vector_store=get_vectorstore(), + embedding=embeddings.get_default(), + ) + file_ingestor: DocumentIngestor = DocumentIngestor.withx() + + def run( + self, + file_paths: str | Path | list[str | Path], + reindex: bool = False, + **kwargs, # type: ignore + ): + """Index the list of documents + + This function will extract the files, persist the files to storage, + index the files. + + Args: + file_paths: list of file paths to index + reindex: whether to force reindexing the files if they exist + + Returns: + list of split nodes + """ + if not isinstance(file_paths, list): + file_paths = [file_paths] + + to_index: list[str] = [] + file_to_hash: dict[str, str] = {} + errors = [] + + for file_path in file_paths: + abs_path = str(Path(file_path).resolve()) + with open(abs_path, "rb") as fi: + file_hash = sha256(fi.read()).hexdigest() + + file_to_hash[abs_path] = file_hash + + with Session(engine) as session: + statement = select(Source).where(Source.name == Path(abs_path).name) + item = session.exec(statement).first() + + if item and not reindex: + errors.append(Path(abs_path).name) + continue + + to_index.append(abs_path) + + if errors: + raise FileExistsError( + "Files already exist. Please rename/remove them or enable reindex.\n" + f"{errors}" + ) + + # persist the files to storage + for path in to_index: + shutil.copy(path, filestorage_path / file_to_hash[path]) + + # prepare record info + file_to_source: dict[str, Source] = {} + for file_path, file_hash in file_to_hash.items(): + source = Source(path=file_hash, name=Path(file_path).name) + file_to_source[file_path] = source + + # extract the files + nodes = self.file_ingestor(to_index) + for node in nodes: + file_path = str(node.metadata["file_path"]) + node.source = file_to_source[file_path].id + + # index the files + self.indexing_vector_pipeline(nodes) + + # persist to the index + file_ids = [] + with Session(engine) as session: + for source in file_to_source.values(): + session.add(source) + session.commit() + for source in file_to_source.values(): + file_ids.append(source.id) + + with Session(engine) as session: + for node in nodes: + index = Index( + source_id=node.source, + target_id=node.doc_id, + relation_type=SourceTargetRelation.DOCUMENT, + ) + session.add(index) + for node in nodes: + index = Index( + source_id=node.source, + target_id=node.doc_id, + relation_type=SourceTargetRelation.VECTOR, + ) + session.add(index) + session.commit() + + return nodes, file_ids + + def get_user_settings(self) -> dict: + return USER_SETTINGS + + @classmethod + def get_pipeline(cls, setting) -> "IndexDocumentPipeline": + """Get the pipeline based on the setting""" + obj = cls() + obj.file_ingestor.pdf_mode = setting["index.index_parser"] + return obj diff --git a/libs/ktem/ktem/main.py b/libs/ktem/ktem/main.py new file mode 100644 index 0000000..dc6abb3 --- /dev/null +++ b/libs/ktem/ktem/main.py @@ -0,0 +1,31 @@ +import gradio as gr +from ktem.app import BaseApp +from ktem.pages.chat import ChatPage +from ktem.pages.help import HelpPage +from ktem.pages.settings import SettingsPage + + +class App(BaseApp): + """The main app of Kotaemon + + The main application contains app-level information: + - setting state + - user id + + App life-cycle: + - Render + - Declare public events + - Subscribe public events + - Register events + """ + + def ui(self): + """Render the UI""" + with gr.Tab("Chat", elem_id="chat-tab"): + self.chat_page = ChatPage(self) + + with gr.Tab("Settings", elem_id="settings-tab"): + self.settings_page = SettingsPage(self) + + with gr.Tab("Help", elem_id="help-tab"): + self.help_page = HelpPage(self) diff --git a/libs/ktem/ktem/pages/__init__.py b/libs/ktem/ktem/pages/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libs/ktem/ktem/pages/chat/__init__.py b/libs/ktem/ktem/pages/chat/__init__.py new file mode 100644 index 0000000..e148c1d --- /dev/null +++ b/libs/ktem/ktem/pages/chat/__init__.py @@ -0,0 +1,125 @@ +import gradio as gr +from ktem.app import BasePage + +from .chat_panel import ChatPanel +from .control import ConversationControl +from .data_source import DataSource +from .events import chat_fn, index_fn, is_liked, load_files, update_data_source +from .report import ReportIssue +from .upload import FileUpload + + +class ChatPage(BasePage): + def __init__(self, app): + self._app = app + self.on_building_ui() + + def on_building_ui(self): + with gr.Row(): + with gr.Column(scale=1): + self.chat_control = ConversationControl(self._app) + self.data_source = DataSource(self._app) + self.file_upload = FileUpload(self._app) + self.report_issue = ReportIssue(self._app) + with gr.Column(scale=6): + self.chat_panel = ChatPanel(self._app) + + def on_register_events(self): + self.chat_panel.submit_btn.click( + fn=chat_fn, + inputs=[ + self.chat_panel.text_input, + self.chat_panel.chatbot, + self.data_source.files, + self._app.settings_state, + ], + outputs=[self.chat_panel.text_input, self.chat_panel.chatbot], + ).then( + fn=update_data_source, + inputs=[ + self.chat_control.conversation_id, + self.data_source.files, + self.chat_panel.chatbot, + ], + outputs=None, + ) + + self.chat_panel.text_input.submit( + fn=chat_fn, + inputs=[ + self.chat_panel.text_input, + self.chat_panel.chatbot, + self.data_source.files, + self._app.settings_state, + ], + outputs=[self.chat_panel.text_input, self.chat_panel.chatbot], + ).then( + fn=update_data_source, + inputs=[ + self.chat_control.conversation_id, + self.data_source.files, + self.chat_panel.chatbot, + ], + outputs=None, + ) + + self.chat_panel.chatbot.like( + fn=is_liked, + inputs=[self.chat_control.conversation_id], + outputs=None, + ) + + self.chat_control.conversation.change( + self.chat_control.select_conv, + inputs=[self.chat_control.conversation], + outputs=[ + self.chat_control.conversation_id, + self.chat_control.conversation, + self.chat_control.conversation_rn, + self.data_source.files, + self.chat_panel.chatbot, + ], + show_progress="hidden", + ) + + self.report_issue.report_btn.click( + self.report_issue.report, + inputs=[ + self.report_issue.correctness, + self.report_issue.issues, + self.report_issue.more_detail, + self.chat_control.conversation_id, + self.chat_panel.chatbot, + self.data_source.files, + self._app.settings_state, + self._app.user_id, + ], + outputs=None, + ) + + self.data_source.files.input( + fn=update_data_source, + inputs=[ + self.chat_control.conversation_id, + self.data_source.files, + self.chat_panel.chatbot, + ], + outputs=None, + ) + + self.file_upload.upload_button.click( + fn=index_fn, + inputs=[ + self.file_upload.files, + self.file_upload.reindex, + self.data_source.files, + self._app.settings_state, + ], + outputs=[self.file_upload.file_output, self.data_source.files], + ) + + self._app.app.load( + lambda: gr.update(choices=load_files()), + inputs=None, + outputs=[self.data_source.files], + ) diff --git a/libs/ktem/ktem/pages/chat/chat_panel.py b/libs/ktem/ktem/pages/chat/chat_panel.py new file mode 100644 index 0000000..728de28 --- /dev/null +++ b/libs/ktem/ktem/pages/chat/chat_panel.py @@ -0,0 +1,21 @@ +import gradio as gr +from ktem.app import BasePage + + +class ChatPanel(BasePage): + def __init__(self, app): + self._app = app + self.on_building_ui() + + def on_building_ui(self): + self.chatbot = gr.Chatbot( + elem_id="main-chat-bot", + show_copy_button=True, + likeable=True, + show_label=False, + ) + with gr.Row(): + self.text_input = gr.Text( + placeholder="Chat input", scale=15, container=False + ) + self.submit_btn = gr.Button(value="Send", scale=1, min_width=10) diff --git a/libs/ktem/ktem/pages/chat/control.py b/libs/ktem/ktem/pages/chat/control.py new file mode 100644 index 0000000..5c11948 --- /dev/null +++ b/libs/ktem/ktem/pages/chat/control.py @@ -0,0 +1,193 @@ +import logging + +import gradio as gr +from ktem.app import BasePage +from ktem.db.models import Conversation, engine +from sqlmodel import Session, select + +logger = logging.getLogger(__name__) + + +class ConversationControl(BasePage): + """Manage conversation""" + + def __init__(self, app): + self._app = app + self.on_building_ui() + + def on_building_ui(self): + with gr.Accordion(label="Conversation control", open=True): + self.conversation_id = gr.State(value="") + self.conversation = gr.Dropdown( + label="Chat sessions", + choices=[], + container=False, + filterable=False, + interactive=True, + ) + + with gr.Row(): + self.conversation_new_btn = gr.Button(value="New", min_width=10) + self.conversation_del_btn = gr.Button(value="Delete", min_width=10) + + with gr.Row(): + self.conversation_rn = gr.Text( + placeholder="Conversation name", + container=False, + scale=5, + min_width=10, + interactive=True, + ) + self.conversation_rn_btn = gr.Button( + value="Rename", scale=1, min_width=10 + ) + + # current_state = gr.Text() + # show_current_state = gr.Button(value="Current") + # show_current_state.click( + # lambda a, b: "\n".join([a, b]), + # inputs=[cid, self.conversation], + # outputs=[current_state], + # ) + + def on_subscribe_public_events(self): + self._app.subscribe_event( + name="onSignIn", + definition={ + "fn": self.reload_conv, + "inputs": [self._app.user_id], + "outputs": [self.conversation], + "show_progress": "hidden", + }, + ) + + self._app.subscribe_event( + name="onSignOut", + definition={ + "fn": self.reload_conv, + "inputs": [self._app.user_id], + "outputs": [self.conversation], + "show_progress": "hidden", + }, + ) + + self._app.subscribe_event( + name="onCreateUser", + definition={ + "fn": self.reload_conv, + "inputs": [self._app.user_id], + "outputs": [self.conversation], + "show_progress": "hidden", + }, + ) + + def on_register_events(self): + self.conversation_new_btn.click( + self.new_conv, + inputs=self._app.user_id, + outputs=[self.conversation_id, self.conversation], + show_progress="hidden", + ) + self.conversation_del_btn.click( + self.delete_conv, + inputs=[self.conversation_id, self._app.user_id], + outputs=[self.conversation_id, self.conversation], + show_progress="hidden", + ) + self.conversation_rn_btn.click( + self.rename_conv, + inputs=[self.conversation_id, self.conversation_rn, self._app.user_id], + outputs=[self.conversation, self.conversation], + show_progress="hidden", + ) + + def load_chat_history(self, user_id): + """Reload chat history""" + options = [] + with Session(engine) as session: + statement = ( + select(Conversation) + .where(Conversation.user == user_id) + .order_by(Conversation.date_created.desc()) # type: ignore + ) + results = session.exec(statement).all() + for result in results: + options.append((result.name, result.id)) + + # return gr.update(choices=options) + return options + + def reload_conv(self, user_id): + conv_list = self.load_chat_history(user_id) + if conv_list: + return gr.update(value=conv_list[0][1], choices=conv_list) + else: + return gr.update(value=None, choices=[]) + + def new_conv(self, user_id): + """Create new chat""" + if user_id is None: + gr.Warning("Please sign in first (Settings → User Settings)") + return None, gr.update() + with Session(engine) as session: + new_conv = Conversation(user=user_id) + session.add(new_conv) + session.commit() + + id_ = new_conv.id + + history = self.load_chat_history(user_id) + + return id_, gr.update(value=id_, choices=history) + + def delete_conv(self, conversation_id, user_id): + """Create new chat""" + if user_id is None: + gr.Warning("Please sign in first (Settings → User Settings)") + return None, gr.update() + with Session(engine) as session: + statement = select(Conversation).where(Conversation.id == conversation_id) + result = session.exec(statement).one() + + session.delete(result) + session.commit() + + history = self.load_chat_history(user_id) + if history: + id_ = history[0][1] + return id_, gr.update(value=id_, choices=history) + else: + return None, gr.update(value=None, choices=[]) + + def select_conv(self, conversation_id): + """Select the conversation""" + with Session(engine) as session: + statement = select(Conversation).where(Conversation.id == conversation_id) + try: + result = session.exec(statement).one() + id_ = result.id + name = result.name + files = result.data_source.get("files", []) + chats = result.data_source.get("messages", []) + except Exception as e: + logger.warning(e) + id_ = "" + name = "" + files = [] + chats = [] + return id_, id_, name, files, chats + + def rename_conv(self, conversation_id, new_name, user_id): + """Rename the conversation""" + if user_id is None: + gr.Warning("Please sign in first (Settings → User Settings)") + return gr.update(), "" + with Session(engine) as session: + statement = select(Conversation).where(Conversation.id == conversation_id) + result = session.exec(statement).one() + result.name = new_name + session.add(result) + session.commit() + + history = self.load_chat_history(user_id) + return gr.update(choices=history), conversation_id diff --git a/libs/ktem/ktem/pages/chat/data_source.py b/libs/ktem/ktem/pages/chat/data_source.py new file mode 100644 index 0000000..f78c7a3 --- /dev/null +++ b/libs/ktem/ktem/pages/chat/data_source.py @@ -0,0 +1,18 @@ +import gradio as gr +from ktem.app import BasePage + + +class DataSource(BasePage): + def __init__(self, app): + self._app = app + self.on_building_ui() + + def on_building_ui(self): + with gr.Accordion(label="Data source", open=True): + self.files = gr.Dropdown( + label="Files", + choices=[], + multiselect=True, + container=False, + interactive=True, + ) diff --git a/libs/ktem/ktem/pages/chat/events.py b/libs/ktem/ktem/pages/chat/events.py new file mode 100644 index 0000000..9a3346d --- /dev/null +++ b/libs/ktem/ktem/pages/chat/events.py @@ -0,0 +1,220 @@ +import os +import tempfile +from copy import deepcopy +from typing import Optional + +import gradio as gr +from ktem.components import llms, reasonings +from ktem.db.models import Conversation, Source, engine +from ktem.indexing.base import BaseIndex +from ktem.reasoning.simple import DocumentRetrievalPipeline +from sqlmodel import Session, select +from theflow.settings import settings as app_settings +from theflow.utils.modules import import_dotted_string + + +def create_pipeline(settings: dict, files: Optional[list] = None): + """Create the pipeline from settings + + Args: + settings: the settings of the app + files: the list of file ids that will be served as context. If None, then + consider using all files + + Returns: + the pipeline objects + """ + + reasoning_mode = settings["reasoning.use"] + reasoning_cls = reasonings[reasoning_mode] + pipeline = reasoning_cls.get_pipeline(settings, files=files) + + if settings["reasoning.use"] in ["rewoo", "react"]: + from kotaemon.agents import ReactAgent, RewooAgent + + llm = ( + llms["gpt4"] + if settings["answer_simple_llm_model"] == "gpt-4" + else llms["gpt35"] + ) + tools = [] + tools_keys = ( + "answer_rewoo_tools" + if settings["reasoning.use"] == "rewoo" + else "answer_react_tools" + ) + for tool in settings[tools_keys]: + if tool == "llm": + from kotaemon.agents import LLMTool + + tools.append(LLMTool(llm=llm)) + elif tool == "docsearch": + from kotaemon.agents import ComponentTool + + filenames = "" + if files: + with Session(engine) as session: + statement = select(Source).where( + Source.id.in_(files) # type: ignore + ) + results = session.exec(statement).all() + filenames = ( + "The file names are: " + + " ".join([result.name for result in results]) + + ". " + ) + + retrieval_pipeline = DocumentRetrievalPipeline() + retrieval_pipeline.set_run( + { + ".top_k": int(settings["retrieval_number"]), + ".mmr": settings["retrieval_mmr"], + ".doc_ids": files, + }, + temp=True, + ) + tool = ComponentTool( + name="docsearch", + description=( + "A vector store that searches for similar and " + "related content " + f"in a document. {filenames}" + "The result is a huge chunk of text related " + "to your search but can also " + "contain irrelevant info." + ), + component=retrieval_pipeline, + postprocessor=lambda docs: "\n\n".join( + [doc.text.replace("\n", " ") for doc in docs] + ), + ) + tools.append(tool) + elif tool == "google": + from kotaemon.agents import GoogleSearchTool + + tools.append(GoogleSearchTool()) + elif tool == "wikipedia": + from kotaemon.agents import WikipediaTool + + tools.append(WikipediaTool()) + else: + raise NotImplementedError(f"Unknown tool: {tool}") + + if settings["reasoning.use"] == "rewoo": + pipeline = RewooAgent( + planner_llm=llm, + solver_llm=llm, + plugins=tools, + ) + pipeline.set_run({".use_citation": True}) + else: + pipeline = ReactAgent( + llm=llm, + plugins=tools, + ) + + return pipeline + + +def chat_fn(chat_input, chat_history, files, settings): + pipeline = create_pipeline(settings, files) + + text = "" + refs = [] + for response in pipeline(chat_input): + if response.metadata.get("citation", None): + citation = response.metadata["citation"] + for idx, fact_with_evidence in enumerate(citation.answer): + quotes = fact_with_evidence.substring_quote + if quotes: + refs.append( + (None, f"***Reference {idx+1}***: {' ... '.join(quotes)}") + ) + else: + text += response.text + + yield "", chat_history + [(chat_input, text)] + refs + + +def is_liked(convo_id, liked: gr.LikeData): + with Session(engine) as session: + statement = select(Conversation).where(Conversation.id == convo_id) + result = session.exec(statement).one() + + data_source = deepcopy(result.data_source) + likes = data_source.get("likes", []) + likes.append([liked.index, liked.value, liked.liked]) + data_source["likes"] = likes + + result.data_source = data_source + session.add(result) + session.commit() + + +def update_data_source(convo_id, selected_files, messages): + """Update the data source""" + if not convo_id: + gr.Warning("No conversation selected") + return + + with Session(engine) as session: + statement = select(Conversation).where(Conversation.id == convo_id) + result = session.exec(statement).one() + + data_source = result.data_source + result.data_source = { + "files": selected_files, + "messages": messages, + "likes": deepcopy(data_source.get("likes", [])), + } + session.add(result) + session.commit() + + +def load_files(): + options = [] + with Session(engine) as session: + statement = select(Source) + results = session.exec(statement).all() + for result in results: + options.append((result.name, result.id)) + + return options + + +def index_fn(files, reindex: bool, selected_files, settings): + """Upload and index the files + + Args: + files: the list of files to be uploaded + reindex: whether to reindex the files + selected_files: the list of files already selected + settings: the settings of the app + """ + gr.Info(f"Start indexing {len(files)} files...") + + # get the pipeline + indexing_cls: BaseIndex = import_dotted_string(app_settings.KH_INDEX, safe=False) + indexing_pipeline = indexing_cls.get_pipeline(settings) + + output_nodes, file_ids = indexing_pipeline(files, reindex=reindex) + gr.Info(f"Finish indexing into {len(output_nodes)} chunks") + + # download the file + text = "\n\n".join([each.text for each in output_nodes]) + handler, file_path = tempfile.mkstemp(suffix=".txt") + with open(file_path, "w") as f: + f.write(text) + os.close(handler) + + if isinstance(selected_files, list): + output = selected_files + file_ids + else: + output = file_ids + + file_list = load_files() + + return ( + gr.update(value=file_path, visible=True), + gr.update(value=output, choices=file_list), + ) diff --git a/libs/ktem/ktem/pages/chat/report.py b/libs/ktem/ktem/pages/chat/report.py new file mode 100644 index 0000000..5e3530b --- /dev/null +++ b/libs/ktem/ktem/pages/chat/report.py @@ -0,0 +1,70 @@ +from typing import Optional + +import gradio as gr +from ktem.app import BasePage +from ktem.db.models import IssueReport, engine +from sqlmodel import Session + + +class ReportIssue(BasePage): + def __init__(self, app): + self._app = app + self.on_building_ui() + + def on_building_ui(self): + with gr.Accordion(label="Report", open=False): + self.correctness = gr.Radio( + choices=[ + ("The answer is correct", "correct"), + ("The answer is incorrect", "incorrect"), + ], + label="Correctness:", + ) + self.issues = gr.CheckboxGroup( + choices=[ + ("The answer is offensive", "offensive"), + ("The evidence is incorrect", "wrong-evidence"), + ], + label="Other issue:", + ) + self.more_detail = gr.Textbox( + placeholder="More detail (e.g. how wrong is it, what is the " + "correct answer, etc...)", + container=False, + lines=3, + ) + gr.Markdown( + "This will send the current chat and the user settings to " + "help with investigation" + ) + self.report_btn = gr.Button("Report") + + def report( + self, + correctness: str, + issues: list[str], + more_detail: str, + conv_id: str, + chat_history: list, + files: list, + settings: dict, + user_id: Optional[int], + ): + with Session(engine) as session: + issue = IssueReport( + issues={ + "correctness": correctness, + "issues": issues, + "more_detail": more_detail, + }, + chat={ + "conv_id": conv_id, + "chat_history": chat_history, + "files": files, + }, + settings=settings, + user=user_id, + ) + session.add(issue) + session.commit() + gr.Info("Thank you for your feedback") diff --git a/libs/ktem/ktem/pages/chat/upload.py b/libs/ktem/ktem/pages/chat/upload.py new file mode 100644 index 0000000..6a603fb --- /dev/null +++ b/libs/ktem/ktem/pages/chat/upload.py @@ -0,0 +1,43 @@ +import gradio as gr +from ktem.app import BasePage + + +class FileUpload(BasePage): + def __init__(self, app): + self._app = app + self.on_building_ui() + + def on_building_ui(self): + with gr.Accordion(label="File upload", open=False): + gr.Markdown( + "Supported file types: image, pdf, txt, csv, xlsx, docx.", + ) + self.files = gr.File( + file_types=["image", ".pdf", ".txt", ".csv", ".xlsx", ".docx"], + file_count="multiple", + container=False, + height=50, + ) + with gr.Accordion("Advanced indexing options", open=False): + with gr.Row(): + with gr.Column(): + self.reindex = gr.Checkbox( + value=False, label="Force reindex file", container=False + ) + with gr.Column(): + self.parser = gr.Dropdown( + choices=[ + ("PDF text parser", "normal"), + ("lib-table", "table"), + ("lib-table + OCR", "ocr"), + ("MathPix", "mathpix"), + ], + value="normal", + label="Use advance PDF parser (table+layout preserving)", + container=True, + ) + + self.upload_button = gr.Button("Upload and Index") + self.file_output = gr.File( + visible=False, label="Output files (debug purpose)" + ) diff --git a/libs/ktem/ktem/pages/help.py b/libs/ktem/ktem/pages/help.py new file mode 100644 index 0000000..c1dccf3 --- /dev/null +++ b/libs/ktem/ktem/pages/help.py @@ -0,0 +1,24 @@ +from pathlib import Path + +import gradio as gr + + +class HelpPage: + def __init__(self, app): + self._app = app + self.dir_md = Path(__file__).parent.parent / "assets" / "md" + + with gr.Accordion("Changelogs"): + gr.Markdown(self.get_changelogs()) + + with gr.Accordion("About Kotaemon (temporary)"): + with (self.dir_md / "about_kotaemon.md").open() as fi: + gr.Markdown(fi.read()) + + with gr.Accordion("About Cinnamon AI (temporary)", open=False): + with (self.dir_md / "about_cinnamon.md").open() as fi: + gr.Markdown(fi.read()) + + def get_changelogs(self): + with (self.dir_md / "changelogs.md").open() as fi: + return fi.read() diff --git a/libs/ktem/ktem/pages/settings.py b/libs/ktem/ktem/pages/settings.py new file mode 100644 index 0000000..7111fc4 --- /dev/null +++ b/libs/ktem/ktem/pages/settings.py @@ -0,0 +1,414 @@ +import hashlib + +import gradio as gr +from ktem.app import BasePage +from ktem.db.models import Settings, User, engine +from sqlmodel import Session, select + +gr_cls_single_value = { + "text": gr.Textbox, + "number": gr.Number, + "checkbox": gr.Checkbox, +} + + +gr_cls_choices = { + "dropdown": gr.Dropdown, + "radio": gr.Radio, + "checkboxgroup": gr.CheckboxGroup, +} + + +def render_setting_item(setting_item, value): + """Render the setting component into corresponding Gradio UI component""" + kwargs = { + "label": setting_item.name, + "value": value, + "interactive": True, + } + + if setting_item.component in gr_cls_single_value: + return gr_cls_single_value[setting_item.component](**kwargs) + + kwargs["choices"] = setting_item.choices + + if setting_item.component in gr_cls_choices: + return gr_cls_choices[setting_item.component](**kwargs) + + raise ValueError( + f"Unknown component {setting_item.component}, allowed are: " + f"{list(gr_cls_single_value.keys()) + list(gr_cls_choices.keys())}.\n" + f"Setting item: {setting_item}" + ) + + +class SettingsPage(BasePage): + """Responsible for allowing the users to customize the application + + **IMPORTANT**: the name and id of the UI setting components should match the + name of the setting in the `app.default_settings` + """ + + public_events = ["onSignIn", "onSignOut", "onCreateUser"] + + def __init__(self, app): + """Initiate the page and render the UI""" + self._app = app + + self._settings_state = app.settings_state + self._user_id = app.user_id + self._default_settings = app.default_settings + self._settings_dict = self._default_settings.flatten() + self._settings_keys = list(self._settings_dict.keys()) + + self._components = {} + self._reasoning_mode = {} + + self.on_building_ui() + + def on_building_ui(self): + self.setting_save_btn = gr.Button("Save settings") + with gr.Tab("User settings"): + self.user_tab() + with gr.Tab("General application settings"): + self.app_tab() + with gr.Tab("Index settings"): + self.index_tab() + with gr.Tab("Reasoning settings"): + self.reasoning_tab() + + def on_subscribe_public_events(self): + pass + + def on_register_events(self): + self.setting_save_btn.click( + self.save_setting, + inputs=[self._user_id] + self.components(), + outputs=self._settings_state, + ) + self.password_change_btn.click( + self.change_password, + inputs=[ + self._user_id, + self.password_change, + self.password_change_confirm, + ], + outputs=None, + show_progress="hidden", + ) + self._components["reasoning.use"].change( + self.change_reasoning_mode, + inputs=[self._components["reasoning.use"]], + outputs=list(self._reasoning_mode.values()), + show_progress="hidden", + ) + + onSignInClick = self.signin.click( + self.sign_in, + inputs=[self.username, self.password], + outputs=[self._user_id, self.username, self.password] + + self.signed_in_state() + + [self.user_out_state], + show_progress="hidden", + ).then( + self.load_setting, + inputs=self._user_id, + outputs=[self._settings_state] + self.components(), + show_progress="hidden", + ) + for event in self._app.get_event("onSignIn"): + onSignInClick = onSignInClick.then(**event) + + onSignInSubmit = self.password.submit( + self.sign_in, + inputs=[self.username, self.password], + outputs=[self._user_id, self.username, self.password] + + self.signed_in_state() + + [self.user_out_state], + show_progress="hidden", + ).then( + self.load_setting, + inputs=self._user_id, + outputs=[self._settings_state] + self.components(), + show_progress="hidden", + ) + for event in self._app.get_event("onSignIn"): + onSignInSubmit = onSignInSubmit.then(**event) + + onCreateUserClick = self.create_btn.click( + self.create_user, + inputs=[ + self.username_new, + self.password_new, + self.password_new_confirm, + ], + outputs=[ + self._user_id, + self.username_new, + self.password_new, + self.password_new_confirm, + ] + + self.signed_in_state() + + [self.user_out_state], + show_progress="hidden", + ).then( + self.load_setting, + inputs=self._user_id, + outputs=[self._settings_state] + self.components(), + show_progress="hidden", + ) + for event in self._app.get_event("onCreateUser"): + onCreateUserClick = onCreateUserClick.then(**event) + + onSignOutClick = self.signout.click( + self.sign_out, + inputs=None, + outputs=[self._user_id] + self.signed_in_state() + [self.user_out_state], + show_progress="hidden", + ).then( + self.load_setting, + inputs=self._user_id, + outputs=[self._settings_state] + self.components(), + show_progress="hidden", + ) + for event in self._app.get_event("onSignOut"): + onSignOutClick = onSignOutClick.then(**event) + + def user_tab(self): + with gr.Row() as self.user_out_state: + with gr.Column(): + gr.Markdown("Sign in") + self.username = gr.Textbox(label="Username", interactive=True) + self.password = gr.Textbox( + label="Password", type="password", interactive=True + ) + self.signin = gr.Button("Login") + + with gr.Column(): + gr.Markdown("Create new account") + self.username_new = gr.Textbox(label="Username", interactive=True) + self.password_new = gr.Textbox( + label="Password", type="password", interactive=True + ) + self.password_new_confirm = gr.Textbox( + label="Confirm password", type="password", interactive=True + ) + self.create_btn = gr.Button("Create account") + + # user management + self.current_name = gr.Markdown("Current user: ___", visible=False) + self.signout = gr.Button("Logout", visible=False) + + self.password_change = gr.Textbox( + label="New password", interactive=True, type="password", visible=False + ) + self.password_change_confirm = gr.Textbox( + label="Confirm password", interactive=True, type="password", visible=False + ) + self.password_change_btn = gr.Button( + "Change password", interactive=True, visible=False + ) + + def signed_out_state(self): + return [ + self.username, + self.password, + self.signin, + self.username_new, + self.password_new, + self.password_new_confirm, + self.create_btn, + ] + + def signed_in_state(self): + return [ + self.current_name, # always the first one + self.signout, + self.password_change, + self.password_change_confirm, + self.password_change_btn, + ] + + def sign_in(self, username: str, password: str): + hashed_password = hashlib.sha256(password.encode()).hexdigest() + user_id, clear_username, clear_password = None, username, password + with Session(engine) as session: + statement = select(User).where( + User.username == username, + User.password == hashed_password, + ) + result = session.exec(statement).all() + if result: + user_id = result[0].id + clear_username, clear_password = "", "" + else: + gr.Warning("Username or password is incorrect") + + output: list = [user_id, clear_username, clear_password] + if user_id is None: + output += [ + gr.update(visible=False) for _ in range(len(self.signed_in_state())) + ] + output.append(gr.update(visible=True)) + else: + output.append(gr.update(visible=True, value=f"Current user: {username}")) + output += [ + gr.update(visible=True) for _ in range(len(self.signed_in_state()) - 1) + ] + output.append(gr.update(visible=False)) + + return output + + def create_user(self, username, password, password_confirm): + user_id, usn, pwd, pwdc = None, username, password, password_confirm + if password != password_confirm: + gr.Warning("Password does not match") + else: + with Session(engine) as session: + statement = select(User).where( + User.username == username, + ) + result = session.exec(statement).all() + if result: + gr.Warning(f'Username "{username}" already exists') + else: + hashed_password = hashlib.sha256(password.encode()).hexdigest() + user = User(username=username, password=hashed_password) + session.add(user) + session.commit() + user_id = user.id + usn, pwd, pwdc = "", "", "" + print(user_id) + + output: list = [user_id, usn, pwd, pwdc] + if user_id is not None: + output.append(gr.update(visible=True, value=f"Current user: {username}")) + output += [ + gr.update(visible=True) for _ in range(len(self.signed_in_state()) - 1) + ] + output.append(gr.update(visible=False)) + else: + output += [ + gr.update(visible=False) for _ in range(len(self.signed_in_state())) + ] + output.append(gr.update(visible=True)) + + return output + + def sign_out(self): + output = [None] + output += [gr.update(visible=False) for _ in range(len(self.signed_in_state()))] + output.append(gr.update(visible=True)) + return output + + def change_password(self, user_id, password, password_confirm): + if password != password_confirm: + gr.Warning("Password does not match") + return + + with Session(engine) as session: + statement = select(User).where(User.id == user_id) + result = session.exec(statement).all() + if result: + user = result[0] + hashed_password = hashlib.sha256(password.encode()).hexdigest() + user.password = hashed_password + session.add(user) + session.commit() + gr.Info("Password changed") + else: + gr.Warning("User not found") + + def app_tab(self): + for n, si in self._default_settings.application.settings.items(): + obj = render_setting_item(si, si.value) + self._components[f"application.{n}"] = obj + + def index_tab(self): + for n, si in self._default_settings.index.settings.items(): + obj = render_setting_item(si, si.value) + self._components[f"index.{n}"] = obj + + def reasoning_tab(self): + with gr.Group(): + for n, si in self._default_settings.reasoning.settings.items(): + if n == "use": + continue + obj = render_setting_item(si, si.value) + self._components[f"reasoning.{n}"] = obj + + gr.Markdown("### Reasoning-specific settings") + self._components["reasoning.use"] = render_setting_item( + self._default_settings.reasoning.settings["use"], + self._default_settings.reasoning.settings["use"].value, + ) + + for idx, (pn, sig) in enumerate( + self._default_settings.reasoning.options.items() + ): + with gr.Group( + visible=idx == 0, + elem_id=pn, + ) as self._reasoning_mode[pn]: + gr.Markdown("**Name**: Description") + for n, si in sig.settings.items(): + obj = render_setting_item(si, si.value) + self._components[f"reasoning.options.{pn}.{n}"] = obj + + def change_reasoning_mode(self, value): + output = [] + for each in self._reasoning_mode.values(): + if value == each.elem_id: + output.append(gr.update(visible=True)) + else: + output.append(gr.update(visible=False)) + return output + + def load_setting(self, user_id=None): + settings = self._settings_dict + with Session(engine) as session: + statement = select(Settings).where(Settings.user == user_id) + result = session.exec(statement).all() + if result: + settings = result[0].setting + + output = [settings] + output += tuple(settings[name] for name in self.component_names()) + return output + + def save_setting(self, user_id: int, *args): + """Save the setting to disk and persist the setting to session state + + Args: + user_id: the user id + args: all the values from the settings + """ + setting = {key: value for key, value in zip(self.component_names(), args)} + if user_id is None: + gr.Warning("Need to login before saving settings") + return setting + + with Session(engine) as session: + statement = select(Settings).where(Settings.user == user_id) + try: + user_setting = session.exec(statement).one() + except Exception: + user_setting = Settings() + user_setting.user = user_id + user_setting.setting = setting + session.add(user_setting) + session.commit() + + gr.Info("Setting saved") + return setting + + def components(self) -> list: + """Get the setting components""" + output = [] + for name in self._settings_keys: + output.append(self._components[name]) + return output + + def component_names(self): + """Get the setting components""" + return self._settings_keys diff --git a/libs/ktem/ktem/reasoning/__init__.py b/libs/ktem/ktem/reasoning/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/libs/ktem/ktem/reasoning/base.py b/libs/ktem/ktem/reasoning/base.py new file mode 100644 index 0000000..e69de29 diff --git a/libs/ktem/ktem/reasoning/simple.py b/libs/ktem/ktem/reasoning/simple.py new file mode 100644 index 0000000..5cfc819 --- /dev/null +++ b/libs/ktem/ktem/reasoning/simple.py @@ -0,0 +1,409 @@ +import logging +import warnings +from collections import defaultdict +from functools import partial +from typing import Iterator, Optional + +import tiktoken +from ktem.components import embeddings, get_docstore, get_vectorstore, llms +from ktem.db.models import Index, SourceTargetRelation, engine +from kotaemon.base import ( + BaseComponent, + Document, + HumanMessage, + Node, + RetrievedDocument, + SystemMessage, +) +from kotaemon.indices import VectorRetrieval +from kotaemon.indices.qa.citation import CitationPipeline +from kotaemon.indices.rankings import BaseReranking, CohereReranking, LLMReranking +from kotaemon.indices.splitters import TokenSplitter +from kotaemon.llms import ChatLLM, PromptTemplate +from llama_index.vector_stores import ( + FilterCondition, + FilterOperator, + MetadataFilter, + MetadataFilters, +) +from llama_index.vector_stores.types import VectorStoreQueryMode +from sqlmodel import Session, select +from theflow.settings import settings + +logger = logging.getLogger(__name__) + + +class DocumentRetrievalPipeline(BaseComponent): + """Retrieve relevant document + + Args: + vector_retrieval: the retrieval pipeline that return the relevant documents + given a text query + reranker: the reranking pipeline that re-rank and filter the retrieved + documents + get_extra_table: if True, for each retrieved document, the pipeline will look + for surrounding tables (e.g. within the page) + """ + + vector_retrieval: VectorRetrieval = VectorRetrieval.withx( + doc_store=get_docstore(), + vector_store=get_vectorstore(), + embedding=embeddings.get_default(), + ) + reranker: BaseReranking = CohereReranking.withx( + cohere_api_key=getattr(settings, "COHERE_API_KEY", "") + ) >> LLMReranking.withx(llm=llms.get_lowest_cost()) + get_extra_table: bool = False + + def run( + self, + text: str, + top_k: int = 5, + mmr: bool = False, + doc_ids: Optional[list[str]] = None, + ) -> list[RetrievedDocument]: + """Retrieve document excerpts similar to the text + + Args: + text: the text to retrieve similar documents + top_k: number of documents to retrieve + mmr: whether to use mmr to re-rank the documents + doc_ids: list of document ids to constraint the retrieval + """ + kwargs = {} + if doc_ids: + with Session(engine) as session: + stmt = select(Index).where( + Index.relation_type == SourceTargetRelation.VECTOR, + Index.source_id.in_(doc_ids), # type: ignore + ) + results = session.exec(stmt) + vs_ids = [r.target_id for r in results.all()] + + kwargs["filters"] = MetadataFilters( + filters=[ + MetadataFilter( + key="doc_id", + value=vs_id, + operator=FilterOperator.EQ, + ) + for vs_id in vs_ids + ], + condition=FilterCondition.OR, + ) + + if mmr: + # TODO: double check that llama-index MMR works correctly + kwargs["mode"] = VectorStoreQueryMode.MMR + kwargs["mmr_threshold"] = 0.5 + + # rerank + docs = self.vector_retrieval(text=text, top_k=top_k, **kwargs) + if self.get_from_path("reranker"): + docs = self.reranker(docs, query=text) + + if not self.get_extra_table: + return docs + + # retrieve extra nodes relate to table + table_pages = defaultdict(list) + retrieved_id = set([doc.doc_id for doc in docs]) + for doc in docs: + if "page_label" not in doc.metadata: + continue + if "file_name" not in doc.metadata: + warnings.warn( + "file_name not in metadata while page_label is in metadata: " + f"{doc.metadata}" + ) + table_pages[doc.metadata["file_name"]].append(doc.metadata["page_label"]) + + queries = [ + {"$and": [{"file_name": {"$eq": fn}}, {"page_label": {"$in": pls}}]} + for fn, pls in table_pages.items() + ] + if queries: + extra_docs = self.vector_retrieval( + text="", + top_k=50, + where={"$or": queries}, + ) + for doc in extra_docs: + if doc.doc_id not in retrieved_id: + docs.append(doc) + + return docs + + +class PrepareEvidencePipeline(BaseComponent): + """Prepare the evidence text from the list of retrieved documents + + This step usually happens after `DocumentRetrievalPipeline`. + + Args: + trim_func: a callback function or a BaseComponent, that splits a large + chunk of text into smaller ones. The first one will be retained. + """ + + trim_func: TokenSplitter = TokenSplitter.withx( + chunk_size=7600, + chunk_overlap=0, + separator=" ", + tokenizer=partial( + tiktoken.encoding_for_model("gpt-3.5-turbo").encode, + allowed_special=set(), + disallowed_special="all", + ), + ) + + def run(self, docs: list[RetrievedDocument]) -> Document: + evidence = "" + table_found = 0 + evidence_mode = 0 + + for _id, retrieved_item in enumerate(docs): + retrieved_content = "" + page = retrieved_item.metadata.get("page_label", None) + source = filename = retrieved_item.metadata.get("file_name", "-") + if page: + source += f" (Page {page})" + if retrieved_item.metadata.get("type", "") == "table": + evidence_mode = 1 # table + if table_found < 5: + retrieved_content = retrieved_item.metadata.get("table_origin", "") + if retrieved_content not in evidence: + table_found += 1 + evidence += ( + f"
Table from {source}\n" + + retrieved_content + + "\n
" + ) + elif retrieved_item.metadata.get("type", "") == "chatbot": + evidence_mode = 2 # chatbot + retrieved_content = retrieved_item.metadata["window"] + evidence += ( + f"
Chatbot scenario from {filename} (Row {page})\n" + + retrieved_content + + "\n
" + ) + else: + if "window" in retrieved_item.metadata: + retrieved_content = retrieved_item.metadata["window"] + else: + retrieved_content = retrieved_item.text + retrieved_content = retrieved_content.replace("\n", " ") + if retrieved_content not in evidence: + evidence += ( + f"
Content from {source}: " + + retrieved_content + + " \n
" + ) + + print("Retrieved #{}: {}".format(_id, retrieved_content)) + print(retrieved_item.metadata) + print("Score", retrieved_item.metadata.get("relevance_score", None)) + + # trim context by trim_len + print("len (original)", len(evidence)) + if evidence: + texts = self.trim_func([Document(text=evidence)]) + evidence = texts[0].text + print("len (trimmed)", len(evidence)) + + print(f"PrepareEvidence with input {input}\nOutput: {evidence}\n") + + return Document(content=(evidence_mode, evidence)) + + +DEFAULT_QA_TEXT_PROMPT = ( + "Use the following pieces of context to answer the question at the end. " + "If you don't know the answer, just say that you don't know, don't try to " + "make up an answer. Keep the answer as concise as possible. Give answer in " + "{lang}. {system}\n\n" + "{context}\n" + "Question: {question}\n" + "Helpful Answer:" +) + +DEFAULT_QA_TABLE_PROMPT = ( + "List all rows (row number) from the table context that related to the question, " + "then provide detail answer with clear explanation and citations. " + "If you don't know the answer, just say that you don't know, " + "don't try to make up an answer. Give answer in {lang}. {system}\n\n" + "Context:\n" + "{context}\n" + "Question: {question}\n" + "Helpful Answer:" +) + +DEFAULT_QA_CHATBOT_PROMPT = ( + "Pick the most suitable chatbot scenarios to answer the question at the end, " + "output the provided answer text. If you don't know the answer, " + "just say that you don't know. Keep the answer as concise as possible. " + "Give answer in {lang}. {system}\n\n" + "Context:\n" + "{context}\n" + "Question: {question}\n" + "Answer:" +) + + +class AnswerWithContextPipeline(BaseComponent): + """Answer the question based on the evidence + + Args: + llm: the language model to generate the answer + citation_pipeline: generates citation from the evidence + qa_template: the prompt template for LLM to generate answer (refer to + evidence_mode) + qa_table_template: the prompt template for LLM to generate answer for table + (refer to evidence_mode) + qa_chatbot_template: the prompt template for LLM to generate answer for + pre-made scenarios (refer to evidence_mode) + lang: the language of the answer. Currently support English and Japanese + """ + + llm: ChatLLM = Node(default_callback=lambda _: llms.get_highest_accuracy()) + citation_pipeline: CitationPipeline = Node( + default_callback=lambda _: CitationPipeline(llm=llms.get_lowest_cost()) + ) + + qa_template: str = DEFAULT_QA_TEXT_PROMPT + qa_table_template: str = DEFAULT_QA_TABLE_PROMPT + qa_chatbot_template: str = DEFAULT_QA_CHATBOT_PROMPT + + system_prompt: str = "" + lang: str = "English" # support English and Japanese + + def run( + self, question: str, evidence: str, evidence_mode: int = 0 + ) -> Document | Iterator[Document]: + """Answer the question based on the evidence + + In addition to the question and the evidence, this method also take into + account evidence_mode. The evidence_mode tells which kind of evidence is. + The kind of evidence affects: + 1. How the evidence is represented. + 2. The prompt to generate the answer. + + By default, the evidence_mode is 0, which means the evidence is plain text with + no particular semantic representation. The evidence_mode can be: + 1. "table": There will be HTML markup telling that there is a table + within the evidence. + 2. "chatbot": There will be HTML markup telling that there is a chatbot. + This chatbot is a scenario, extracted from an Excel file, where each + row corresponds to an interaction. + + Args: + question: the original question posed by user + evidence: the text that contain relevant information to answer the question + (determined by retrieval pipeline) + evidence_mode: the mode of evidence, 0 for text, 1 for table, 2 for chatbot + """ + if evidence_mode == 0: + prompt_template = PromptTemplate(self.qa_template) + elif evidence_mode == 1: + prompt_template = PromptTemplate(self.qa_table_template) + else: + prompt_template = PromptTemplate(self.qa_chatbot_template) + + prompt = prompt_template.populate( + context=evidence, + question=question, + lang=self.lang, + system=self.system_prompt, + ) + + messages = [ + SystemMessage(content="You are a helpful assistant"), + HumanMessage(content=prompt), + ] + # output = self.llm(messages).text + yield from self.llm(messages) + + citation = self.citation_pipeline(context=evidence, question=question) + answer = Document(text="", metadata={"citation": citation}) + yield answer + + +class FullQAPipeline(BaseComponent): + """Question answering pipeline. Handle from question to answer""" + + class Config: + allow_extra = True + params_publish = True + + retrieval_pipeline: DocumentRetrievalPipeline = DocumentRetrievalPipeline.withx() + evidence_pipeline: PrepareEvidencePipeline = PrepareEvidencePipeline.withx() + answering_pipeline: AnswerWithContextPipeline = AnswerWithContextPipeline.withx() + + def run(self, question: str, **kwargs) -> Iterator[Document]: + docs = self.retrieval_pipeline(text=question) + evidence_mode, evidence = self.evidence_pipeline(docs).content + answer = self.answering_pipeline( + question=question, evidence=evidence, evidence_mode=evidence_mode + ) + yield from answer # should be a generator + + @classmethod + def get_pipeline(cls, settings, **kwargs): + """Get the reasoning pipeline + + Need a base pipeline implementation. Currently the drawback is that we want to + treat the retrievers as tools. Hence, the reasoning pipelie should just take + the already initiated tools (retrievers), and do not need to set such logic + here. + """ + pipeline = FullQAPipeline(get_extra_table=settings["index.prioritize_table"]) + if not settings["index.use_reranking"]: + pipeline.retrieval_pipeline.reranker = None # type: ignore + + pipeline.answering_pipeline.llm = llms.get_highest_accuracy() + kwargs = { + ".retrieval_pipeline.top_k": int(settings["index.num_retrieval"]), + ".retrieval_pipeline.mmr": settings["index.mmr"], + ".retrieval_pipeline.doc_ids": kwargs.get("files", None), + } + pipeline.set_run(kwargs, temp=True) + + return pipeline + + @classmethod + def get_user_settings(cls) -> dict: + from ktem.components import llms + + try: + citation_llm = llms.get_lowest_cost_name() + citation_llm_choices = list(llms.options().keys()) + main_llm = llms.get_highest_accuracy_name() + main_llm_choices = list(llms.options().keys()) + except Exception as e: + logger.error(e) + citation_llm = None + citation_llm_choices = [] + main_llm = None + main_llm_choices = [] + + return { + "highlight_citation": { + "name": "Highlight Citation", + "value": True, + "component": "checkbox", + }, + "system_prompt": { + "name": "System Prompt", + "value": "This is a question answering system", + }, + "citation_llm": { + "name": "LLM for citation", + "value": citation_llm, + "component": "dropdown", + "choices": citation_llm_choices, + }, + "main_llm": { + "name": "LLM for main generation", + "value": main_llm, + "component": "dropdown", + "choices": main_llm_choices, + }, + } diff --git a/libs/ktem/ktem/settings.py b/libs/ktem/ktem/settings.py new file mode 100644 index 0000000..99dc9e7 --- /dev/null +++ b/libs/ktem/ktem/settings.py @@ -0,0 +1,156 @@ +from typing import Any + +from pydantic import BaseModel, Field + + +class SettingItem(BaseModel): + """Represent a setting item + + Args: + name: the name of the setting item + value: the default value of the setting item + choices: the list of choices of the setting item, if any + metadata: the metadata of the setting item + component: the expected UI component to render the setting + """ + + name: str + value: Any + choices: list = Field(default_factory=list) + metadata: dict = Field(default_factory=dict) + component: str = "text" + + +class BaseSettingGroup(BaseModel): + settings: dict[str, "SettingItem"] = Field(default_factory=dict) + options: dict[str, "BaseSettingGroup"] = Field(default_factory=dict) + + def _get_options(self) -> dict: + return {} + + def finalize(self): + """Finalize the setting group""" + + def flatten(self) -> dict: + """Render the setting group into value""" + output = {} + for key, value in self.settings.items(): + output[key] = value.value + + output.update({f"options.{k}": v for k, v in self._get_options().items()}) + + return output + + def get_setting_item(self, path: str) -> SettingItem: + """Get the item based on dot notation""" + path = path.strip(".") + if "." not in path: + return self.settings[path] + + key, sub_path = path.split(".", 1) + if key != "options": + raise ValueError(f"Invalid key {path}. Should starts with `options.*`") + + option_id, sub_path = sub_path.split(".", 1) + option = self.options[option_id] + return option.get_setting_item(sub_path) + + +class SettingReasoningGroup(BaseSettingGroup): + def _get_options(self) -> dict: + output = {} + for ex_name, ex_setting in self.options.items(): + for key, value in ex_setting.flatten().items(): + output[f"{ex_name}.{key}"] = value + + return output + + def finalize(self): + """Finalize the setting""" + options = list(self.options.keys()) + if options: + self.settings["use"].choices = [(x, x) for x in options] + self.settings["use"].value = options[0] + + +class SettingIndexOption(BaseSettingGroup): + """Temporarily keep it here to see if we need this setting template + for the index component + """ + + indexing: BaseSettingGroup + retrieval: BaseSettingGroup + + def flatten(self) -> dict: + """Render the setting group into value""" + output = {} + for key, value in self.indexing.flatten(): + output[f"indexing.{key}"] = value + + for key, value in self.retrieval.flatten(): + output[f"retrieval.{key}"] = value + + return output + + def get_setting_item(self, path: str) -> SettingItem: + """Get the item based on dot notation""" + path = path.strip(".") + + key, sub_path = path.split(".", 1) + if key not in ["indexing", "retrieval"]: + raise ValueError( + f"Invalid key {path}. Should starts with `indexing.*` or `retrieval.*`" + ) + + value = getattr(self, key) + return value.get_setting_item(sub_path) + + +class SettingIndexGroup(BaseSettingGroup): + def _get_options(self) -> dict: + output = {} + for name, setting in self.options.items(): + for key, value in setting.flatten().items(): + output[f"{name}.{key}"] = value + + return output + + def finalize(self): + """Finalize the setting""" + options = list(self.options.keys()) + if options: + self.settings["use"].choices = [(x, x) for x in options] + self.settings["use"].value = options + + +class SettingGroup(BaseModel): + application: BaseSettingGroup = Field(default_factory=BaseSettingGroup) + index: SettingIndexGroup = Field(default_factory=SettingIndexGroup) + reasoning: SettingReasoningGroup = Field(default_factory=SettingReasoningGroup) + + def flatten(self) -> dict: + """Render the setting group into value""" + output = {} + for key, value in self.application.flatten().items(): + output[f"application.{key}"] = value + + for key, value in self.index.flatten().items(): + output[f"index.{key}"] = value + + for key, value in self.reasoning.flatten().items(): + output[f"reasoning.{key}"] = value + + return output + + def get_setting_item(self, path: str) -> SettingItem: + """Get the item based on dot notation""" + path = path.strip(".") + + key, sub_path = path.split(".", 1) + if key not in ["application", "index", "reasoning"]: + raise ValueError( + f"Invalid key {path}. Should starts with `indexing.*` or `retrieval.*`" + ) + + value = getattr(self, key) + return value.get_setting_item(sub_path) diff --git a/libs/ktem/launch.py b/libs/ktem/launch.py new file mode 100644 index 0000000..66dfc0b --- /dev/null +++ b/libs/ktem/launch.py @@ -0,0 +1,5 @@ +from ktem.main import App + +app = App() +demo = app.make() +demo.queue().launch() diff --git a/libs/ktem/pyproject.toml b/libs/ktem/pyproject.toml new file mode 100644 index 0000000..64ce8e1 --- /dev/null +++ b/libs/ktem/pyproject.toml @@ -0,0 +1,39 @@ +[build-system] +requires = ["setuptools >= 61.0"] +build-backend = "setuptools.build_meta" + +[tool.setuptools] +include-package-data = false +packages.find.include = ["ktem*"] +packages.find.exclude = ["tests*", "env*"] + +[project] +name = "ktem" +version = "0.0.1" +requires-python = ">= 3.10" +description = "RAG-based Question and Answering Application" +dependencies = [ + "chromadb", + "click", + "cohere", + "platformdirs", + "pluggy", + "python-decouple", + "python-dotenv", + "sqlalchemy", + "sqlmodel", + "tiktoken", + "unstructured[pdf]", +] +readme = "README.md" +license = { text = "MIT License" } +authors = [ + { name = "john", email = "john@cinnamon.is" }, + { name = "ian", email = "ian@cinnamon.is" }, + { name = "tadashi", email = "tadashi@cinnamon.is" }, +] +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", +] diff --git a/libs/ktem/requirements.txt b/libs/ktem/requirements.txt new file mode 100644 index 0000000..67fd014 --- /dev/null +++ b/libs/ktem/requirements.txt @@ -0,0 +1 @@ +platformdirs diff --git a/libs/ktem/scripts/mock.py b/libs/ktem/scripts/mock.py new file mode 100644 index 0000000..63ffae0 --- /dev/null +++ b/libs/ktem/scripts/mock.py @@ -0,0 +1,29 @@ +import time + +from ktem.db.models import Conversation, Source, engine +from sqlmodel import Session + + +def add_conversation(): + """Add conversation to the manager.""" + with Session(engine) as session: + c1 = Conversation(name="Conversation 1") + c2 = Conversation() + session.add(c1) + time.sleep(1) + session.add(c2) + time.sleep(1) + session.commit() + + +def add_files(): + with Session(engine) as session: + s1 = Source(name="Source 1", path="Path 1") + s2 = Source(name="Source 2", path="Path 2") + session.add(s1) + session.add(s2) + session.commit() + + +# add_conversation() +add_files() diff --git a/mkdocs.yml b/mkdocs.yml index 14cd154..80048f1 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -59,7 +59,7 @@ plugins: - "!^_" members_order: source separate_signature: true - paths: [kotaemon] + paths: [libs/kotaemon] - git-revision-date-localized: enable_creation_date: true type: timeago diff --git a/pyproject.toml b/pyproject.toml index 1a05069..6500548 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,77 +1,3 @@ -# build backand and build dependencies -[build-system] -requires = ["setuptools >= 61.0"] -build-backend = "setuptools.build_meta" - -[tool.setuptools] -include-package-data = false -packages.find.include = ["kotaemon*"] -packages.find.exclude = ["tests*", "env*"] - -# metadata and dependencies -[project] -name = "kotaemon" -version = "0.3.5" -requires-python = ">= 3.10" -description = "Kotaemon core library for AI development." -dependencies = [ - "langchain", - "langchain-community", - "theflow", - "llama-index>=0.9.0", - "llama-hub", - "gradio>=4.0.0", - "openpyxl", - "cookiecutter", - "click", - "pandas", - "trogon", -] -readme = "README.md" -license = { text = "MIT License" } -authors = [ - { name = "john", email = "john@cinnamon.is" }, - { name = "ian", email = "ian@cinnamon.is" }, - { name = "tadashi", email = "tadashi@cinnamon.is" }, -] -classifiers = [ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", -] - -[project.optional-dependencies] -dev = [ - "ipython", - "pytest", - "pre-commit", - "black", - "flake8", - "sphinx", - "coverage", - "openai", - "langchain-openai", - "chromadb", - "wikipedia", - "duckduckgo-search", - "googlesearch-python", - "python-dotenv", - "pytest-mock", - "unstructured[pdf]", - "sentence_transformers", - "cohere", - "elasticsearch", - "pypdf", -] - -[project.scripts] -kh = "kotaemon.cli:main" - -[project.urls] -Homepage = "https://github.com/Cinnamon/kotaemon/" -Repository = "https://github.com/Cinnamon/kotaemon/" -Documentation = "https://github.com/Cinnamon/kotaemon/wiki" - [tool.codespell] skip = "*.js,*.css,*.map" # `llm` abbreviation for large language models diff --git a/tests/test_citation.py b/tests/test_citation.py deleted file mode 100644 index 4378f59..0000000 --- a/tests/test_citation.py +++ /dev/null @@ -1,62 +0,0 @@ -# flake8: noqa -from unittest.mock import patch - -import pytest -from openai.types.chat.chat_completion import ChatCompletion - -from kotaemon.indices.qa import CitationPipeline -from kotaemon.llms import AzureChatOpenAI - -function_output = '{\n "question": "What is the provided _example_ benefits?",\n "answer": [\n {\n "fact": "特約死亡保険金: 被保険者がこの特約の保険期間中に死亡したときに支払います。",\n "substring_quote": ["特約死亡保険金"]\n },\n {\n "fact": "特約特定疾病保険金: 被保険者がこの特約の保険期間中に特定の疾病(悪性新生物(がん)、急性心筋梗塞または脳卒中)により所定の状態に該当したときに支払います。",\n "substring_quote": ["特約特定疾病保険金"]\n },\n {\n "fact": "特約障害保険金: 被保険者がこの特約の保険期間中に傷害もしくは疾病により所定の身体障害の状態に該当したとき、または不慮の事故により所定の身体障害の状態に該当したときに支払います。",\n "substring_quote": ["特約障害保険金"]\n },\n {\n "fact": "特約介護保険金: 被保険者がこの特約の保険期間中に傷害または疾病により所定の要介護状態に該当したときに支払います。",\n "substring_quote": ["特約介護保険金"]\n }\n ]\n}' - -_openai_chat_completion_response = [ - ChatCompletion.parse_obj( - { - "id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x", - "object": "chat.completion", - "created": 1692338378, - "model": "gpt-35-turbo", - "system_fingerprint": None, - "choices": [ - { - "index": 0, - "finish_reason": "function_call", - "message": { - "role": "assistant", - "content": None, - "function_call": { - "arguments": function_output, - "name": "QuestionAnswer", - }, - "tool_calls": None, - }, - "logprobs": None, - } - ], - "usage": {"completion_tokens": 9, "prompt_tokens": 10, "total_tokens": 19}, - } - ) -] - - -@pytest.fixture -def llm(): - return AzureChatOpenAI( - azure_endpoint="https://dummy.openai.azure.com/", - openai_api_key="dummy", - openai_api_version="2023-03-15-preview", - temperature=0, - ) - - -@patch( - "openai.resources.chat.completions.Completions.create", - side_effect=_openai_chat_completion_response, -) -def test_citation(openai_completion, llm): - question = "test query" - context = "document context" - - citation = CitationPipeline(llm=llm) - result = citation(context, question) - assert len(result.answer) == 4