diff --git a/.github/workflows/style-check.yaml b/.github/workflows/style-check.yaml new file mode 100644 index 0000000..2047461 --- /dev/null +++ b/.github/workflows/style-check.yaml @@ -0,0 +1,20 @@ +name: style-check + +on: + pull_request: + branches: [main] + push: + branches: [main] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - name: Clone the repo + uses: actions/checkout@v3 + - name: Setup python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: run pre-commit + uses: pre-commit/action@v3.0.0 diff --git a/.github/workflows/unit-test.yaml b/.github/workflows/unit-test.yaml new file mode 100644 index 0000000..5bd66c9 --- /dev/null +++ b/.github/workflows/unit-test.yaml @@ -0,0 +1,32 @@ +name: unit-test + +on: + pull_request: + branches: [main] + push: + branches: [main] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11"] + name: unit testing with python ${{ matrix.python-version }} + steps: + - name: Clone the repo + uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + - name: Display Python version + run: python -c "import sys; print(sys.version)" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e .[dev] + - name: Test with pytest + run: | + pytest diff --git a/.gitignore b/.gitignore index 22de1c3..28d4435 100644 --- a/.gitignore +++ b/.gitignore @@ -51,7 +51,6 @@ flycheck_*.el # network security /network-security.data - ### Linux ### # temporary files which can be created if a process still has a handle open of a deleted file @@ -75,7 +74,6 @@ flycheck_*.el # Icon must end with two \r Icon - # Thumbnails ._* @@ -386,7 +384,7 @@ pyrightconfig.json ### Vim ### # Swap [._]*.s[a-v][a-z] -!*.svg # comment out if you don't need vector files +!*.svg # comment out if you don't need vector files [._]*.sw[a-p] [._]s[a-rt-v][a-z] [._]ss[a-gi-z] @@ -451,3 +449,10 @@ $RECYCLE.BIN/ .theflow/ # End of https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm + +logs/ +.gitsecret/keys/random_seed +!*.secret +credentials.txt + +S.gpg-agent* diff --git a/.gitsecret/keys/pubring.kbx b/.gitsecret/keys/pubring.kbx new file mode 100644 index 0000000..86a2b56 Binary files /dev/null and b/.gitsecret/keys/pubring.kbx differ diff --git a/.gitsecret/keys/trustdb.gpg b/.gitsecret/keys/trustdb.gpg new file mode 100644 index 0000000..e599a3b Binary files /dev/null and b/.gitsecret/keys/trustdb.gpg differ diff --git a/.gitsecret/paths/mapping.cfg b/.gitsecret/paths/mapping.cfg new file mode 100644 index 0000000..576d1d7 --- /dev/null +++ b/.gitsecret/paths/mapping.cfg @@ -0,0 +1 @@ +credentials.txt:1e17fa46dd8353b5ded588b32983ac7d800e70fd16bc5831663b9aaefc409011 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..d97508e --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,49 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: check-yaml + - id: check-toml + - id: end-of-file-fixer + - id: trailing-whitespace + - id: detect-aws-credentials + args: ["--allow-missing-credentials"] + - id: detect-private-key + - id: check-added-large-files + - repo: https://github.com/ambv/black + rev: 22.3.0 + hooks: + - id: black + language_version: python3 + - repo: https://github.com/pycqa/isort + rev: 5.12.0 + hooks: + - id: isort + args: ["--profile", "black"] + language_version: python3.10 + - repo: https://github.com/pycqa/flake8 + rev: 4.0.1 + hooks: + - id: flake8 + args: ["--max-line-length", "88"] + - repo: https://github.com/myint/autoflake + rev: v1.4 + hooks: + - id: autoflake + args: + [ + "--in-place", + "--remove-unused-variables", + "--remove-all-unused-imports", + "--ignore-init-module-imports", + "--exclude=tests/*", + ] + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v2.7.1 + hooks: + - id: prettier + types_or: [markdown, yaml] + - repo: https://github.com/pre-commit/mirrors-mypy + rev: "v1.5.1" + hooks: + - id: mypy diff --git a/README.md b/README.md index a79c399..19befda 100644 --- a/README.md +++ b/README.md @@ -13,18 +13,116 @@ pip install kotaemon@git+ssh://git@github.com/Cinnamon/kotaemon.git ### Setup +- Create conda environment (suggest 3.10) + + ```shell + conda create -n kotaemon python=3.10 + conda activate kotaemon + ``` + +- Clone the repo + + ```shel + git clone git@github.com:Cinnamon/kotaemon.git + cd kotaemon + ``` + +- Install all + + ```shell + pip install -e ".[dev]" + ``` + +- Pre-commit + + ```shell + pre-commit install + ``` + +- Test + + ```shell + pytest tests + ``` + +### Credential sharing + +This repo uses [ssh-secret](https://sobolevn.me/git-secret/) to share credentials, which internally uses `gpg` to encrypt and decrypt secret files. + +#### Install git-secret + +Please follow the [official guide](https://sobolevn.me/git-secret/installation) to install git-secret. + +#### Gaining access + +In order to gain access to the secret files, you must provide your gpg public file to anyone who has access and ask them to ask your key to the keyring. For a quick tutorial on generating your gpg key pair, you can refer to the `Using gpg` section from the [ssh-secret main page](https://sobolevn.me/git-secret/). + +#### Decrypt the secret file + +The credentials are encrypted in the `credentials.txt.secret` file. To print the decrypted content to stdout, run + ```shell -# Create conda environment (suggest 3.10) -conda create -n kotaemon python=3.10 -conda activate kotaemon - -# Install all -pip install -e ".[dev]" - -# Test -pytest tests +git-secret cat [filename] ``` +Or to get the decrypted `credentials.txt` file, run + +```shell +git-secret reveal [filename] +``` + +#### For Windows users + +ssh-secret is currently not available for Windows, thus the easiest way is to use it in WSL (please use the latest version of WSL2). From there you have 2 options: + +1. Using the gpg of WSL. + + This is the most straight-forward option since you would use WSL just like any other unix environment. However, the downside is that you have to make WSL your main environment, which means WSL must have write permission on your repo. To achieve this, you must either: + + - Clone and store your repo inside WSL's file system. + - Provide WSL with necessary permission on your Windows file system. This can be achieve by setting `automount` options for WSL. To do that, add these content to `/etc/wsl.conf` and then restart your sub-system. + + ```shell + [automount] + options = "metadata,umask=022,fmask=011" + ``` + + This enables all permissions for user owner. + +2. Using the gpg of Windows but with ssh-secret from WSL. + + For those who use Windows as the main environment, having to switch back and forth between Windows and WSL will be inconvenient. You can instead stay within your Windows environment and apply some tricks to use `ssh-secret` from WSL. + + - Install and setup `gpg` on Windows. + - Install `ssh-secret` on WSL. + - Make WSL use the `gpg` executable from Windows. This can be done by alias `gpg` to your Windows executable `gpg.exe` file. Add this content to your startup script: + + ```shell + # Create ~/bin if it doesn't exist + [ ! -d "$HOME/bin" ] && mkdir "$HOME/bin" + + # link windows executable + ln -snf "$(which gpg.exe)" "$HOME/bin/gpg" + + # Prepend $HOME/bin to PATH + if [[ ":$PATH:" == *":$HOME/bin:"* ]]; then + export PATH="$HOME/bin:$PATH" + fi + ``` + + - Now in Windows, you can invoke `ssh-secret` using `wsl ssh-secret`. + - Alternatively you can setup alias in CMD to shorten the syntax. Please refer to [this SO answer](https://stackoverflow.com/a/65823225) for the instruction. Some recommended aliases are: + + ```bat + @echo off + + :: Commands + DOSKEY ls=dir /B $* + DOSKEY ll=dir /a $* + DOSKEY git-secret=wsl git-secret $* + DOSKEY gs=wsl git-secret $* + ``` + ### Code base structure - documents: define document diff --git a/credentials.txt.secret b/credentials.txt.secret new file mode 100644 index 0000000..c686370 Binary files /dev/null and b/credentials.txt.secret differ diff --git a/knowledgehub/llms/base.py b/knowledgehub/llms/base.py index 85ea0b9..db09bd9 100644 --- a/knowledgehub/llms/base.py +++ b/knowledgehub/llms/base.py @@ -1,15 +1,16 @@ from dataclasses import dataclass, field +from typing import List from ..components import BaseComponent @dataclass class LLMInterface: - text: list[str] + text: List[str] completion_tokens: int = -1 total_tokens: int = -1 prompt_tokens: int = -1 - logits: list[list[float]] = field(default_factory=list) + logits: List[List[float]] = field(default_factory=list) class PromptTemplate(BaseComponent): diff --git a/knowledgehub/llms/chats/base.py b/knowledgehub/llms/chats/base.py index ea1aa26..d181d94 100644 --- a/knowledgehub/llms/chats/base.py +++ b/knowledgehub/llms/chats/base.py @@ -1,17 +1,12 @@ -from typing import Type, TypeVar +from typing import List, Type, TypeVar -from theflow.base import Param from langchain.schema.language_model import BaseLanguageModel - -from langchain.schema.messages import ( - BaseMessage, - HumanMessage, -) +from langchain.schema.messages import BaseMessage, HumanMessage +from theflow.base import Param from ...components import BaseComponent from ..base import LLMInterface - Message = TypeVar("Message", bound=BaseMessage) @@ -43,11 +38,11 @@ class LangchainChatLLM(ChatLLM): message = HumanMessage(content=text) return self.run_document([message]) - def run_batch_raw(self, text: list[str]) -> list[LLMInterface]: + def run_batch_raw(self, text: List[str]) -> List[LLMInterface]: inputs = [[HumanMessage(content=each)] for each in text] return self.run_batch_document(inputs) - def run_document(self, text: list[Message]) -> LLMInterface: + def run_document(self, text: List[Message]) -> LLMInterface: pred = self.agent.generate([text]) return LLMInterface( text=[each.text for each in pred.generations[0]], @@ -57,7 +52,7 @@ class LangchainChatLLM(ChatLLM): logits=[], ) - def run_batch_document(self, text: list[list[Message]]) -> list[LLMInterface]: + def run_batch_document(self, text: List[List[Message]]) -> List[LLMInterface]: outputs = [] for each_text in text: outputs.append(self.run_document(each_text)) @@ -66,14 +61,14 @@ class LangchainChatLLM(ChatLLM): def is_document(self, text) -> bool: if isinstance(text, str): return False - elif isinstance(text, list) and isinstance(text[0], str): + elif isinstance(text, List) and isinstance(text[0], str): return False return True def is_batch(self, text) -> bool: if isinstance(text, str): return False - elif isinstance(text, list): + elif isinstance(text, List): if isinstance(text[0], BaseMessage): return False return True diff --git a/knowledgehub/llms/completions/base.py b/knowledgehub/llms/completions/base.py index 2409c91..145979e 100644 --- a/knowledgehub/llms/completions/base.py +++ b/knowledgehub/llms/completions/base.py @@ -1,7 +1,7 @@ -from typing import Type +from typing import List, Type -from theflow.base import Param from langchain.schema.language_model import BaseLanguageModel +from theflow.base import Param from ...components import BaseComponent from ..base import LLMInterface @@ -41,7 +41,7 @@ class LangchainLLM(LLM): logits=[], ) - def run_batch_raw(self, text: list[str]) -> list[LLMInterface]: + def run_batch_raw(self, text: List[str]) -> List[LLMInterface]: outputs = [] for each_text in text: outputs.append(self.run_raw(each_text)) @@ -50,7 +50,7 @@ class LangchainLLM(LLM): def run_document(self, text: str) -> LLMInterface: return self.run_raw(text) - def run_batch_document(self, text: list[str]) -> list[LLMInterface]: + def run_batch_document(self, text: List[str]) -> List[LLMInterface]: return self.run_batch_raw(text) def is_document(self, text) -> bool: diff --git a/knowledgehub/llms/completions/openai.py b/knowledgehub/llms/completions/openai.py index a510e27..93a25ee 100644 --- a/knowledgehub/llms/completions/openai.py +++ b/knowledgehub/llms/completions/openai.py @@ -5,9 +5,11 @@ from .base import LangchainLLM class OpenAI(LangchainLLM): """Wrapper around Langchain's OpenAI class""" + _lc_class = langchain_llms.OpenAI class AzureOpenAI(LangchainLLM): """Wrapper around Langchain's AzureOpenAI class""" + _lc_class = langchain_llms.AzureOpenAI diff --git a/knowledgehub/loaders/base.py b/knowledgehub/loaders/base.py index eaae292..f21e2ec 100644 --- a/knowledgehub/loaders/base.py +++ b/knowledgehub/loaders/base.py @@ -1,13 +1,10 @@ class DocumentLoader: """Document loader""" - pass class TextManipulator: """Text manipulation""" - pass class DocumentManipulator: """Document manipulation""" - pass diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..1127b02 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,9 @@ +[pytest] +minversion = 7.4.0 +testpaths = tests +addopts = -ra -q +log_cli=true +log_level=DEBUG +log_format = %(asctime)s %(levelname)s %(message)s +log_date_format = %Y-%m-%d %H:%M:%S +log_file = logs/pytest-logs.txt diff --git a/setup.py b/setup.py index 39ed677..94839d0 100644 --- a/setup.py +++ b/setup.py @@ -41,9 +41,8 @@ setuptools.setup( "flake8", "sphinx", "coverage", - # optional dependency needed for test - "openai" + "openai", ], }, entry_points={"console_scripts": ["kh=kotaemon.cli:main"]}, diff --git a/tests/test_llms_chat_models.py b/tests/test_llms_chat_models.py index aba8e8e..392d54e 100644 --- a/tests/test_llms_chat_models.py +++ b/tests/test_llms_chat_models.py @@ -1,15 +1,10 @@ from unittest.mock import patch from langchain.chat_models import AzureChatOpenAI as AzureChatOpenAILC -from langchain.schema.messages import ( - SystemMessage, - HumanMessage, - AIMessage, -) +from langchain.schema.messages import AIMessage, HumanMessage, SystemMessage -from kotaemon.llms.chats.openai import AzureChatOpenAI from kotaemon.llms.base import LLMInterface - +from kotaemon.llms.chats.openai import AzureChatOpenAI _openai_chat_completion_response = { "id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x", @@ -49,7 +44,9 @@ def test_azureopenai_model(openai_completion): # test for str input - stream mode output = model("hello world") - assert isinstance(output, LLMInterface), "Output for single text is not LLMInterface" + assert isinstance( + output, LLMInterface + ), "Output for single text is not LLMInterface" openai_completion.assert_called() # test for list[str] input - batch mode @@ -67,7 +64,9 @@ def test_azureopenai_model(openai_completion): ] output = model(messages) - assert isinstance(output, LLMInterface), "Output for single text is not LLMInterface" + assert isinstance( + output, LLMInterface + ), "Output for single text is not LLMInterface" openai_completion.assert_called() # test for list[list[message]] input - batch mode @@ -75,4 +74,3 @@ def test_azureopenai_model(openai_completion): assert isinstance(output, list), "Output for batch string is not a list" assert isinstance(output[0], LLMInterface), "Output for text is not LLMInterface" openai_completion.assert_called() - diff --git a/tests/test_llms_completion_models.py b/tests/test_llms_completion_models.py index 99a354e..495e5b4 100644 --- a/tests/test_llms_completion_models.py +++ b/tests/test_llms_completion_models.py @@ -1,10 +1,10 @@ from unittest.mock import patch -from langchain.llms import AzureOpenAI as AzureOpenAILC, OpenAI as OpenAILC +from langchain.llms import AzureOpenAI as AzureOpenAILC +from langchain.llms import OpenAI as OpenAILC -from kotaemon.llms.completions.openai import AzureOpenAI, OpenAI from kotaemon.llms.base import LLMInterface - +from kotaemon.llms.completions.openai import AzureOpenAI, OpenAI _openai_completion_response = { "id": "cmpl-7qyNoIo6gRSCJR0hi8o3ZKBH4RkJ0", @@ -41,7 +41,9 @@ def test_azureopenai_model(openai_completion): openai_completion.assert_called() output = model("hello world") - assert isinstance(output, LLMInterface), "Output for single text is not LLMInterface" + assert isinstance( + output, LLMInterface + ), "Output for single text is not LLMInterface" @patch( @@ -67,4 +69,6 @@ def test_openai_model(openai_completion): openai_completion.assert_called() output = model("hello world") - assert isinstance(output, LLMInterface), "Output for single text is not LLMInterface" + assert isinstance( + output, LLMInterface + ), "Output for single text is not LLMInterface" diff --git a/tests/test_telemetry.py b/tests/test_telemetry.py index 59b184f..3b1e96c 100644 --- a/tests/test_telemetry.py +++ b/tests/test_telemetry.py @@ -29,11 +29,14 @@ def clean_artifacts_for_telemetry(): def test_disable_telemetry_import_haystack_first(): """Test that telemetry is disabled when kotaemon lib is initiated after""" import os + import haystack.telemetry + assert haystack.telemetry.telemetry is not None assert os.environ.get("HAYSTACK_TELEMETRY_ENABLED", "True") != "False" - import kotaemon # noqa: F401 + import kotaemon # noqa: F401 + assert haystack.telemetry.telemetry is None assert os.environ.get("HAYSTACK_TELEMETRY_ENABLED", "True") == "False" @@ -43,8 +46,9 @@ def test_disable_telemetry_import_haystack_after_kotaemon(): """Test that telemetry is disabled when kotaemon lib is initiated before""" import os - import kotaemon # noqa: F401 import haystack.telemetry + + import kotaemon # noqa: F401 + assert haystack.telemetry.telemetry is None assert os.environ.get("HAYSTACK_TELEMETRY_ENABLED", "True") == "False" -