From 5241edbc4696386af3c3bacfff648a7ee80caed7 Mon Sep 17 00:00:00 2001 From: ian_Cin Date: Wed, 30 Aug 2023 07:22:01 +0700 Subject: [PATCH] [AUR-361] Setup pre-commit, pytest, GitHub actions, ssh-secret (#3) Co-authored-by: trducng --- .github/workflows/style-check.yaml | 20 ++++ .github/workflows/unit-test.yaml | 32 +++++++ .gitignore | 11 ++- .gitsecret/keys/pubring.kbx | Bin 0 -> 1966 bytes .gitsecret/keys/trustdb.gpg | Bin 0 -> 1200 bytes .gitsecret/paths/mapping.cfg | 1 + .pre-commit-config.yaml | 49 ++++++++++ README.md | 116 ++++++++++++++++++++++-- credentials.txt.secret | Bin 0 -> 486 bytes knowledgehub/llms/base.py | 5 +- knowledgehub/llms/chats/base.py | 21 ++--- knowledgehub/llms/completions/base.py | 8 +- knowledgehub/llms/completions/openai.py | 2 + knowledgehub/loaders/base.py | 3 - pytest.ini | 9 ++ setup.py | 3 +- tests/test_llms_chat_models.py | 18 ++-- tests/test_llms_completion_models.py | 14 ++- tests/test_telemetry.py | 10 +- 19 files changed, 268 insertions(+), 54 deletions(-) create mode 100644 .github/workflows/style-check.yaml create mode 100644 .github/workflows/unit-test.yaml create mode 100644 .gitsecret/keys/pubring.kbx create mode 100644 .gitsecret/keys/trustdb.gpg create mode 100644 .gitsecret/paths/mapping.cfg create mode 100644 .pre-commit-config.yaml create mode 100644 credentials.txt.secret create mode 100644 pytest.ini diff --git a/.github/workflows/style-check.yaml b/.github/workflows/style-check.yaml new file mode 100644 index 0000000..2047461 --- /dev/null +++ b/.github/workflows/style-check.yaml @@ -0,0 +1,20 @@ +name: style-check + +on: + pull_request: + branches: [main] + push: + branches: [main] + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - name: Clone the repo + uses: actions/checkout@v3 + - name: Setup python + uses: actions/setup-python@v4 + with: + python-version: "3.10" + - name: run pre-commit + uses: pre-commit/action@v3.0.0 diff --git a/.github/workflows/unit-test.yaml b/.github/workflows/unit-test.yaml new file mode 100644 index 0000000..5bd66c9 --- /dev/null +++ b/.github/workflows/unit-test.yaml @@ -0,0 +1,32 @@ +name: unit-test + +on: + pull_request: + branches: [main] + push: + branches: [main] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11"] + name: unit testing with python ${{ matrix.python-version }} + steps: + - name: Clone the repo + uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + architecture: x64 + - name: Display Python version + run: python -c "import sys; print(sys.version)" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e .[dev] + - name: Test with pytest + run: | + pytest diff --git a/.gitignore b/.gitignore index 22de1c3..28d4435 100644 --- a/.gitignore +++ b/.gitignore @@ -51,7 +51,6 @@ flycheck_*.el # network security /network-security.data - ### Linux ### # temporary files which can be created if a process still has a handle open of a deleted file @@ -75,7 +74,6 @@ flycheck_*.el # Icon must end with two \r Icon - # Thumbnails ._* @@ -386,7 +384,7 @@ pyrightconfig.json ### Vim ### # Swap [._]*.s[a-v][a-z] -!*.svg # comment out if you don't need vector files +!*.svg # comment out if you don't need vector files [._]*.sw[a-p] [._]s[a-rt-v][a-z] [._]ss[a-gi-z] @@ -451,3 +449,10 @@ $RECYCLE.BIN/ .theflow/ # End of https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm + +logs/ +.gitsecret/keys/random_seed +!*.secret +credentials.txt + +S.gpg-agent* diff --git a/.gitsecret/keys/pubring.kbx b/.gitsecret/keys/pubring.kbx new file mode 100644 index 0000000000000000000000000000000000000000..86a2b56937cf8dbed6bdace86a2b411bfa39d47e GIT binary patch literal 1966 zcmaizdocpD~zW$E60j4Q1Mi6ywQd+=qybn57y^VX$N-Gno*{%$T*5OK9DB zl3UqvXNZ-ob;*jNT&8j>N{n2hOzU~5XV3GT=j=KAKIi-WzQ3RE=a26>pZ5y@01OBM zkdw`+0AO2`#aF32YsY_)L^1?^b};||{{%pQD)pYEQBC{7);>0a$9L6#-juB(v)zH& z>AIRs-E=bmNn;J8V8r|e@#wzc+U>I0&fB&?KpFrb^=+s0`3?cF&*%Sb+dsIzQ_2RD zV3cr+3Q!v04F%X?e~DVN`_!Xn@&fmU`*{57h!q!`QjvBgS^BZVEsRad^wHs}sr#|c zg#FxrYP>@9uPtq!!>dOU7~j_){IE5s>@XLg2Et1Z{Y0i1t38Z$*H&t#EGMN|S=_~X zd0_+hNv7Y0(@L1Q4_K)XHnhgL=8I{Cmix2J((Q_ElGumXdSzptk)(l%sHZE5v50}h zU@xv^FmOKE-_2_}#`GlWq6|^pX(S^2w=_ag6N*Dm6Zay@sg!h1<6plK%_hPuh0E4g zs{>4RyEWP?P<-sU?6r?@s`R0}nv;61H*-S@Ps0PLseC2TpyAS^s~S1l^3cswTk_^myZ%&FYE>6Kme@Xo?c*R;FmlA^-9NlXc!MB#w^H( zX5|+|qxn$J)AWKaSp26C`ZCr8a2TclA^;1tChV`WBM$A%7*Y`vmlEp;1jLb2k`fSn zH$qAh3Bk~i7zn9@kYX%=kdpl~Ze3Jq;8ymc{d9pP{%KeK6f%6t&^gdvyTp4v2tUA! zpyq41?`pT>#u)1etNcpvm5C&wXlH9Vf{c?ReY z%2O;N>{;mIG}bow%1Z{(-(E&D*9%_*-}QebjHH~^YT$V5MA2uwTTa>XmvmNk4XKgR zs)|^qw4WbH>!ldX2;;&ZivQAS-`OS!iplo#i%2@@x1mu+zxjV@!?fbqmNU zy)2JEtd-a9NX~UI&csa=z!If$9X_qP3H;WkB^vjyPFXr3BhR+a@Z;zI#S(YFDFU4M z!s-ZlyXWL<{|vfZIZi}bDH~I+ExXP}X?60eP$FHz6!`$EjPDjr_jz^O(bN$4w%It_ zfSmwb?kUPDYb-Goup%-#$#c66S%d4Ft6<^SRd1jO@GEA@2`dfV%uQ=n*R%F zvsA~pFk(E(ahkiKKj&%zvd@Poi0lp>?~W@iu<2U;F}sz%#*Lh~K-3H7B!8O8O}Blo z^|bOye+F*9keo1M@j=n*DM#XkK0Qw`DIoWUp1`y#8D=>AdL^ULOd$?eHLF6Ne`f3^ zUQ%*DsIJh_ePkmKJUfZ-t8_eEP=<2X^m;3HBIKu6PfFHJf}K1}^eTcYoB`Yn8{hZw zPEK0AttD}aFhi|;XGBX?+p;bV1omd-d+Xx3f4JQ>2nvpO6O;P~77GTl!Y6d<-ngF)Kl(uoh{ta%@UvT?htj6{rw=^GQ&>s4Nt zaPNErkHs~0>dk(ly9tgQVLsj~5ao@H9~B5i*6Up)ullesn|t?Aqo-SVc}(|Uzej#N zQ^s8<*MC@O)F$%tB*-SZii01|co<_jnJ-ssQL9UJvdqwBLS<~Jkt0s4Q#n8R{WVCEg~$Ua?hf9`46V%3B}frUC+0~I%2A71f9Bevq^8* zbpOiTbd90bkA8_=*A*#j`M~alMd;~ i_;Aj${quK!VzoM2_gnVvFb=_gmWcEHGmT^vJnR2@c;fCWawWi(tsAuyT_C`$(br&|R$ literal 0 HcmV?d00001 diff --git a/.gitsecret/paths/mapping.cfg b/.gitsecret/paths/mapping.cfg new file mode 100644 index 0000000..576d1d7 --- /dev/null +++ b/.gitsecret/paths/mapping.cfg @@ -0,0 +1 @@ +credentials.txt:1e17fa46dd8353b5ded588b32983ac7d800e70fd16bc5831663b9aaefc409011 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..d97508e --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,49 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: check-yaml + - id: check-toml + - id: end-of-file-fixer + - id: trailing-whitespace + - id: detect-aws-credentials + args: ["--allow-missing-credentials"] + - id: detect-private-key + - id: check-added-large-files + - repo: https://github.com/ambv/black + rev: 22.3.0 + hooks: + - id: black + language_version: python3 + - repo: https://github.com/pycqa/isort + rev: 5.12.0 + hooks: + - id: isort + args: ["--profile", "black"] + language_version: python3.10 + - repo: https://github.com/pycqa/flake8 + rev: 4.0.1 + hooks: + - id: flake8 + args: ["--max-line-length", "88"] + - repo: https://github.com/myint/autoflake + rev: v1.4 + hooks: + - id: autoflake + args: + [ + "--in-place", + "--remove-unused-variables", + "--remove-all-unused-imports", + "--ignore-init-module-imports", + "--exclude=tests/*", + ] + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v2.7.1 + hooks: + - id: prettier + types_or: [markdown, yaml] + - repo: https://github.com/pre-commit/mirrors-mypy + rev: "v1.5.1" + hooks: + - id: mypy diff --git a/README.md b/README.md index a79c399..19befda 100644 --- a/README.md +++ b/README.md @@ -13,18 +13,116 @@ pip install kotaemon@git+ssh://git@github.com/Cinnamon/kotaemon.git ### Setup +- Create conda environment (suggest 3.10) + + ```shell + conda create -n kotaemon python=3.10 + conda activate kotaemon + ``` + +- Clone the repo + + ```shel + git clone git@github.com:Cinnamon/kotaemon.git + cd kotaemon + ``` + +- Install all + + ```shell + pip install -e ".[dev]" + ``` + +- Pre-commit + + ```shell + pre-commit install + ``` + +- Test + + ```shell + pytest tests + ``` + +### Credential sharing + +This repo uses [ssh-secret](https://sobolevn.me/git-secret/) to share credentials, which internally uses `gpg` to encrypt and decrypt secret files. + +#### Install git-secret + +Please follow the [official guide](https://sobolevn.me/git-secret/installation) to install git-secret. + +#### Gaining access + +In order to gain access to the secret files, you must provide your gpg public file to anyone who has access and ask them to ask your key to the keyring. For a quick tutorial on generating your gpg key pair, you can refer to the `Using gpg` section from the [ssh-secret main page](https://sobolevn.me/git-secret/). + +#### Decrypt the secret file + +The credentials are encrypted in the `credentials.txt.secret` file. To print the decrypted content to stdout, run + ```shell -# Create conda environment (suggest 3.10) -conda create -n kotaemon python=3.10 -conda activate kotaemon - -# Install all -pip install -e ".[dev]" - -# Test -pytest tests +git-secret cat [filename] ``` +Or to get the decrypted `credentials.txt` file, run + +```shell +git-secret reveal [filename] +``` + +#### For Windows users + +ssh-secret is currently not available for Windows, thus the easiest way is to use it in WSL (please use the latest version of WSL2). From there you have 2 options: + +1. Using the gpg of WSL. + + This is the most straight-forward option since you would use WSL just like any other unix environment. However, the downside is that you have to make WSL your main environment, which means WSL must have write permission on your repo. To achieve this, you must either: + + - Clone and store your repo inside WSL's file system. + - Provide WSL with necessary permission on your Windows file system. This can be achieve by setting `automount` options for WSL. To do that, add these content to `/etc/wsl.conf` and then restart your sub-system. + + ```shell + [automount] + options = "metadata,umask=022,fmask=011" + ``` + + This enables all permissions for user owner. + +2. Using the gpg of Windows but with ssh-secret from WSL. + + For those who use Windows as the main environment, having to switch back and forth between Windows and WSL will be inconvenient. You can instead stay within your Windows environment and apply some tricks to use `ssh-secret` from WSL. + + - Install and setup `gpg` on Windows. + - Install `ssh-secret` on WSL. + - Make WSL use the `gpg` executable from Windows. This can be done by alias `gpg` to your Windows executable `gpg.exe` file. Add this content to your startup script: + + ```shell + # Create ~/bin if it doesn't exist + [ ! -d "$HOME/bin" ] && mkdir "$HOME/bin" + + # link windows executable + ln -snf "$(which gpg.exe)" "$HOME/bin/gpg" + + # Prepend $HOME/bin to PATH + if [[ ":$PATH:" == *":$HOME/bin:"* ]]; then + export PATH="$HOME/bin:$PATH" + fi + ``` + + - Now in Windows, you can invoke `ssh-secret` using `wsl ssh-secret`. + - Alternatively you can setup alias in CMD to shorten the syntax. Please refer to [this SO answer](https://stackoverflow.com/a/65823225) for the instruction. Some recommended aliases are: + + ```bat + @echo off + + :: Commands + DOSKEY ls=dir /B $* + DOSKEY ll=dir /a $* + DOSKEY git-secret=wsl git-secret $* + DOSKEY gs=wsl git-secret $* + ``` + ### Code base structure - documents: define document diff --git a/credentials.txt.secret b/credentials.txt.secret new file mode 100644 index 0000000000000000000000000000000000000000..c686370158b38628976569ae6a8f072860610d65 GIT binary patch literal 486 zcmV<0gMB9>i0uNB-FA23;>D&*3GeMZCqi(x}8P`CmRNzm;!juW-*?#VVg_Q zO#$f1gXS&F9NYJKwM7M=;z(xB6v&78Xao8$;v-1^V=-c*+FED1U1SJK7sn;on`Awr zECPp+-ekAfB(byzNdK|b%JYrq*gzWvmLq@*M9?%4UOO~niXk`&KX1Ac1ZA5IuczZ1 zXvUopLxOeAkUPsVkZMSy4CHkXdKMl6Jb-WY1DJ=+YVH_IzEGXW+dN;-cZ)L?mi5wj z4>#RewmFK)O8lCmo*^Ay$_s-Mft?vaAffxXy6Wrj*@VXFlGHc&{6uTQ6N#Ai$S-Pv z@+``o+&;7sY>e#~bq?Dd%bV;lNI);Z)s=T2-@nQ(&(PCFi|FV|avXiS3!@Be*3U%= zL}3KVZ)a-6I3NSjcA3ljRXYYr(=#L960-qk|2@9z2vvk%D*KoGR_P;T00!`$t)rr@ z#OfIlz7o9Z;3alB_>{vEb-qiAh3G7nFwH`qSKf#(3L=PM6WdO9f+5mX0oZf1;HxTN zI4!k2P1B6aKn>ug{yzNkn3nGfTpA{=e`dAc)gD!^co#O8&#FlAiNMXl-}xlm(=mo<_A4N9|#s=keAvvDqEZpbC list[LLMInterface]: + def run_batch_raw(self, text: List[str]) -> List[LLMInterface]: inputs = [[HumanMessage(content=each)] for each in text] return self.run_batch_document(inputs) - def run_document(self, text: list[Message]) -> LLMInterface: + def run_document(self, text: List[Message]) -> LLMInterface: pred = self.agent.generate([text]) return LLMInterface( text=[each.text for each in pred.generations[0]], @@ -57,7 +52,7 @@ class LangchainChatLLM(ChatLLM): logits=[], ) - def run_batch_document(self, text: list[list[Message]]) -> list[LLMInterface]: + def run_batch_document(self, text: List[List[Message]]) -> List[LLMInterface]: outputs = [] for each_text in text: outputs.append(self.run_document(each_text)) @@ -66,14 +61,14 @@ class LangchainChatLLM(ChatLLM): def is_document(self, text) -> bool: if isinstance(text, str): return False - elif isinstance(text, list) and isinstance(text[0], str): + elif isinstance(text, List) and isinstance(text[0], str): return False return True def is_batch(self, text) -> bool: if isinstance(text, str): return False - elif isinstance(text, list): + elif isinstance(text, List): if isinstance(text[0], BaseMessage): return False return True diff --git a/knowledgehub/llms/completions/base.py b/knowledgehub/llms/completions/base.py index 2409c91..145979e 100644 --- a/knowledgehub/llms/completions/base.py +++ b/knowledgehub/llms/completions/base.py @@ -1,7 +1,7 @@ -from typing import Type +from typing import List, Type -from theflow.base import Param from langchain.schema.language_model import BaseLanguageModel +from theflow.base import Param from ...components import BaseComponent from ..base import LLMInterface @@ -41,7 +41,7 @@ class LangchainLLM(LLM): logits=[], ) - def run_batch_raw(self, text: list[str]) -> list[LLMInterface]: + def run_batch_raw(self, text: List[str]) -> List[LLMInterface]: outputs = [] for each_text in text: outputs.append(self.run_raw(each_text)) @@ -50,7 +50,7 @@ class LangchainLLM(LLM): def run_document(self, text: str) -> LLMInterface: return self.run_raw(text) - def run_batch_document(self, text: list[str]) -> list[LLMInterface]: + def run_batch_document(self, text: List[str]) -> List[LLMInterface]: return self.run_batch_raw(text) def is_document(self, text) -> bool: diff --git a/knowledgehub/llms/completions/openai.py b/knowledgehub/llms/completions/openai.py index a510e27..93a25ee 100644 --- a/knowledgehub/llms/completions/openai.py +++ b/knowledgehub/llms/completions/openai.py @@ -5,9 +5,11 @@ from .base import LangchainLLM class OpenAI(LangchainLLM): """Wrapper around Langchain's OpenAI class""" + _lc_class = langchain_llms.OpenAI class AzureOpenAI(LangchainLLM): """Wrapper around Langchain's AzureOpenAI class""" + _lc_class = langchain_llms.AzureOpenAI diff --git a/knowledgehub/loaders/base.py b/knowledgehub/loaders/base.py index eaae292..f21e2ec 100644 --- a/knowledgehub/loaders/base.py +++ b/knowledgehub/loaders/base.py @@ -1,13 +1,10 @@ class DocumentLoader: """Document loader""" - pass class TextManipulator: """Text manipulation""" - pass class DocumentManipulator: """Document manipulation""" - pass diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..1127b02 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,9 @@ +[pytest] +minversion = 7.4.0 +testpaths = tests +addopts = -ra -q +log_cli=true +log_level=DEBUG +log_format = %(asctime)s %(levelname)s %(message)s +log_date_format = %Y-%m-%d %H:%M:%S +log_file = logs/pytest-logs.txt diff --git a/setup.py b/setup.py index 39ed677..94839d0 100644 --- a/setup.py +++ b/setup.py @@ -41,9 +41,8 @@ setuptools.setup( "flake8", "sphinx", "coverage", - # optional dependency needed for test - "openai" + "openai", ], }, entry_points={"console_scripts": ["kh=kotaemon.cli:main"]}, diff --git a/tests/test_llms_chat_models.py b/tests/test_llms_chat_models.py index aba8e8e..392d54e 100644 --- a/tests/test_llms_chat_models.py +++ b/tests/test_llms_chat_models.py @@ -1,15 +1,10 @@ from unittest.mock import patch from langchain.chat_models import AzureChatOpenAI as AzureChatOpenAILC -from langchain.schema.messages import ( - SystemMessage, - HumanMessage, - AIMessage, -) +from langchain.schema.messages import AIMessage, HumanMessage, SystemMessage -from kotaemon.llms.chats.openai import AzureChatOpenAI from kotaemon.llms.base import LLMInterface - +from kotaemon.llms.chats.openai import AzureChatOpenAI _openai_chat_completion_response = { "id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x", @@ -49,7 +44,9 @@ def test_azureopenai_model(openai_completion): # test for str input - stream mode output = model("hello world") - assert isinstance(output, LLMInterface), "Output for single text is not LLMInterface" + assert isinstance( + output, LLMInterface + ), "Output for single text is not LLMInterface" openai_completion.assert_called() # test for list[str] input - batch mode @@ -67,7 +64,9 @@ def test_azureopenai_model(openai_completion): ] output = model(messages) - assert isinstance(output, LLMInterface), "Output for single text is not LLMInterface" + assert isinstance( + output, LLMInterface + ), "Output for single text is not LLMInterface" openai_completion.assert_called() # test for list[list[message]] input - batch mode @@ -75,4 +74,3 @@ def test_azureopenai_model(openai_completion): assert isinstance(output, list), "Output for batch string is not a list" assert isinstance(output[0], LLMInterface), "Output for text is not LLMInterface" openai_completion.assert_called() - diff --git a/tests/test_llms_completion_models.py b/tests/test_llms_completion_models.py index 99a354e..495e5b4 100644 --- a/tests/test_llms_completion_models.py +++ b/tests/test_llms_completion_models.py @@ -1,10 +1,10 @@ from unittest.mock import patch -from langchain.llms import AzureOpenAI as AzureOpenAILC, OpenAI as OpenAILC +from langchain.llms import AzureOpenAI as AzureOpenAILC +from langchain.llms import OpenAI as OpenAILC -from kotaemon.llms.completions.openai import AzureOpenAI, OpenAI from kotaemon.llms.base import LLMInterface - +from kotaemon.llms.completions.openai import AzureOpenAI, OpenAI _openai_completion_response = { "id": "cmpl-7qyNoIo6gRSCJR0hi8o3ZKBH4RkJ0", @@ -41,7 +41,9 @@ def test_azureopenai_model(openai_completion): openai_completion.assert_called() output = model("hello world") - assert isinstance(output, LLMInterface), "Output for single text is not LLMInterface" + assert isinstance( + output, LLMInterface + ), "Output for single text is not LLMInterface" @patch( @@ -67,4 +69,6 @@ def test_openai_model(openai_completion): openai_completion.assert_called() output = model("hello world") - assert isinstance(output, LLMInterface), "Output for single text is not LLMInterface" + assert isinstance( + output, LLMInterface + ), "Output for single text is not LLMInterface" diff --git a/tests/test_telemetry.py b/tests/test_telemetry.py index 59b184f..3b1e96c 100644 --- a/tests/test_telemetry.py +++ b/tests/test_telemetry.py @@ -29,11 +29,14 @@ def clean_artifacts_for_telemetry(): def test_disable_telemetry_import_haystack_first(): """Test that telemetry is disabled when kotaemon lib is initiated after""" import os + import haystack.telemetry + assert haystack.telemetry.telemetry is not None assert os.environ.get("HAYSTACK_TELEMETRY_ENABLED", "True") != "False" - import kotaemon # noqa: F401 + import kotaemon # noqa: F401 + assert haystack.telemetry.telemetry is None assert os.environ.get("HAYSTACK_TELEMETRY_ENABLED", "True") == "False" @@ -43,8 +46,9 @@ def test_disable_telemetry_import_haystack_after_kotaemon(): """Test that telemetry is disabled when kotaemon lib is initiated before""" import os - import kotaemon # noqa: F401 import haystack.telemetry + + import kotaemon # noqa: F401 + assert haystack.telemetry.telemetry is None assert os.environ.get("HAYSTACK_TELEMETRY_ENABLED", "True") == "False" -