[AUR-361] Setup pre-commit, pytest, GitHub actions, ssh-secret (#3)

Co-authored-by: trducng <trungduc1992@gmail.com>
2023-08-30 07:22:01 +07:00 · 2023-08-30 07:22:01 +07:00 · 5241edbc46
commit 5241edbc46
parent c3c25db48c
19 changed files with 268 additions and 54 deletions
--- a/.github/workflows/style-check.yaml
+++ b/.github/workflows/style-check.yaml
@ -0,0 +1,20 @@
 name: style-check
 on:
  pull_request:
    branches: [main]
  push:
    branches: [main]
 jobs:
  pre-commit:
    runs-on: ubuntu-latest
    steps:
      - name: Clone the repo
        uses: actions/checkout@v3
      - name: Setup python
        uses: actions/setup-python@v4
        with:
          python-version: "3.10"
      - name: run pre-commit
        uses: pre-commit/action@v3.0.0
--- a/.github/workflows/unit-test.yaml
+++ b/.github/workflows/unit-test.yaml
@ -0,0 +1,32 @@
 name: unit-test
 on:
  pull_request:
    branches: [main]
  push:
    branches: [main]
 jobs:
  build:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.8", "3.9", "3.10", "3.11"]
    name: unit testing with python ${{ matrix.python-version }}
    steps:
      - name: Clone the repo
        uses: actions/checkout@v3
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
          architecture: x64
      - name: Display Python version
        run: python -c "import sys; print(sys.version)"
      - name: Install dependencies
        run: |
          python -m pip install --upgrade pip
          pip install -e .[dev]
      - name: Test with pytest
        run: |
          pytest
--- a/.gitignore
+++ b/.gitignore
@ -51,7 +51,6 @@ flycheck_*.el
 # network security
 /network-security.data
 ### Linux ###
 # temporary files which can be created if a process still has a handle open of a deleted file
@ -75,7 +74,6 @@ flycheck_*.el
 # Icon must end with two \r
 Icon
 # Thumbnails
 ._*
@ -451,3 +449,10 @@ $RECYCLE.BIN/
 .theflow/
 # End of https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm
 logs/
 .gitsecret/keys/random_seed
 !*.secret
 credentials.txt
 S.gpg-agent*
--- a/.gitsecret/keys/pubring.kbx
+++ b/.gitsecret/keys/pubring.kbx
--- a/.gitsecret/keys/trustdb.gpg
+++ b/.gitsecret/keys/trustdb.gpg
--- a/.gitsecret/paths/mapping.cfg
+++ b/.gitsecret/paths/mapping.cfg
@ -0,0 +1 @@
 credentials.txt:1e17fa46dd8353b5ded588b32983ac7d800e70fd16bc5831663b9aaefc409011
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,49 @@
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.3.0
    hooks:
      - id: check-yaml
      - id: check-toml
      - id: end-of-file-fixer
      - id: trailing-whitespace
      - id: detect-aws-credentials
        args: ["--allow-missing-credentials"]
      - id: detect-private-key
      - id: check-added-large-files
  - repo: https://github.com/ambv/black
    rev: 22.3.0
    hooks:
      - id: black
        language_version: python3
  - repo: https://github.com/pycqa/isort
    rev: 5.12.0
    hooks:
      - id: isort
        args: ["--profile", "black"]
        language_version: python3.10
  - repo: https://github.com/pycqa/flake8
    rev: 4.0.1
    hooks:
      - id: flake8
        args: ["--max-line-length", "88"]
  - repo: https://github.com/myint/autoflake
    rev: v1.4
    hooks:
      - id: autoflake
        args:
          [
            "--in-place",
            "--remove-unused-variables",
            "--remove-all-unused-imports",
            "--ignore-init-module-imports",
            "--exclude=tests/*",
          ]
  - repo: https://github.com/pre-commit/mirrors-prettier
    rev: v2.7.1
    hooks:
      - id: prettier
        types_or: [markdown, yaml]
  - repo: https://github.com/pre-commit/mirrors-mypy
    rev: "v1.5.1"
    hooks:
      - id: mypy
--- a/README.md
+++ b/README.md
@ -13,18 +13,116 @@ pip install kotaemon@git+ssh://git@github.com/Cinnamon/kotaemon.git
 ### Setup
 - Create conda environment (suggest 3.10)
  ```shell
 # Create conda environment (suggest 3.10)
  conda create -n kotaemon python=3.10
  conda activate kotaemon
  ```
-# Install all
+- Clone the repo
  ```shel
  git clone git@github.com:Cinnamon/kotaemon.git
  cd kotaemon
  ```
 - Install all
  ```shell
  pip install -e ".[dev]"
  ```
-# Test
+- Pre-commit
  ```shell
  pre-commit install
  ```
 - Test
  ```shell
  pytest tests
  ```
 ### Credential sharing
 This repo uses [ssh-secret](https://sobolevn.me/git-secret/) to share credentials, which internally uses `gpg` to encrypt and decrypt secret files.
 #### Install git-secret
 Please follow the [official guide](https://sobolevn.me/git-secret/installation) to install git-secret.
 #### Gaining access
 In order to gain access to the secret files, you must provide your gpg public file to anyone who has access and ask them to ask your key to the keyring. For a quick tutorial on generating your gpg key pair, you can refer to the `Using gpg` section from the [ssh-secret main page](https://sobolevn.me/git-secret/).
 #### Decrypt the secret file
 The credentials are encrypted in the `credentials.txt.secret` file. To print the decrypted content to stdout, run
 ```shell
 git-secret cat [filename]
 ```
 Or to get the decrypted `credentials.txt` file, run
 ```shell
 git-secret reveal [filename]
 ```
 #### For Windows users
 ssh-secret is currently not available for Windows, thus the easiest way is to use it in WSL (please use the latest version of WSL2). From there you have 2 options:
 1. Using the gpg of WSL.
   This is the most straight-forward option since you would use WSL just like any other unix environment. However, the downside is that you have to make WSL your main environment, which means WSL must have write permission on your repo. To achieve this, you must either:
   - Clone and store your repo inside WSL's file system.
   - Provide WSL with necessary permission on your Windows file system. This can be achieve by setting `automount` options for WSL. To do that, add these content to `/etc/wsl.conf` and then restart your sub-system.
     ```shell
     [automount]
     options = "metadata,umask=022,fmask=011"
     ```
     This enables all permissions for user owner.
 2. Using the gpg of Windows but with ssh-secret from WSL.
   For those who use Windows as the main environment, having to switch back and forth between Windows and WSL will be inconvenient. You can instead stay within your Windows environment and apply some tricks to use `ssh-secret` from WSL.
   - Install and setup `gpg` on Windows.
   - Install `ssh-secret` on WSL.
   - Make WSL use the `gpg` executable from Windows. This can be done by alias `gpg` to your Windows executable `gpg.exe` file. Add this content to your startup script:
     ```shell
     # Create ~/bin if it doesn't exist
     [ ! -d "$HOME/bin" ] && mkdir "$HOME/bin"
     # link windows executable
     ln -snf "$(which gpg.exe)" "$HOME/bin/gpg"
     # Prepend $HOME/bin to PATH
     if [[ ":$PATH:" == *":$HOME/bin:"* ]]; then
         export PATH="$HOME/bin:$PATH"
     fi
     ```
   - Now in Windows, you can invoke `ssh-secret` using `wsl ssh-secret`.
   - Alternatively you can setup alias in CMD to shorten the syntax. Please refer to [this SO answer](https://stackoverflow.com/a/65823225) for the instruction. Some recommended aliases are:
     ```bat
     @echo off
     :: Commands
     DOSKEY ls=dir /B $*
     DOSKEY ll=dir /a $*
     DOSKEY git-secret=wsl git-secret $*
     DOSKEY gs=wsl git-secret $*
     ```
 ### Code base structure
 - documents: define document
--- a/credentials.txt.secret
+++ b/credentials.txt.secret
--- a/knowledgehub/llms/base.py
+++ b/knowledgehub/llms/base.py
@ -1,15 +1,16 @@
 from dataclasses import dataclass, field
 from typing import List
 from ..components import BaseComponent
@dataclass
 class LLMInterface:
-    text: list[str]
+    text: List[str]
    completion_tokens: int = -1
    total_tokens: int = -1
    prompt_tokens: int = -1
-    logits: list[list[float]] = field(default_factory=list)
+    logits: List[List[float]] = field(default_factory=list)
 class PromptTemplate(BaseComponent):
--- a/knowledgehub/llms/chats/base.py
+++ b/knowledgehub/llms/chats/base.py
@ -1,17 +1,12 @@
-from typing import Type, TypeVar
+from typing import List, Type, TypeVar
 from theflow.base import Param
 from langchain.schema.language_model import BaseLanguageModel
-
+from langchain.schema.messages import BaseMessage, HumanMessage
-from langchain.schema.messages import (
+from theflow.base import Param
    BaseMessage,
    HumanMessage,
 )
 from ...components import BaseComponent
 from ..base import LLMInterface
 Message = TypeVar("Message", bound=BaseMessage)
@ -43,11 +38,11 @@ class LangchainChatLLM(ChatLLM):
        message = HumanMessage(content=text)
        return self.run_document([message])
-    def run_batch_raw(self, text: list[str]) -> list[LLMInterface]:
+    def run_batch_raw(self, text: List[str]) -> List[LLMInterface]:
        inputs = [[HumanMessage(content=each)] for each in text]
        return self.run_batch_document(inputs)
-    def run_document(self, text: list[Message]) -> LLMInterface:
+    def run_document(self, text: List[Message]) -> LLMInterface:
        pred = self.agent.generate([text])
        return LLMInterface(
            text=[each.text for each in pred.generations[0]],
@ -57,7 +52,7 @@ class LangchainChatLLM(ChatLLM):
            logits=[],
        )
-    def run_batch_document(self, text: list[list[Message]]) -> list[LLMInterface]:
+    def run_batch_document(self, text: List[List[Message]]) -> List[LLMInterface]:
        outputs = []
        for each_text in text:
            outputs.append(self.run_document(each_text))
@ -66,14 +61,14 @@ class LangchainChatLLM(ChatLLM):
    def is_document(self, text) -> bool:
        if isinstance(text, str):
            return False
-        elif isinstance(text, list) and isinstance(text[0], str):
+        elif isinstance(text, List) and isinstance(text[0], str):
            return False
        return True
    def is_batch(self, text) -> bool:
        if isinstance(text, str):
            return False
-        elif isinstance(text, list):
+        elif isinstance(text, List):
            if isinstance(text[0], BaseMessage):
                return False
        return True
--- a/knowledgehub/llms/completions/base.py
+++ b/knowledgehub/llms/completions/base.py
@ -1,7 +1,7 @@
-from typing import Type
+from typing import List, Type
 from theflow.base import Param
 from langchain.schema.language_model import BaseLanguageModel
 from theflow.base import Param
 from ...components import BaseComponent
 from ..base import LLMInterface
@ -41,7 +41,7 @@ class LangchainLLM(LLM):
            logits=[],
        )
-    def run_batch_raw(self, text: list[str]) -> list[LLMInterface]:
+    def run_batch_raw(self, text: List[str]) -> List[LLMInterface]:
        outputs = []
        for each_text in text:
            outputs.append(self.run_raw(each_text))
@ -50,7 +50,7 @@ class LangchainLLM(LLM):
    def run_document(self, text: str) -> LLMInterface:
        return self.run_raw(text)
-    def run_batch_document(self, text: list[str]) -> list[LLMInterface]:
+    def run_batch_document(self, text: List[str]) -> List[LLMInterface]:
        return self.run_batch_raw(text)
    def is_document(self, text) -> bool:
--- a/knowledgehub/llms/completions/openai.py
+++ b/knowledgehub/llms/completions/openai.py
@ -5,9 +5,11 @@ from .base import LangchainLLM
 class OpenAI(LangchainLLM):
    """Wrapper around Langchain's OpenAI class"""
    _lc_class = langchain_llms.OpenAI
 class AzureOpenAI(LangchainLLM):
    """Wrapper around Langchain's AzureOpenAI class"""
    _lc_class = langchain_llms.AzureOpenAI
--- a/knowledgehub/loaders/base.py
+++ b/knowledgehub/loaders/base.py
@ -1,13 +1,10 @@
 class DocumentLoader:
    """Document loader"""
    pass
 class TextManipulator:
    """Text manipulation"""
    pass
 class DocumentManipulator:
    """Document manipulation"""
    pass
--- a/pytest.ini
+++ b/pytest.ini
@ -0,0 +1,9 @@
 [pytest]
 minversion = 7.4.0
 testpaths = tests
 addopts = -ra -q
 log_cli=true
 log_level=DEBUG
 log_format = %(asctime)s %(levelname)s %(message)s
 log_date_format = %Y-%m-%d %H:%M:%S
 log_file = logs/pytest-logs.txt
--- a/setup.py
+++ b/setup.py
@ -41,9 +41,8 @@ setuptools.setup(
            "flake8",
            "sphinx",
            "coverage",
            # optional dependency needed for test
-            "openai"
+            "openai",
        ],
    },
    entry_points={"console_scripts": ["kh=kotaemon.cli:main"]},
--- a/tests/test_llms_chat_models.py
+++ b/tests/test_llms_chat_models.py
@ -1,15 +1,10 @@
 from unittest.mock import patch
 from langchain.chat_models import AzureChatOpenAI as AzureChatOpenAILC
-from langchain.schema.messages import (
+from langchain.schema.messages import AIMessage, HumanMessage, SystemMessage
    SystemMessage,
    HumanMessage,
    AIMessage,
 )
 from kotaemon.llms.chats.openai import AzureChatOpenAI
 from kotaemon.llms.base import LLMInterface
-
+from kotaemon.llms.chats.openai import AzureChatOpenAI
 _openai_chat_completion_response = {
    "id": "chatcmpl-7qyuw6Q1CFCpcKsMdFkmUPUa7JP2x",
@ -49,7 +44,9 @@ def test_azureopenai_model(openai_completion):
    # test for str input - stream mode
    output = model("hello world")
-    assert isinstance(output, LLMInterface), "Output for single text is not LLMInterface"
+    assert isinstance(
        output, LLMInterface
    ), "Output for single text is not LLMInterface"
    openai_completion.assert_called()
    # test for list[str] input - batch mode
@ -67,7 +64,9 @@ def test_azureopenai_model(openai_completion):
    ]
    output = model(messages)
-    assert isinstance(output, LLMInterface), "Output for single text is not LLMInterface"
+    assert isinstance(
        output, LLMInterface
    ), "Output for single text is not LLMInterface"
    openai_completion.assert_called()
    # test for list[list[message]] input - batch mode
@ -75,4 +74,3 @@ def test_azureopenai_model(openai_completion):
    assert isinstance(output, list), "Output for batch string is not a list"
    assert isinstance(output[0], LLMInterface), "Output for text is not LLMInterface"
    openai_completion.assert_called()
--- a/tests/test_llms_completion_models.py
+++ b/tests/test_llms_completion_models.py
@ -1,10 +1,10 @@
 from unittest.mock import patch
-from langchain.llms import AzureOpenAI as AzureOpenAILC, OpenAI as OpenAILC
+from langchain.llms import AzureOpenAI as AzureOpenAILC
 from langchain.llms import OpenAI as OpenAILC
 from kotaemon.llms.completions.openai import AzureOpenAI, OpenAI
 from kotaemon.llms.base import LLMInterface
-
+from kotaemon.llms.completions.openai import AzureOpenAI, OpenAI
 _openai_completion_response = {
    "id": "cmpl-7qyNoIo6gRSCJR0hi8o3ZKBH4RkJ0",
@ -41,7 +41,9 @@ def test_azureopenai_model(openai_completion):
    openai_completion.assert_called()
    output = model("hello world")
-    assert isinstance(output, LLMInterface), "Output for single text is not LLMInterface"
+    assert isinstance(
        output, LLMInterface
    ), "Output for single text is not LLMInterface"
@patch(
@ -67,4 +69,6 @@ def test_openai_model(openai_completion):
    openai_completion.assert_called()
    output = model("hello world")
-    assert isinstance(output, LLMInterface), "Output for single text is not LLMInterface"
+    assert isinstance(
        output, LLMInterface
    ), "Output for single text is not LLMInterface"
--- a/tests/test_telemetry.py
+++ b/tests/test_telemetry.py
@ -29,11 +29,14 @@ def clean_artifacts_for_telemetry():
 def test_disable_telemetry_import_haystack_first():
    """Test that telemetry is disabled when kotaemon lib is initiated after"""
    import os
    import haystack.telemetry
    assert haystack.telemetry.telemetry is not None
    assert os.environ.get("HAYSTACK_TELEMETRY_ENABLED", "True") != "False"
    import kotaemon  # noqa: F401
    assert haystack.telemetry.telemetry is None
    assert os.environ.get("HAYSTACK_TELEMETRY_ENABLED", "True") == "False"
@ -43,8 +46,9 @@ def test_disable_telemetry_import_haystack_after_kotaemon():
    """Test that telemetry is disabled when kotaemon lib is initiated before"""
    import os
    import kotaemon     # noqa: F401
    import haystack.telemetry
    import kotaemon  # noqa: F401
    assert haystack.telemetry.telemetry is None
    assert os.environ.get("HAYSTACK_TELEMETRY_ENABLED", "True") == "False"
		`@ -0,0 +1 @@`
							`credentials.txt:1e17fa46dd8353b5ded588b32983ac7d800e70fd16bc5831663b9aaefc409011`