From d3fd75297f7d78fc88b1e58f2f698ec9851c0980 Mon Sep 17 00:00:00 2001 From: kan_cin Date: Thu, 12 Sep 2024 20:25:03 +0700 Subject: [PATCH] feat: add multi-stages docker and support platform arm (#274) * feat: add multi-stages docker and support platform arm * refactor: pre-commit * fix: raise ImportError (fastembed) instead of auto install * feat: add dependencies for local llm * feat: free disk * feat: update README * feat: update README * chore: fix typo --------- Co-authored-by: cin-niko --- .github/workflows/build-push-docker.yaml | 49 ++++++++++-- Dockerfile | 80 ++++++++++++++----- README.md | 33 +++++++- .../kotaemon/kotaemon/embeddings/fastembed.py | 5 +- libs/kotaemon/pyproject.toml | 3 + scripts/download_pdfjs.sh | 2 + 6 files changed, 141 insertions(+), 31 deletions(-) diff --git a/.github/workflows/build-push-docker.yaml b/.github/workflows/build-push-docker.yaml index f055212..98f3233 100644 --- a/.github/workflows/build-push-docker.yaml +++ b/.github/workflows/build-push-docker.yaml @@ -25,8 +25,9 @@ jobs: id-token: write strategy: matrix: - platform: - - linux/amd64 + target: + - lite + - full steps: - name: Set repository and image name run: | @@ -37,15 +38,32 @@ jobs: - name: Checkout uses: actions/checkout@v4 + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + with: + image: tonistiigi/binfmt:latest + platforms: arm64,arm + - name: Set up Docker Buildx id: buildx - uses: docker/setup-buildx-action@v2 + uses: docker/setup-buildx-action@v3 - name: Set up Docker meta id: meta uses: docker/metadata-action@v5 with: images: ${{ env.FULL_IMAGE_NAME }} + tags: | + # branch + type=ref,event=branch,suffix=-${{ matrix.target }} + # semver with suffix for lite/full targets + type=semver,pattern={{version}},suffix=-${{ matrix.target }} + # latest tag with suffix for lite/full targets + type=raw,value=latest,enable=${{ startsWith(github.ref, 'refs/tags/') && !contains(github.ref, 'pre') }},suffix=-${{ matrix.target }} + flavor: | + # This is disabled here so we can use the raw form above + latest=false + # Suffix is not used here since there's no way to disable it above - name: Log in to the Container registry uses: docker/login-action@v3 @@ -54,15 +72,32 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Free Disk Space (Ubuntu) + uses: jlumbroso/free-disk-space@main + with: + # this might remove tools that are actually needed, + # if set to "true" but frees about 6 GB + tool-cache: false + + # all of these default to true, but feel free to set to + # "false" if necessary for your workflow + android: true + dotnet: true + haskell: true + large-packages: true + docker-images: true + swap-storage: true + - name: Build docker image - uses: docker/build-push-action@v4 + uses: docker/build-push-action@v6 with: file: Dockerfile context: . push: true - platforms: ${{ matrix.platform }} - tags: ${{ steps.meta.outputs.tags }} + platforms: linux/amd64,linux/arm64 + tags: | + ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} + target: ${{ matrix.target }} cache-from: type=gha cache-to: type=gha,mode=max - load: true diff --git a/Dockerfile b/Dockerfile index c281162..14bf651 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,14 +1,7 @@ -# syntax=docker/dockerfile:1.0.0-experimental -FROM python:3.10-slim as base_image - -# for additional file parsers - -# tesseract-ocr \ -# tesseract-ocr-jpn \ -# libsm6 \ -# libxext6 \ -# ffmpeg \ +# Lite version +FROM python:3.10-slim AS lite +# Common dependencies RUN apt-get update -qqy && \ apt-get install -y --no-install-recommends \ ssh \ @@ -19,28 +12,75 @@ RUN apt-get update -qqy && \ libpoppler-dev \ unzip \ curl \ - && apt-get clean \ - && apt-get autoremove \ - && rm -rf /var/lib/apt/lists/* + cargo +# Set environment variables ENV PYTHONDONTWRITEBYTECODE=1 ENV PYTHONUNBUFFERED=1 ENV PYTHONIOENCODING=UTF-8 +# Create working directory WORKDIR /app -FROM base_image as dev - +# Download pdfjs COPY scripts/download_pdfjs.sh /app/scripts/download_pdfjs.sh RUN chmod +x /app/scripts/download_pdfjs.sh - ENV PDFJS_PREBUILT_DIR="/app/libs/ktem/ktem/assets/prebuilt/pdfjs-dist" RUN bash scripts/download_pdfjs.sh $PDFJS_PREBUILT_DIR +# Copy contents COPY . /app -RUN --mount=type=ssh pip install --no-cache-dir -e "libs/kotaemon[all]" \ - && pip install --no-cache-dir -e "libs/ktem" \ - && pip install --no-cache-dir graphrag future \ - && pip install --no-cache-dir "pdfservices-sdk@git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements" + +# Install pip packages +RUN --mount=type=ssh \ + --mount=type=cache,target=/root/.cache/pip \ + pip install -e "libs/kotaemon[all]" \ + && pip install -e "libs/ktem" \ + && pip install graphrag future \ + && pip install "pdfservices-sdk@git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements" + +# Clean up +RUN apt-get autoremove \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf ~/.cache + +CMD ["python", "app.py"] + +# Full version +FROM lite AS full + +# Additional dependencies for full version +RUN apt-get update -qqy && \ + apt-get install -y --no-install-recommends \ + tesseract-ocr \ + tesseract-ocr-jpn \ + libsm6 \ + libxext6 \ + libreoffice \ + ffmpeg \ + libmagic-dev + +# Install torch and torchvision for unstructured +RUN --mount=type=ssh \ + --mount=type=cache,target=/root/.cache/pip \ + pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + +# Copy contents +COPY . /app + +# Install additional pip packages +RUN --mount=type=ssh \ + --mount=type=cache,target=/root/.cache/pip \ + pip install unstructured[all-docs] + +# Clean up +RUN apt-get autoremove \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* \ + && rm -rf ~/.cache + +# Download nltk packages as required for unstructured +RUN python -c "from unstructured.nlp.tokenize import _download_nltk_packages_if_not_present; _download_nltk_packages_if_not_present()" CMD ["python", "app.py"] diff --git a/README.md b/README.md index 12b2b4c..33909e2 100644 --- a/README.md +++ b/README.md @@ -86,17 +86,44 @@ Use the most recent release .zip to include latest features and bug-fixes. #### With Docker (recommended) -- Use this command to launch the server +We support `lite` & `full` version of Dockerfile. With `full`, the extra packages of `unstructured` will be installed as +well, it can support multiple file types (.doc, .docx, ...) but the cost is larger docker image size + +- To use the `lite` version. ``` docker run \ -e GRADIO_SERVER_NAME=0.0.0.0 \ -e GRADIO_SERVER_PORT=7860 \ -p 7860:7860 -it --rm \ -ghcr.io/cinnamon/kotaemon:latest +ghcr.io/cinnamon/kotaemon:latest-lite ``` -Navigate to `http://localhost:7860/` to access the web UI. +- To use the `full` version. + +``` +docker run \ +-e GRADIO_SERVER_NAME=0.0.0.0 \ +-e GRADIO_SERVER_PORT=7860 \ +-p 7860:7860 -it --rm \ +ghcr.io/cinnamon/kotaemon:latest-full +``` + +Currently, two platforms: `linux/amd64` and `linux/arm64` (for newer Mac) are provided & tested. User can specify the platform by passing `--platform` in the docker run command. For example: + +``` +# To run docker with platform linux/arm64 +docker run \ +-e GRADIO_SERVER_NAME=0.0.0.0 \ +-e GRADIO_SERVER_PORT=7860 \ +-p 7860:7860 -it --rm \ +--platform linux/arm64 \ +ghcr.io/cinnamon/kotaemon:latest-lite +``` + +If everything is set up fine, navigate to `http://localhost:7860/` to access the web UI. + +We use [GHCR](https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry) to store docker images, all images can be found [here.](https://github.com/Cinnamon/kotaemon/pkgs/container/kotaemon) #### Without Docker diff --git a/libs/kotaemon/kotaemon/embeddings/fastembed.py b/libs/kotaemon/kotaemon/embeddings/fastembed.py index de9816e..ecaec67 100644 --- a/libs/kotaemon/kotaemon/embeddings/fastembed.py +++ b/libs/kotaemon/kotaemon/embeddings/fastembed.py @@ -41,7 +41,10 @@ class FastEmbedEmbeddings(BaseEmbeddings): @Param.auto() def client_(self) -> "TextEmbedding": - from fastembed import TextEmbedding + try: + from fastembed import TextEmbedding + except ImportError: + raise ImportError("Please install FastEmbed: `pip install fastembed`") return TextEmbedding(model_name=self.model_name) diff --git a/libs/kotaemon/pyproject.toml b/libs/kotaemon/pyproject.toml index ee28cfd..dacba75 100644 --- a/libs/kotaemon/pyproject.toml +++ b/libs/kotaemon/pyproject.toml @@ -70,6 +70,9 @@ adv = [ "python-docx>=1.1.0,<1.2", "tabulate", "wikipedia>=1.4.0,<1.5", + "sentence-transformers", + "llama-cpp-python<0.2.8", + "fastembed", ] dev = [ "black", diff --git a/scripts/download_pdfjs.sh b/scripts/download_pdfjs.sh index 2e9f00f..8506890 100644 --- a/scripts/download_pdfjs.sh +++ b/scripts/download_pdfjs.sh @@ -1,5 +1,7 @@ #!/bin/bash +set -eo pipefail + # Check and capture input argument for PDFJS_VERSION_DIST if [ -z "$1" ]; then echo "Usage: $0 "