diff --git a/.github/workflows/build-push-docker.yaml b/.github/workflows/build-push-docker.yaml new file mode 100644 index 0000000..5e02470 --- /dev/null +++ b/.github/workflows/build-push-docker.yaml @@ -0,0 +1,64 @@ +name: Build and Push Docker Image + +on: + push: + tags: + - "v[0-9]+.[0-9]+.[0-9]+" + + workflow_dispatch: + +env: + REGISTRY: ghcr.io + +jobs: + build: + name: Build and push container + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + attestations: write + id-token: write + strategy: + matrix: + platform: + - linux/amd64 + steps: + - name: Set repository and image name + run: | + echo "FULL_IMAGE_NAME=${{ env.REGISTRY }}/${IMAGE_NAME,,}" >>${GITHUB_ENV} + env: + IMAGE_NAME: "${{ github.repository }}" + + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v2 + + - name: Set up Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.FULL_IMAGE_NAME }} + + - name: Log in to the Container registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build docker image + uses: docker/build-push-action@v4 + with: + file: Dockerfile + context: . + push: true + platforms: ${{ matrix.platform }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + load: true diff --git a/Dockerfile b/Dockerfile index 0cd1f02..c281162 100644 --- a/Dockerfile +++ b/Dockerfile @@ -17,6 +17,8 @@ RUN apt-get update -qqy && \ g++ \ poppler-utils \ libpoppler-dev \ + unzip \ + curl \ && apt-get clean \ && apt-get autoremove \ && rm -rf /var/lib/apt/lists/* @@ -27,13 +29,18 @@ ENV PYTHONIOENCODING=UTF-8 WORKDIR /app - FROM base_image as dev +COPY scripts/download_pdfjs.sh /app/scripts/download_pdfjs.sh +RUN chmod +x /app/scripts/download_pdfjs.sh + +ENV PDFJS_PREBUILT_DIR="/app/libs/ktem/ktem/assets/prebuilt/pdfjs-dist" +RUN bash scripts/download_pdfjs.sh $PDFJS_PREBUILT_DIR + COPY . /app RUN --mount=type=ssh pip install --no-cache-dir -e "libs/kotaemon[all]" \ && pip install --no-cache-dir -e "libs/ktem" \ && pip install --no-cache-dir graphrag future \ && pip install --no-cache-dir "pdfservices-sdk@git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements" -ENTRYPOINT ["gradio", "app.py"] +CMD ["python", "app.py"] diff --git a/scripts/download_pdfjs.sh b/scripts/download_pdfjs.sh new file mode 100644 index 0000000..2e9f00f --- /dev/null +++ b/scripts/download_pdfjs.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# Check and capture input argument for PDFJS_VERSION_DIST +if [ -z "$1" ]; then + echo "Usage: $0 " + exit 1 +fi + +pdfjs_version_dist=$1 + +function check_path_for_spaces() { + if [[ $PWD =~ \ ]]; then + echo "The current workdir has whitespace which can lead to unintended behaviour. Please modify your path and continue later." + exit 1 + fi +} + +function download_and_unzip() { + local url=$1 + local dest_dir=$2 + + if [ -d "$dest_dir" ]; then + echo "Destination directory $dest_dir already exists. Skipping download." + return + fi + + mkdir -p "$dest_dir" + + local zip_file="${dest_dir}/downloaded.zip" + echo "Downloading $url to $zip_file" + curl -L -o "$zip_file" "$url" + + echo "Unzipping $zip_file to $dest_dir" + unzip -o "$zip_file" -d "$dest_dir" + + rm "$zip_file" + echo "Download and unzip completed successfully." +} + +# Main script execution + +pdf_js_version="4.0.379" +pdf_js_dist_name="pdfjs-${pdf_js_version}-dist" +pdf_js_dist_url="https://github.com/mozilla/pdf.js/releases/download/v${pdf_js_version}/${pdf_js_dist_name}.zip" + +check_path_for_spaces + +echo "Downloading and unzipping PDF.js" +download_and_unzip "$pdf_js_dist_url" "$pdfjs_version_dist" + +echo "PDF.js has been set up in $pdfjs_version_dist"