From f7b6f313b59966996154ec317666c2cf5dd0d6f0 Mon Sep 17 00:00:00 2001
From: Ben Dykstra <benjamin.dykstra94@gmail.com>
Date: Sun, 29 Sep 2024 09:26:02 -0600
Subject: [PATCH] fix: update setup instructions  (#144) #none

* activate directory to gitignore

* add my custom env to gitignore, will have to change that

* add unstructured to kotaemon pyproject.toml

* add .env to gitignore

* remove .env from tracking

* make changes to the run_macos script, update readme with more detailed instructions

* remove my personal changes from gitignore

* remove line from run_macos script

* remove option for not installing miniconda for non technical users, mark docker dependency as optional

* docs: update demo URL

* gitignore changes

* merge .env.example

* revert changes to run_macos.sh

* unstructured to advanced dependencies

* add link to unstructured system dependencies

* remove api key

* fix: skip tests when unstructured pdf not installed

* chore: loosen unstructured package version in pyproject.toml

* chore: correct syntax

---------

Co-authored-by: Tadashi <tadashi@cinnamon.is>
Co-authored-by: cin-albert <albert@cinnamon.is>
---
 .env => .env.example                     | 12 +++++++-----
 .gitignore                               |  5 +++++
 .python-version                          |  1 +
 README.md                                | 16 +++++++++++++++-
 libs/kotaemon/pyproject.toml             |  9 ++++++---
 libs/kotaemon/tests/conftest.py          |  7 ++++---
 libs/kotaemon/tests/test_reader.py       |  4 ++--
 libs/kotaemon/tests/test_table_reader.py |  4 ++--
 8 files changed, 42 insertions(+), 16 deletions(-)
 rename .env => .env.example (76%)
 create mode 100644 .python-version
diff --git a/.env b/.env.example
similarity index 76%
rename from .env
rename to .env.example
index 6f42b4c..72943d4 100644
--- a/.env
+++ b/.env.example
@@ -1,8 +1,10 @@
+# this is an example .env file, use it to create your own .env file and place it in the root of the project
+
 # settings for OpenAI
 OPENAI_API_BASE=https://api.openai.com/v1
-OPENAI_API_KEY=openai_key
-OPENAI_CHAT_MODEL=gpt-4o
-OPENAI_EMBEDDINGS_MODEL=text-embedding-3-small
+OPENAI_API_KEY=<YOUR OPEN AI KEY HERE>
+OPENAI_CHAT_MODEL=gpt-3.5-turbo
+OPENAI_EMBEDDINGS_MODEL=text-embedding-ada-002
 
 # settings for Azure OpenAI
 AZURE_OPENAI_ENDPOINT=
@@ -12,14 +14,14 @@ AZURE_OPENAI_CHAT_DEPLOYMENT=gpt-35-turbo
 AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT=text-embedding-ada-002
 
 # settings for Cohere
-COHERE_API_KEY=
+COHERE_API_KEY=<COHERE API KEY>
 
 # settings for local models
 LOCAL_MODEL=llama3.1:8b
 LOCAL_MODEL_EMBEDDINGS=nomic-embed-text
 
 # settings for GraphRAG
-GRAPHRAG_API_KEY=openai_key
+GRAPHRAG_API_KEY=<YOUR OPEN AI KEY HERE>
 GRAPHRAG_LLM_MODEL=gpt-4o-mini
 GRAPHRAG_EMBEDDING_MODEL=text-embedding-3-small
 
diff --git a/.gitignore b/.gitignore
index ef2704a..80e557d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,11 @@
 # Created by https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm
 # Edit at https://www.toptal.com/developers/gitignore?templates=python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm
 
+activate*
+activate/*
+kotaemon-env*
+.env
+
 ### Emacs ###
 # -*- mode: gitignore; -*-
 *~
diff --git a/.python-version b/.python-version
new file mode 100644
index 0000000..c8cfe39
--- /dev/null
+++ b/.python-version
@@ -0,0 +1 @@
+3.10
diff --git a/README.md b/README.md
index 7044af2..e55756c 100644
--- a/README.md
+++ b/README.md
@@ -85,6 +85,15 @@ Use the most recent release `.zip` to include latest features and bug-fixes.
 
 ### For developers
 
+#### System requirements
+
+1. Python >=3.10
+2. (optional) [Docker](https://www.docker.com/)
+
+#### If you would like to process files other than .pdf, .html, .mhtml, and .xlsx documents:
+
+You will need to install the system dependencies of [unstructured](https://docs.unstructured.io/open-source/installation/full-installation#full-installation). The installations vary by operating system, so please go to the link and follow the instructions there.
+
 #### With Docker (recommended)
 
 We support `lite` & `full` version of Docker images. With `full`, the extra packages of `unstructured` will be installed as
@@ -141,9 +150,12 @@ cd kotaemon
 
 pip install -e "libs/kotaemon[all]"
 pip install -e "libs/ktem"
+
 ```
 
-- View and edit your environment variables (API keys, end-points) in `.env`.
+- Create a .env file in the root of this project. Use .env.example as a template
+
+The .env file is there to serve use cases where users want to pre-config the models before starting up the app (e.g. deploy the app on HF hub). The file will only be used to populate the db once upon the first run, it will no longer be used in consequent runs.
 
 - (Optional) To enable in-browser PDF_JS viewer, download [PDF_JS_DIST](https://github.com/mozilla/pdf.js/releases/download/v4.0.379/pdfjs-4.0.379-dist.zip) and extract it to `libs/ktem/ktem/assets/prebuilt`
 
@@ -161,6 +173,8 @@ Default username / password are: `admin` / `admin`. You can setup additional use
 
 ![Chat tab](https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/chat-tab.png)
 
+- Check the Resources tab and LLMs and Embeddings and ensure that your `api_key` value is set correctly from your `.env`. file. If it is not set, you can set it here.
+
 ## Setup local models (for local / private RAG)
 
 See [Local model setup](docs/local_model.md).
diff --git a/libs/kotaemon/pyproject.toml b/libs/kotaemon/pyproject.toml
index 59a31d0..2607c55 100644
--- a/libs/kotaemon/pyproject.toml
+++ b/libs/kotaemon/pyproject.toml
@@ -52,7 +52,7 @@ dependencies = [
     "python-dotenv>=1.0.1,<1.1",
     "tenacity>=8.2.3,<8.3",
     "theflow>=0.8.6,<0.9.0",
-    "trogon>=0.5.0,<0.6",
+    "trogon>=0.5.0,<0.6"
 ]
 readme = "README.md"
 authors = [
@@ -73,11 +73,14 @@ adv = [
     "fastembed",
     "googlesearch-python>=1.2.4,<1.3",
     "llama-cpp-python<0.2.8",
-    "sentence-transformers",
-    "wikipedia>=1.4.0,<1.5",
     "llama-index>=0.10.40,<0.11.0",
     "llama-index-vector-stores-milvus",
     "llama-index-vector-stores-qdrant",
+    "python-docx>=1.1.0,<1.2",
+    "sentence-transformers",
+    "tabulate",
+    "unstructured>=0.15.8,<0.16",
+    "wikipedia>=1.4.0,<1.5",
 ]
 dev = [
     "black",
diff --git a/libs/kotaemon/tests/conftest.py b/libs/kotaemon/tests/conftest.py
index c76114c..3f46c70 100644
--- a/libs/kotaemon/tests/conftest.py
+++ b/libs/kotaemon/tests/conftest.py
@@ -42,9 +42,10 @@ def if_sentence_fastembed_not_installed():
         return False
 
 
-def if_unstructured_not_installed():
+def if_unstructured_pdf_not_installed():
     try:
         import unstructured  # noqa: F401
+        from unstructured.partition.pdf import partition_pdf  # noqa: F401
     except ImportError:
         return True
     else:
@@ -81,8 +82,8 @@ skip_when_fastembed_not_installed = pytest.mark.skipif(
     if_sentence_fastembed_not_installed(), reason="fastembed is not installed"
 )
 
-skip_when_unstructured_not_installed = pytest.mark.skipif(
-    if_unstructured_not_installed(), reason="unstructured is not installed"
+skip_when_unstructured_pdf_not_installed = pytest.mark.skipif(
+    if_unstructured_pdf_not_installed(), reason="unstructured is not installed"
 )
 
 skip_when_cohere_not_installed = pytest.mark.skipif(
diff --git a/libs/kotaemon/tests/test_reader.py b/libs/kotaemon/tests/test_reader.py
index e9ca39f..d27c774 100644
--- a/libs/kotaemon/tests/test_reader.py
+++ b/libs/kotaemon/tests/test_reader.py
@@ -14,7 +14,7 @@ from kotaemon.loaders import (
     UnstructuredReader,
 )
 
-from .conftest import skip_when_unstructured_not_installed
+from .conftest import skip_when_unstructured_pdf_not_installed
 
 
 def test_docx_reader():
@@ -54,7 +54,7 @@ def test_pdf_reader():
     assert len(nodes) > 0
 
 
-@skip_when_unstructured_not_installed
+@skip_when_unstructured_pdf_not_installed
 def test_unstructured_pdf_reader():
     reader = UnstructuredReader()
     dirpath = Path(__file__).parent
diff --git a/libs/kotaemon/tests/test_table_reader.py b/libs/kotaemon/tests/test_table_reader.py
index ff4dd4e..e2da3cc 100644
--- a/libs/kotaemon/tests/test_table_reader.py
+++ b/libs/kotaemon/tests/test_table_reader.py
@@ -5,7 +5,7 @@ import pytest
 
 from kotaemon.loaders import MathpixPDFReader, OCRReader, PandasExcelReader
 
-from .conftest import skip_when_unstructured_not_installed
+from .conftest import skip_when_unstructured_pdf_not_installed
 
 input_file = Path(__file__).parent / "resources" / "table.pdf"
 input_file_excel = Path(__file__).parent / "resources" / "dummy.xlsx"
@@ -28,7 +28,7 @@ def mathpix_output():
     return content
 
 
-@skip_when_unstructured_not_installed
+@skip_when_unstructured_pdf_not_installed
 def test_ocr_reader(fullocr_output):
     reader = OCRReader()
     documents = reader.load_data(input_file, response_content=fullocr_output)