make default installation faster (#2)

* remove cohere as default * refractor dependencies * use llama-index pdf reader as default (pypdf) * fix some lazy docstring * update install scripts * minor fix
2024-03-21 22:48:20 +07:00
parent a8f92b3f9e
commit d22ae88c7a
11 changed files with 30 additions and 33 deletions
--- a/.github/workflows/unit-test.yaml
+++ b/.github/workflows/unit-test.yaml
@@ -89,7 +89,7 @@ jobs:
        run: |
          python -m pip install --upgrade pip
          cd libs/kotaemon
-          pip install -U --upgrade-strategy eager -e .[dev]
+          pip install -U --upgrade-strategy eager -e .[all]
      - name: New dependencies cache for key ${{ steps.restore-dependencies.outputs.cache-primary-key }}
        if: |
--- a/libs/kotaemon/kotaemon/indices/ingests/files.py
+++ b/libs/kotaemon/kotaemon/indices/ingests/files.py
@@ -7,7 +7,6 @@ from kotaemon.base import BaseComponent, Document, Param
 from kotaemon.indices.extractors import BaseDocParser
 from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
 from kotaemon.loaders import (
    AutoReader,
    DirectoryReader,
    MathpixPDFReader,
    OCRReader,
@@ -59,7 +58,7 @@ class DocumentIngestor(BaseComponent):
            file_extractors[ext] = cls()
        if self.pdf_mode == "normal":
-            file_extractors[".pdf"] = AutoReader("UnstructuredReader")  # type: ignore
+            pass  # use default loader of llama-index which is pypdf
        elif self.pdf_mode == "ocr":
            file_extractors[".pdf"] = OCRReader()
        else:
--- a/libs/kotaemon/kotaemon/loaders/base.py
+++ b/libs/kotaemon/kotaemon/loaders/base.py
@@ -55,7 +55,7 @@ class LIReaderMixin(BaseComponent):
    def _get_wrapped_class(self) -> Type["LIBaseReader"]:
        raise NotImplementedError(
-            "Please return the relevant Langchain class in in _get_lc_class"
+            "Please return the relevant llama-index class in in _get_wrapped_class"
        )
    def __init__(self, *args, **kwargs):
--- a/libs/kotaemon/kotaemon/loaders/docx_loader.py
+++ b/libs/kotaemon/kotaemon/loaders/docx_loader.py
@@ -33,7 +33,7 @@ class DocxReader(BaseReader):
        """Load data using Docx reader
        Args:
-            file_path (Path): Path to PDF file
+            file_path (Path): Path to .docx file
        Returns:
            List[Document]: list of documents extracted from the HTML file
--- a/libs/kotaemon/kotaemon/loaders/html_loader.py
+++ b/libs/kotaemon/kotaemon/loaders/html_loader.py
@@ -37,7 +37,7 @@ class HtmlReader(BaseReader):
        """Load data using Html reader
        Args:
-            file_path: path to pdf file
+            file_path: path to HTML file
            extra_info: extra information passed to this reader during extracting data
        Returns:
--- a/libs/kotaemon/pyproject.toml
+++ b/libs/kotaemon/pyproject.toml
@@ -17,6 +17,8 @@ description = "Kotaemon core library for AI development."
 dependencies = [
    "langchain",
    "langchain-community",
    "langchain-openai",
    "openai",
    "theflow",
    "llama-index>=0.9.0,<0.10.0",
    "llama-hub",
@@ -27,6 +29,11 @@ dependencies = [
    "pandas",
    "trogon",
    "tenacity",
    "python-dotenv", # currently used to read configs from file, should be remove in the future
    "chromadb",
    "unstructured",
    "pypdf",
    "html2text",
 ]
 readme = "README.md"
 license = { text = "MIT License" }
@@ -42,6 +49,18 @@ classifiers = [
 ]
 [project.optional-dependencies]
 adv = [
    "wikipedia",
    "duckduckgo-search",
    "googlesearch-python",
    "python-docx",
    "pytest-mock",
    "unstructured[pdf]",
    "sentence_transformers",
    "cohere",
    "elasticsearch",
    "llama-cpp-python",
 ]
 dev = [
    "ipython",
    "pytest",
@@ -50,23 +69,8 @@ dev = [
    "flake8",
    "sphinx",
    "coverage",
    "openai",
    "langchain-openai",
    "chromadb",
    "wikipedia",
    "duckduckgo-search",
    "googlesearch-python",
    "python-docx",
    "python-dotenv",
    "pytest-mock",
    "unstructured[pdf]",
    "sentence_transformers",
    "cohere",
    "elasticsearch",
    "pypdf",
    "html2text",
    "llama-cpp-python",
 ]
 all = ["kotaemon[adv,dev]"]
 [project.scripts]
 kh = "kotaemon.cli:main"
--- a/libs/ktem/ktem/index/file/pipelines.py
+++ b/libs/ktem/ktem/index/file/pipelines.py
@@ -25,7 +25,7 @@ from theflow.utils.modules import import_dotted_string
 from kotaemon.base import RetrievedDocument
 from kotaemon.indices import VectorIndexing, VectorRetrieval
 from kotaemon.indices.ingests import DocumentIngestor
-from kotaemon.indices.rankings import BaseReranking, CohereReranking, LLMReranking
+from kotaemon.indices.rankings import BaseReranking, LLMReranking
 from .base import BaseFileIndexIndexing, BaseFileIndexRetriever
@@ -67,9 +67,7 @@ class DocumentRetrievalPipeline(BaseFileIndexRetriever):
    vector_retrieval: VectorRetrieval = VectorRetrieval.withx(
        embedding=embeddings.get_default(),
    )
-    reranker: BaseReranking = CohereReranking.withx(
+    reranker: BaseReranking = LLMReranking.withx(llm=llms.get_lowest_cost())
        cohere_api_key=getattr(settings, "COHERE_API_KEY", "")
    ) >> LLMReranking.withx(llm=llms.get_lowest_cost())
    get_extra_table: bool = False
    def run(
--- a/libs/ktem/pyproject.toml
+++ b/libs/ktem/pyproject.toml
@@ -13,18 +13,14 @@ version = "0.2.0"
 requires-python = ">= 3.10"
 description = "RAG-based Question and Answering Application"
 dependencies = [
    "chromadb",
    "click",
    "cohere",
    "platformdirs",
    "pluggy",
    "python-decouple",
    "python-dotenv",
    "python-pptx",
    "sqlalchemy",
    "sqlmodel",
    "tiktoken",
    "unstructured[pdf]",
 ]
 readme = "README.md"
 license = { text = "MIT License" }
--- a/scripts/run_linux.sh
+++ b/scripts/run_linux.sh
@@ -92,7 +92,7 @@ function install_dependencies() {
    if pip list 2>/dev/null | grep -q "kotaemon"; then
        echo "Requirements are already installed"
    else
-        local kotaemon_root="$(pwd)/libs/kotaemon/.[dev]"
+        local kotaemon_root="$(pwd)/libs/kotaemon"
        local ktem_root="$(pwd)/libs/ktem/"
        echo "" && echo "Install kotaemon's requirements"
--- a/scripts/run_macos.sh
+++ b/scripts/run_macos.sh
@@ -92,7 +92,7 @@ function install_dependencies() {
    if pip list 2>/dev/null | grep -q "kotaemon"; then
        echo "Requirements are already installed"
    else
-        local kotaemon_root="$(pwd)/libs/kotaemon/.[dev]"
+        local kotaemon_root="$(pwd)/libs/kotaemon"
        local ktem_root="$(pwd)/libs/ktem/"
        echo "" && echo "Install kotaemon's requirements"
--- a/scripts/run_windows.bat
+++ b/scripts/run_windows.bat
@@ -114,7 +114,7 @@ IF %ERRORLEVEL% == 0  (
    ECHO Dependencies are already installed
 ) ELSE (
    ECHO Install kotaemon's requirements
-    CALL python -m pip install -e "%CD%\libs\kotaemon\.[dev]"
+    CALL python -m pip install -e "%CD%\libs\kotaemon"
    ECHO Install ktem's requirements
    CALL python -m pip install -e "%CD%\libs\ktem"