make default installation faster (#2)

* remove cohere as default

* refractor dependencies

* use llama-index pdf reader as default (pypdf)

* fix some lazy docstring

* update install scripts

* minor fix
This commit is contained in:
ian_Cin 2024-03-21 22:48:20 +07:00 committed by GitHub
parent a8f92b3f9e
commit d22ae88c7a
11 changed files with 30 additions and 33 deletions

View File

@ -89,7 +89,7 @@ jobs:
run: | run: |
python -m pip install --upgrade pip python -m pip install --upgrade pip
cd libs/kotaemon cd libs/kotaemon
pip install -U --upgrade-strategy eager -e .[dev] pip install -U --upgrade-strategy eager -e .[all]
- name: New dependencies cache for key ${{ steps.restore-dependencies.outputs.cache-primary-key }} - name: New dependencies cache for key ${{ steps.restore-dependencies.outputs.cache-primary-key }}
if: | if: |

View File

@ -7,7 +7,6 @@ from kotaemon.base import BaseComponent, Document, Param
from kotaemon.indices.extractors import BaseDocParser from kotaemon.indices.extractors import BaseDocParser
from kotaemon.indices.splitters import BaseSplitter, TokenSplitter from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
from kotaemon.loaders import ( from kotaemon.loaders import (
AutoReader,
DirectoryReader, DirectoryReader,
MathpixPDFReader, MathpixPDFReader,
OCRReader, OCRReader,
@ -59,7 +58,7 @@ class DocumentIngestor(BaseComponent):
file_extractors[ext] = cls() file_extractors[ext] = cls()
if self.pdf_mode == "normal": if self.pdf_mode == "normal":
file_extractors[".pdf"] = AutoReader("UnstructuredReader") # type: ignore pass # use default loader of llama-index which is pypdf
elif self.pdf_mode == "ocr": elif self.pdf_mode == "ocr":
file_extractors[".pdf"] = OCRReader() file_extractors[".pdf"] = OCRReader()
else: else:

View File

@ -55,7 +55,7 @@ class LIReaderMixin(BaseComponent):
def _get_wrapped_class(self) -> Type["LIBaseReader"]: def _get_wrapped_class(self) -> Type["LIBaseReader"]:
raise NotImplementedError( raise NotImplementedError(
"Please return the relevant Langchain class in in _get_lc_class" "Please return the relevant llama-index class in in _get_wrapped_class"
) )
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):

View File

@ -33,7 +33,7 @@ class DocxReader(BaseReader):
"""Load data using Docx reader """Load data using Docx reader
Args: Args:
file_path (Path): Path to PDF file file_path (Path): Path to .docx file
Returns: Returns:
List[Document]: list of documents extracted from the HTML file List[Document]: list of documents extracted from the HTML file

View File

@ -37,7 +37,7 @@ class HtmlReader(BaseReader):
"""Load data using Html reader """Load data using Html reader
Args: Args:
file_path: path to pdf file file_path: path to HTML file
extra_info: extra information passed to this reader during extracting data extra_info: extra information passed to this reader during extracting data
Returns: Returns:

View File

@ -17,6 +17,8 @@ description = "Kotaemon core library for AI development."
dependencies = [ dependencies = [
"langchain", "langchain",
"langchain-community", "langchain-community",
"langchain-openai",
"openai",
"theflow", "theflow",
"llama-index>=0.9.0,<0.10.0", "llama-index>=0.9.0,<0.10.0",
"llama-hub", "llama-hub",
@ -27,6 +29,11 @@ dependencies = [
"pandas", "pandas",
"trogon", "trogon",
"tenacity", "tenacity",
"python-dotenv", # currently used to read configs from file, should be remove in the future
"chromadb",
"unstructured",
"pypdf",
"html2text",
] ]
readme = "README.md" readme = "README.md"
license = { text = "MIT License" } license = { text = "MIT License" }
@ -42,6 +49,18 @@ classifiers = [
] ]
[project.optional-dependencies] [project.optional-dependencies]
adv = [
"wikipedia",
"duckduckgo-search",
"googlesearch-python",
"python-docx",
"pytest-mock",
"unstructured[pdf]",
"sentence_transformers",
"cohere",
"elasticsearch",
"llama-cpp-python",
]
dev = [ dev = [
"ipython", "ipython",
"pytest", "pytest",
@ -50,23 +69,8 @@ dev = [
"flake8", "flake8",
"sphinx", "sphinx",
"coverage", "coverage",
"openai",
"langchain-openai",
"chromadb",
"wikipedia",
"duckduckgo-search",
"googlesearch-python",
"python-docx",
"python-dotenv",
"pytest-mock",
"unstructured[pdf]",
"sentence_transformers",
"cohere",
"elasticsearch",
"pypdf",
"html2text",
"llama-cpp-python",
] ]
all = ["kotaemon[adv,dev]"]
[project.scripts] [project.scripts]
kh = "kotaemon.cli:main" kh = "kotaemon.cli:main"

View File

@ -25,7 +25,7 @@ from theflow.utils.modules import import_dotted_string
from kotaemon.base import RetrievedDocument from kotaemon.base import RetrievedDocument
from kotaemon.indices import VectorIndexing, VectorRetrieval from kotaemon.indices import VectorIndexing, VectorRetrieval
from kotaemon.indices.ingests import DocumentIngestor from kotaemon.indices.ingests import DocumentIngestor
from kotaemon.indices.rankings import BaseReranking, CohereReranking, LLMReranking from kotaemon.indices.rankings import BaseReranking, LLMReranking
from .base import BaseFileIndexIndexing, BaseFileIndexRetriever from .base import BaseFileIndexIndexing, BaseFileIndexRetriever
@ -67,9 +67,7 @@ class DocumentRetrievalPipeline(BaseFileIndexRetriever):
vector_retrieval: VectorRetrieval = VectorRetrieval.withx( vector_retrieval: VectorRetrieval = VectorRetrieval.withx(
embedding=embeddings.get_default(), embedding=embeddings.get_default(),
) )
reranker: BaseReranking = CohereReranking.withx( reranker: BaseReranking = LLMReranking.withx(llm=llms.get_lowest_cost())
cohere_api_key=getattr(settings, "COHERE_API_KEY", "")
) >> LLMReranking.withx(llm=llms.get_lowest_cost())
get_extra_table: bool = False get_extra_table: bool = False
def run( def run(

View File

@ -13,18 +13,14 @@ version = "0.2.0"
requires-python = ">= 3.10" requires-python = ">= 3.10"
description = "RAG-based Question and Answering Application" description = "RAG-based Question and Answering Application"
dependencies = [ dependencies = [
"chromadb",
"click", "click",
"cohere",
"platformdirs", "platformdirs",
"pluggy", "pluggy",
"python-decouple", "python-decouple",
"python-dotenv",
"python-pptx", "python-pptx",
"sqlalchemy", "sqlalchemy",
"sqlmodel", "sqlmodel",
"tiktoken", "tiktoken",
"unstructured[pdf]",
] ]
readme = "README.md" readme = "README.md"
license = { text = "MIT License" } license = { text = "MIT License" }

View File

@ -92,7 +92,7 @@ function install_dependencies() {
if pip list 2>/dev/null | grep -q "kotaemon"; then if pip list 2>/dev/null | grep -q "kotaemon"; then
echo "Requirements are already installed" echo "Requirements are already installed"
else else
local kotaemon_root="$(pwd)/libs/kotaemon/.[dev]" local kotaemon_root="$(pwd)/libs/kotaemon"
local ktem_root="$(pwd)/libs/ktem/" local ktem_root="$(pwd)/libs/ktem/"
echo "" && echo "Install kotaemon's requirements" echo "" && echo "Install kotaemon's requirements"

View File

@ -92,7 +92,7 @@ function install_dependencies() {
if pip list 2>/dev/null | grep -q "kotaemon"; then if pip list 2>/dev/null | grep -q "kotaemon"; then
echo "Requirements are already installed" echo "Requirements are already installed"
else else
local kotaemon_root="$(pwd)/libs/kotaemon/.[dev]" local kotaemon_root="$(pwd)/libs/kotaemon"
local ktem_root="$(pwd)/libs/ktem/" local ktem_root="$(pwd)/libs/ktem/"
echo "" && echo "Install kotaemon's requirements" echo "" && echo "Install kotaemon's requirements"

View File

@ -114,7 +114,7 @@ IF %ERRORLEVEL% == 0 (
ECHO Dependencies are already installed ECHO Dependencies are already installed
) ELSE ( ) ELSE (
ECHO Install kotaemon's requirements ECHO Install kotaemon's requirements
CALL python -m pip install -e "%CD%\libs\kotaemon\.[dev]" CALL python -m pip install -e "%CD%\libs\kotaemon"
ECHO Install ktem's requirements ECHO Install ktem's requirements
CALL python -m pip install -e "%CD%\libs\ktem" CALL python -m pip install -e "%CD%\libs\ktem"