make default installation faster (#2)
* remove cohere as default * refractor dependencies * use llama-index pdf reader as default (pypdf) * fix some lazy docstring * update install scripts * minor fix
This commit is contained in:
parent
a8f92b3f9e
commit
d22ae88c7a
2
.github/workflows/unit-test.yaml
vendored
2
.github/workflows/unit-test.yaml
vendored
|
@ -89,7 +89,7 @@ jobs:
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
cd libs/kotaemon
|
cd libs/kotaemon
|
||||||
pip install -U --upgrade-strategy eager -e .[dev]
|
pip install -U --upgrade-strategy eager -e .[all]
|
||||||
|
|
||||||
- name: New dependencies cache for key ${{ steps.restore-dependencies.outputs.cache-primary-key }}
|
- name: New dependencies cache for key ${{ steps.restore-dependencies.outputs.cache-primary-key }}
|
||||||
if: |
|
if: |
|
||||||
|
|
|
@ -7,7 +7,6 @@ from kotaemon.base import BaseComponent, Document, Param
|
||||||
from kotaemon.indices.extractors import BaseDocParser
|
from kotaemon.indices.extractors import BaseDocParser
|
||||||
from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
|
from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
|
||||||
from kotaemon.loaders import (
|
from kotaemon.loaders import (
|
||||||
AutoReader,
|
|
||||||
DirectoryReader,
|
DirectoryReader,
|
||||||
MathpixPDFReader,
|
MathpixPDFReader,
|
||||||
OCRReader,
|
OCRReader,
|
||||||
|
@ -59,7 +58,7 @@ class DocumentIngestor(BaseComponent):
|
||||||
file_extractors[ext] = cls()
|
file_extractors[ext] = cls()
|
||||||
|
|
||||||
if self.pdf_mode == "normal":
|
if self.pdf_mode == "normal":
|
||||||
file_extractors[".pdf"] = AutoReader("UnstructuredReader") # type: ignore
|
pass # use default loader of llama-index which is pypdf
|
||||||
elif self.pdf_mode == "ocr":
|
elif self.pdf_mode == "ocr":
|
||||||
file_extractors[".pdf"] = OCRReader()
|
file_extractors[".pdf"] = OCRReader()
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -55,7 +55,7 @@ class LIReaderMixin(BaseComponent):
|
||||||
|
|
||||||
def _get_wrapped_class(self) -> Type["LIBaseReader"]:
|
def _get_wrapped_class(self) -> Type["LIBaseReader"]:
|
||||||
raise NotImplementedError(
|
raise NotImplementedError(
|
||||||
"Please return the relevant Langchain class in in _get_lc_class"
|
"Please return the relevant llama-index class in in _get_wrapped_class"
|
||||||
)
|
)
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
|
|
|
@ -33,7 +33,7 @@ class DocxReader(BaseReader):
|
||||||
"""Load data using Docx reader
|
"""Load data using Docx reader
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path (Path): Path to PDF file
|
file_path (Path): Path to .docx file
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List[Document]: list of documents extracted from the HTML file
|
List[Document]: list of documents extracted from the HTML file
|
||||||
|
|
|
@ -37,7 +37,7 @@ class HtmlReader(BaseReader):
|
||||||
"""Load data using Html reader
|
"""Load data using Html reader
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: path to pdf file
|
file_path: path to HTML file
|
||||||
extra_info: extra information passed to this reader during extracting data
|
extra_info: extra information passed to this reader during extracting data
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
|
@ -17,6 +17,8 @@ description = "Kotaemon core library for AI development."
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"langchain",
|
"langchain",
|
||||||
"langchain-community",
|
"langchain-community",
|
||||||
|
"langchain-openai",
|
||||||
|
"openai",
|
||||||
"theflow",
|
"theflow",
|
||||||
"llama-index>=0.9.0,<0.10.0",
|
"llama-index>=0.9.0,<0.10.0",
|
||||||
"llama-hub",
|
"llama-hub",
|
||||||
|
@ -27,6 +29,11 @@ dependencies = [
|
||||||
"pandas",
|
"pandas",
|
||||||
"trogon",
|
"trogon",
|
||||||
"tenacity",
|
"tenacity",
|
||||||
|
"python-dotenv", # currently used to read configs from file, should be remove in the future
|
||||||
|
"chromadb",
|
||||||
|
"unstructured",
|
||||||
|
"pypdf",
|
||||||
|
"html2text",
|
||||||
]
|
]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
license = { text = "MIT License" }
|
license = { text = "MIT License" }
|
||||||
|
@ -42,6 +49,18 @@ classifiers = [
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
|
adv = [
|
||||||
|
"wikipedia",
|
||||||
|
"duckduckgo-search",
|
||||||
|
"googlesearch-python",
|
||||||
|
"python-docx",
|
||||||
|
"pytest-mock",
|
||||||
|
"unstructured[pdf]",
|
||||||
|
"sentence_transformers",
|
||||||
|
"cohere",
|
||||||
|
"elasticsearch",
|
||||||
|
"llama-cpp-python",
|
||||||
|
]
|
||||||
dev = [
|
dev = [
|
||||||
"ipython",
|
"ipython",
|
||||||
"pytest",
|
"pytest",
|
||||||
|
@ -50,23 +69,8 @@ dev = [
|
||||||
"flake8",
|
"flake8",
|
||||||
"sphinx",
|
"sphinx",
|
||||||
"coverage",
|
"coverage",
|
||||||
"openai",
|
|
||||||
"langchain-openai",
|
|
||||||
"chromadb",
|
|
||||||
"wikipedia",
|
|
||||||
"duckduckgo-search",
|
|
||||||
"googlesearch-python",
|
|
||||||
"python-docx",
|
|
||||||
"python-dotenv",
|
|
||||||
"pytest-mock",
|
|
||||||
"unstructured[pdf]",
|
|
||||||
"sentence_transformers",
|
|
||||||
"cohere",
|
|
||||||
"elasticsearch",
|
|
||||||
"pypdf",
|
|
||||||
"html2text",
|
|
||||||
"llama-cpp-python",
|
|
||||||
]
|
]
|
||||||
|
all = ["kotaemon[adv,dev]"]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
kh = "kotaemon.cli:main"
|
kh = "kotaemon.cli:main"
|
||||||
|
|
|
@ -25,7 +25,7 @@ from theflow.utils.modules import import_dotted_string
|
||||||
from kotaemon.base import RetrievedDocument
|
from kotaemon.base import RetrievedDocument
|
||||||
from kotaemon.indices import VectorIndexing, VectorRetrieval
|
from kotaemon.indices import VectorIndexing, VectorRetrieval
|
||||||
from kotaemon.indices.ingests import DocumentIngestor
|
from kotaemon.indices.ingests import DocumentIngestor
|
||||||
from kotaemon.indices.rankings import BaseReranking, CohereReranking, LLMReranking
|
from kotaemon.indices.rankings import BaseReranking, LLMReranking
|
||||||
|
|
||||||
from .base import BaseFileIndexIndexing, BaseFileIndexRetriever
|
from .base import BaseFileIndexIndexing, BaseFileIndexRetriever
|
||||||
|
|
||||||
|
@ -67,9 +67,7 @@ class DocumentRetrievalPipeline(BaseFileIndexRetriever):
|
||||||
vector_retrieval: VectorRetrieval = VectorRetrieval.withx(
|
vector_retrieval: VectorRetrieval = VectorRetrieval.withx(
|
||||||
embedding=embeddings.get_default(),
|
embedding=embeddings.get_default(),
|
||||||
)
|
)
|
||||||
reranker: BaseReranking = CohereReranking.withx(
|
reranker: BaseReranking = LLMReranking.withx(llm=llms.get_lowest_cost())
|
||||||
cohere_api_key=getattr(settings, "COHERE_API_KEY", "")
|
|
||||||
) >> LLMReranking.withx(llm=llms.get_lowest_cost())
|
|
||||||
get_extra_table: bool = False
|
get_extra_table: bool = False
|
||||||
|
|
||||||
def run(
|
def run(
|
||||||
|
|
|
@ -13,18 +13,14 @@ version = "0.2.0"
|
||||||
requires-python = ">= 3.10"
|
requires-python = ">= 3.10"
|
||||||
description = "RAG-based Question and Answering Application"
|
description = "RAG-based Question and Answering Application"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"chromadb",
|
|
||||||
"click",
|
"click",
|
||||||
"cohere",
|
|
||||||
"platformdirs",
|
"platformdirs",
|
||||||
"pluggy",
|
"pluggy",
|
||||||
"python-decouple",
|
"python-decouple",
|
||||||
"python-dotenv",
|
|
||||||
"python-pptx",
|
"python-pptx",
|
||||||
"sqlalchemy",
|
"sqlalchemy",
|
||||||
"sqlmodel",
|
"sqlmodel",
|
||||||
"tiktoken",
|
"tiktoken",
|
||||||
"unstructured[pdf]",
|
|
||||||
]
|
]
|
||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
license = { text = "MIT License" }
|
license = { text = "MIT License" }
|
||||||
|
|
|
@ -92,7 +92,7 @@ function install_dependencies() {
|
||||||
if pip list 2>/dev/null | grep -q "kotaemon"; then
|
if pip list 2>/dev/null | grep -q "kotaemon"; then
|
||||||
echo "Requirements are already installed"
|
echo "Requirements are already installed"
|
||||||
else
|
else
|
||||||
local kotaemon_root="$(pwd)/libs/kotaemon/.[dev]"
|
local kotaemon_root="$(pwd)/libs/kotaemon"
|
||||||
local ktem_root="$(pwd)/libs/ktem/"
|
local ktem_root="$(pwd)/libs/ktem/"
|
||||||
|
|
||||||
echo "" && echo "Install kotaemon's requirements"
|
echo "" && echo "Install kotaemon's requirements"
|
||||||
|
|
|
@ -92,7 +92,7 @@ function install_dependencies() {
|
||||||
if pip list 2>/dev/null | grep -q "kotaemon"; then
|
if pip list 2>/dev/null | grep -q "kotaemon"; then
|
||||||
echo "Requirements are already installed"
|
echo "Requirements are already installed"
|
||||||
else
|
else
|
||||||
local kotaemon_root="$(pwd)/libs/kotaemon/.[dev]"
|
local kotaemon_root="$(pwd)/libs/kotaemon"
|
||||||
local ktem_root="$(pwd)/libs/ktem/"
|
local ktem_root="$(pwd)/libs/ktem/"
|
||||||
|
|
||||||
echo "" && echo "Install kotaemon's requirements"
|
echo "" && echo "Install kotaemon's requirements"
|
||||||
|
|
|
@ -114,7 +114,7 @@ IF %ERRORLEVEL% == 0 (
|
||||||
ECHO Dependencies are already installed
|
ECHO Dependencies are already installed
|
||||||
) ELSE (
|
) ELSE (
|
||||||
ECHO Install kotaemon's requirements
|
ECHO Install kotaemon's requirements
|
||||||
CALL python -m pip install -e "%CD%\libs\kotaemon\.[dev]"
|
CALL python -m pip install -e "%CD%\libs\kotaemon"
|
||||||
|
|
||||||
ECHO Install ktem's requirements
|
ECHO Install ktem's requirements
|
||||||
CALL python -m pip install -e "%CD%\libs\ktem"
|
CALL python -m pip install -e "%CD%\libs\ktem"
|
||||||
|
|
Loading…
Reference in New Issue
Block a user