diff --git a/libs/kotaemon/kotaemon/indices/ingests/files.py b/libs/kotaemon/kotaemon/indices/ingests/files.py index 22e7db9..5800554 100644 --- a/libs/kotaemon/kotaemon/indices/ingests/files.py +++ b/libs/kotaemon/kotaemon/indices/ingests/files.py @@ -1,4 +1,5 @@ from pathlib import Path +from typing import Type from llama_index.readers.base import BaseReader @@ -14,6 +15,13 @@ from kotaemon.loaders import ( UnstructuredReader, ) +KH_DEFAULT_FILE_EXTRACTORS: dict[str, Type[BaseReader]] = { + ".xlsx": PandasExcelReader, + ".docx": UnstructuredReader, + ".xls": UnstructuredReader, + ".doc": UnstructuredReader, +} + class DocumentIngestor(BaseComponent): """Ingest common office document types into Document for indexing @@ -30,6 +38,8 @@ class DocumentIngestor(BaseComponent): - ocr: parse pdf image using flax doc_parsers: list of document parsers to parse the document text_splitter: splitter to split the document into text nodes + override_file_extractors: override file extractors for specific file extensions + The default file extractors are stored in `KH_DEFAULT_FILE_EXTRACTORS` """ pdf_mode: str = "normal" # "normal", "mathpix", "ocr" @@ -38,26 +48,26 @@ class DocumentIngestor(BaseComponent): chunk_size=1024, chunk_overlap=256, ) + override_file_extractors: dict[str, Type[BaseReader]] = {} def _get_reader(self, input_files: list[str | Path]): """Get appropriate readers for the input files based on file extension""" - file_extractor: dict[str, AutoReader | BaseReader] = { - ".xlsx": PandasExcelReader(), - ".docx": UnstructuredReader(), - ".xls": UnstructuredReader(), - ".doc": UnstructuredReader(), + file_extractors: dict[str, BaseReader] = { + ext: cls() for ext, cls in KH_DEFAULT_FILE_EXTRACTORS.items() } + for ext, cls in self.override_file_extractors.items(): + file_extractors[ext] = cls() if self.pdf_mode == "normal": - file_extractor[".pdf"] = AutoReader("UnstructuredReader") + file_extractors[".pdf"] = AutoReader("UnstructuredReader") # type: ignore elif self.pdf_mode == "ocr": - file_extractor[".pdf"] = OCRReader() + file_extractors[".pdf"] = OCRReader() else: - file_extractor[".pdf"] = MathpixPDFReader() + file_extractors[".pdf"] = MathpixPDFReader() main_reader = DirectoryReader( input_files=input_files, - file_extractor=file_extractor, + file_extractor=file_extractors, # type: ignore ) return main_reader diff --git a/libs/kotaemon/kotaemon/loaders/__init__.py b/libs/kotaemon/kotaemon/loaders/__init__.py index 369242e..d742b52 100644 --- a/libs/kotaemon/kotaemon/loaders/__init__.py +++ b/libs/kotaemon/kotaemon/loaders/__init__.py @@ -1,4 +1,5 @@ -from .base import AutoReader, DirectoryReader +from .base import AutoReader, BaseReader +from .composite_loader import DirectoryReader from .docx_loader import DocxReader from .excel_loader import PandasExcelReader from .html_loader import HtmlReader @@ -8,6 +9,7 @@ from .unstructured_loader import UnstructuredReader __all__ = [ "AutoReader", + "BaseReader", "PandasExcelReader", "MathpixPDFReader", "OCRReader", diff --git a/libs/kotaemon/kotaemon/loaders/base.py b/libs/kotaemon/kotaemon/loaders/base.py index cb92d5b..ca27e49 100644 --- a/libs/kotaemon/kotaemon/loaders/base.py +++ b/libs/kotaemon/kotaemon/loaders/base.py @@ -1,19 +1,25 @@ from pathlib import Path -from typing import Any, List, Type, Union - -from llama_index import SimpleDirectoryReader, download_loader -from llama_index.readers.base import BaseReader +from typing import TYPE_CHECKING, Any, List, Type, Union from kotaemon.base import BaseComponent, Document +if TYPE_CHECKING: + from llama_index.readers.base import BaseReader as LIBaseReader -class AutoReader(BaseComponent): + +class BaseReader(BaseComponent): + ... + + +class AutoReader(BaseReader): """General auto reader for a variety of files. (based on llama-hub)""" - def __init__(self, reader_type: Union[str, Type[BaseReader]]) -> None: + def __init__(self, reader_type: Union[str, Type["LIBaseReader"]]) -> None: """Init reader using string identifier or class name from llama-hub""" if isinstance(reader_type, str): + from llama_index import download_loader + self._reader = download_loader(reader_type)() else: self._reader = reader_type() @@ -30,15 +36,30 @@ class AutoReader(BaseComponent): return self.load_data(file=file, **kwargs) -class LIBaseReader(BaseComponent): - _reader_class: Type[BaseReader] +class LIReaderMixin(BaseComponent): + """Base wrapper around llama-index reader + + To use the LIBaseReader, you need to implement the _get_wrapped_class method to + return the relevant llama-index reader class that you want to wrap. + + Example: + + ```python + class DirectoryReader(LIBaseReader): + def _get_wrapped_class(self) -> Type["BaseReader"]: + from llama_index import SimpleDirectoryReader + + return SimpleDirectoryReader + ``` + """ + + def _get_wrapped_class(self) -> Type["LIBaseReader"]: + raise NotImplementedError( + "Please return the relevant Langchain class in in _get_lc_class" + ) def __init__(self, *args, **kwargs): - if self._reader_class is None: - raise AttributeError( - "Require `_reader_class` to set a BaseReader class from LlamarIndex" - ) - + self._reader_class = self._get_wrapped_class() self._reader = self._reader_class(*args, **kwargs) super().__init__() @@ -60,7 +81,3 @@ class LIBaseReader(BaseComponent): def run(self, *args, **kwargs: Any) -> List[Document]: return self.load_data(*args, **kwargs) - - -class DirectoryReader(LIBaseReader): - _reader_class = SimpleDirectoryReader diff --git a/libs/kotaemon/kotaemon/loaders/composite_loader.py b/libs/kotaemon/kotaemon/loaders/composite_loader.py new file mode 100644 index 0000000..9d35e26 --- /dev/null +++ b/libs/kotaemon/kotaemon/loaders/composite_loader.py @@ -0,0 +1,53 @@ +from typing import Callable, List, Optional, Type + +from llama_index.readers.base import BaseReader as LIBaseReader + +from .base import BaseReader, LIReaderMixin + + +class DirectoryReader(LIReaderMixin, BaseReader): + """Wrap around llama-index SimpleDirectoryReader + + Args: + input_dir (str): Path to the directory. + input_files (List): List of file paths to read + (Optional; overrides input_dir, exclude) + exclude (List): glob of python file paths to exclude (Optional) + exclude_hidden (bool): Whether to exclude hidden files (dotfiles). + encoding (str): Encoding of the files. + Default is utf-8. + errors (str): how encoding and decoding errors are to be handled, + see https://docs.python.org/3/library/functions.html#open + recursive (bool): Whether to recursively search in subdirectories. + False by default. + filename_as_id (bool): Whether to use the filename as the document id. + False by default. + required_exts (Optional[List[str]]): List of required extensions. + Default is None. + file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file + extension to a BaseReader class that specifies how to convert that file + to text. If not specified, use default from DEFAULT_FILE_READER_CLS. + num_files_limit (Optional[int]): Maximum number of files to read. + Default is None. + file_metadata (Optional[Callable[str, Dict]]): A function that takes + in a filename and returns a Dict of metadata for the Document. + Default is None. + """ + + input_dir: Optional[str] = None + input_files: Optional[List] = None + exclude: Optional[List] = None + exclude_hidden: bool = True + errors: str = "ignore" + recursive: bool = False + encoding: str = "utf-8" + filename_as_id: bool = False + required_exts: Optional[list[str]] = None + file_extractor: Optional[dict[str, "LIBaseReader"]] = None + num_files_limit: Optional[int] = None + file_metadata: Optional[Callable[[str], dict]] = None + + def _get_wrapped_class(self) -> Type["LIBaseReader"]: + from llama_index import SimpleDirectoryReader + + return SimpleDirectoryReader diff --git a/libs/kotaemon/pyproject.toml b/libs/kotaemon/pyproject.toml index 944e7e2..547ed12 100644 --- a/libs/kotaemon/pyproject.toml +++ b/libs/kotaemon/pyproject.toml @@ -11,14 +11,14 @@ packages.find.exclude = ["tests*", "env*"] # metadata and dependencies [project] name = "kotaemon" -version = "0.3.6" +version = "0.3.7" requires-python = ">= 3.10" description = "Kotaemon core library for AI development." dependencies = [ "langchain", "langchain-community", "theflow", - "llama-index>=0.9.0", + "llama-index>=0.9.0,<0.10.0", "llama-hub", "gradio>=4.0.0", "openpyxl", diff --git a/libs/kotaemon/tests/test_agent.py b/libs/kotaemon/tests/test_agent.py index dad9a33..06a89c4 100644 --- a/libs/kotaemon/tests/test_agent.py +++ b/libs/kotaemon/tests/test_agent.py @@ -93,7 +93,7 @@ _openai_chat_completion_responses_react_langchain_tool = [ ( "I don't have prior knowledge about Cinnamon AI company, " "so I should gather information about it.\n" - "Action: Wikipedia\n" + "Action: wikipedia\n" "Action Input: Cinnamon AI company\n" ), (