refactor: replace llama-index based loader, to a llama-index mixin loader (#142)

This commit is contained in:
Duc Nguyen (john) 2024-02-20 02:33:28 +07:00 committed by GitHub
parent 7fc54d52e4
commit d36522129f
6 changed files with 112 additions and 30 deletions

View File

@ -1,4 +1,5 @@
from pathlib import Path from pathlib import Path
from typing import Type
from llama_index.readers.base import BaseReader from llama_index.readers.base import BaseReader
@ -14,6 +15,13 @@ from kotaemon.loaders import (
UnstructuredReader, UnstructuredReader,
) )
KH_DEFAULT_FILE_EXTRACTORS: dict[str, Type[BaseReader]] = {
".xlsx": PandasExcelReader,
".docx": UnstructuredReader,
".xls": UnstructuredReader,
".doc": UnstructuredReader,
}
class DocumentIngestor(BaseComponent): class DocumentIngestor(BaseComponent):
"""Ingest common office document types into Document for indexing """Ingest common office document types into Document for indexing
@ -30,6 +38,8 @@ class DocumentIngestor(BaseComponent):
- ocr: parse pdf image using flax - ocr: parse pdf image using flax
doc_parsers: list of document parsers to parse the document doc_parsers: list of document parsers to parse the document
text_splitter: splitter to split the document into text nodes text_splitter: splitter to split the document into text nodes
override_file_extractors: override file extractors for specific file extensions
The default file extractors are stored in `KH_DEFAULT_FILE_EXTRACTORS`
""" """
pdf_mode: str = "normal" # "normal", "mathpix", "ocr" pdf_mode: str = "normal" # "normal", "mathpix", "ocr"
@ -38,26 +48,26 @@ class DocumentIngestor(BaseComponent):
chunk_size=1024, chunk_size=1024,
chunk_overlap=256, chunk_overlap=256,
) )
override_file_extractors: dict[str, Type[BaseReader]] = {}
def _get_reader(self, input_files: list[str | Path]): def _get_reader(self, input_files: list[str | Path]):
"""Get appropriate readers for the input files based on file extension""" """Get appropriate readers for the input files based on file extension"""
file_extractor: dict[str, AutoReader | BaseReader] = { file_extractors: dict[str, BaseReader] = {
".xlsx": PandasExcelReader(), ext: cls() for ext, cls in KH_DEFAULT_FILE_EXTRACTORS.items()
".docx": UnstructuredReader(),
".xls": UnstructuredReader(),
".doc": UnstructuredReader(),
} }
for ext, cls in self.override_file_extractors.items():
file_extractors[ext] = cls()
if self.pdf_mode == "normal": if self.pdf_mode == "normal":
file_extractor[".pdf"] = AutoReader("UnstructuredReader") file_extractors[".pdf"] = AutoReader("UnstructuredReader") # type: ignore
elif self.pdf_mode == "ocr": elif self.pdf_mode == "ocr":
file_extractor[".pdf"] = OCRReader() file_extractors[".pdf"] = OCRReader()
else: else:
file_extractor[".pdf"] = MathpixPDFReader() file_extractors[".pdf"] = MathpixPDFReader()
main_reader = DirectoryReader( main_reader = DirectoryReader(
input_files=input_files, input_files=input_files,
file_extractor=file_extractor, file_extractor=file_extractors, # type: ignore
) )
return main_reader return main_reader

View File

@ -1,4 +1,5 @@
from .base import AutoReader, DirectoryReader from .base import AutoReader, BaseReader
from .composite_loader import DirectoryReader
from .docx_loader import DocxReader from .docx_loader import DocxReader
from .excel_loader import PandasExcelReader from .excel_loader import PandasExcelReader
from .html_loader import HtmlReader from .html_loader import HtmlReader
@ -8,6 +9,7 @@ from .unstructured_loader import UnstructuredReader
__all__ = [ __all__ = [
"AutoReader", "AutoReader",
"BaseReader",
"PandasExcelReader", "PandasExcelReader",
"MathpixPDFReader", "MathpixPDFReader",
"OCRReader", "OCRReader",

View File

@ -1,19 +1,25 @@
from pathlib import Path from pathlib import Path
from typing import Any, List, Type, Union from typing import TYPE_CHECKING, Any, List, Type, Union
from llama_index import SimpleDirectoryReader, download_loader
from llama_index.readers.base import BaseReader
from kotaemon.base import BaseComponent, Document from kotaemon.base import BaseComponent, Document
if TYPE_CHECKING:
from llama_index.readers.base import BaseReader as LIBaseReader
class AutoReader(BaseComponent):
class BaseReader(BaseComponent):
...
class AutoReader(BaseReader):
"""General auto reader for a variety of files. (based on llama-hub)""" """General auto reader for a variety of files. (based on llama-hub)"""
def __init__(self, reader_type: Union[str, Type[BaseReader]]) -> None: def __init__(self, reader_type: Union[str, Type["LIBaseReader"]]) -> None:
"""Init reader using string identifier or class name from llama-hub""" """Init reader using string identifier or class name from llama-hub"""
if isinstance(reader_type, str): if isinstance(reader_type, str):
from llama_index import download_loader
self._reader = download_loader(reader_type)() self._reader = download_loader(reader_type)()
else: else:
self._reader = reader_type() self._reader = reader_type()
@ -30,15 +36,30 @@ class AutoReader(BaseComponent):
return self.load_data(file=file, **kwargs) return self.load_data(file=file, **kwargs)
class LIBaseReader(BaseComponent): class LIReaderMixin(BaseComponent):
_reader_class: Type[BaseReader] """Base wrapper around llama-index reader
To use the LIBaseReader, you need to implement the _get_wrapped_class method to
return the relevant llama-index reader class that you want to wrap.
Example:
```python
class DirectoryReader(LIBaseReader):
def _get_wrapped_class(self) -> Type["BaseReader"]:
from llama_index import SimpleDirectoryReader
return SimpleDirectoryReader
```
"""
def _get_wrapped_class(self) -> Type["LIBaseReader"]:
raise NotImplementedError(
"Please return the relevant Langchain class in in _get_lc_class"
)
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
if self._reader_class is None: self._reader_class = self._get_wrapped_class()
raise AttributeError(
"Require `_reader_class` to set a BaseReader class from LlamarIndex"
)
self._reader = self._reader_class(*args, **kwargs) self._reader = self._reader_class(*args, **kwargs)
super().__init__() super().__init__()
@ -60,7 +81,3 @@ class LIBaseReader(BaseComponent):
def run(self, *args, **kwargs: Any) -> List[Document]: def run(self, *args, **kwargs: Any) -> List[Document]:
return self.load_data(*args, **kwargs) return self.load_data(*args, **kwargs)
class DirectoryReader(LIBaseReader):
_reader_class = SimpleDirectoryReader

View File

@ -0,0 +1,53 @@
from typing import Callable, List, Optional, Type
from llama_index.readers.base import BaseReader as LIBaseReader
from .base import BaseReader, LIReaderMixin
class DirectoryReader(LIReaderMixin, BaseReader):
"""Wrap around llama-index SimpleDirectoryReader
Args:
input_dir (str): Path to the directory.
input_files (List): List of file paths to read
(Optional; overrides input_dir, exclude)
exclude (List): glob of python file paths to exclude (Optional)
exclude_hidden (bool): Whether to exclude hidden files (dotfiles).
encoding (str): Encoding of the files.
Default is utf-8.
errors (str): how encoding and decoding errors are to be handled,
see https://docs.python.org/3/library/functions.html#open
recursive (bool): Whether to recursively search in subdirectories.
False by default.
filename_as_id (bool): Whether to use the filename as the document id.
False by default.
required_exts (Optional[List[str]]): List of required extensions.
Default is None.
file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file
extension to a BaseReader class that specifies how to convert that file
to text. If not specified, use default from DEFAULT_FILE_READER_CLS.
num_files_limit (Optional[int]): Maximum number of files to read.
Default is None.
file_metadata (Optional[Callable[str, Dict]]): A function that takes
in a filename and returns a Dict of metadata for the Document.
Default is None.
"""
input_dir: Optional[str] = None
input_files: Optional[List] = None
exclude: Optional[List] = None
exclude_hidden: bool = True
errors: str = "ignore"
recursive: bool = False
encoding: str = "utf-8"
filename_as_id: bool = False
required_exts: Optional[list[str]] = None
file_extractor: Optional[dict[str, "LIBaseReader"]] = None
num_files_limit: Optional[int] = None
file_metadata: Optional[Callable[[str], dict]] = None
def _get_wrapped_class(self) -> Type["LIBaseReader"]:
from llama_index import SimpleDirectoryReader
return SimpleDirectoryReader

View File

@ -11,14 +11,14 @@ packages.find.exclude = ["tests*", "env*"]
# metadata and dependencies # metadata and dependencies
[project] [project]
name = "kotaemon" name = "kotaemon"
version = "0.3.6" version = "0.3.7"
requires-python = ">= 3.10" requires-python = ">= 3.10"
description = "Kotaemon core library for AI development." description = "Kotaemon core library for AI development."
dependencies = [ dependencies = [
"langchain", "langchain",
"langchain-community", "langchain-community",
"theflow", "theflow",
"llama-index>=0.9.0", "llama-index>=0.9.0,<0.10.0",
"llama-hub", "llama-hub",
"gradio>=4.0.0", "gradio>=4.0.0",
"openpyxl", "openpyxl",

View File

@ -93,7 +93,7 @@ _openai_chat_completion_responses_react_langchain_tool = [
( (
"I don't have prior knowledge about Cinnamon AI company, " "I don't have prior knowledge about Cinnamon AI company, "
"so I should gather information about it.\n" "so I should gather information about it.\n"
"Action: Wikipedia\n" "Action: wikipedia\n"
"Action Input: Cinnamon AI company\n" "Action Input: Cinnamon AI company\n"
), ),
( (