refactor: replace llama-index based loader, to a llama-index mixin loader (#142)
This commit is contained in:
parent
7fc54d52e4
commit
d36522129f
|
@ -1,4 +1,5 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Type
|
||||||
|
|
||||||
from llama_index.readers.base import BaseReader
|
from llama_index.readers.base import BaseReader
|
||||||
|
|
||||||
|
@ -14,6 +15,13 @@ from kotaemon.loaders import (
|
||||||
UnstructuredReader,
|
UnstructuredReader,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
KH_DEFAULT_FILE_EXTRACTORS: dict[str, Type[BaseReader]] = {
|
||||||
|
".xlsx": PandasExcelReader,
|
||||||
|
".docx": UnstructuredReader,
|
||||||
|
".xls": UnstructuredReader,
|
||||||
|
".doc": UnstructuredReader,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class DocumentIngestor(BaseComponent):
|
class DocumentIngestor(BaseComponent):
|
||||||
"""Ingest common office document types into Document for indexing
|
"""Ingest common office document types into Document for indexing
|
||||||
|
@ -30,6 +38,8 @@ class DocumentIngestor(BaseComponent):
|
||||||
- ocr: parse pdf image using flax
|
- ocr: parse pdf image using flax
|
||||||
doc_parsers: list of document parsers to parse the document
|
doc_parsers: list of document parsers to parse the document
|
||||||
text_splitter: splitter to split the document into text nodes
|
text_splitter: splitter to split the document into text nodes
|
||||||
|
override_file_extractors: override file extractors for specific file extensions
|
||||||
|
The default file extractors are stored in `KH_DEFAULT_FILE_EXTRACTORS`
|
||||||
"""
|
"""
|
||||||
|
|
||||||
pdf_mode: str = "normal" # "normal", "mathpix", "ocr"
|
pdf_mode: str = "normal" # "normal", "mathpix", "ocr"
|
||||||
|
@ -38,26 +48,26 @@ class DocumentIngestor(BaseComponent):
|
||||||
chunk_size=1024,
|
chunk_size=1024,
|
||||||
chunk_overlap=256,
|
chunk_overlap=256,
|
||||||
)
|
)
|
||||||
|
override_file_extractors: dict[str, Type[BaseReader]] = {}
|
||||||
|
|
||||||
def _get_reader(self, input_files: list[str | Path]):
|
def _get_reader(self, input_files: list[str | Path]):
|
||||||
"""Get appropriate readers for the input files based on file extension"""
|
"""Get appropriate readers for the input files based on file extension"""
|
||||||
file_extractor: dict[str, AutoReader | BaseReader] = {
|
file_extractors: dict[str, BaseReader] = {
|
||||||
".xlsx": PandasExcelReader(),
|
ext: cls() for ext, cls in KH_DEFAULT_FILE_EXTRACTORS.items()
|
||||||
".docx": UnstructuredReader(),
|
|
||||||
".xls": UnstructuredReader(),
|
|
||||||
".doc": UnstructuredReader(),
|
|
||||||
}
|
}
|
||||||
|
for ext, cls in self.override_file_extractors.items():
|
||||||
|
file_extractors[ext] = cls()
|
||||||
|
|
||||||
if self.pdf_mode == "normal":
|
if self.pdf_mode == "normal":
|
||||||
file_extractor[".pdf"] = AutoReader("UnstructuredReader")
|
file_extractors[".pdf"] = AutoReader("UnstructuredReader") # type: ignore
|
||||||
elif self.pdf_mode == "ocr":
|
elif self.pdf_mode == "ocr":
|
||||||
file_extractor[".pdf"] = OCRReader()
|
file_extractors[".pdf"] = OCRReader()
|
||||||
else:
|
else:
|
||||||
file_extractor[".pdf"] = MathpixPDFReader()
|
file_extractors[".pdf"] = MathpixPDFReader()
|
||||||
|
|
||||||
main_reader = DirectoryReader(
|
main_reader = DirectoryReader(
|
||||||
input_files=input_files,
|
input_files=input_files,
|
||||||
file_extractor=file_extractor,
|
file_extractor=file_extractors, # type: ignore
|
||||||
)
|
)
|
||||||
|
|
||||||
return main_reader
|
return main_reader
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
from .base import AutoReader, DirectoryReader
|
from .base import AutoReader, BaseReader
|
||||||
|
from .composite_loader import DirectoryReader
|
||||||
from .docx_loader import DocxReader
|
from .docx_loader import DocxReader
|
||||||
from .excel_loader import PandasExcelReader
|
from .excel_loader import PandasExcelReader
|
||||||
from .html_loader import HtmlReader
|
from .html_loader import HtmlReader
|
||||||
|
@ -8,6 +9,7 @@ from .unstructured_loader import UnstructuredReader
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"AutoReader",
|
"AutoReader",
|
||||||
|
"BaseReader",
|
||||||
"PandasExcelReader",
|
"PandasExcelReader",
|
||||||
"MathpixPDFReader",
|
"MathpixPDFReader",
|
||||||
"OCRReader",
|
"OCRReader",
|
||||||
|
|
|
@ -1,19 +1,25 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, List, Type, Union
|
from typing import TYPE_CHECKING, Any, List, Type, Union
|
||||||
|
|
||||||
from llama_index import SimpleDirectoryReader, download_loader
|
|
||||||
from llama_index.readers.base import BaseReader
|
|
||||||
|
|
||||||
from kotaemon.base import BaseComponent, Document
|
from kotaemon.base import BaseComponent, Document
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from llama_index.readers.base import BaseReader as LIBaseReader
|
||||||
|
|
||||||
class AutoReader(BaseComponent):
|
|
||||||
|
class BaseReader(BaseComponent):
|
||||||
|
...
|
||||||
|
|
||||||
|
|
||||||
|
class AutoReader(BaseReader):
|
||||||
"""General auto reader for a variety of files. (based on llama-hub)"""
|
"""General auto reader for a variety of files. (based on llama-hub)"""
|
||||||
|
|
||||||
def __init__(self, reader_type: Union[str, Type[BaseReader]]) -> None:
|
def __init__(self, reader_type: Union[str, Type["LIBaseReader"]]) -> None:
|
||||||
"""Init reader using string identifier or class name from llama-hub"""
|
"""Init reader using string identifier or class name from llama-hub"""
|
||||||
|
|
||||||
if isinstance(reader_type, str):
|
if isinstance(reader_type, str):
|
||||||
|
from llama_index import download_loader
|
||||||
|
|
||||||
self._reader = download_loader(reader_type)()
|
self._reader = download_loader(reader_type)()
|
||||||
else:
|
else:
|
||||||
self._reader = reader_type()
|
self._reader = reader_type()
|
||||||
|
@ -30,15 +36,30 @@ class AutoReader(BaseComponent):
|
||||||
return self.load_data(file=file, **kwargs)
|
return self.load_data(file=file, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class LIBaseReader(BaseComponent):
|
class LIReaderMixin(BaseComponent):
|
||||||
_reader_class: Type[BaseReader]
|
"""Base wrapper around llama-index reader
|
||||||
|
|
||||||
|
To use the LIBaseReader, you need to implement the _get_wrapped_class method to
|
||||||
|
return the relevant llama-index reader class that you want to wrap.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
class DirectoryReader(LIBaseReader):
|
||||||
|
def _get_wrapped_class(self) -> Type["BaseReader"]:
|
||||||
|
from llama_index import SimpleDirectoryReader
|
||||||
|
|
||||||
|
return SimpleDirectoryReader
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _get_wrapped_class(self) -> Type["LIBaseReader"]:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"Please return the relevant Langchain class in in _get_lc_class"
|
||||||
|
)
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
if self._reader_class is None:
|
self._reader_class = self._get_wrapped_class()
|
||||||
raise AttributeError(
|
|
||||||
"Require `_reader_class` to set a BaseReader class from LlamarIndex"
|
|
||||||
)
|
|
||||||
|
|
||||||
self._reader = self._reader_class(*args, **kwargs)
|
self._reader = self._reader_class(*args, **kwargs)
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
|
@ -60,7 +81,3 @@ class LIBaseReader(BaseComponent):
|
||||||
|
|
||||||
def run(self, *args, **kwargs: Any) -> List[Document]:
|
def run(self, *args, **kwargs: Any) -> List[Document]:
|
||||||
return self.load_data(*args, **kwargs)
|
return self.load_data(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
class DirectoryReader(LIBaseReader):
|
|
||||||
_reader_class = SimpleDirectoryReader
|
|
||||||
|
|
53
libs/kotaemon/kotaemon/loaders/composite_loader.py
Normal file
53
libs/kotaemon/kotaemon/loaders/composite_loader.py
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
from typing import Callable, List, Optional, Type
|
||||||
|
|
||||||
|
from llama_index.readers.base import BaseReader as LIBaseReader
|
||||||
|
|
||||||
|
from .base import BaseReader, LIReaderMixin
|
||||||
|
|
||||||
|
|
||||||
|
class DirectoryReader(LIReaderMixin, BaseReader):
|
||||||
|
"""Wrap around llama-index SimpleDirectoryReader
|
||||||
|
|
||||||
|
Args:
|
||||||
|
input_dir (str): Path to the directory.
|
||||||
|
input_files (List): List of file paths to read
|
||||||
|
(Optional; overrides input_dir, exclude)
|
||||||
|
exclude (List): glob of python file paths to exclude (Optional)
|
||||||
|
exclude_hidden (bool): Whether to exclude hidden files (dotfiles).
|
||||||
|
encoding (str): Encoding of the files.
|
||||||
|
Default is utf-8.
|
||||||
|
errors (str): how encoding and decoding errors are to be handled,
|
||||||
|
see https://docs.python.org/3/library/functions.html#open
|
||||||
|
recursive (bool): Whether to recursively search in subdirectories.
|
||||||
|
False by default.
|
||||||
|
filename_as_id (bool): Whether to use the filename as the document id.
|
||||||
|
False by default.
|
||||||
|
required_exts (Optional[List[str]]): List of required extensions.
|
||||||
|
Default is None.
|
||||||
|
file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file
|
||||||
|
extension to a BaseReader class that specifies how to convert that file
|
||||||
|
to text. If not specified, use default from DEFAULT_FILE_READER_CLS.
|
||||||
|
num_files_limit (Optional[int]): Maximum number of files to read.
|
||||||
|
Default is None.
|
||||||
|
file_metadata (Optional[Callable[str, Dict]]): A function that takes
|
||||||
|
in a filename and returns a Dict of metadata for the Document.
|
||||||
|
Default is None.
|
||||||
|
"""
|
||||||
|
|
||||||
|
input_dir: Optional[str] = None
|
||||||
|
input_files: Optional[List] = None
|
||||||
|
exclude: Optional[List] = None
|
||||||
|
exclude_hidden: bool = True
|
||||||
|
errors: str = "ignore"
|
||||||
|
recursive: bool = False
|
||||||
|
encoding: str = "utf-8"
|
||||||
|
filename_as_id: bool = False
|
||||||
|
required_exts: Optional[list[str]] = None
|
||||||
|
file_extractor: Optional[dict[str, "LIBaseReader"]] = None
|
||||||
|
num_files_limit: Optional[int] = None
|
||||||
|
file_metadata: Optional[Callable[[str], dict]] = None
|
||||||
|
|
||||||
|
def _get_wrapped_class(self) -> Type["LIBaseReader"]:
|
||||||
|
from llama_index import SimpleDirectoryReader
|
||||||
|
|
||||||
|
return SimpleDirectoryReader
|
|
@ -11,14 +11,14 @@ packages.find.exclude = ["tests*", "env*"]
|
||||||
# metadata and dependencies
|
# metadata and dependencies
|
||||||
[project]
|
[project]
|
||||||
name = "kotaemon"
|
name = "kotaemon"
|
||||||
version = "0.3.6"
|
version = "0.3.7"
|
||||||
requires-python = ">= 3.10"
|
requires-python = ">= 3.10"
|
||||||
description = "Kotaemon core library for AI development."
|
description = "Kotaemon core library for AI development."
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"langchain",
|
"langchain",
|
||||||
"langchain-community",
|
"langchain-community",
|
||||||
"theflow",
|
"theflow",
|
||||||
"llama-index>=0.9.0",
|
"llama-index>=0.9.0,<0.10.0",
|
||||||
"llama-hub",
|
"llama-hub",
|
||||||
"gradio>=4.0.0",
|
"gradio>=4.0.0",
|
||||||
"openpyxl",
|
"openpyxl",
|
||||||
|
|
|
@ -93,7 +93,7 @@ _openai_chat_completion_responses_react_langchain_tool = [
|
||||||
(
|
(
|
||||||
"I don't have prior knowledge about Cinnamon AI company, "
|
"I don't have prior knowledge about Cinnamon AI company, "
|
||||||
"so I should gather information about it.\n"
|
"so I should gather information about it.\n"
|
||||||
"Action: Wikipedia\n"
|
"Action: wikipedia\n"
|
||||||
"Action Input: Cinnamon AI company\n"
|
"Action Input: Cinnamon AI company\n"
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
|
|
Loading…
Reference in New Issue
Block a user