67 lines
2.2 KiB
Python
67 lines
2.2 KiB
Python
from pathlib import Path
|
|
from typing import Any, List, Type, Union
|
|
|
|
from llama_index import SimpleDirectoryReader, download_loader
|
|
from llama_index.readers.base import BaseReader
|
|
|
|
from ..base import BaseComponent, Document
|
|
|
|
|
|
class AutoReader(BaseComponent):
|
|
"""General auto reader for a variety of files. (based on llama-hub)"""
|
|
|
|
def __init__(self, reader_type: Union[str, Type[BaseReader]]) -> None:
|
|
"""Init reader using string identifier or class name from llama-hub"""
|
|
|
|
if isinstance(reader_type, str):
|
|
self._reader = download_loader(reader_type)()
|
|
else:
|
|
self._reader = reader_type()
|
|
super().__init__()
|
|
|
|
def load_data(self, file: Union[Path, str], **kwargs: Any) -> List[Document]:
|
|
documents = self._reader.load_data(file=file, **kwargs)
|
|
|
|
# convert Document to new base class from kotaemon
|
|
converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]
|
|
return converted_documents
|
|
|
|
def run(self, file: Union[Path, str], **kwargs: Any) -> List[Document]:
|
|
return self.load_data(file=file, **kwargs)
|
|
|
|
|
|
class LIBaseReader(BaseComponent):
|
|
_reader_class: Type[BaseReader]
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
if self._reader_class is None:
|
|
raise AttributeError(
|
|
"Require `_reader_class` to set a BaseReader class from LlamarIndex"
|
|
)
|
|
|
|
self._reader = self._reader_class(*args, **kwargs)
|
|
super().__init__()
|
|
|
|
def __setattr__(self, name: str, value: Any) -> None:
|
|
if name.startswith("_"):
|
|
return super().__setattr__(name, value)
|
|
|
|
return setattr(self._reader, name, value)
|
|
|
|
def __getattr__(self, name: str) -> Any:
|
|
return getattr(self._reader, name)
|
|
|
|
def load_data(self, *args, **kwargs: Any) -> List[Document]:
|
|
documents = self._reader.load_data(*args, **kwargs)
|
|
|
|
# convert Document to new base class from kotaemon
|
|
converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]
|
|
return converted_documents
|
|
|
|
def run(self, *args, **kwargs: Any) -> List[Document]:
|
|
return self.load_data(*args, **kwargs)
|
|
|
|
|
|
class DirectoryReader(LIBaseReader):
|
|
_reader_class = SimpleDirectoryReader
|