[AUR-432] Add layout-aware table parsing PDF reader (#27)
* add OCRReader, MathPixReader and ExcelReader * update test case for ocr reader * reformat * minor fix
This commit is contained in:
committed by
GitHub
parent
6207f4332a
commit
6c3d614973
96
knowledgehub/loaders/excel_loader.py
Normal file
96
knowledgehub/loaders/excel_loader.py
Normal file
@@ -0,0 +1,96 @@
|
||||
"""Pandas Excel reader.
|
||||
|
||||
Pandas parser for .xlsx files.
|
||||
|
||||
"""
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Optional, Union
|
||||
|
||||
from llama_index.readers.base import BaseReader
|
||||
|
||||
from kotaemon.documents import Document
|
||||
|
||||
|
||||
class PandasExcelReader(BaseReader):
|
||||
r"""Pandas-based CSV parser.
|
||||
|
||||
Parses CSVs using the separator detection from Pandas `read_csv`function.
|
||||
If special parameters are required, use the `pandas_config` dict.
|
||||
|
||||
Args:
|
||||
|
||||
pandas_config (dict): Options for the `pandas.read_excel` function call.
|
||||
Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
|
||||
for more information. Set to empty dict by default,
|
||||
this means defaults will be used.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*args: Any,
|
||||
pandas_config: Optional[dict] = None,
|
||||
row_joiner: str = "\n",
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Init params."""
|
||||
super().__init__(*args, **kwargs)
|
||||
self._pandas_config = pandas_config or {}
|
||||
self._row_joiner = row_joiner if row_joiner else "\n"
|
||||
|
||||
def load_data(
|
||||
self,
|
||||
file: Path,
|
||||
include_sheetname: bool = False,
|
||||
sheet_name: Optional[Union[str, int, list]] = None,
|
||||
**kwargs,
|
||||
) -> List[Document]:
|
||||
"""Parse file and extract values from a specific column.
|
||||
|
||||
Args:
|
||||
file (Path): The path to the Excel file to read.
|
||||
include_sheetname (bool): Whether to include the sheet name in the output.
|
||||
sheet_name (Union[str, int, None]): The specific sheet to read from,
|
||||
default is None which reads all sheets.
|
||||
|
||||
Returns:
|
||||
List[Document]: A list of`Document objects containing the
|
||||
values from the specified column in the Excel file.
|
||||
"""
|
||||
import itertools
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"install pandas using `pip3 install pandas` to use this loader"
|
||||
)
|
||||
|
||||
if sheet_name is not None:
|
||||
sheet_name = (
|
||||
[sheet_name] if not isinstance(sheet_name, list) else sheet_name
|
||||
)
|
||||
|
||||
dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)
|
||||
sheet_names = dfs.keys()
|
||||
df_sheets = []
|
||||
|
||||
for key in sheet_names:
|
||||
sheet = []
|
||||
if include_sheetname:
|
||||
sheet.append([key])
|
||||
sheet.extend(dfs[key].values.astype(str).tolist())
|
||||
df_sheets.append(sheet)
|
||||
|
||||
text_list = list(
|
||||
itertools.chain.from_iterable(df_sheets)
|
||||
) # flatten list of lists
|
||||
|
||||
output = [
|
||||
Document(
|
||||
text=self._row_joiner.join(" ".join(sublist) for sublist in text_list),
|
||||
metadata={"source": file.stem},
|
||||
)
|
||||
]
|
||||
|
||||
return output
|
Reference in New Issue
Block a user