[AUR-432] Add layout-aware table parsing PDF reader (#27)

* add OCRReader, MathPixReader and ExcelReader * update test case for ocr reader * reformat * minor fix
2023-09-26 15:52:44 +07:00
parent 6207f4332a
commit 6c3d614973
12 changed files with 888 additions and 2 deletions
--- a/knowledgehub/loaders/excel_loader.py
+++ b/knowledgehub/loaders/excel_loader.py
@@ -0,0 +1,96 @@
+"""Pandas Excel reader.
+
+Pandas parser for .xlsx files.
+
+"""
+from pathlib import Path
+from typing import Any, List, Optional, Union
+
+from llama_index.readers.base import BaseReader
+
+from kotaemon.documents import Document
+
+
+class PandasExcelReader(BaseReader):
+    r"""Pandas-based CSV parser.
+
+    Parses CSVs using the separator detection from Pandas `read_csv`function.
+    If special parameters are required, use the `pandas_config` dict.
+
+    Args:
+
+        pandas_config (dict): Options for the `pandas.read_excel` function call.
+            Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
+            for more information. Set to empty dict by default,
+            this means defaults will be used.
+
+    """
+
+    def __init__(
+        self,
+        *args: Any,
+        pandas_config: Optional[dict] = None,
+        row_joiner: str = "\n",
+        **kwargs: Any,
+    ) -> None:
+        """Init params."""
+        super().__init__(*args, **kwargs)
+        self._pandas_config = pandas_config or {}
+        self._row_joiner = row_joiner if row_joiner else "\n"
+
+    def load_data(
+        self,
+        file: Path,
+        include_sheetname: bool = False,
+        sheet_name: Optional[Union[str, int, list]] = None,
+        **kwargs,
+    ) -> List[Document]:
+        """Parse file and extract values from a specific column.
+
+        Args:
+            file (Path): The path to the Excel file to read.
+            include_sheetname (bool): Whether to include the sheet name in the output.
+            sheet_name (Union[str, int, None]): The specific sheet to read from,
+                default is None which reads all sheets.
+
+        Returns:
+            List[Document]: A list of`Document objects containing the
+                values from the specified column in the Excel file.
+        """
+        import itertools
+
+        try:
+            import pandas as pd
+        except ImportError:
+            raise ImportError(
+                "install pandas using `pip3 install pandas` to use this loader"
+            )
+
+        if sheet_name is not None:
+            sheet_name = (
+                [sheet_name] if not isinstance(sheet_name, list) else sheet_name
+            )
+
+        dfs = pd.read_excel(file, sheet_name=sheet_name, **self._pandas_config)
+        sheet_names = dfs.keys()
+        df_sheets = []
+
+        for key in sheet_names:
+            sheet = []
+            if include_sheetname:
+                sheet.append([key])
+            sheet.extend(dfs[key].values.astype(str).tolist())
+            df_sheets.append(sheet)
+
+        text_list = list(
+            itertools.chain.from_iterable(df_sheets)
+        )  # flatten list of lists
+
+        output = [
+            Document(
+                text=self._row_joiner.join(" ".join(sublist) for sublist in text_list),
+                metadata={"source": file.stem},
+            )
+        ]
+
+        return output