Refactor excel Loader (#79)

This commit is contained in:
Nguyen Trung Duc (john) 2023-11-16 11:30:11 +07:00 committed by GitHub
parent cc1e75b3c6
commit 98c76c4700

View File

@ -14,7 +14,7 @@ from kotaemon.base import Document
class PandasExcelReader(BaseReader): class PandasExcelReader(BaseReader):
r"""Pandas-based CSV parser. r"""Pandas-based CSV parser.
Parses CSVs using the separator detection from Pandas `read_csv`function. Parses CSVs using the separator detection from Pandas `read_csv` function.
If special parameters are required, use the `pandas_config` dict. If special parameters are required, use the `pandas_config` dict.
Args: Args:
@ -31,12 +31,14 @@ class PandasExcelReader(BaseReader):
*args: Any, *args: Any,
pandas_config: Optional[dict] = None, pandas_config: Optional[dict] = None,
row_joiner: str = "\n", row_joiner: str = "\n",
col_joiner: str = " ",
**kwargs: Any, **kwargs: Any,
) -> None: ) -> None:
"""Init params.""" """Init params."""
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
self._pandas_config = pandas_config or {} self._pandas_config = pandas_config or {}
self._row_joiner = row_joiner if row_joiner else "\n" self._row_joiner = row_joiner if row_joiner else "\n"
self._col_joiner = col_joiner if col_joiner else " "
def load_data( def load_data(
self, self,
@ -88,7 +90,9 @@ class PandasExcelReader(BaseReader):
output = [ output = [
Document( Document(
text=self._row_joiner.join(" ".join(sublist) for sublist in text_list), text=self._row_joiner.join(
self._col_joiner.join(sublist) for sublist in text_list
),
metadata={"source": file.stem}, metadata={"source": file.stem},
) )
] ]