From 80ec21410770e69ffea3d8366c6eafb738e18422 Mon Sep 17 00:00:00 2001 From: trducng Date: Sat, 27 Jan 2024 22:52:46 +0700 Subject: [PATCH] Fix loaders' file_path and other metadata --- .../kotaemon/kotaemon/loaders/excel_loader.py | 3 ++- .../kotaemon/loaders/mathpix_loader.py | 9 ++++++--- libs/kotaemon/kotaemon/loaders/ocr_loader.py | 20 +++++++------------ .../kotaemon/loaders/unstructured_loader.py | 10 +++++----- libs/ktem/ktem/reasoning/simple.py | 4 ++-- 5 files changed, 22 insertions(+), 24 deletions(-) diff --git a/libs/kotaemon/kotaemon/loaders/excel_loader.py b/libs/kotaemon/kotaemon/loaders/excel_loader.py index 72fd0b3..5fab7c5 100644 --- a/libs/kotaemon/kotaemon/loaders/excel_loader.py +++ b/libs/kotaemon/kotaemon/loaders/excel_loader.py @@ -44,6 +44,7 @@ class PandasExcelReader(BaseReader): file: Path, include_sheetname: bool = False, sheet_name: Optional[Union[str, int, list]] = None, + extra_info: Optional[dict] = None, **kwargs, ) -> List[Document]: """Parse file and extract values from a specific column. @@ -92,7 +93,7 @@ class PandasExcelReader(BaseReader): text=self._row_joiner.join( self._col_joiner.join(sublist) for sublist in text_list ), - metadata={"source": file.stem}, + metadata=extra_info or {}, ) ] diff --git a/libs/kotaemon/kotaemon/loaders/mathpix_loader.py b/libs/kotaemon/kotaemon/loaders/mathpix_loader.py index 4bbf769..1f5b602 100644 --- a/libs/kotaemon/kotaemon/loaders/mathpix_loader.py +++ b/libs/kotaemon/kotaemon/loaders/mathpix_loader.py @@ -2,7 +2,7 @@ import json import re import time from pathlib import Path -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional import requests from kotaemon.base import Document @@ -138,7 +138,9 @@ class MathpixPDFReader(BaseReader): contents = re.sub(markup_regex, "", contents) return contents - def load_data(self, file_path: Path, **kwargs) -> List[Document]: + def load_data( + self, file_path: Path, extra_info: Optional[dict] = None, **kwargs + ) -> List[Document]: if "response_content" in kwargs: # overriding response content if specified content = kwargs["response_content"] @@ -154,10 +156,11 @@ class MathpixPDFReader(BaseReader): for table in tables: text = strip_special_chars_markdown(table) metadata = { - "source": file_path.name, "table_origin": table, "type": "table", } + if extra_info: + metadata.update(extra_info) documents.append( Document( text=text, diff --git a/libs/kotaemon/kotaemon/loaders/ocr_loader.py b/libs/kotaemon/kotaemon/loaders/ocr_loader.py index 13bd907..608405f 100644 --- a/libs/kotaemon/kotaemon/loaders/ocr_loader.py +++ b/libs/kotaemon/kotaemon/loaders/ocr_loader.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import List +from typing import List, Optional from uuid import uuid4 import requests @@ -25,7 +25,9 @@ class OCRReader(BaseReader): self.ocr_endpoint = endpoint self.use_ocr = use_ocr - def load_data(self, file_path: Path, **kwargs) -> List[Document]: + def load_data( + self, file_path: Path, extra_info: Optional[dict] = None, **kwargs + ) -> List[Document]: """Load data using OCR reader Args: @@ -63,6 +65,7 @@ class OCRReader(BaseReader): debug_path=debug_path, artifact_path=artifact_path, ) + extra_info = extra_info or {} # create output Document with metadata from table documents = [ @@ -72,10 +75,7 @@ class OCRReader(BaseReader): "table_origin": table_text, "type": "table", "page_label": page_id + 1, - "source": file_path.name, - "file_path": str(file_path), - "file_name": file_path.name, - "filename": str(file_path), + **extra_info, }, metadata_template="", metadata_seperator="", @@ -87,13 +87,7 @@ class OCRReader(BaseReader): [ Document( text=non_table_text, - metadata={ - "page_label": page_id + 1, - "source": file_path.name, - "file_path": str(file_path), - "file_name": file_path.name, - "filename": str(file_path), - }, + metadata={"page_label": page_id + 1, **extra_info}, ) for page_id, non_table_text in texts ] diff --git a/libs/kotaemon/kotaemon/loaders/unstructured_loader.py b/libs/kotaemon/kotaemon/loaders/unstructured_loader.py index 1c3e526..8568d95 100644 --- a/libs/kotaemon/kotaemon/loaders/unstructured_loader.py +++ b/libs/kotaemon/kotaemon/loaders/unstructured_loader.py @@ -50,7 +50,7 @@ class UnstructuredReader(BaseReader): def load_data( self, file: Path, - additional_metadata: Optional[Dict] = None, + extra_info: Optional[Dict] = None, split_documents: Optional[bool] = False, **kwargs, ) -> List[Document]: @@ -91,8 +91,8 @@ class UnstructuredReader(BaseReader): continue metadata[field] = val - if additional_metadata is not None: - metadata.update(additional_metadata) + if extra_info is not None: + metadata.update(extra_info) metadata["file_name"] = file_name docs.append(Document(text=node.text, metadata=metadata)) @@ -101,8 +101,8 @@ class UnstructuredReader(BaseReader): text_chunks = [" ".join(str(el).split()) for el in elements] metadata = {"file_name": file_name, "file_path": file_path} - if additional_metadata is not None: - metadata.update(additional_metadata) + if extra_info is not None: + metadata.update(extra_info) # Create a single document by joining all the texts docs.append(Document(text="\n\n".join(text_chunks), metadata=metadata)) diff --git a/libs/ktem/ktem/reasoning/simple.py b/libs/ktem/ktem/reasoning/simple.py index c41f41f..352868f 100644 --- a/libs/ktem/ktem/reasoning/simple.py +++ b/libs/ktem/ktem/reasoning/simple.py @@ -96,7 +96,7 @@ class PrepareEvidencePipeline(BaseComponent): evidence = texts[0].text print("len (trimmed)", len(evidence)) - print(f"PrepareEvidence with input {input}\nOutput: {evidence}\n") + print(f"PrepareEvidence with input {docs}\nOutput: {evidence}\n") return Document(content=(evidence_mode, evidence)) @@ -228,7 +228,7 @@ class FullQAPipeline(BaseComponent): answering_pipeline: AnswerWithContextPipeline = AnswerWithContextPipeline.withx() async def run( # type: ignore - self, message: str, cid: str, history: list, **kwargs # type: ignore + self, message: str, conv_id: str, history: list, **kwargs # type: ignore ) -> Document: # type: ignore docs = [] for retriever in self.retrievers: