Fix loaders' file_path and other metadata

This commit is contained in:
trducng 2024-01-27 22:52:46 +07:00
parent c6637ca56e
commit 80ec214107
5 changed files with 22 additions and 24 deletions

View File

@ -44,6 +44,7 @@ class PandasExcelReader(BaseReader):
file: Path, file: Path,
include_sheetname: bool = False, include_sheetname: bool = False,
sheet_name: Optional[Union[str, int, list]] = None, sheet_name: Optional[Union[str, int, list]] = None,
extra_info: Optional[dict] = None,
**kwargs, **kwargs,
) -> List[Document]: ) -> List[Document]:
"""Parse file and extract values from a specific column. """Parse file and extract values from a specific column.
@ -92,7 +93,7 @@ class PandasExcelReader(BaseReader):
text=self._row_joiner.join( text=self._row_joiner.join(
self._col_joiner.join(sublist) for sublist in text_list self._col_joiner.join(sublist) for sublist in text_list
), ),
metadata={"source": file.stem}, metadata=extra_info or {},
) )
] ]

View File

@ -2,7 +2,7 @@ import json
import re import re
import time import time
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List from typing import Any, Dict, List, Optional
import requests import requests
from kotaemon.base import Document from kotaemon.base import Document
@ -138,7 +138,9 @@ class MathpixPDFReader(BaseReader):
contents = re.sub(markup_regex, "", contents) contents = re.sub(markup_regex, "", contents)
return contents return contents
def load_data(self, file_path: Path, **kwargs) -> List[Document]: def load_data(
self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
) -> List[Document]:
if "response_content" in kwargs: if "response_content" in kwargs:
# overriding response content if specified # overriding response content if specified
content = kwargs["response_content"] content = kwargs["response_content"]
@ -154,10 +156,11 @@ class MathpixPDFReader(BaseReader):
for table in tables: for table in tables:
text = strip_special_chars_markdown(table) text = strip_special_chars_markdown(table)
metadata = { metadata = {
"source": file_path.name,
"table_origin": table, "table_origin": table,
"type": "table", "type": "table",
} }
if extra_info:
metadata.update(extra_info)
documents.append( documents.append(
Document( Document(
text=text, text=text,

View File

@ -1,5 +1,5 @@
from pathlib import Path from pathlib import Path
from typing import List from typing import List, Optional
from uuid import uuid4 from uuid import uuid4
import requests import requests
@ -25,7 +25,9 @@ class OCRReader(BaseReader):
self.ocr_endpoint = endpoint self.ocr_endpoint = endpoint
self.use_ocr = use_ocr self.use_ocr = use_ocr
def load_data(self, file_path: Path, **kwargs) -> List[Document]: def load_data(
self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
) -> List[Document]:
"""Load data using OCR reader """Load data using OCR reader
Args: Args:
@ -63,6 +65,7 @@ class OCRReader(BaseReader):
debug_path=debug_path, debug_path=debug_path,
artifact_path=artifact_path, artifact_path=artifact_path,
) )
extra_info = extra_info or {}
# create output Document with metadata from table # create output Document with metadata from table
documents = [ documents = [
@ -72,10 +75,7 @@ class OCRReader(BaseReader):
"table_origin": table_text, "table_origin": table_text,
"type": "table", "type": "table",
"page_label": page_id + 1, "page_label": page_id + 1,
"source": file_path.name, **extra_info,
"file_path": str(file_path),
"file_name": file_path.name,
"filename": str(file_path),
}, },
metadata_template="", metadata_template="",
metadata_seperator="", metadata_seperator="",
@ -87,13 +87,7 @@ class OCRReader(BaseReader):
[ [
Document( Document(
text=non_table_text, text=non_table_text,
metadata={ metadata={"page_label": page_id + 1, **extra_info},
"page_label": page_id + 1,
"source": file_path.name,
"file_path": str(file_path),
"file_name": file_path.name,
"filename": str(file_path),
},
) )
for page_id, non_table_text in texts for page_id, non_table_text in texts
] ]

View File

@ -50,7 +50,7 @@ class UnstructuredReader(BaseReader):
def load_data( def load_data(
self, self,
file: Path, file: Path,
additional_metadata: Optional[Dict] = None, extra_info: Optional[Dict] = None,
split_documents: Optional[bool] = False, split_documents: Optional[bool] = False,
**kwargs, **kwargs,
) -> List[Document]: ) -> List[Document]:
@ -91,8 +91,8 @@ class UnstructuredReader(BaseReader):
continue continue
metadata[field] = val metadata[field] = val
if additional_metadata is not None: if extra_info is not None:
metadata.update(additional_metadata) metadata.update(extra_info)
metadata["file_name"] = file_name metadata["file_name"] = file_name
docs.append(Document(text=node.text, metadata=metadata)) docs.append(Document(text=node.text, metadata=metadata))
@ -101,8 +101,8 @@ class UnstructuredReader(BaseReader):
text_chunks = [" ".join(str(el).split()) for el in elements] text_chunks = [" ".join(str(el).split()) for el in elements]
metadata = {"file_name": file_name, "file_path": file_path} metadata = {"file_name": file_name, "file_path": file_path}
if additional_metadata is not None: if extra_info is not None:
metadata.update(additional_metadata) metadata.update(extra_info)
# Create a single document by joining all the texts # Create a single document by joining all the texts
docs.append(Document(text="\n\n".join(text_chunks), metadata=metadata)) docs.append(Document(text="\n\n".join(text_chunks), metadata=metadata))

View File

@ -96,7 +96,7 @@ class PrepareEvidencePipeline(BaseComponent):
evidence = texts[0].text evidence = texts[0].text
print("len (trimmed)", len(evidence)) print("len (trimmed)", len(evidence))
print(f"PrepareEvidence with input {input}\nOutput: {evidence}\n") print(f"PrepareEvidence with input {docs}\nOutput: {evidence}\n")
return Document(content=(evidence_mode, evidence)) return Document(content=(evidence_mode, evidence))
@ -228,7 +228,7 @@ class FullQAPipeline(BaseComponent):
answering_pipeline: AnswerWithContextPipeline = AnswerWithContextPipeline.withx() answering_pipeline: AnswerWithContextPipeline = AnswerWithContextPipeline.withx()
async def run( # type: ignore async def run( # type: ignore
self, message: str, cid: str, history: list, **kwargs # type: ignore self, message: str, conv_id: str, history: list, **kwargs # type: ignore
) -> Document: # type: ignore ) -> Document: # type: ignore
docs = [] docs = []
for retriever in self.retrievers: for retriever in self.retrievers: