kotaemon/knowledgehub/loaders/ocr_loader.py
Tuan Anh Nguyen Dang (Tadashi_Cin) 6c3d614973 [AUR-432] Add layout-aware table parsing PDF reader (#27)
* add OCRReader, MathPixReader and ExcelReader

* update test case for ocr reader

* reformat

* minor fix
2023-09-26 15:52:44 +07:00

98 lines
3.0 KiB
Python

from pathlib import Path
from typing import List
from uuid import uuid4
import requests
from llama_index.readers.base import BaseReader
from kotaemon.documents import Document
from .utils.table import (
extract_tables_from_csv_string,
get_table_from_ocr,
strip_special_chars_markdown,
)
DEFAULT_OCR_ENDPOINT = "http://127.0.0.1:8000/v2/ai/infer/"
class OCRReader(BaseReader):
def __init__(self, endpoint: str = DEFAULT_OCR_ENDPOINT):
"""Init the OCR reader with OCR endpoint (FullOCR pipeline)
Args:
endpoint: URL to FullOCR endpoint. Defaults to OCR_ENDPOINT.
"""
super().__init__()
self.ocr_endpoint = endpoint
def load_data(
self,
file: Path,
**kwargs,
) -> List[Document]:
# create input params for the requests
content = open(file, "rb")
files = {"input": content}
data = {"job_id": uuid4()}
# init list of output documents
documents = []
all_table_csv_list = []
all_non_table_texts = []
# call the API from FullOCR endpoint
if "response_content" in kwargs:
# overriding response content if specified
results = kwargs["response_content"]
else:
# call original API
resp = requests.post(url=self.ocr_endpoint, files=files, data=data)
results = resp.json()["result"]
for _id, each in enumerate(results):
csv_content = each["csv_string"]
table = each["json"]["table"]
ocr = each["json"]["ocr"]
# using helper function to extract list of table texts from FullOCR output
table_texts = get_table_from_ocr(ocr, table)
# extract the formatted CSV table from specified text
csv_list, non_table_text = extract_tables_from_csv_string(
csv_content, table_texts
)
all_table_csv_list.extend([(csv, _id) for csv in csv_list])
all_non_table_texts.append((non_table_text, _id))
# create output Document with metadata from table
documents = [
Document(
text=strip_special_chars_markdown(csv),
metadata={
"table_origin": csv,
"type": "table",
"page_label": page_id + 1,
"source": file.name,
},
metadata_template="",
metadata_seperator="",
)
for csv, page_id in all_table_csv_list
]
# create Document from non-table text
documents.extend(
[
Document(
text=non_table_text,
metadata={
"page_label": page_id + 1,
"source": file.name,
},
)
for non_table_text, page_id in all_non_table_texts
]
)
return documents