This change speeds up OCR extraction by allowing bypassing OCR for texts that are irrelevant (not in table). --------- Co-authored-by: Nguyen Trung Duc (john) <trungduc1992@gmail.com>
101 lines
3.2 KiB
Python
101 lines
3.2 KiB
Python
from pathlib import Path
|
|
from typing import List
|
|
from uuid import uuid4
|
|
|
|
import requests
|
|
from llama_index.readers.base import BaseReader
|
|
|
|
from kotaemon.documents import Document
|
|
|
|
from .utils.pdf_ocr import parse_ocr_output, read_pdf_unstructured
|
|
from .utils.table import strip_special_chars_markdown
|
|
|
|
DEFAULT_OCR_ENDPOINT = "http://127.0.0.1:8000/v2/ai/infer/"
|
|
|
|
|
|
class OCRReader(BaseReader):
|
|
def __init__(self, endpoint: str = DEFAULT_OCR_ENDPOINT, use_ocr=True):
|
|
"""Init the OCR reader with OCR endpoint (FullOCR pipeline)
|
|
|
|
Args:
|
|
endpoint: URL to FullOCR endpoint. Defaults to OCR_ENDPOINT.
|
|
use_ocr: whether to use OCR to read text
|
|
(e.g: from images, tables) in the PDF
|
|
"""
|
|
super().__init__()
|
|
self.ocr_endpoint = endpoint
|
|
self.use_ocr = use_ocr
|
|
|
|
def load_data(
|
|
self,
|
|
file_path: Path,
|
|
**kwargs,
|
|
) -> List[Document]:
|
|
"""Load data using OCR reader
|
|
|
|
Args:
|
|
file_path (Path): Path to PDF file
|
|
debug_path (Path): Path to store debug image output
|
|
artifact_path (Path): Path to OCR endpoints artifacts directory
|
|
|
|
Returns:
|
|
List[Document]: list of documents extracted from the PDF file
|
|
"""
|
|
# create input params for the requests
|
|
content = open(file_path, "rb")
|
|
files = {"input": content}
|
|
data = {"job_id": uuid4(), "table_only": not self.use_ocr}
|
|
|
|
debug_path = kwargs.pop("debug_path", None)
|
|
artifact_path = kwargs.pop("artifact_path", None)
|
|
|
|
# call the API from FullOCR endpoint
|
|
if "response_content" in kwargs:
|
|
# overriding response content if specified
|
|
ocr_results = kwargs["response_content"]
|
|
else:
|
|
# call original API
|
|
resp = requests.post(url=self.ocr_endpoint, files=files, data=data)
|
|
ocr_results = resp.json()["result"]
|
|
|
|
# read PDF through normal reader (unstructured)
|
|
pdf_page_items = read_pdf_unstructured(file_path)
|
|
# merge PDF text output with OCR output
|
|
tables, texts = parse_ocr_output(
|
|
ocr_results,
|
|
pdf_page_items,
|
|
debug_path=debug_path,
|
|
artifact_path=artifact_path,
|
|
)
|
|
|
|
# create output Document with metadata from table
|
|
documents = [
|
|
Document(
|
|
text=strip_special_chars_markdown(table_text),
|
|
metadata={
|
|
"table_origin": table_text,
|
|
"type": "table",
|
|
"page_label": page_id + 1,
|
|
"source": file_path.name,
|
|
},
|
|
metadata_template="",
|
|
metadata_seperator="",
|
|
)
|
|
for page_id, table_text in tables
|
|
]
|
|
# create Document from non-table text
|
|
documents.extend(
|
|
[
|
|
Document(
|
|
text=non_table_text,
|
|
metadata={
|
|
"page_label": page_id + 1,
|
|
"source": file_path.name,
|
|
},
|
|
)
|
|
for page_id, non_table_text in texts
|
|
]
|
|
)
|
|
|
|
return documents
|