[AUR-432] Add layout-aware table parsing PDF reader (#27)
* add OCRReader, MathPixReader and ExcelReader * update test case for ocr reader * reformat * minor fix
This commit is contained in:
committed by
GitHub
parent
6207f4332a
commit
6c3d614973
45
tests/test_table_reader.py
Normal file
45
tests/test_table_reader.py
Normal file
@@ -0,0 +1,45 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from kotaemon.loaders import MathpixPDFReader, OCRReader, PandasExcelReader
|
||||
|
||||
input_file = Path(__file__).parent / "resources" / "dummy.pdf"
|
||||
input_file_excel = Path(__file__).parent / "resources" / "dummy.xlsx"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def fullocr_output():
|
||||
with open(Path(__file__).parent / "resources" / "fullocr_sample_output.json") as f:
|
||||
fullocr = json.load(f)
|
||||
return fullocr
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mathpix_output():
|
||||
with open(Path(__file__).parent / "resources" / "policy.md") as f:
|
||||
content = f.read()
|
||||
return content
|
||||
|
||||
|
||||
def test_ocr_reader(fullocr_output):
|
||||
reader = OCRReader()
|
||||
documents = reader.load_data(input_file, response_content=fullocr_output)
|
||||
table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"]
|
||||
assert len(table_docs) == 4
|
||||
|
||||
|
||||
def test_mathpix_reader(mathpix_output):
|
||||
reader = MathpixPDFReader()
|
||||
documents = reader.load_data(input_file, response_content=mathpix_output)
|
||||
table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"]
|
||||
assert len(table_docs) == 4
|
||||
|
||||
|
||||
def test_excel_reader():
|
||||
reader = PandasExcelReader()
|
||||
documents = reader.load_data(
|
||||
input_file_excel,
|
||||
)
|
||||
assert len(documents) == 1
|
Reference in New Issue
Block a user