[AUR-432] Add layout-aware table parsing PDF reader (#27)

* add OCRReader, MathPixReader and ExcelReader * update test case for ocr reader * reformat * minor fix
2023-09-26 15:52:44 +07:00
parent 6207f4332a
commit 6c3d614973
12 changed files with 888 additions and 2 deletions
--- a/tests/test_table_reader.py
+++ b/tests/test_table_reader.py
@@ -0,0 +1,45 @@
+import json
+from pathlib import Path
+
+import pytest
+
+from kotaemon.loaders import MathpixPDFReader, OCRReader, PandasExcelReader
+
+input_file = Path(__file__).parent / "resources" / "dummy.pdf"
+input_file_excel = Path(__file__).parent / "resources" / "dummy.xlsx"
+
+
+@pytest.fixture
+def fullocr_output():
+    with open(Path(__file__).parent / "resources" / "fullocr_sample_output.json") as f:
+        fullocr = json.load(f)
+    return fullocr
+
+
+@pytest.fixture
+def mathpix_output():
+    with open(Path(__file__).parent / "resources" / "policy.md") as f:
+        content = f.read()
+    return content
+
+
+def test_ocr_reader(fullocr_output):
+    reader = OCRReader()
+    documents = reader.load_data(input_file, response_content=fullocr_output)
+    table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"]
+    assert len(table_docs) == 4
+
+
+def test_mathpix_reader(mathpix_output):
+    reader = MathpixPDFReader()
+    documents = reader.load_data(input_file, response_content=mathpix_output)
+    table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"]
+    assert len(table_docs) == 4
+
+
+def test_excel_reader():
+    reader = PandasExcelReader()
+    documents = reader.load_data(
+        input_file_excel,
+    )
+    assert len(documents) == 1