import json from pathlib import Path import pytest from kotaemon.loaders import MathpixPDFReader, OCRReader, PandasExcelReader input_file = Path(__file__).parent / "resources" / "dummy.pdf" input_file_excel = Path(__file__).parent / "resources" / "dummy.xlsx" @pytest.fixture def fullocr_output(): with open( Path(__file__).parent / "resources" / "fullocr_sample_output.json", encoding="utf-8", ) as f: fullocr = json.load(f) return fullocr @pytest.fixture def mathpix_output(): with open(Path(__file__).parent / "resources" / "policy.md", encoding="utf-8") as f: content = f.read() return content def test_ocr_reader(fullocr_output): reader = OCRReader() documents = reader.load_data(input_file, response_content=fullocr_output) table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"] assert len(table_docs) == 4 def test_mathpix_reader(mathpix_output): reader = MathpixPDFReader() documents = reader.load_data(input_file, response_content=mathpix_output) table_docs = [doc for doc in documents if doc.metadata.get("type", "") == "table"] assert len(table_docs) == 4 def test_excel_reader(): reader = PandasExcelReader() documents = reader.load_data( input_file_excel, ) assert len(documents) == 1