kotaemon/tests/test_reader.py
Tuan Anh Nguyen Dang (Tadashi_Cin) 21350153d4 [AUR-391, AUR-393] Add Document and DocumentReader base (#6)
* Declare BaseComponent

* Brainstorming base class for LLM call

* Define base LLM

* Add tests

* Clean telemetry environment for accurate testing

* Fix README

* Fix typing

* add base document reader

* update test

* update requirements

* Cosmetic change

* update requirements

* reformat

---------

Co-authored-by: trducng <trungduc1992@gmail.com>
2023-08-31 11:24:12 +07:00

33 lines
1.1 KiB
Python

from pathlib import Path
from langchain.schema import Document as LangchainDocument
from llama_index.node_parser import SimpleNodeParser
from kotaemon.documents.base import Document, HaystackDocument
from kotaemon.loaders import AutoReader
def test_pdf_reader():
reader = AutoReader("PDFReader")
dirpath = Path(__file__).parent
documents = reader.load_data(dirpath / "resources/dummy.pdf")
# check document reader output
assert len(documents) == 1
first_doc = documents[0]
assert isinstance(first_doc, Document)
assert first_doc.text.lower().replace(" ", "") == "dummypdffile"
# check conversion output
haystack_doc = first_doc.to_haystack_format()
assert isinstance(haystack_doc, HaystackDocument)
langchain_doc = first_doc.to_langchain_format()
assert isinstance(langchain_doc, LangchainDocument)
# test chunking using NodeParser from llama-index
node_parser = SimpleNodeParser.from_defaults(chunk_size=100, chunk_overlap=20)
nodes = node_parser.get_nodes_from_documents(documents)
assert len(nodes) > 0