[AUR-391, AUR-393] Add Document and DocumentReader base (#6)
* Declare BaseComponent * Brainstorming base class for LLM call * Define base LLM * Add tests * Clean telemetry environment for accurate testing * Fix README * Fix typing * add base document reader * update test * update requirements * Cosmetic change * update requirements * reformat --------- Co-authored-by: trducng <trungduc1992@gmail.com>
This commit is contained in:
committed by
GitHub
parent
4211315a54
commit
21350153d4
32
tests/test_reader.py
Normal file
32
tests/test_reader.py
Normal file
@@ -0,0 +1,32 @@
|
||||
from pathlib import Path
|
||||
|
||||
from langchain.schema import Document as LangchainDocument
|
||||
from llama_index.node_parser import SimpleNodeParser
|
||||
|
||||
from kotaemon.documents.base import Document, HaystackDocument
|
||||
from kotaemon.loaders import AutoReader
|
||||
|
||||
|
||||
def test_pdf_reader():
|
||||
reader = AutoReader("PDFReader")
|
||||
dirpath = Path(__file__).parent
|
||||
documents = reader.load_data(dirpath / "resources/dummy.pdf")
|
||||
|
||||
# check document reader output
|
||||
assert len(documents) == 1
|
||||
|
||||
first_doc = documents[0]
|
||||
assert isinstance(first_doc, Document)
|
||||
assert first_doc.text.lower().replace(" ", "") == "dummypdffile"
|
||||
|
||||
# check conversion output
|
||||
haystack_doc = first_doc.to_haystack_format()
|
||||
assert isinstance(haystack_doc, HaystackDocument)
|
||||
|
||||
langchain_doc = first_doc.to_langchain_format()
|
||||
assert isinstance(langchain_doc, LangchainDocument)
|
||||
|
||||
# test chunking using NodeParser from llama-index
|
||||
node_parser = SimpleNodeParser.from_defaults(chunk_size=100, chunk_overlap=20)
|
||||
nodes = node_parser.get_nodes_from_documents(documents)
|
||||
assert len(nodes) > 0
|
Reference in New Issue
Block a user