Add UnstructuredReader with support for various legacy files (.doc, .xls) (#99)
This commit is contained in:
committed by
GitHub
parent
37c744b616
commit
d9e925eb75
@@ -4,7 +4,7 @@ from langchain.schema import Document as LangchainDocument
|
||||
from llama_index.node_parser import SimpleNodeParser
|
||||
|
||||
from kotaemon.base import Document
|
||||
from kotaemon.loaders import AutoReader
|
||||
from kotaemon.loaders import AutoReader, UnstructuredReader
|
||||
|
||||
|
||||
def test_pdf_reader():
|
||||
@@ -26,3 +26,22 @@ def test_pdf_reader():
|
||||
node_parser = SimpleNodeParser.from_defaults(chunk_size=100, chunk_overlap=20)
|
||||
nodes = node_parser.get_nodes_from_documents(documents)
|
||||
assert len(nodes) > 0
|
||||
|
||||
|
||||
def test_unstructured_pdf_reader():
|
||||
reader = UnstructuredReader()
|
||||
dirpath = Path(__file__).parent
|
||||
input_path = dirpath / "resources/dummy.pdf"
|
||||
documents = reader.load_data(input_path)
|
||||
|
||||
# check document reader output
|
||||
assert len(documents) == 1
|
||||
|
||||
first_doc = documents[0]
|
||||
assert isinstance(first_doc, Document)
|
||||
assert first_doc.text.lower().replace(" ", "") == "dummypdffile"
|
||||
|
||||
# split documents mode
|
||||
documents = reader.load_data(input_path, split_documents=True)
|
||||
# check document reader output
|
||||
assert len(documents) == 1
|
||||
|
Reference in New Issue
Block a user