[AUR-391, AUR-393] Add Document and DocumentReader base (#6)

* Declare BaseComponent * Brainstorming base class for LLM call * Define base LLM * Add tests * Clean telemetry environment for accurate testing * Fix README * Fix typing * add base document reader * update test * update requirements * Cosmetic change * update requirements * reformat --------- Co-authored-by: trducng <trungduc1992@gmail.com>
2023-08-31 11:24:12 +07:00
parent 4211315a54
commit 21350153d4
6 changed files with 82 additions and 6 deletions
--- a/tests/test_reader.py
+++ b/tests/test_reader.py
@@ -0,0 +1,32 @@
+from pathlib import Path
+
+from langchain.schema import Document as LangchainDocument
+from llama_index.node_parser import SimpleNodeParser
+
+from kotaemon.documents.base import Document, HaystackDocument
+from kotaemon.loaders import AutoReader
+
+
+def test_pdf_reader():
+    reader = AutoReader("PDFReader")
+    dirpath = Path(__file__).parent
+    documents = reader.load_data(dirpath / "resources/dummy.pdf")
+
+    # check document reader output
+    assert len(documents) == 1
+
+    first_doc = documents[0]
+    assert isinstance(first_doc, Document)
+    assert first_doc.text.lower().replace(" ", "") == "dummypdffile"
+
+    # check conversion output
+    haystack_doc = first_doc.to_haystack_format()
+    assert isinstance(haystack_doc, HaystackDocument)
+
+    langchain_doc = first_doc.to_langchain_format()
+    assert isinstance(langchain_doc, LangchainDocument)
+
+    # test chunking using NodeParser from llama-index
+    node_parser = SimpleNodeParser.from_defaults(chunk_size=100, chunk_overlap=20)
+    nodes = node_parser.get_nodes_from_documents(documents)
+    assert len(nodes) > 0