[AUR-391, AUR-393] Add Document and DocumentReader base (#6)

* Declare BaseComponent

* Brainstorming base class for LLM call

* Define base LLM

* Add tests

* Clean telemetry environment for accurate testing

* Fix README

* Fix typing

* add base document reader

* update test

* update requirements

* Cosmetic change

* update requirements

* reformat

---------

Co-authored-by: trducng <trungduc1992@gmail.com>
This commit is contained in:
Tuan Anh Nguyen Dang (Tadashi_Cin) 2023-08-31 11:24:12 +07:00 committed by GitHub
parent 4211315a54
commit 21350153d4
6 changed files with 82 additions and 6 deletions

View File

@ -0,0 +1,22 @@
from haystack.schema import Document as HaystackDocument
from llama_index.schema import Document as BaseDocument
SAMPLE_TEXT = "A sample Document from kotaemon"
class Document(BaseDocument):
"""Base document class, mostly inherited from Document class from llama-index"""
@classmethod
def example(cls) -> "Document":
document = Document(
text=SAMPLE_TEXT,
metadata={"filename": "README.md", "category": "codebase"},
)
return document
def to_haystack_format(self) -> HaystackDocument:
"""Convert struct to Haystack document format."""
metadata = self.metadata or {}
text = self.text
return HaystackDocument(content=text, meta=metadata)

View File

@ -0,0 +1,3 @@
from .base import AutoReader
__all__ = ["AutoReader"]

View File

@ -1,10 +1,26 @@
class DocumentLoader: from pathlib import Path
"""Document loader""" from typing import Any, List, Type, Union
from llama_index import download_loader
from llama_index.readers.base import BaseReader
from ..documents.base import Document
class TextManipulator: class AutoReader(BaseReader):
"""Text manipulation""" """General auto reader for a variety of files. (based on llama-hub)"""
def __init__(self, reader_type: Union[str, Type[BaseReader]]) -> None:
"""Init reader using string identifier or class name from llama-hub"""
class DocumentManipulator: if isinstance(reader_type, str):
"""Document manipulation""" self._reader = download_loader(reader_type)()
else:
self._reader = reader_type()
def load_data(self, file: Union[Path, str], **kwargs: Any) -> List[Document]:
documents = self._reader.load_data(file=file, **kwargs)
# convert Document to new base class from kotaemon
converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]
return converted_documents

View File

@ -31,6 +31,9 @@ setuptools.setup(
"farm-haystack==1.19.0", "farm-haystack==1.19.0",
"langchain", "langchain",
"theflow", "theflow",
"llama-index",
"llama-hub",
"nltk",
], ],
extras_require={ extras_require={
"dev": [ "dev": [

BIN
tests/resources/dummy.pdf Normal file

Binary file not shown.

32
tests/test_reader.py Normal file
View File

@ -0,0 +1,32 @@
from pathlib import Path
from langchain.schema import Document as LangchainDocument
from llama_index.node_parser import SimpleNodeParser
from kotaemon.documents.base import Document, HaystackDocument
from kotaemon.loaders import AutoReader
def test_pdf_reader():
reader = AutoReader("PDFReader")
dirpath = Path(__file__).parent
documents = reader.load_data(dirpath / "resources/dummy.pdf")
# check document reader output
assert len(documents) == 1
first_doc = documents[0]
assert isinstance(first_doc, Document)
assert first_doc.text.lower().replace(" ", "") == "dummypdffile"
# check conversion output
haystack_doc = first_doc.to_haystack_format()
assert isinstance(haystack_doc, HaystackDocument)
langchain_doc = first_doc.to_langchain_format()
assert isinstance(langchain_doc, LangchainDocument)
# test chunking using NodeParser from llama-index
node_parser = SimpleNodeParser.from_defaults(chunk_size=100, chunk_overlap=20)
nodes = node_parser.get_nodes_from_documents(documents)
assert len(nodes) > 0