[AUR-391, AUR-393] Add Document and DocumentReader base (#6)

* Declare BaseComponent * Brainstorming base class for LLM call * Define base LLM * Add tests * Clean telemetry environment for accurate testing * Fix README * Fix typing * add base document reader * update test * update requirements * Cosmetic change * update requirements * reformat --------- Co-authored-by: trducng <trungduc1992@gmail.com>
2023-08-31 11:24:12 +07:00 · 2023-08-31 11:24:12 +07:00 · 21350153d4
commit 21350153d4
parent 4211315a54
6 changed files with 82 additions and 6 deletions
--- a/knowledgehub/documents/base.py
+++ b/knowledgehub/documents/base.py
@ -0,0 +1,22 @@
 from haystack.schema import Document as HaystackDocument
 from llama_index.schema import Document as BaseDocument
 SAMPLE_TEXT = "A sample Document from kotaemon"
 class Document(BaseDocument):
    """Base document class, mostly inherited from Document class from llama-index"""
    @classmethod
    def example(cls) -> "Document":
        document = Document(
            text=SAMPLE_TEXT,
            metadata={"filename": "README.md", "category": "codebase"},
        )
        return document
    def to_haystack_format(self) -> HaystackDocument:
        """Convert struct to Haystack document format."""
        metadata = self.metadata or {}
        text = self.text
        return HaystackDocument(content=text, meta=metadata)
--- a/knowledgehub/loaders/init.py
+++ b/knowledgehub/loaders/init.py
@ -0,0 +1,3 @@
 from .base import AutoReader
 __all__ = ["AutoReader"]
--- a/knowledgehub/loaders/base.py
+++ b/knowledgehub/loaders/base.py
@ -1,10 +1,26 @@
-class DocumentLoader:
+from pathlib import Path
-    """Document loader"""
+from typing import Any, List, Type, Union
 from llama_index import download_loader
 from llama_index.readers.base import BaseReader
 from ..documents.base import Document
-class TextManipulator:
+class AutoReader(BaseReader):
-    """Text manipulation"""
+    """General auto reader for a variety of files. (based on llama-hub)"""
    def __init__(self, reader_type: Union[str, Type[BaseReader]]) -> None:
        """Init reader using string identifier or class name from llama-hub"""
-class DocumentManipulator:
+        if isinstance(reader_type, str):
-    """Document manipulation"""
+            self._reader = download_loader(reader_type)()
        else:
            self._reader = reader_type()
    def load_data(self, file: Union[Path, str], **kwargs: Any) -> List[Document]:
        documents = self._reader.load_data(file=file, **kwargs)
        # convert Document to new base class from kotaemon
        converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]
        return converted_documents
--- a/setup.py
+++ b/setup.py
@ -31,6 +31,9 @@ setuptools.setup(
        "farm-haystack==1.19.0",
        "langchain",
        "theflow",
        "llama-index",
        "llama-hub",
        "nltk",
    ],
    extras_require={
        "dev": [
--- a/tests/resources/dummy.pdf
+++ b/tests/resources/dummy.pdf
--- a/tests/test_reader.py
+++ b/tests/test_reader.py
@ -0,0 +1,32 @@
 from pathlib import Path
 from langchain.schema import Document as LangchainDocument
 from llama_index.node_parser import SimpleNodeParser
 from kotaemon.documents.base import Document, HaystackDocument
 from kotaemon.loaders import AutoReader
 def test_pdf_reader():
    reader = AutoReader("PDFReader")
    dirpath = Path(__file__).parent
    documents = reader.load_data(dirpath / "resources/dummy.pdf")
    # check document reader output
    assert len(documents) == 1
    first_doc = documents[0]
    assert isinstance(first_doc, Document)
    assert first_doc.text.lower().replace(" ", "") == "dummypdffile"
    # check conversion output
    haystack_doc = first_doc.to_haystack_format()
    assert isinstance(haystack_doc, HaystackDocument)
    langchain_doc = first_doc.to_langchain_format()
    assert isinstance(langchain_doc, LangchainDocument)
    # test chunking using NodeParser from llama-index
    node_parser = SimpleNodeParser.from_defaults(chunk_size=100, chunk_overlap=20)
    nodes = node_parser.get_nodes_from_documents(documents)
    assert len(nodes) > 0
		`@ -0,0 +1,3 @@`
							`from .base import AutoReader`

							`__all__ = ["AutoReader"]`