[AUR-391, AUR-393] Add Document and DocumentReader base (#6)
* Declare BaseComponent * Brainstorming base class for LLM call * Define base LLM * Add tests * Clean telemetry environment for accurate testing * Fix README * Fix typing * add base document reader * update test * update requirements * Cosmetic change * update requirements * reformat --------- Co-authored-by: trducng <trungduc1992@gmail.com>
This commit is contained in:
parent
4211315a54
commit
21350153d4
22
knowledgehub/documents/base.py
Normal file
22
knowledgehub/documents/base.py
Normal file
|
@ -0,0 +1,22 @@
|
||||||
|
from haystack.schema import Document as HaystackDocument
|
||||||
|
from llama_index.schema import Document as BaseDocument
|
||||||
|
|
||||||
|
SAMPLE_TEXT = "A sample Document from kotaemon"
|
||||||
|
|
||||||
|
|
||||||
|
class Document(BaseDocument):
|
||||||
|
"""Base document class, mostly inherited from Document class from llama-index"""
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def example(cls) -> "Document":
|
||||||
|
document = Document(
|
||||||
|
text=SAMPLE_TEXT,
|
||||||
|
metadata={"filename": "README.md", "category": "codebase"},
|
||||||
|
)
|
||||||
|
return document
|
||||||
|
|
||||||
|
def to_haystack_format(self) -> HaystackDocument:
|
||||||
|
"""Convert struct to Haystack document format."""
|
||||||
|
metadata = self.metadata or {}
|
||||||
|
text = self.text
|
||||||
|
return HaystackDocument(content=text, meta=metadata)
|
|
@ -0,0 +1,3 @@
|
||||||
|
from .base import AutoReader
|
||||||
|
|
||||||
|
__all__ = ["AutoReader"]
|
|
@ -1,10 +1,26 @@
|
||||||
class DocumentLoader:
|
from pathlib import Path
|
||||||
"""Document loader"""
|
from typing import Any, List, Type, Union
|
||||||
|
|
||||||
|
from llama_index import download_loader
|
||||||
|
from llama_index.readers.base import BaseReader
|
||||||
|
|
||||||
|
from ..documents.base import Document
|
||||||
|
|
||||||
|
|
||||||
class TextManipulator:
|
class AutoReader(BaseReader):
|
||||||
"""Text manipulation"""
|
"""General auto reader for a variety of files. (based on llama-hub)"""
|
||||||
|
|
||||||
|
def __init__(self, reader_type: Union[str, Type[BaseReader]]) -> None:
|
||||||
|
"""Init reader using string identifier or class name from llama-hub"""
|
||||||
|
|
||||||
class DocumentManipulator:
|
if isinstance(reader_type, str):
|
||||||
"""Document manipulation"""
|
self._reader = download_loader(reader_type)()
|
||||||
|
else:
|
||||||
|
self._reader = reader_type()
|
||||||
|
|
||||||
|
def load_data(self, file: Union[Path, str], **kwargs: Any) -> List[Document]:
|
||||||
|
documents = self._reader.load_data(file=file, **kwargs)
|
||||||
|
|
||||||
|
# convert Document to new base class from kotaemon
|
||||||
|
converted_documents = [Document.from_dict(doc.to_dict()) for doc in documents]
|
||||||
|
return converted_documents
|
||||||
|
|
3
setup.py
3
setup.py
|
@ -31,6 +31,9 @@ setuptools.setup(
|
||||||
"farm-haystack==1.19.0",
|
"farm-haystack==1.19.0",
|
||||||
"langchain",
|
"langchain",
|
||||||
"theflow",
|
"theflow",
|
||||||
|
"llama-index",
|
||||||
|
"llama-hub",
|
||||||
|
"nltk",
|
||||||
],
|
],
|
||||||
extras_require={
|
extras_require={
|
||||||
"dev": [
|
"dev": [
|
||||||
|
|
BIN
tests/resources/dummy.pdf
Normal file
BIN
tests/resources/dummy.pdf
Normal file
Binary file not shown.
32
tests/test_reader.py
Normal file
32
tests/test_reader.py
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from langchain.schema import Document as LangchainDocument
|
||||||
|
from llama_index.node_parser import SimpleNodeParser
|
||||||
|
|
||||||
|
from kotaemon.documents.base import Document, HaystackDocument
|
||||||
|
from kotaemon.loaders import AutoReader
|
||||||
|
|
||||||
|
|
||||||
|
def test_pdf_reader():
|
||||||
|
reader = AutoReader("PDFReader")
|
||||||
|
dirpath = Path(__file__).parent
|
||||||
|
documents = reader.load_data(dirpath / "resources/dummy.pdf")
|
||||||
|
|
||||||
|
# check document reader output
|
||||||
|
assert len(documents) == 1
|
||||||
|
|
||||||
|
first_doc = documents[0]
|
||||||
|
assert isinstance(first_doc, Document)
|
||||||
|
assert first_doc.text.lower().replace(" ", "") == "dummypdffile"
|
||||||
|
|
||||||
|
# check conversion output
|
||||||
|
haystack_doc = first_doc.to_haystack_format()
|
||||||
|
assert isinstance(haystack_doc, HaystackDocument)
|
||||||
|
|
||||||
|
langchain_doc = first_doc.to_langchain_format()
|
||||||
|
assert isinstance(langchain_doc, LangchainDocument)
|
||||||
|
|
||||||
|
# test chunking using NodeParser from llama-index
|
||||||
|
node_parser = SimpleNodeParser.from_defaults(chunk_size=100, chunk_overlap=20)
|
||||||
|
nodes = node_parser.get_nodes_from_documents(documents)
|
||||||
|
assert len(nodes) > 0
|
Loading…
Reference in New Issue
Block a user