from typing import TYPE_CHECKING, Any, Optional, TypeVar from llama_index.bridge.pydantic import Field from llama_index.schema import Document as BaseDocument if TYPE_CHECKING: from haystack.schema import Document as HaystackDocument IO_Type = TypeVar("IO_Type", "Document", str) SAMPLE_TEXT = "A sample Document from kotaemon" class Document(BaseDocument): """ Base document class, mostly inherited from Document class from llama-index. This class accept one positional argument `content` of an arbitrary type, which will store the raw content of the document. If specified, the class will use `content` to initialize the base llama_index class. """ content: Any def __init__(self, content: Optional[Any] = None, *args, **kwargs): if content is None: if kwargs.get("text", None) is not None: kwargs["content"] = kwargs["text"] elif kwargs.get("embedding", None) is not None: kwargs["content"] = kwargs["embedding"] elif isinstance(content, Document): kwargs = content.dict() else: kwargs["content"] = content if content: kwargs["text"] = str(content) else: kwargs["text"] = "" super().__init__(*args, **kwargs) def __bool__(self): return bool(self.content) @classmethod def example(cls) -> "Document": document = Document( text=SAMPLE_TEXT, metadata={"filename": "README.md", "category": "codebase"}, ) return document def to_haystack_format(self) -> "HaystackDocument": """Convert struct to Haystack document format.""" from haystack.schema import Document as HaystackDocument metadata = self.metadata or {} text = self.text return HaystackDocument(content=text, meta=metadata) def __str__(self): return str(self.content) class RetrievedDocument(Document): """Subclass of Document with retrieval-related information Attributes: score (float): score of the document (from 0.0 to 1.0) retrieval_metadata (dict): metadata from the retrieval process, can be used by different components in a retrieved pipeline to communicate with each other """ score: float = Field(default=0.0) retrieval_metadata: dict = Field(default={})