Move Document and other interface into base/schema (#69)
This commit is contained in:
committed by
GitHub
parent
4704e2c11a
commit
8532138842
92
knowledgehub/base/schema.py
Normal file
92
knowledgehub/base/schema.py
Normal file
@@ -0,0 +1,92 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Optional, TypeVar
|
||||
|
||||
from llama_index.bridge.pydantic import Field
|
||||
from llama_index.schema import Document as BaseDocument
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from haystack.schema import Document as HaystackDocument
|
||||
|
||||
IO_Type = TypeVar("IO_Type", "Document", str)
|
||||
SAMPLE_TEXT = "A sample Document from kotaemon"
|
||||
|
||||
|
||||
class Document(BaseDocument):
|
||||
"""
|
||||
Base document class, mostly inherited from Document class from llama-index.
|
||||
|
||||
This class accept one positional argument `content` of an arbitrary type, which will
|
||||
store the raw content of the document. If specified, the class will use
|
||||
`content` to initialize the base llama_index class.
|
||||
"""
|
||||
|
||||
content: Any
|
||||
|
||||
def __init__(self, content: Optional[Any] = None, *args, **kwargs):
|
||||
if content is None:
|
||||
if kwargs.get("text", None) is not None:
|
||||
kwargs["content"] = kwargs["text"]
|
||||
elif kwargs.get("embedding", None) is not None:
|
||||
kwargs["content"] = kwargs["embedding"]
|
||||
elif isinstance(content, Document):
|
||||
kwargs = content.dict()
|
||||
else:
|
||||
kwargs["content"] = content
|
||||
if content:
|
||||
kwargs["text"] = str(content)
|
||||
else:
|
||||
kwargs["text"] = ""
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def __bool__(self):
|
||||
return bool(self.content)
|
||||
|
||||
@classmethod
|
||||
def example(cls) -> "Document":
|
||||
document = Document(
|
||||
text=SAMPLE_TEXT,
|
||||
metadata={"filename": "README.md", "category": "codebase"},
|
||||
)
|
||||
return document
|
||||
|
||||
def to_haystack_format(self) -> "HaystackDocument":
|
||||
"""Convert struct to Haystack document format."""
|
||||
from haystack.schema import Document as HaystackDocument
|
||||
|
||||
metadata = self.metadata or {}
|
||||
text = self.text
|
||||
return HaystackDocument(content=text, meta=metadata)
|
||||
|
||||
def __str__(self):
|
||||
return str(self.content)
|
||||
|
||||
|
||||
class RetrievedDocument(Document):
|
||||
"""Subclass of Document with retrieval-related information
|
||||
|
||||
Attributes:
|
||||
score (float): score of the document (from 0.0 to 1.0)
|
||||
retrieval_metadata (dict): metadata from the retrieval process, can be used
|
||||
by different components in a retrieved pipeline to communicate with each
|
||||
other
|
||||
"""
|
||||
|
||||
score: float = Field(default=0.0)
|
||||
retrieval_metadata: dict = Field(default={})
|
||||
|
||||
|
||||
class LLMInterface(Document):
|
||||
candidates: list[str] = Field(default_factory=list)
|
||||
completion_tokens: int = -1
|
||||
total_tokens: int = -1
|
||||
prompt_tokens: int = -1
|
||||
logits: list[list[float]] = Field(default_factory=list)
|
||||
|
||||
|
||||
class ExtractorOutput(Document):
|
||||
"""
|
||||
Represents the output of an extractor.
|
||||
"""
|
||||
|
||||
matches: list[str]
|
Reference in New Issue
Block a user