feat: integrate with docling (#471) bump:patch
* feat: add docling reader implementation * feat: expose docling to UI * fix: improve docling output parsing * docs: update README --------- Co-authored-by: Tadashi <tadashi@cinnamon.is>
This commit is contained in:
parent
5b828c213c
commit
56c40f1c05
11
README.md
11
README.md
|
@ -216,6 +216,17 @@ documents and developers who want to build their own RAG pipeline.
|
||||||
|
|
||||||
See [Local model setup](docs/local_model.md).
|
See [Local model setup](docs/local_model.md).
|
||||||
|
|
||||||
|
### Setup multimodal document parsing (OCR, table parsing, figure extraction)
|
||||||
|
|
||||||
|
These options are available:
|
||||||
|
|
||||||
|
- [Azure Document Intelligence (API)](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence)
|
||||||
|
- [Adobe PDF Extract (API)](https://developer.adobe.com/document-services/docs/overview/pdf-extract-api/)
|
||||||
|
- [Docling (local, open-source)](https://github.com/DS4SD/docling)
|
||||||
|
- To use Docling, first install required dependencies: `pip install docling`
|
||||||
|
|
||||||
|
Select corresponding loaders in `Settings -> Retrieval Settings -> File loader`
|
||||||
|
|
||||||
### Customize your application
|
### Customize your application
|
||||||
|
|
||||||
- By default, all application data is stored in the `./ktem_app_data` folder. You can back up or copy this folder to transfer your installation to a new machine.
|
- By default, all application data is stored in the `./ktem_app_data` folder. You can back up or copy this folder to transfer your installation to a new machine.
|
||||||
|
|
|
@ -13,6 +13,7 @@ from kotaemon.loaders import (
|
||||||
AdobeReader,
|
AdobeReader,
|
||||||
AzureAIDocumentIntelligenceLoader,
|
AzureAIDocumentIntelligenceLoader,
|
||||||
DirectoryReader,
|
DirectoryReader,
|
||||||
|
DoclingReader,
|
||||||
HtmlReader,
|
HtmlReader,
|
||||||
MathpixPDFReader,
|
MathpixPDFReader,
|
||||||
MhtmlReader,
|
MhtmlReader,
|
||||||
|
@ -32,9 +33,10 @@ azure_reader = AzureAIDocumentIntelligenceLoader(
|
||||||
credential=str(config("AZURE_DI_CREDENTIAL", default="")),
|
credential=str(config("AZURE_DI_CREDENTIAL", default="")),
|
||||||
cache_dir=getattr(flowsettings, "KH_MARKDOWN_OUTPUT_DIR", None),
|
cache_dir=getattr(flowsettings, "KH_MARKDOWN_OUTPUT_DIR", None),
|
||||||
)
|
)
|
||||||
adobe_reader.vlm_endpoint = azure_reader.vlm_endpoint = getattr(
|
docling_reader = DoclingReader()
|
||||||
flowsettings, "KH_VLM_ENDPOINT", ""
|
adobe_reader.vlm_endpoint = (
|
||||||
)
|
azure_reader.vlm_endpoint
|
||||||
|
) = docling_reader.vlm_endpoint = getattr(flowsettings, "KH_VLM_ENDPOINT", "")
|
||||||
|
|
||||||
|
|
||||||
KH_DEFAULT_FILE_EXTRACTORS: dict[str, BaseReader] = {
|
KH_DEFAULT_FILE_EXTRACTORS: dict[str, BaseReader] = {
|
||||||
|
|
|
@ -2,6 +2,7 @@ from .adobe_loader import AdobeReader
|
||||||
from .azureai_document_intelligence_loader import AzureAIDocumentIntelligenceLoader
|
from .azureai_document_intelligence_loader import AzureAIDocumentIntelligenceLoader
|
||||||
from .base import AutoReader, BaseReader
|
from .base import AutoReader, BaseReader
|
||||||
from .composite_loader import DirectoryReader
|
from .composite_loader import DirectoryReader
|
||||||
|
from .docling_loader import DoclingReader
|
||||||
from .docx_loader import DocxReader
|
from .docx_loader import DocxReader
|
||||||
from .excel_loader import ExcelReader, PandasExcelReader
|
from .excel_loader import ExcelReader, PandasExcelReader
|
||||||
from .html_loader import HtmlReader, MhtmlReader
|
from .html_loader import HtmlReader, MhtmlReader
|
||||||
|
@ -30,4 +31,5 @@ __all__ = [
|
||||||
"TxtReader",
|
"TxtReader",
|
||||||
"PDFThumbnailReader",
|
"PDFThumbnailReader",
|
||||||
"WebReader",
|
"WebReader",
|
||||||
|
"DoclingReader",
|
||||||
]
|
]
|
||||||
|
|
|
@ -25,6 +25,9 @@ def crop_image(file_path: Path, bbox: list[float], page_number: int = 0) -> Imag
|
||||||
"""
|
"""
|
||||||
left, upper, right, lower = bbox
|
left, upper, right, lower = bbox
|
||||||
|
|
||||||
|
left, right = min(left, right), max(left, right)
|
||||||
|
upper, lower = min(upper, lower), max(upper, lower)
|
||||||
|
|
||||||
img: Image.Image
|
img: Image.Image
|
||||||
suffix = file_path.suffix.lower()
|
suffix = file_path.suffix.lower()
|
||||||
if suffix == ".pdf":
|
if suffix == ".pdf":
|
||||||
|
|
232
libs/kotaemon/kotaemon/loaders/docling_loader.py
Normal file
232
libs/kotaemon/kotaemon/loaders/docling_loader.py
Normal file
|
@ -0,0 +1,232 @@
|
||||||
|
import base64
|
||||||
|
from collections import defaultdict
|
||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from kotaemon.base import Document, Param
|
||||||
|
|
||||||
|
from .azureai_document_intelligence_loader import crop_image
|
||||||
|
from .base import BaseReader
|
||||||
|
from .utils.adobe import generate_single_figure_caption, make_markdown_table
|
||||||
|
|
||||||
|
|
||||||
|
class DoclingReader(BaseReader):
|
||||||
|
"""Using Docling to extract document structure and content"""
|
||||||
|
|
||||||
|
_dependencies = ["docling"]
|
||||||
|
|
||||||
|
vlm_endpoint: str = Param(
|
||||||
|
help=(
|
||||||
|
"Default VLM endpoint for figure captioning. "
|
||||||
|
"If not provided, will not caption the figures"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
max_figure_to_caption: int = Param(
|
||||||
|
100,
|
||||||
|
help=(
|
||||||
|
"The maximum number of figures to caption. "
|
||||||
|
"The rest will be indexed without captions."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
figure_friendly_filetypes: list[str] = Param(
|
||||||
|
[".pdf", ".jpeg", ".jpg", ".png", ".bmp", ".tiff", ".heif", ".tif"],
|
||||||
|
help=(
|
||||||
|
"File types that we can reliably open and extract figures. "
|
||||||
|
"For files like .docx or .html, the visual layout may be different "
|
||||||
|
"when viewed from different tools, hence we cannot use Azure DI location "
|
||||||
|
"to extract figures."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
@Param.auto(cache=True)
|
||||||
|
def converter_(self):
|
||||||
|
try:
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError("Please install docling: 'pip install docling'")
|
||||||
|
|
||||||
|
return DocumentConverter()
|
||||||
|
|
||||||
|
def run(
|
||||||
|
self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
|
||||||
|
) -> List[Document]:
|
||||||
|
return self.load_data(file_path, extra_info, **kwargs)
|
||||||
|
|
||||||
|
def load_data(
|
||||||
|
self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Extract the input file, allowing multi-modal extraction"""
|
||||||
|
|
||||||
|
metadata = extra_info or {}
|
||||||
|
|
||||||
|
result = self.converter_.convert(file_path)
|
||||||
|
result_dict = result.document.export_to_dict()
|
||||||
|
|
||||||
|
file_path = Path(file_path)
|
||||||
|
file_name = file_path.name
|
||||||
|
|
||||||
|
# extract the figures
|
||||||
|
figures = []
|
||||||
|
gen_caption_count = 0
|
||||||
|
for figure_obj in result_dict.get("pictures", []):
|
||||||
|
if not self.vlm_endpoint:
|
||||||
|
continue
|
||||||
|
if file_path.suffix.lower() not in self.figure_friendly_filetypes:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# retrieve extractive captions provided by docling
|
||||||
|
caption_refs = [caption["$ref"] for caption in figure_obj["captions"]]
|
||||||
|
extractive_captions = []
|
||||||
|
for caption_ref in caption_refs:
|
||||||
|
text_id = caption_ref.split("/")[-1]
|
||||||
|
try:
|
||||||
|
caption_text = result_dict["texts"][int(text_id)]["text"]
|
||||||
|
extractive_captions.append(caption_text)
|
||||||
|
except (ValueError, TypeError, IndexError) as e:
|
||||||
|
print(e)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# read & crop image
|
||||||
|
page_number = figure_obj["prov"][0]["page_no"]
|
||||||
|
|
||||||
|
try:
|
||||||
|
page_number_text = str(page_number)
|
||||||
|
page_width = result_dict["pages"][page_number_text]["size"]["width"]
|
||||||
|
page_height = result_dict["pages"][page_number_text]["size"]["height"]
|
||||||
|
|
||||||
|
bbox_obj = figure_obj["prov"][0]["bbox"]
|
||||||
|
bbox: list[float] = [
|
||||||
|
bbox_obj["l"],
|
||||||
|
bbox_obj["t"],
|
||||||
|
bbox_obj["r"],
|
||||||
|
bbox_obj["b"],
|
||||||
|
]
|
||||||
|
if bbox_obj["coord_origin"] == "BOTTOMLEFT":
|
||||||
|
bbox = self._convert_bbox_bl_tl(bbox, page_width, page_height)
|
||||||
|
|
||||||
|
img = crop_image(file_path, bbox, page_number - 1)
|
||||||
|
except KeyError as e:
|
||||||
|
print(e, list(result_dict["pages"].keys()))
|
||||||
|
continue
|
||||||
|
|
||||||
|
# convert img to base64
|
||||||
|
img_bytes = BytesIO()
|
||||||
|
img.save(img_bytes, format="PNG")
|
||||||
|
img_base64 = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
|
||||||
|
img_base64 = f"data:image/png;base64,{img_base64}"
|
||||||
|
|
||||||
|
# generate the generative caption
|
||||||
|
if gen_caption_count >= self.max_figure_to_caption:
|
||||||
|
gen_caption = ""
|
||||||
|
else:
|
||||||
|
gen_caption_count += 1
|
||||||
|
gen_caption = generate_single_figure_caption(
|
||||||
|
img_base64, self.vlm_endpoint
|
||||||
|
)
|
||||||
|
|
||||||
|
# join the extractive and generative captions
|
||||||
|
caption = "\n".join(extractive_captions + [gen_caption])
|
||||||
|
|
||||||
|
# store the image into document
|
||||||
|
figure_metadata = {
|
||||||
|
"image_origin": img_base64,
|
||||||
|
"type": "image",
|
||||||
|
"page_label": page_number,
|
||||||
|
"file_name": file_name,
|
||||||
|
"file_path": file_path,
|
||||||
|
}
|
||||||
|
figure_metadata.update(metadata)
|
||||||
|
|
||||||
|
figures.append(
|
||||||
|
Document(
|
||||||
|
text=caption,
|
||||||
|
metadata=figure_metadata,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# extract the tables
|
||||||
|
tables = []
|
||||||
|
for table_obj in result_dict.get("tables", []):
|
||||||
|
# convert the tables into markdown format
|
||||||
|
markdown_table = self._parse_table(table_obj)
|
||||||
|
caption_refs = [caption["$ref"] for caption in table_obj["captions"]]
|
||||||
|
|
||||||
|
extractive_captions = []
|
||||||
|
for caption_ref in caption_refs:
|
||||||
|
text_id = caption_ref.split("/")[-1]
|
||||||
|
try:
|
||||||
|
caption_text = result_dict["texts"][int(text_id)]["text"]
|
||||||
|
extractive_captions.append(caption_text)
|
||||||
|
except (ValueError, TypeError, IndexError) as e:
|
||||||
|
print(e)
|
||||||
|
continue
|
||||||
|
# join the extractive and generative captions
|
||||||
|
caption = "\n".join(extractive_captions)
|
||||||
|
markdown_table = f"{caption}\n{markdown_table}"
|
||||||
|
|
||||||
|
page_number = table_obj["prov"][0].get("page_no", 1)
|
||||||
|
|
||||||
|
table_metadata = {
|
||||||
|
"type": "table",
|
||||||
|
"page_label": page_number,
|
||||||
|
"table_origin": markdown_table,
|
||||||
|
"file_name": file_name,
|
||||||
|
"file_path": file_path,
|
||||||
|
}
|
||||||
|
table_metadata.update(metadata)
|
||||||
|
|
||||||
|
tables.append(
|
||||||
|
Document(
|
||||||
|
text=markdown_table,
|
||||||
|
metadata=table_metadata,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# join plain text elements
|
||||||
|
texts = []
|
||||||
|
page_number_to_text = defaultdict(list)
|
||||||
|
|
||||||
|
for text_obj in result_dict["texts"]:
|
||||||
|
page_number = text_obj["prov"][0].get("page_no", 1)
|
||||||
|
page_number_to_text[page_number].append(text_obj["text"])
|
||||||
|
|
||||||
|
for page_number, txts in page_number_to_text.items():
|
||||||
|
texts.append(
|
||||||
|
Document(
|
||||||
|
text="\n".join(txts),
|
||||||
|
metadata={
|
||||||
|
"page_label": page_number,
|
||||||
|
"file_name": file_name,
|
||||||
|
"file_path": file_path,
|
||||||
|
**metadata,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return texts + tables + figures
|
||||||
|
|
||||||
|
def _convert_bbox_bl_tl(
|
||||||
|
self, bbox: list[float], page_width: int, page_height: int
|
||||||
|
) -> list[float]:
|
||||||
|
"""Convert bbox from bottom-left to top-left"""
|
||||||
|
x0, y0, x1, y1 = bbox
|
||||||
|
return [
|
||||||
|
x0 / page_width,
|
||||||
|
(page_height - y1) / page_height,
|
||||||
|
x1 / page_width,
|
||||||
|
(page_height - y0) / page_height,
|
||||||
|
]
|
||||||
|
|
||||||
|
def _parse_table(self, table_obj: dict) -> str:
|
||||||
|
"""Convert docling table object to markdown table"""
|
||||||
|
table_as_list: List[List[str]] = []
|
||||||
|
grid = table_obj["data"]["grid"]
|
||||||
|
for row in grid:
|
||||||
|
table_as_list.append([])
|
||||||
|
for cell in row:
|
||||||
|
table_as_list[-1].append(cell["text"])
|
||||||
|
|
||||||
|
return make_markdown_table(table_as_list)
|
|
@ -110,7 +110,7 @@ def request_adobe_service(file_path: str, output_path: str = "") -> str:
|
||||||
return output_path
|
return output_path
|
||||||
|
|
||||||
|
|
||||||
def make_markdown_table(table_as_list: List[str]) -> str:
|
def make_markdown_table(table_as_list: List[List[str]]) -> str:
|
||||||
"""
|
"""
|
||||||
Convert table from python list representation to markdown format.
|
Convert table from python list representation to markdown format.
|
||||||
The input list consists of rows of tables, the first row is the header.
|
The input list consists of rows of tables, the first row is the header.
|
||||||
|
@ -203,8 +203,11 @@ def parse_figure_paths(file_paths: List[Path]) -> Union[bytes, str]:
|
||||||
|
|
||||||
|
|
||||||
def generate_single_figure_caption(vlm_endpoint: str, figure: str) -> str:
|
def generate_single_figure_caption(vlm_endpoint: str, figure: str) -> str:
|
||||||
|
output = ""
|
||||||
|
|
||||||
"""Summarize a single figure using GPT-4V"""
|
"""Summarize a single figure using GPT-4V"""
|
||||||
if figure:
|
if figure:
|
||||||
|
try:
|
||||||
output = generate_gpt4v(
|
output = generate_gpt4v(
|
||||||
endpoint=vlm_endpoint,
|
endpoint=vlm_endpoint,
|
||||||
prompt="Provide a short 2 sentence summary of this image?",
|
prompt="Provide a short 2 sentence summary of this image?",
|
||||||
|
@ -212,8 +215,9 @@ def generate_single_figure_caption(vlm_endpoint: str, figure: str) -> str:
|
||||||
)
|
)
|
||||||
if "sorry" in output.lower():
|
if "sorry" in output.lower():
|
||||||
output = ""
|
output = ""
|
||||||
else:
|
except Exception as e:
|
||||||
output = ""
|
print(f"Error generating caption: {e}")
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -39,6 +39,7 @@ from kotaemon.indices.ingests.files import (
|
||||||
KH_DEFAULT_FILE_EXTRACTORS,
|
KH_DEFAULT_FILE_EXTRACTORS,
|
||||||
adobe_reader,
|
adobe_reader,
|
||||||
azure_reader,
|
azure_reader,
|
||||||
|
docling_reader,
|
||||||
unstructured,
|
unstructured,
|
||||||
web_reader,
|
web_reader,
|
||||||
)
|
)
|
||||||
|
@ -673,6 +674,8 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
|
||||||
readers[".pdf"] = adobe_reader
|
readers[".pdf"] = adobe_reader
|
||||||
elif self.reader_mode == "azure-di":
|
elif self.reader_mode == "azure-di":
|
||||||
readers[".pdf"] = azure_reader
|
readers[".pdf"] = azure_reader
|
||||||
|
elif self.reader_mode == "docling":
|
||||||
|
readers[".pdf"] = docling_reader
|
||||||
|
|
||||||
dev_readers, _, _ = dev_settings()
|
dev_readers, _, _ = dev_settings()
|
||||||
readers.update(dev_readers)
|
readers.update(dev_readers)
|
||||||
|
@ -692,6 +695,7 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
|
||||||
"Azure AI Document Intelligence (figure+table extraction)",
|
"Azure AI Document Intelligence (figure+table extraction)",
|
||||||
"azure-di",
|
"azure-di",
|
||||||
),
|
),
|
||||||
|
("Docling", "docling"),
|
||||||
],
|
],
|
||||||
"component": "dropdown",
|
"component": "dropdown",
|
||||||
},
|
},
|
||||||
|
|
Loading…
Reference in New Issue
Block a user