feat: integrate with docling (#471) bump:patch

* feat: add docling reader implementation

* feat: expose docling to UI

* fix: improve docling output parsing

* docs: update README

---------

Co-authored-by: Tadashi <tadashi@cinnamon.is>
This commit is contained in:
Quang (Albert) 2024-11-16 10:04:57 +07:00 committed by GitHub
parent 5b828c213c
commit 56c40f1c05
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 271 additions and 13 deletions

View File

@ -216,6 +216,17 @@ documents and developers who want to build their own RAG pipeline.
See [Local model setup](docs/local_model.md). See [Local model setup](docs/local_model.md).
### Setup multimodal document parsing (OCR, table parsing, figure extraction)
These options are available:
- [Azure Document Intelligence (API)](https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence)
- [Adobe PDF Extract (API)](https://developer.adobe.com/document-services/docs/overview/pdf-extract-api/)
- [Docling (local, open-source)](https://github.com/DS4SD/docling)
- To use Docling, first install required dependencies: `pip install docling`
Select corresponding loaders in `Settings -> Retrieval Settings -> File loader`
### Customize your application ### Customize your application
- By default, all application data is stored in the `./ktem_app_data` folder. You can back up or copy this folder to transfer your installation to a new machine. - By default, all application data is stored in the `./ktem_app_data` folder. You can back up or copy this folder to transfer your installation to a new machine.

View File

@ -13,6 +13,7 @@ from kotaemon.loaders import (
AdobeReader, AdobeReader,
AzureAIDocumentIntelligenceLoader, AzureAIDocumentIntelligenceLoader,
DirectoryReader, DirectoryReader,
DoclingReader,
HtmlReader, HtmlReader,
MathpixPDFReader, MathpixPDFReader,
MhtmlReader, MhtmlReader,
@ -32,9 +33,10 @@ azure_reader = AzureAIDocumentIntelligenceLoader(
credential=str(config("AZURE_DI_CREDENTIAL", default="")), credential=str(config("AZURE_DI_CREDENTIAL", default="")),
cache_dir=getattr(flowsettings, "KH_MARKDOWN_OUTPUT_DIR", None), cache_dir=getattr(flowsettings, "KH_MARKDOWN_OUTPUT_DIR", None),
) )
adobe_reader.vlm_endpoint = azure_reader.vlm_endpoint = getattr( docling_reader = DoclingReader()
flowsettings, "KH_VLM_ENDPOINT", "" adobe_reader.vlm_endpoint = (
) azure_reader.vlm_endpoint
) = docling_reader.vlm_endpoint = getattr(flowsettings, "KH_VLM_ENDPOINT", "")
KH_DEFAULT_FILE_EXTRACTORS: dict[str, BaseReader] = { KH_DEFAULT_FILE_EXTRACTORS: dict[str, BaseReader] = {

View File

@ -2,6 +2,7 @@ from .adobe_loader import AdobeReader
from .azureai_document_intelligence_loader import AzureAIDocumentIntelligenceLoader from .azureai_document_intelligence_loader import AzureAIDocumentIntelligenceLoader
from .base import AutoReader, BaseReader from .base import AutoReader, BaseReader
from .composite_loader import DirectoryReader from .composite_loader import DirectoryReader
from .docling_loader import DoclingReader
from .docx_loader import DocxReader from .docx_loader import DocxReader
from .excel_loader import ExcelReader, PandasExcelReader from .excel_loader import ExcelReader, PandasExcelReader
from .html_loader import HtmlReader, MhtmlReader from .html_loader import HtmlReader, MhtmlReader
@ -30,4 +31,5 @@ __all__ = [
"TxtReader", "TxtReader",
"PDFThumbnailReader", "PDFThumbnailReader",
"WebReader", "WebReader",
"DoclingReader",
] ]

View File

@ -25,6 +25,9 @@ def crop_image(file_path: Path, bbox: list[float], page_number: int = 0) -> Imag
""" """
left, upper, right, lower = bbox left, upper, right, lower = bbox
left, right = min(left, right), max(left, right)
upper, lower = min(upper, lower), max(upper, lower)
img: Image.Image img: Image.Image
suffix = file_path.suffix.lower() suffix = file_path.suffix.lower()
if suffix == ".pdf": if suffix == ".pdf":

View File

@ -0,0 +1,232 @@
import base64
from collections import defaultdict
from io import BytesIO
from pathlib import Path
from typing import List, Optional
from kotaemon.base import Document, Param
from .azureai_document_intelligence_loader import crop_image
from .base import BaseReader
from .utils.adobe import generate_single_figure_caption, make_markdown_table
class DoclingReader(BaseReader):
"""Using Docling to extract document structure and content"""
_dependencies = ["docling"]
vlm_endpoint: str = Param(
help=(
"Default VLM endpoint for figure captioning. "
"If not provided, will not caption the figures"
)
)
max_figure_to_caption: int = Param(
100,
help=(
"The maximum number of figures to caption. "
"The rest will be indexed without captions."
),
)
figure_friendly_filetypes: list[str] = Param(
[".pdf", ".jpeg", ".jpg", ".png", ".bmp", ".tiff", ".heif", ".tif"],
help=(
"File types that we can reliably open and extract figures. "
"For files like .docx or .html, the visual layout may be different "
"when viewed from different tools, hence we cannot use Azure DI location "
"to extract figures."
),
)
@Param.auto(cache=True)
def converter_(self):
try:
from docling.document_converter import DocumentConverter
except ImportError:
raise ImportError("Please install docling: 'pip install docling'")
return DocumentConverter()
def run(
self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
) -> List[Document]:
return self.load_data(file_path, extra_info, **kwargs)
def load_data(
self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
) -> List[Document]:
"""Extract the input file, allowing multi-modal extraction"""
metadata = extra_info or {}
result = self.converter_.convert(file_path)
result_dict = result.document.export_to_dict()
file_path = Path(file_path)
file_name = file_path.name
# extract the figures
figures = []
gen_caption_count = 0
for figure_obj in result_dict.get("pictures", []):
if not self.vlm_endpoint:
continue
if file_path.suffix.lower() not in self.figure_friendly_filetypes:
continue
# retrieve extractive captions provided by docling
caption_refs = [caption["$ref"] for caption in figure_obj["captions"]]
extractive_captions = []
for caption_ref in caption_refs:
text_id = caption_ref.split("/")[-1]
try:
caption_text = result_dict["texts"][int(text_id)]["text"]
extractive_captions.append(caption_text)
except (ValueError, TypeError, IndexError) as e:
print(e)
continue
# read & crop image
page_number = figure_obj["prov"][0]["page_no"]
try:
page_number_text = str(page_number)
page_width = result_dict["pages"][page_number_text]["size"]["width"]
page_height = result_dict["pages"][page_number_text]["size"]["height"]
bbox_obj = figure_obj["prov"][0]["bbox"]
bbox: list[float] = [
bbox_obj["l"],
bbox_obj["t"],
bbox_obj["r"],
bbox_obj["b"],
]
if bbox_obj["coord_origin"] == "BOTTOMLEFT":
bbox = self._convert_bbox_bl_tl(bbox, page_width, page_height)
img = crop_image(file_path, bbox, page_number - 1)
except KeyError as e:
print(e, list(result_dict["pages"].keys()))
continue
# convert img to base64
img_bytes = BytesIO()
img.save(img_bytes, format="PNG")
img_base64 = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
img_base64 = f"data:image/png;base64,{img_base64}"
# generate the generative caption
if gen_caption_count >= self.max_figure_to_caption:
gen_caption = ""
else:
gen_caption_count += 1
gen_caption = generate_single_figure_caption(
img_base64, self.vlm_endpoint
)
# join the extractive and generative captions
caption = "\n".join(extractive_captions + [gen_caption])
# store the image into document
figure_metadata = {
"image_origin": img_base64,
"type": "image",
"page_label": page_number,
"file_name": file_name,
"file_path": file_path,
}
figure_metadata.update(metadata)
figures.append(
Document(
text=caption,
metadata=figure_metadata,
)
)
# extract the tables
tables = []
for table_obj in result_dict.get("tables", []):
# convert the tables into markdown format
markdown_table = self._parse_table(table_obj)
caption_refs = [caption["$ref"] for caption in table_obj["captions"]]
extractive_captions = []
for caption_ref in caption_refs:
text_id = caption_ref.split("/")[-1]
try:
caption_text = result_dict["texts"][int(text_id)]["text"]
extractive_captions.append(caption_text)
except (ValueError, TypeError, IndexError) as e:
print(e)
continue
# join the extractive and generative captions
caption = "\n".join(extractive_captions)
markdown_table = f"{caption}\n{markdown_table}"
page_number = table_obj["prov"][0].get("page_no", 1)
table_metadata = {
"type": "table",
"page_label": page_number,
"table_origin": markdown_table,
"file_name": file_name,
"file_path": file_path,
}
table_metadata.update(metadata)
tables.append(
Document(
text=markdown_table,
metadata=table_metadata,
)
)
# join plain text elements
texts = []
page_number_to_text = defaultdict(list)
for text_obj in result_dict["texts"]:
page_number = text_obj["prov"][0].get("page_no", 1)
page_number_to_text[page_number].append(text_obj["text"])
for page_number, txts in page_number_to_text.items():
texts.append(
Document(
text="\n".join(txts),
metadata={
"page_label": page_number,
"file_name": file_name,
"file_path": file_path,
**metadata,
},
)
)
return texts + tables + figures
def _convert_bbox_bl_tl(
self, bbox: list[float], page_width: int, page_height: int
) -> list[float]:
"""Convert bbox from bottom-left to top-left"""
x0, y0, x1, y1 = bbox
return [
x0 / page_width,
(page_height - y1) / page_height,
x1 / page_width,
(page_height - y0) / page_height,
]
def _parse_table(self, table_obj: dict) -> str:
"""Convert docling table object to markdown table"""
table_as_list: List[List[str]] = []
grid = table_obj["data"]["grid"]
for row in grid:
table_as_list.append([])
for cell in row:
table_as_list[-1].append(cell["text"])
return make_markdown_table(table_as_list)

View File

@ -110,7 +110,7 @@ def request_adobe_service(file_path: str, output_path: str = "") -> str:
return output_path return output_path
def make_markdown_table(table_as_list: List[str]) -> str: def make_markdown_table(table_as_list: List[List[str]]) -> str:
""" """
Convert table from python list representation to markdown format. Convert table from python list representation to markdown format.
The input list consists of rows of tables, the first row is the header. The input list consists of rows of tables, the first row is the header.
@ -203,17 +203,21 @@ def parse_figure_paths(file_paths: List[Path]) -> Union[bytes, str]:
def generate_single_figure_caption(vlm_endpoint: str, figure: str) -> str: def generate_single_figure_caption(vlm_endpoint: str, figure: str) -> str:
output = ""
"""Summarize a single figure using GPT-4V""" """Summarize a single figure using GPT-4V"""
if figure: if figure:
output = generate_gpt4v( try:
endpoint=vlm_endpoint, output = generate_gpt4v(
prompt="Provide a short 2 sentence summary of this image?", endpoint=vlm_endpoint,
images=figure, prompt="Provide a short 2 sentence summary of this image?",
) images=figure,
if "sorry" in output.lower(): )
output = "" if "sorry" in output.lower():
else: output = ""
output = "" except Exception as e:
print(f"Error generating caption: {e}")
return output return output

View File

@ -39,6 +39,7 @@ from kotaemon.indices.ingests.files import (
KH_DEFAULT_FILE_EXTRACTORS, KH_DEFAULT_FILE_EXTRACTORS,
adobe_reader, adobe_reader,
azure_reader, azure_reader,
docling_reader,
unstructured, unstructured,
web_reader, web_reader,
) )
@ -673,6 +674,8 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
readers[".pdf"] = adobe_reader readers[".pdf"] = adobe_reader
elif self.reader_mode == "azure-di": elif self.reader_mode == "azure-di":
readers[".pdf"] = azure_reader readers[".pdf"] = azure_reader
elif self.reader_mode == "docling":
readers[".pdf"] = docling_reader
dev_readers, _, _ = dev_settings() dev_readers, _, _ = dev_settings()
readers.update(dev_readers) readers.update(dev_readers)
@ -692,6 +695,7 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
"Azure AI Document Intelligence (figure+table extraction)", "Azure AI Document Intelligence (figure+table extraction)",
"azure-di", "azure-di",
), ),
("Docling", "docling"),
], ],
"component": "dropdown", "component": "dropdown",
}, },