Add Azure AI Document Intelligence loader (#52)
* Add azureai document intelligence loader * Add load_data interface to Azure DI * Bump version * Access azure credentials from environment variables
This commit is contained in:
parent
bbe862fe47
commit
ec11b54ff2
|
@ -1,4 +1,5 @@
|
|||
from .adobe_loader import AdobeReader
|
||||
from .azureai_document_intelligence_loader import AzureAIDocumentIntelligenceLoader
|
||||
from .base import AutoReader, BaseReader
|
||||
from .composite_loader import DirectoryReader
|
||||
from .docx_loader import DocxReader
|
||||
|
@ -10,6 +11,7 @@ from .unstructured_loader import UnstructuredReader
|
|||
|
||||
__all__ = [
|
||||
"AutoReader",
|
||||
"AzureAIDocumentIntelligenceLoader",
|
||||
"BaseReader",
|
||||
"PandasExcelReader",
|
||||
"MathpixPDFReader",
|
||||
|
|
|
@ -0,0 +1,68 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from kotaemon.base import Document, Param
|
||||
|
||||
from .base import BaseReader
|
||||
|
||||
|
||||
class AzureAIDocumentIntelligenceLoader(BaseReader):
|
||||
"""Utilize Azure AI Document Intelligence to parse document
|
||||
|
||||
As of April 24, the supported file formats are: pdf, jpeg/jpg, png, bmp, tiff,
|
||||
heif, docx, xlsx, pptx and html.
|
||||
"""
|
||||
|
||||
_dependencies = ["azure-ai-documentintelligence"]
|
||||
|
||||
endpoint: str = Param(
|
||||
os.environ.get("AZUREAI_DOCUMENT_INTELLIGENT_ENDPOINT", None),
|
||||
help="Endpoint of Azure AI Document Intelligence",
|
||||
)
|
||||
credential: str = Param(
|
||||
os.environ.get("AZUREAI_DOCUMENT_INTELLIGENT_CREDENTIAL", None),
|
||||
help="Credential of Azure AI Document Intelligence",
|
||||
)
|
||||
model: str = Param(
|
||||
"prebuilt-layout",
|
||||
help=(
|
||||
"Model to use for document analysis. Default is prebuilt-layout. "
|
||||
"As of April 24, you can view the supported models [here]"
|
||||
"(https://learn.microsoft.com/en-us/azure/ai-services/"
|
||||
"document-intelligence/concept-model-overview?view=doc-intel-4.0.0"
|
||||
"#model-analysis-features)"
|
||||
),
|
||||
)
|
||||
|
||||
@Param.auto(depends_on=["endpoint", "credential"])
|
||||
def client_(self):
|
||||
try:
|
||||
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
||||
from azure.core.credentials import AzureKeyCredential
|
||||
except ImportError:
|
||||
raise ImportError("Please install azure-ai-documentintelligence")
|
||||
|
||||
return DocumentIntelligenceClient(
|
||||
self.endpoint, AzureKeyCredential(self.credential)
|
||||
)
|
||||
|
||||
def run(
|
||||
self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
|
||||
) -> list[Document]:
|
||||
return self.load_data(Path(file_path), extra_info=extra_info, **kwargs)
|
||||
|
||||
def load_data(
|
||||
self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
|
||||
) -> list[Document]:
|
||||
metadata = extra_info or {}
|
||||
with open(file_path, "rb") as fi:
|
||||
poller = self.client_.begin_analyze_document(
|
||||
self.model,
|
||||
analyze_request=fi,
|
||||
content_type="application/octet-stream",
|
||||
output_content_format="markdown",
|
||||
)
|
||||
result = poller.result()
|
||||
|
||||
return [Document(content=result.content, metadata=metadata)]
|
|
@ -11,7 +11,7 @@ packages.find.exclude = ["tests*", "env*"]
|
|||
# metadata and dependencies
|
||||
[project]
|
||||
name = "kotaemon"
|
||||
version = "0.3.11"
|
||||
version = "0.3.12"
|
||||
requires-python = ">= 3.10"
|
||||
description = "Kotaemon core library for AI development."
|
||||
dependencies = [
|
||||
|
@ -64,6 +64,7 @@ adv = [
|
|||
"pdfservices-sdk @ git+https://github.com/niallcm/pdfservices-python-sdk.git@bump-and-unfreeze-requirements",
|
||||
"fastembed",
|
||||
"beautifulsoup4",
|
||||
"azure-ai-documentintelligence",
|
||||
]
|
||||
dev = [
|
||||
"ipython",
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
from langchain.schema import Document as LangchainDocument
|
||||
from llama_index.node_parser import SimpleNodeParser
|
||||
|
@ -6,6 +7,7 @@ from llama_index.node_parser import SimpleNodeParser
|
|||
from kotaemon.base import Document
|
||||
from kotaemon.loaders import (
|
||||
AutoReader,
|
||||
AzureAIDocumentIntelligenceLoader,
|
||||
DocxReader,
|
||||
HtmlReader,
|
||||
MhtmlReader,
|
||||
|
@ -76,3 +78,15 @@ def test_mhtml_reader():
|
|||
|
||||
assert len(docs) == 1
|
||||
assert docs[0].text.startswith("This is a test")
|
||||
|
||||
|
||||
@patch("azure.ai.documentintelligence.DocumentIntelligenceClient")
|
||||
def test_azureai_document_intelligence_reader(mock_client):
|
||||
reader = AzureAIDocumentIntelligenceLoader(
|
||||
endpoint="https://endpoint.com",
|
||||
credential="credential",
|
||||
)
|
||||
docs = reader(Path(__file__).parent / "resources" / "dummy.pdf")
|
||||
|
||||
assert len(docs) == 1
|
||||
mock_client.assert_called_once()
|
||||
|
|
Loading…
Reference in New Issue
Block a user