feat: add web URL loader & refine indexing logics (#397)

* feat: add web URL loader & refine indexing logics

* fix: comfort mypy
This commit is contained in:
Tuan Anh Nguyen Dang (Tadashi_Cin)
2024-10-15 22:42:24 +07:00
committed by GitHub
parent 41966fcd5b
commit b113efc855
7 changed files with 183 additions and 60 deletions

View File

@@ -21,8 +21,10 @@ from kotaemon.loaders import (
PDFThumbnailReader,
TxtReader,
UnstructuredReader,
WebReader,
)
web_reader = WebReader()
unstructured = UnstructuredReader()
adobe_reader = AdobeReader()
azure_reader = AzureAIDocumentIntelligenceLoader(

View File

@@ -10,6 +10,7 @@ from .ocr_loader import ImageReader, OCRReader
from .pdf_loader import PDFThumbnailReader
from .txt_loader import TxtReader
from .unstructured_loader import UnstructuredReader
from .web_loader import WebReader
__all__ = [
"AutoReader",
@@ -28,4 +29,5 @@ __all__ = [
"AdobeReader",
"TxtReader",
"PDFThumbnailReader",
"WebReader",
]

View File

@@ -0,0 +1,43 @@
from pathlib import Path
from typing import Optional
import requests
from decouple import config
from kotaemon.base import Document
from .base import BaseReader
JINA_API_KEY = config("JINA_API_KEY", default="")
JINA_URL = config("JINA_URL", default="https://r.jina.ai/")
class WebReader(BaseReader):
def run(
self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
) -> list[Document]:
return self.load_data(Path(file_path), extra_info=extra_info, **kwargs)
def fetch_url(self, url: str):
# setup the request
api_url = f"https://r.jina.ai/{url}"
headers = {
"X-With-Links-Summary": "true",
}
if JINA_API_KEY:
headers["Authorization"] = f"Bearer {JINA_API_KEY}"
response = requests.get(api_url, headers=headers)
response.raise_for_status()
data = response.text
return data
def load_data(
self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
) -> list[Document]:
file_path = str(file_path)
output = self.fetch_url(file_path)
metadata = extra_info or {}
return [Document(text=output, metadata=metadata)]