feat: add web URL loader & refine indexing logics (#397)
* feat: add web URL loader & refine indexing logics * fix: comfort mypy
This commit is contained in:
parent
41966fcd5b
commit
b113efc855
|
@ -21,8 +21,10 @@ from kotaemon.loaders import (
|
||||||
PDFThumbnailReader,
|
PDFThumbnailReader,
|
||||||
TxtReader,
|
TxtReader,
|
||||||
UnstructuredReader,
|
UnstructuredReader,
|
||||||
|
WebReader,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
web_reader = WebReader()
|
||||||
unstructured = UnstructuredReader()
|
unstructured = UnstructuredReader()
|
||||||
adobe_reader = AdobeReader()
|
adobe_reader = AdobeReader()
|
||||||
azure_reader = AzureAIDocumentIntelligenceLoader(
|
azure_reader = AzureAIDocumentIntelligenceLoader(
|
||||||
|
|
|
@ -10,6 +10,7 @@ from .ocr_loader import ImageReader, OCRReader
|
||||||
from .pdf_loader import PDFThumbnailReader
|
from .pdf_loader import PDFThumbnailReader
|
||||||
from .txt_loader import TxtReader
|
from .txt_loader import TxtReader
|
||||||
from .unstructured_loader import UnstructuredReader
|
from .unstructured_loader import UnstructuredReader
|
||||||
|
from .web_loader import WebReader
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"AutoReader",
|
"AutoReader",
|
||||||
|
@ -28,4 +29,5 @@ __all__ = [
|
||||||
"AdobeReader",
|
"AdobeReader",
|
||||||
"TxtReader",
|
"TxtReader",
|
||||||
"PDFThumbnailReader",
|
"PDFThumbnailReader",
|
||||||
|
"WebReader",
|
||||||
]
|
]
|
||||||
|
|
43
libs/kotaemon/kotaemon/loaders/web_loader.py
Normal file
43
libs/kotaemon/kotaemon/loaders/web_loader.py
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from decouple import config
|
||||||
|
|
||||||
|
from kotaemon.base import Document
|
||||||
|
|
||||||
|
from .base import BaseReader
|
||||||
|
|
||||||
|
JINA_API_KEY = config("JINA_API_KEY", default="")
|
||||||
|
JINA_URL = config("JINA_URL", default="https://r.jina.ai/")
|
||||||
|
|
||||||
|
|
||||||
|
class WebReader(BaseReader):
|
||||||
|
def run(
|
||||||
|
self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
|
||||||
|
) -> list[Document]:
|
||||||
|
return self.load_data(Path(file_path), extra_info=extra_info, **kwargs)
|
||||||
|
|
||||||
|
def fetch_url(self, url: str):
|
||||||
|
# setup the request
|
||||||
|
api_url = f"https://r.jina.ai/{url}"
|
||||||
|
headers = {
|
||||||
|
"X-With-Links-Summary": "true",
|
||||||
|
}
|
||||||
|
if JINA_API_KEY:
|
||||||
|
headers["Authorization"] = f"Bearer {JINA_API_KEY}"
|
||||||
|
|
||||||
|
response = requests.get(api_url, headers=headers)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
data = response.text
|
||||||
|
return data
|
||||||
|
|
||||||
|
def load_data(
|
||||||
|
self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
|
||||||
|
) -> list[Document]:
|
||||||
|
file_path = str(file_path)
|
||||||
|
output = self.fetch_url(file_path)
|
||||||
|
metadata = extra_info or {}
|
||||||
|
|
||||||
|
return [Document(text=output, metadata=metadata)]
|
|
@ -57,7 +57,7 @@ def prepare_graph_index_path(graph_id: str):
|
||||||
class GraphRAGIndexingPipeline(IndexDocumentPipeline):
|
class GraphRAGIndexingPipeline(IndexDocumentPipeline):
|
||||||
"""GraphRAG specific indexing pipeline"""
|
"""GraphRAG specific indexing pipeline"""
|
||||||
|
|
||||||
def route(self, file_path: Path) -> IndexPipeline:
|
def route(self, file_path: str | Path) -> IndexPipeline:
|
||||||
"""Simply disable the splitter (chunking) for this pipeline"""
|
"""Simply disable the splitter (chunking) for this pipeline"""
|
||||||
pipeline = super().route(file_path)
|
pipeline = super().route(file_path)
|
||||||
pipeline.splitter = None
|
pipeline.splitter = None
|
||||||
|
|
|
@ -32,7 +32,7 @@ class KnetIndexingPipeline(IndexDocumentPipeline):
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
def route(self, file_path: Path) -> IndexPipeline:
|
def route(self, file_path: str | Path) -> IndexPipeline:
|
||||||
"""Simply disable the splitter (chunking) for this pipeline"""
|
"""Simply disable the splitter (chunking) for this pipeline"""
|
||||||
pipeline = super().route(file_path)
|
pipeline = super().route(file_path)
|
||||||
pipeline.splitter = None
|
pipeline.splitter = None
|
||||||
|
|
|
@ -39,6 +39,7 @@ from kotaemon.indices.ingests.files import (
|
||||||
adobe_reader,
|
adobe_reader,
|
||||||
azure_reader,
|
azure_reader,
|
||||||
unstructured,
|
unstructured,
|
||||||
|
web_reader,
|
||||||
)
|
)
|
||||||
from kotaemon.indices.rankings import BaseReranking, LLMReranking, LLMTrulensScoring
|
from kotaemon.indices.rankings import BaseReranking, LLMReranking, LLMTrulensScoring
|
||||||
from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
|
from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
|
||||||
|
@ -444,7 +445,7 @@ class IndexPipeline(BaseComponent):
|
||||||
session.add_all(nodes)
|
session.add_all(nodes)
|
||||||
session.commit()
|
session.commit()
|
||||||
|
|
||||||
def get_id_if_exists(self, file_path: Path) -> Optional[str]:
|
def get_id_if_exists(self, file_path: str | Path) -> Optional[str]:
|
||||||
"""Check if the file is already indexed
|
"""Check if the file is already indexed
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
@ -453,13 +454,14 @@ class IndexPipeline(BaseComponent):
|
||||||
Returns:
|
Returns:
|
||||||
the file id if the file is indexed, otherwise None
|
the file id if the file is indexed, otherwise None
|
||||||
"""
|
"""
|
||||||
|
file_name = file_path.name if isinstance(file_path, Path) else file_path
|
||||||
if self.private:
|
if self.private:
|
||||||
cond: tuple = (
|
cond: tuple = (
|
||||||
self.Source.name == file_path.name,
|
self.Source.name == file_name,
|
||||||
self.Source.user == self.user_id,
|
self.Source.user == self.user_id,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
cond = (self.Source.name == file_path.name,)
|
cond = (self.Source.name == file_name,)
|
||||||
|
|
||||||
with Session(engine) as session:
|
with Session(engine) as session:
|
||||||
stmt = select(self.Source).where(*cond)
|
stmt = select(self.Source).where(*cond)
|
||||||
|
@ -469,6 +471,29 @@ class IndexPipeline(BaseComponent):
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def store_url(self, url: str) -> str:
|
||||||
|
"""Store URL into the database and storage, return the file id
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: the URL
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
the file id
|
||||||
|
"""
|
||||||
|
file_hash = sha256(url.encode()).hexdigest()
|
||||||
|
source = self.Source(
|
||||||
|
name=url,
|
||||||
|
path=file_hash,
|
||||||
|
size=0,
|
||||||
|
user=self.user_id, # type: ignore
|
||||||
|
)
|
||||||
|
with Session(engine) as session:
|
||||||
|
session.add(source)
|
||||||
|
session.commit()
|
||||||
|
file_id = source.id
|
||||||
|
|
||||||
|
return file_id
|
||||||
|
|
||||||
def store_file(self, file_path: Path) -> str:
|
def store_file(self, file_path: Path) -> str:
|
||||||
"""Store file into the database and storage, return the file id
|
"""Store file into the database and storage, return the file id
|
||||||
|
|
||||||
|
@ -495,7 +520,7 @@ class IndexPipeline(BaseComponent):
|
||||||
|
|
||||||
return file_id
|
return file_id
|
||||||
|
|
||||||
def finish(self, file_id: str, file_path: Path) -> str:
|
def finish(self, file_id: str, file_path: str | Path) -> str:
|
||||||
"""Finish the indexing"""
|
"""Finish the indexing"""
|
||||||
with Session(engine) as session:
|
with Session(engine) as session:
|
||||||
stmt = select(self.Source).where(self.Source.id == file_id)
|
stmt = select(self.Source).where(self.Source.id == file_id)
|
||||||
|
@ -561,9 +586,13 @@ class IndexPipeline(BaseComponent):
|
||||||
def stream(
|
def stream(
|
||||||
self, file_path: str | Path, reindex: bool, **kwargs
|
self, file_path: str | Path, reindex: bool, **kwargs
|
||||||
) -> Generator[Document, None, tuple[str, list[Document]]]:
|
) -> Generator[Document, None, tuple[str, list[Document]]]:
|
||||||
# check for duplication
|
# check if the file is already indexed
|
||||||
file_path = Path(file_path).resolve()
|
if isinstance(file_path, Path):
|
||||||
|
file_path = file_path.resolve()
|
||||||
|
|
||||||
file_id = self.get_id_if_exists(file_path)
|
file_id = self.get_id_if_exists(file_path)
|
||||||
|
|
||||||
|
if isinstance(file_path, Path):
|
||||||
if file_id is not None:
|
if file_id is not None:
|
||||||
if not reindex:
|
if not reindex:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -572,26 +601,40 @@ class IndexPipeline(BaseComponent):
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# remove the existing records
|
# remove the existing records
|
||||||
yield Document(f" => Removing old {file_path.name}", channel="debug")
|
yield Document(
|
||||||
|
f" => Removing old {file_path.name}", channel="debug"
|
||||||
|
)
|
||||||
self.delete_file(file_id)
|
self.delete_file(file_id)
|
||||||
file_id = self.store_file(file_path)
|
file_id = self.store_file(file_path)
|
||||||
else:
|
else:
|
||||||
# add record to db
|
# add record to db
|
||||||
file_id = self.store_file(file_path)
|
file_id = self.store_file(file_path)
|
||||||
|
else:
|
||||||
|
if file_id is not None:
|
||||||
|
raise ValueError(f"URL {file_path} already indexed.")
|
||||||
|
else:
|
||||||
|
# add record to db
|
||||||
|
file_id = self.store_url(file_path)
|
||||||
|
|
||||||
# extract the file
|
# extract the file
|
||||||
|
if isinstance(file_path, Path):
|
||||||
extra_info = default_file_metadata_func(str(file_path))
|
extra_info = default_file_metadata_func(str(file_path))
|
||||||
|
file_name = file_path.name
|
||||||
|
else:
|
||||||
|
extra_info = {"file_name": file_path}
|
||||||
|
file_name = file_path
|
||||||
|
|
||||||
extra_info["file_id"] = file_id
|
extra_info["file_id"] = file_id
|
||||||
extra_info["collection_name"] = self.collection_name
|
extra_info["collection_name"] = self.collection_name
|
||||||
|
|
||||||
yield Document(f" => Converting {file_path.name} to text", channel="debug")
|
yield Document(f" => Converting {file_name} to text", channel="debug")
|
||||||
docs = self.loader.load_data(file_path, extra_info=extra_info)
|
docs = self.loader.load_data(file_path, extra_info=extra_info)
|
||||||
yield Document(f" => Converted {file_path.name} to text", channel="debug")
|
yield Document(f" => Converted {file_name} to text", channel="debug")
|
||||||
yield from self.handle_docs(docs, file_id, file_path.name)
|
yield from self.handle_docs(docs, file_id, file_name)
|
||||||
|
|
||||||
self.finish(file_id, file_path)
|
self.finish(file_id, file_path)
|
||||||
|
|
||||||
yield Document(f" => Finished indexing {file_path.name}", channel="debug")
|
yield Document(f" => Finished indexing {file_name}", channel="debug")
|
||||||
return file_id, docs
|
return file_id, docs
|
||||||
|
|
||||||
|
|
||||||
|
@ -658,13 +701,23 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
|
||||||
)
|
)
|
||||||
return obj
|
return obj
|
||||||
|
|
||||||
def route(self, file_path: Path) -> IndexPipeline:
|
def is_url(self, file_path: str | Path) -> bool:
|
||||||
|
return isinstance(file_path, str) and (
|
||||||
|
file_path.startswith("http://") or file_path.startswith("https://")
|
||||||
|
)
|
||||||
|
|
||||||
|
def route(self, file_path: str | Path) -> IndexPipeline:
|
||||||
"""Decide the pipeline based on the file type
|
"""Decide the pipeline based on the file type
|
||||||
|
|
||||||
Can subclass this method for a more elaborate pipeline routing strategy.
|
Can subclass this method for a more elaborate pipeline routing strategy.
|
||||||
"""
|
"""
|
||||||
_, chunk_size, chunk_overlap = dev_settings()
|
_, chunk_size, chunk_overlap = dev_settings()
|
||||||
|
|
||||||
|
# check if file_path is a URL
|
||||||
|
if self.is_url(file_path):
|
||||||
|
reader = web_reader
|
||||||
|
else:
|
||||||
|
assert isinstance(file_path, Path)
|
||||||
ext = file_path.suffix.lower()
|
ext = file_path.suffix.lower()
|
||||||
reader = self.readers.get(ext, unstructured)
|
reader = self.readers.get(ext, unstructured)
|
||||||
if reader is None:
|
if reader is None:
|
||||||
|
@ -715,9 +768,14 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
|
||||||
|
|
||||||
n_files = len(file_paths)
|
n_files = len(file_paths)
|
||||||
for idx, file_path in enumerate(file_paths):
|
for idx, file_path in enumerate(file_paths):
|
||||||
|
if self.is_url(file_path):
|
||||||
|
file_name = file_path
|
||||||
|
else:
|
||||||
file_path = Path(file_path)
|
file_path = Path(file_path)
|
||||||
|
file_name = file_path.name
|
||||||
|
|
||||||
yield Document(
|
yield Document(
|
||||||
content=f"Indexing [{idx + 1}/{n_files}]: {file_path.name}",
|
content=f"Indexing [{idx + 1}/{n_files}]: {file_name}",
|
||||||
channel="debug",
|
channel="debug",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -730,7 +788,11 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
|
||||||
file_ids.append(file_id)
|
file_ids.append(file_id)
|
||||||
errors.append(None)
|
errors.append(None)
|
||||||
yield Document(
|
yield Document(
|
||||||
content={"file_path": file_path, "status": "success"},
|
content={
|
||||||
|
"file_path": file_path,
|
||||||
|
"file_name": file_name,
|
||||||
|
"status": "success",
|
||||||
|
},
|
||||||
channel="index",
|
channel="index",
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -740,6 +802,7 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
|
||||||
yield Document(
|
yield Document(
|
||||||
content={
|
content={
|
||||||
"file_path": file_path,
|
"file_path": file_path,
|
||||||
|
"file_name": file_name,
|
||||||
"status": "failed",
|
"status": "failed",
|
||||||
"message": str(e),
|
"message": str(e),
|
||||||
},
|
},
|
||||||
|
|
|
@ -111,8 +111,8 @@ class FileIndexPage(BasePage):
|
||||||
"""Build the UI of the app"""
|
"""Build the UI of the app"""
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
with gr.Column(scale=1):
|
with gr.Column(scale=1):
|
||||||
gr.Markdown("## File Upload")
|
|
||||||
with gr.Column() as self.upload:
|
with gr.Column() as self.upload:
|
||||||
|
with gr.Tab("Upload Files"):
|
||||||
self.files = File(
|
self.files = File(
|
||||||
file_types=self._supported_file_types,
|
file_types=self._supported_file_types,
|
||||||
file_count="multiple",
|
file_count="multiple",
|
||||||
|
@ -124,6 +124,13 @@ class FileIndexPage(BasePage):
|
||||||
if msg:
|
if msg:
|
||||||
gr.Markdown(msg)
|
gr.Markdown(msg)
|
||||||
|
|
||||||
|
with gr.Tab("Use Web Links"):
|
||||||
|
self.urls = gr.Textbox(
|
||||||
|
label="Input web URLs",
|
||||||
|
lines=8,
|
||||||
|
)
|
||||||
|
gr.Markdown("(separated by new line)")
|
||||||
|
|
||||||
with gr.Accordion("Advanced indexing options", open=True):
|
with gr.Accordion("Advanced indexing options", open=True):
|
||||||
with gr.Row():
|
with gr.Row():
|
||||||
self.reindex = gr.Checkbox(
|
self.reindex = gr.Checkbox(
|
||||||
|
@ -525,6 +532,7 @@ class FileIndexPage(BasePage):
|
||||||
fn=self.index_fn,
|
fn=self.index_fn,
|
||||||
inputs=[
|
inputs=[
|
||||||
self.files,
|
self.files,
|
||||||
|
self.urls,
|
||||||
self.reindex,
|
self.reindex,
|
||||||
self._app.settings_state,
|
self._app.settings_state,
|
||||||
self._app.user_id,
|
self._app.user_id,
|
||||||
|
@ -670,16 +678,21 @@ class FileIndexPage(BasePage):
|
||||||
return remaining_files
|
return remaining_files
|
||||||
|
|
||||||
def index_fn(
|
def index_fn(
|
||||||
self, files, reindex: bool, settings, user_id
|
self, files, urls, reindex: bool, settings, user_id
|
||||||
) -> Generator[tuple[str, str], None, None]:
|
) -> Generator[tuple[str, str], None, None]:
|
||||||
"""Upload and index the files
|
"""Upload and index the files
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
files: the list of files to be uploaded
|
files: the list of files to be uploaded
|
||||||
|
urls: list of web URLs to be indexed
|
||||||
reindex: whether to reindex the files
|
reindex: whether to reindex the files
|
||||||
selected_files: the list of files already selected
|
selected_files: the list of files already selected
|
||||||
settings: the settings of the app
|
settings: the settings of the app
|
||||||
"""
|
"""
|
||||||
|
if urls:
|
||||||
|
files = [it.strip() for it in urls.split("\n")]
|
||||||
|
errors = []
|
||||||
|
else:
|
||||||
if not files:
|
if not files:
|
||||||
gr.Info("No uploaded file")
|
gr.Info("No uploaded file")
|
||||||
yield "", ""
|
yield "", ""
|
||||||
|
@ -708,10 +721,10 @@ class FileIndexPage(BasePage):
|
||||||
continue
|
continue
|
||||||
if response.channel == "index":
|
if response.channel == "index":
|
||||||
if response.content["status"] == "success":
|
if response.content["status"] == "success":
|
||||||
outputs.append(f"\u2705 | {response.content['file_path'].name}")
|
outputs.append(f"\u2705 | {response.content['file_name']}")
|
||||||
elif response.content["status"] == "failed":
|
elif response.content["status"] == "failed":
|
||||||
outputs.append(
|
outputs.append(
|
||||||
f"\u274c | {response.content['file_path'].name}: "
|
f"\u274c | {response.content['file_name']}: "
|
||||||
f"{response.content['message']}"
|
f"{response.content['message']}"
|
||||||
)
|
)
|
||||||
elif response.channel == "debug":
|
elif response.channel == "debug":
|
||||||
|
@ -764,7 +777,7 @@ class FileIndexPage(BasePage):
|
||||||
settings[f"index.options.{self._index.id}.reader_mode"] = "default"
|
settings[f"index.options.{self._index.id}.reader_mode"] = "default"
|
||||||
settings[f"index.options.{self._index.id}.quick_index_mode"] = True
|
settings[f"index.options.{self._index.id}.quick_index_mode"] = True
|
||||||
if to_process_files:
|
if to_process_files:
|
||||||
_iter = self.index_fn(to_process_files, reindex, settings, user_id)
|
_iter = self.index_fn(to_process_files, [], reindex, settings, user_id)
|
||||||
try:
|
try:
|
||||||
while next(_iter):
|
while next(_iter):
|
||||||
pass
|
pass
|
||||||
|
@ -844,7 +857,7 @@ class FileIndexPage(BasePage):
|
||||||
for p in exclude_patterns:
|
for p in exclude_patterns:
|
||||||
files = [f for f in files if not fnmatch.fnmatch(name=f, pat=p)]
|
files = [f for f in files if not fnmatch.fnmatch(name=f, pat=p)]
|
||||||
|
|
||||||
yield from self.index_fn(files, reindex, settings, user_id)
|
yield from self.index_fn(files, [], reindex, settings, user_id)
|
||||||
|
|
||||||
def format_size_human_readable(self, num: float | str, suffix="B"):
|
def format_size_human_readable(self, num: float | str, suffix="B"):
|
||||||
try:
|
try:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user