feat: add web URL loader & refine indexing logics (#397)

* feat: add web URL loader & refine indexing logics * fix: comfort mypy
2024-10-15 22:42:24 +07:00
parent 41966fcd5b
commit b113efc855
7 changed files with 183 additions and 60 deletions
--- a/libs/kotaemon/kotaemon/indices/ingests/files.py
+++ b/libs/kotaemon/kotaemon/indices/ingests/files.py
@@ -21,8 +21,10 @@ from kotaemon.loaders import (
    PDFThumbnailReader,
    TxtReader,
    UnstructuredReader,
    WebReader,
 )
 web_reader = WebReader()
 unstructured = UnstructuredReader()
 adobe_reader = AdobeReader()
 azure_reader = AzureAIDocumentIntelligenceLoader(
--- a/libs/kotaemon/kotaemon/loaders/init.py
+++ b/libs/kotaemon/kotaemon/loaders/init.py
@@ -10,6 +10,7 @@ from .ocr_loader import ImageReader, OCRReader
 from .pdf_loader import PDFThumbnailReader
 from .txt_loader import TxtReader
 from .unstructured_loader import UnstructuredReader
 from .web_loader import WebReader
 __all__ = [
    "AutoReader",
@@ -28,4 +29,5 @@ __all__ = [
    "AdobeReader",
    "TxtReader",
    "PDFThumbnailReader",
    "WebReader",
 ]
--- a/libs/kotaemon/kotaemon/loaders/web_loader.py
+++ b/libs/kotaemon/kotaemon/loaders/web_loader.py
@@ -0,0 +1,43 @@
 from pathlib import Path
 from typing import Optional
 import requests
 from decouple import config
 from kotaemon.base import Document
 from .base import BaseReader
 JINA_API_KEY = config("JINA_API_KEY", default="")
 JINA_URL = config("JINA_URL", default="https://r.jina.ai/")
 class WebReader(BaseReader):
    def run(
        self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
    ) -> list[Document]:
        return self.load_data(Path(file_path), extra_info=extra_info, **kwargs)
    def fetch_url(self, url: str):
        # setup the request
        api_url = f"https://r.jina.ai/{url}"
        headers = {
            "X-With-Links-Summary": "true",
        }
        if JINA_API_KEY:
            headers["Authorization"] = f"Bearer {JINA_API_KEY}"
        response = requests.get(api_url, headers=headers)
        response.raise_for_status()
        data = response.text
        return data
    def load_data(
        self, file_path: str | Path, extra_info: Optional[dict] = None, **kwargs
    ) -> list[Document]:
        file_path = str(file_path)
        output = self.fetch_url(file_path)
        metadata = extra_info or {}
        return [Document(text=output, metadata=metadata)]
--- a/libs/ktem/ktem/index/file/graph/pipelines.py
+++ b/libs/ktem/ktem/index/file/graph/pipelines.py
@@ -57,7 +57,7 @@ def prepare_graph_index_path(graph_id: str):
 class GraphRAGIndexingPipeline(IndexDocumentPipeline):
    """GraphRAG specific indexing pipeline"""
-    def route(self, file_path: Path) -> IndexPipeline:
+    def route(self, file_path: str | Path) -> IndexPipeline:
        """Simply disable the splitter (chunking) for this pipeline"""
        pipeline = super().route(file_path)
        pipeline.splitter = None
--- a/libs/ktem/ktem/index/file/knet/pipelines.py
+++ b/libs/ktem/ktem/index/file/knet/pipelines.py
@@ -32,7 +32,7 @@ class KnetIndexingPipeline(IndexDocumentPipeline):
            },
        }
-    def route(self, file_path: Path) -> IndexPipeline:
+    def route(self, file_path: str | Path) -> IndexPipeline:
        """Simply disable the splitter (chunking) for this pipeline"""
        pipeline = super().route(file_path)
        pipeline.splitter = None
--- a/libs/ktem/ktem/index/file/pipelines.py
+++ b/libs/ktem/ktem/index/file/pipelines.py
@@ -39,6 +39,7 @@ from kotaemon.indices.ingests.files import (
    adobe_reader,
    azure_reader,
    unstructured,
    web_reader,
 )
 from kotaemon.indices.rankings import BaseReranking, LLMReranking, LLMTrulensScoring
 from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
@@ -444,7 +445,7 @@ class IndexPipeline(BaseComponent):
                session.add_all(nodes)
                session.commit()
-    def get_id_if_exists(self, file_path: Path) -> Optional[str]:
+    def get_id_if_exists(self, file_path: str | Path) -> Optional[str]:
        """Check if the file is already indexed
        Args:
@@ -453,13 +454,14 @@ class IndexPipeline(BaseComponent):
        Returns:
            the file id if the file is indexed, otherwise None
        """
        file_name = file_path.name if isinstance(file_path, Path) else file_path
        if self.private:
            cond: tuple = (
-                self.Source.name == file_path.name,
+                self.Source.name == file_name,
                self.Source.user == self.user_id,
            )
        else:
-            cond = (self.Source.name == file_path.name,)
+            cond = (self.Source.name == file_name,)
        with Session(engine) as session:
            stmt = select(self.Source).where(*cond)
@@ -469,6 +471,29 @@ class IndexPipeline(BaseComponent):
        return None
    def store_url(self, url: str) -> str:
        """Store URL into the database and storage, return the file id
        Args:
            url: the URL
        Returns:
            the file id
        """
        file_hash = sha256(url.encode()).hexdigest()
        source = self.Source(
            name=url,
            path=file_hash,
            size=0,
            user=self.user_id,  # type: ignore
        )
        with Session(engine) as session:
            session.add(source)
            session.commit()
            file_id = source.id
        return file_id
    def store_file(self, file_path: Path) -> str:
        """Store file into the database and storage, return the file id
@@ -495,7 +520,7 @@ class IndexPipeline(BaseComponent):
        return file_id
-    def finish(self, file_id: str, file_path: Path) -> str:
+    def finish(self, file_id: str, file_path: str | Path) -> str:
        """Finish the indexing"""
        with Session(engine) as session:
            stmt = select(self.Source).where(self.Source.id == file_id)
@@ -561,9 +586,13 @@ class IndexPipeline(BaseComponent):
    def stream(
        self, file_path: str | Path, reindex: bool, **kwargs
    ) -> Generator[Document, None, tuple[str, list[Document]]]:
-        # check for duplication
+        # check if the file is already indexed
-        file_path = Path(file_path).resolve()
+        if isinstance(file_path, Path):
            file_path = file_path.resolve()
        file_id = self.get_id_if_exists(file_path)
        if isinstance(file_path, Path):
            if file_id is not None:
                if not reindex:
                    raise ValueError(
@@ -572,26 +601,40 @@ class IndexPipeline(BaseComponent):
                    )
                else:
                    # remove the existing records
-                yield Document(f" => Removing old {file_path.name}", channel="debug")
+                    yield Document(
                        f" => Removing old {file_path.name}", channel="debug"
                    )
                    self.delete_file(file_id)
                    file_id = self.store_file(file_path)
            else:
                # add record to db
                file_id = self.store_file(file_path)
        else:
            if file_id is not None:
                raise ValueError(f"URL {file_path} already indexed.")
            else:
                # add record to db
                file_id = self.store_url(file_path)
        # extract the file
        if isinstance(file_path, Path):
            extra_info = default_file_metadata_func(str(file_path))
            file_name = file_path.name
        else:
            extra_info = {"file_name": file_path}
            file_name = file_path
        extra_info["file_id"] = file_id
        extra_info["collection_name"] = self.collection_name
-        yield Document(f" => Converting {file_path.name} to text", channel="debug")
+        yield Document(f" => Converting {file_name} to text", channel="debug")
        docs = self.loader.load_data(file_path, extra_info=extra_info)
-        yield Document(f" => Converted {file_path.name} to text", channel="debug")
+        yield Document(f" => Converted {file_name} to text", channel="debug")
-        yield from self.handle_docs(docs, file_id, file_path.name)
+        yield from self.handle_docs(docs, file_id, file_name)
        self.finish(file_id, file_path)
-        yield Document(f" => Finished indexing {file_path.name}", channel="debug")
+        yield Document(f" => Finished indexing {file_name}", channel="debug")
        return file_id, docs
@@ -658,13 +701,23 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
        )
        return obj
-    def route(self, file_path: Path) -> IndexPipeline:
+    def is_url(self, file_path: str | Path) -> bool:
        return isinstance(file_path, str) and (
            file_path.startswith("http://") or file_path.startswith("https://")
        )
    def route(self, file_path: str | Path) -> IndexPipeline:
        """Decide the pipeline based on the file type
        Can subclass this method for a more elaborate pipeline routing strategy.
        """
        _, chunk_size, chunk_overlap = dev_settings()
        # check if file_path is a URL
        if self.is_url(file_path):
            reader = web_reader
        else:
            assert isinstance(file_path, Path)
            ext = file_path.suffix.lower()
            reader = self.readers.get(ext, unstructured)
            if reader is None:
@@ -715,9 +768,14 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
        n_files = len(file_paths)
        for idx, file_path in enumerate(file_paths):
            if self.is_url(file_path):
                file_name = file_path
            else:
                file_path = Path(file_path)
                file_name = file_path.name
            yield Document(
-                content=f"Indexing [{idx + 1}/{n_files}]: {file_path.name}",
+                content=f"Indexing [{idx + 1}/{n_files}]: {file_name}",
                channel="debug",
            )
@@ -730,7 +788,11 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
                file_ids.append(file_id)
                errors.append(None)
                yield Document(
-                    content={"file_path": file_path, "status": "success"},
+                    content={
                        "file_path": file_path,
                        "file_name": file_name,
                        "status": "success",
                    },
                    channel="index",
                )
            except Exception as e:
@@ -740,6 +802,7 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
                yield Document(
                    content={
                        "file_path": file_path,
                        "file_name": file_name,
                        "status": "failed",
                        "message": str(e),
                    },
--- a/libs/ktem/ktem/index/file/ui.py
+++ b/libs/ktem/ktem/index/file/ui.py
@@ -111,8 +111,8 @@ class FileIndexPage(BasePage):
        """Build the UI of the app"""
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("## File Upload")
                with gr.Column() as self.upload:
                    with gr.Tab("Upload Files"):
                        self.files = File(
                            file_types=self._supported_file_types,
                            file_count="multiple",
@@ -124,6 +124,13 @@ class FileIndexPage(BasePage):
                        if msg:
                            gr.Markdown(msg)
                    with gr.Tab("Use Web Links"):
                        self.urls = gr.Textbox(
                            label="Input web URLs",
                            lines=8,
                        )
                        gr.Markdown("(separated by new line)")
                    with gr.Accordion("Advanced indexing options", open=True):
                        with gr.Row():
                            self.reindex = gr.Checkbox(
@@ -525,6 +532,7 @@ class FileIndexPage(BasePage):
            fn=self.index_fn,
            inputs=[
                self.files,
                self.urls,
                self.reindex,
                self._app.settings_state,
                self._app.user_id,
@@ -670,16 +678,21 @@ class FileIndexPage(BasePage):
        return remaining_files
    def index_fn(
-        self, files, reindex: bool, settings, user_id
+        self, files, urls, reindex: bool, settings, user_id
    ) -> Generator[tuple[str, str], None, None]:
        """Upload and index the files
        Args:
            files: the list of files to be uploaded
            urls: list of web URLs to be indexed
            reindex: whether to reindex the files
            selected_files: the list of files already selected
            settings: the settings of the app
        """
        if urls:
            files = [it.strip() for it in urls.split("\n")]
            errors = []
        else:
            if not files:
                gr.Info("No uploaded file")
                yield "", ""
@@ -708,10 +721,10 @@ class FileIndexPage(BasePage):
                    continue
                if response.channel == "index":
                    if response.content["status"] == "success":
-                        outputs.append(f"\u2705 | {response.content['file_path'].name}")
+                        outputs.append(f"\u2705 | {response.content['file_name']}")
                    elif response.content["status"] == "failed":
                        outputs.append(
-                            f"\u274c | {response.content['file_path'].name}: "
+                            f"\u274c | {response.content['file_name']}: "
                            f"{response.content['message']}"
                        )
                elif response.channel == "debug":
@@ -764,7 +777,7 @@ class FileIndexPage(BasePage):
        settings[f"index.options.{self._index.id}.reader_mode"] = "default"
        settings[f"index.options.{self._index.id}.quick_index_mode"] = True
        if to_process_files:
-            _iter = self.index_fn(to_process_files, reindex, settings, user_id)
+            _iter = self.index_fn(to_process_files, [], reindex, settings, user_id)
            try:
                while next(_iter):
                    pass
@@ -844,7 +857,7 @@ class FileIndexPage(BasePage):
            for p in exclude_patterns:
                files = [f for f in files if not fnmatch.fnmatch(name=f, pat=p)]
-        yield from self.index_fn(files, reindex, settings, user_id)
+        yield from self.index_fn(files, [], reindex, settings, user_id)
    def format_size_human_readable(self, num: float | str, suffix="B"):
        try: