(pump:minor) Allow the indexing pipeline to report the indexing progress onto the UI (#81)

* Turn the file indexing event to generator to report progress * Fix React text's trimming function * Refactor delete file into a method
2024-05-25 22:09:41 +07:00
parent 56dfc8fb53
commit ebf1315569
11 changed files with 520 additions and 299 deletions
--- a/libs/kotaemon/kotaemon/base/component.py
+++ b/libs/kotaemon/kotaemon/base/component.py
@@ -1,5 +1,5 @@
 from abc import abstractmethod
-from typing import AsyncGenerator, Iterator, Optional
+from typing import Any, AsyncGenerator, Iterator, Optional

 from theflow import Function, Node, Param, lazy

@@ -58,7 +58,7 @@ class BaseComponent(Function):
    @abstractmethod
    def run(
        self, *args, **kwargs
-    ) -> Document | list[Document] | Iterator[Document] | None:
+    ) -> Document | list[Document] | Iterator[Document] | None | Any:
        """Run the component."""
        ...

--- a/libs/kotaemon/kotaemon/base/schema.py
+++ b/libs/kotaemon/kotaemon/base/schema.py
@@ -32,12 +32,13 @@ class Document(BaseDocument):
        channel: the channel to show the document. Optional.:
            - chat: show in chat message
            - info: show in information panel
+            - index: show in index panel
            - debug: show in debug panel
    """

    content: Any = None
    source: Optional[str] = None
-    channel: Optional[Literal["chat", "info", "debug"]] = None
+    channel: Optional[Literal["chat", "info", "index", "debug"]] = None

    def __init__(self, content: Optional[Any] = None, *args, **kwargs):
        if content is None:
--- a/libs/kotaemon/kotaemon/indices/ingests/files.py
+++ b/libs/kotaemon/kotaemon/indices/ingests/files.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 from typing import Type

+from llama_index.readers import PDFReader
 from llama_index.readers.base import BaseReader

 from kotaemon.base import BaseComponent, Document, Param
@@ -17,18 +18,20 @@ from kotaemon.loaders import (
    UnstructuredReader,
 )

-KH_DEFAULT_FILE_EXTRACTORS: dict[str, Type[BaseReader]] = {
-    ".xlsx": PandasExcelReader,
-    ".docx": UnstructuredReader,
-    ".xls": UnstructuredReader,
-    ".doc": UnstructuredReader,
-    ".html": HtmlReader,
-    ".mhtml": MhtmlReader,
-    ".png": UnstructuredReader,
-    ".jpeg": UnstructuredReader,
-    ".jpg": UnstructuredReader,
-    ".tiff": UnstructuredReader,
-    ".tif": UnstructuredReader,
+unstructured = UnstructuredReader()
+KH_DEFAULT_FILE_EXTRACTORS: dict[str, BaseReader] = {
+    ".xlsx": PandasExcelReader(),
+    ".docx": unstructured,
+    ".xls": unstructured,
+    ".doc": unstructured,
+    ".html": HtmlReader(),
+    ".mhtml": MhtmlReader(),
+    ".png": unstructured,
+    ".jpeg": unstructured,
+    ".jpg": unstructured,
+    ".tiff": unstructured,
+    ".tif": unstructured,
+    ".pdf": PDFReader(),
 }


@@ -64,7 +67,7 @@ class DocumentIngestor(BaseComponent):
    def _get_reader(self, input_files: list[str | Path]):
        """Get appropriate readers for the input files based on file extension"""
        file_extractors: dict[str, BaseReader] = {
-            ext: cls() for ext, cls in KH_DEFAULT_FILE_EXTRACTORS.items()
+            ext: reader for ext, reader in KH_DEFAULT_FILE_EXTRACTORS.items()
        }
        for ext, cls in self.override_file_extractors.items():
            file_extractors[ext] = cls()
--- a/libs/kotaemon/kotaemon/loaders/base.py
+++ b/libs/kotaemon/kotaemon/loaders/base.py
@@ -8,6 +8,8 @@ if TYPE_CHECKING:


 class BaseReader(BaseComponent):
+    """The base class for all readers"""
+
    ...