(pump:minor) Allow the indexing pipeline to report the indexing progress onto the UI (#81)

* Turn the file indexing event to generator to report progress

* Fix React text's trimming function

* Refactor delete file into a method
This commit is contained in:
trducng
2024-05-25 22:09:41 +07:00
committed by GitHub
parent 56dfc8fb53
commit ebf1315569
11 changed files with 520 additions and 299 deletions

View File

@@ -1,5 +1,5 @@
from abc import abstractmethod
from typing import AsyncGenerator, Iterator, Optional
from typing import Any, AsyncGenerator, Iterator, Optional
from theflow import Function, Node, Param, lazy
@@ -58,7 +58,7 @@ class BaseComponent(Function):
@abstractmethod
def run(
self, *args, **kwargs
) -> Document | list[Document] | Iterator[Document] | None:
) -> Document | list[Document] | Iterator[Document] | None | Any:
"""Run the component."""
...

View File

@@ -32,12 +32,13 @@ class Document(BaseDocument):
channel: the channel to show the document. Optional.:
- chat: show in chat message
- info: show in information panel
- index: show in index panel
- debug: show in debug panel
"""
content: Any = None
source: Optional[str] = None
channel: Optional[Literal["chat", "info", "debug"]] = None
channel: Optional[Literal["chat", "info", "index", "debug"]] = None
def __init__(self, content: Optional[Any] = None, *args, **kwargs):
if content is None:

View File

@@ -1,6 +1,7 @@
from pathlib import Path
from typing import Type
from llama_index.readers import PDFReader
from llama_index.readers.base import BaseReader
from kotaemon.base import BaseComponent, Document, Param
@@ -17,18 +18,20 @@ from kotaemon.loaders import (
UnstructuredReader,
)
KH_DEFAULT_FILE_EXTRACTORS: dict[str, Type[BaseReader]] = {
".xlsx": PandasExcelReader,
".docx": UnstructuredReader,
".xls": UnstructuredReader,
".doc": UnstructuredReader,
".html": HtmlReader,
".mhtml": MhtmlReader,
".png": UnstructuredReader,
".jpeg": UnstructuredReader,
".jpg": UnstructuredReader,
".tiff": UnstructuredReader,
".tif": UnstructuredReader,
unstructured = UnstructuredReader()
KH_DEFAULT_FILE_EXTRACTORS: dict[str, BaseReader] = {
".xlsx": PandasExcelReader(),
".docx": unstructured,
".xls": unstructured,
".doc": unstructured,
".html": HtmlReader(),
".mhtml": MhtmlReader(),
".png": unstructured,
".jpeg": unstructured,
".jpg": unstructured,
".tiff": unstructured,
".tif": unstructured,
".pdf": PDFReader(),
}
@@ -64,7 +67,7 @@ class DocumentIngestor(BaseComponent):
def _get_reader(self, input_files: list[str | Path]):
"""Get appropriate readers for the input files based on file extension"""
file_extractors: dict[str, BaseReader] = {
ext: cls() for ext, cls in KH_DEFAULT_FILE_EXTRACTORS.items()
ext: reader for ext, reader in KH_DEFAULT_FILE_EXTRACTORS.items()
}
for ext, cls in self.override_file_extractors.items():
file_extractors[ext] = cls()

View File

@@ -8,6 +8,8 @@ if TYPE_CHECKING:
class BaseReader(BaseComponent):
"""The base class for all readers"""
...