(pump:minor) Allow the indexing pipeline to report the indexing progress onto the UI (#81)
* Turn the file indexing event to generator to report progress * Fix React text's trimming function * Refactor delete file into a method
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
from abc import abstractmethod
|
||||
from typing import AsyncGenerator, Iterator, Optional
|
||||
from typing import Any, AsyncGenerator, Iterator, Optional
|
||||
|
||||
from theflow import Function, Node, Param, lazy
|
||||
|
||||
@@ -58,7 +58,7 @@ class BaseComponent(Function):
|
||||
@abstractmethod
|
||||
def run(
|
||||
self, *args, **kwargs
|
||||
) -> Document | list[Document] | Iterator[Document] | None:
|
||||
) -> Document | list[Document] | Iterator[Document] | None | Any:
|
||||
"""Run the component."""
|
||||
...
|
||||
|
||||
|
@@ -32,12 +32,13 @@ class Document(BaseDocument):
|
||||
channel: the channel to show the document. Optional.:
|
||||
- chat: show in chat message
|
||||
- info: show in information panel
|
||||
- index: show in index panel
|
||||
- debug: show in debug panel
|
||||
"""
|
||||
|
||||
content: Any = None
|
||||
source: Optional[str] = None
|
||||
channel: Optional[Literal["chat", "info", "debug"]] = None
|
||||
channel: Optional[Literal["chat", "info", "index", "debug"]] = None
|
||||
|
||||
def __init__(self, content: Optional[Any] = None, *args, **kwargs):
|
||||
if content is None:
|
||||
|
@@ -1,6 +1,7 @@
|
||||
from pathlib import Path
|
||||
from typing import Type
|
||||
|
||||
from llama_index.readers import PDFReader
|
||||
from llama_index.readers.base import BaseReader
|
||||
|
||||
from kotaemon.base import BaseComponent, Document, Param
|
||||
@@ -17,18 +18,20 @@ from kotaemon.loaders import (
|
||||
UnstructuredReader,
|
||||
)
|
||||
|
||||
KH_DEFAULT_FILE_EXTRACTORS: dict[str, Type[BaseReader]] = {
|
||||
".xlsx": PandasExcelReader,
|
||||
".docx": UnstructuredReader,
|
||||
".xls": UnstructuredReader,
|
||||
".doc": UnstructuredReader,
|
||||
".html": HtmlReader,
|
||||
".mhtml": MhtmlReader,
|
||||
".png": UnstructuredReader,
|
||||
".jpeg": UnstructuredReader,
|
||||
".jpg": UnstructuredReader,
|
||||
".tiff": UnstructuredReader,
|
||||
".tif": UnstructuredReader,
|
||||
unstructured = UnstructuredReader()
|
||||
KH_DEFAULT_FILE_EXTRACTORS: dict[str, BaseReader] = {
|
||||
".xlsx": PandasExcelReader(),
|
||||
".docx": unstructured,
|
||||
".xls": unstructured,
|
||||
".doc": unstructured,
|
||||
".html": HtmlReader(),
|
||||
".mhtml": MhtmlReader(),
|
||||
".png": unstructured,
|
||||
".jpeg": unstructured,
|
||||
".jpg": unstructured,
|
||||
".tiff": unstructured,
|
||||
".tif": unstructured,
|
||||
".pdf": PDFReader(),
|
||||
}
|
||||
|
||||
|
||||
@@ -64,7 +67,7 @@ class DocumentIngestor(BaseComponent):
|
||||
def _get_reader(self, input_files: list[str | Path]):
|
||||
"""Get appropriate readers for the input files based on file extension"""
|
||||
file_extractors: dict[str, BaseReader] = {
|
||||
ext: cls() for ext, cls in KH_DEFAULT_FILE_EXTRACTORS.items()
|
||||
ext: reader for ext, reader in KH_DEFAULT_FILE_EXTRACTORS.items()
|
||||
}
|
||||
for ext, cls in self.override_file_extractors.items():
|
||||
file_extractors[ext] = cls()
|
||||
|
@@ -8,6 +8,8 @@ if TYPE_CHECKING:
|
||||
|
||||
|
||||
class BaseReader(BaseComponent):
|
||||
"""The base class for all readers"""
|
||||
|
||||
...
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user