Migrate the MVP into kotaemon (#108)

- Migrate the MVP into kotaemon. - Preliminary include the pipeline within chatbot interface. - Organize MVP as an application. Todo: - Add an info panel to view the planning of agents -> Fix streaming agents' output. Resolve: #60 Resolve: #61 Resolve: #62
2024-01-10 15:28:09 +07:00
parent 230328c62f
commit 5a9d6f75be
31 changed files with 273 additions and 92 deletions
--- a/knowledgehub/indices/ingests/files.py
+++ b/knowledgehub/indices/ingests/files.py
@@ -11,6 +11,7 @@ from kotaemon.loaders import (
    MathpixPDFReader,
    OCRReader,
    PandasExcelReader,
+    UnstructuredReader,
 )


@@ -19,8 +20,16 @@ class DocumentIngestor(BaseComponent):

    Document types:
        - pdf
-        - xlsx
-        - docx
+        - xlsx, xls
+        - docx, doc
+
+    Args:
+        pdf_mode: mode for pdf extraction, one of "normal", "mathpix", "ocr"
+            - normal: parse pdf text
+            - mathpix: parse pdf text using mathpix
+            - ocr: parse pdf image using flax
+        doc_parsers: list of document parsers to parse the document
+        text_splitter: splitter to split the document into text nodes
    """

    pdf_mode: str = "normal"  # "normal", "mathpix", "ocr"
@@ -34,6 +43,9 @@ class DocumentIngestor(BaseComponent):
        """Get appropriate readers for the input files based on file extension"""
        file_extractor: dict[str, AutoReader | BaseReader] = {
            ".xlsx": PandasExcelReader(),
+            ".docx": UnstructuredReader(),
+            ".xls": UnstructuredReader(),
+            ".doc": UnstructuredReader(),
        }

        if self.pdf_mode == "normal":