Migrate the MVP into kotaemon (#108)

- Migrate the MVP into kotaemon.
- Preliminary include the pipeline within chatbot interface.
- Organize MVP as an application.

Todo:

- Add an info panel to view the planning of agents -> Fix streaming agents' output.

Resolve: #60
Resolve: #61 
Resolve: #62
This commit is contained in:
Duc Nguyen (john)
2024-01-10 15:28:09 +07:00
committed by GitHub
parent 230328c62f
commit 5a9d6f75be
31 changed files with 273 additions and 92 deletions

View File

@@ -11,6 +11,7 @@ from kotaemon.loaders import (
MathpixPDFReader,
OCRReader,
PandasExcelReader,
UnstructuredReader,
)
@@ -19,8 +20,16 @@ class DocumentIngestor(BaseComponent):
Document types:
- pdf
- xlsx
- docx
- xlsx, xls
- docx, doc
Args:
pdf_mode: mode for pdf extraction, one of "normal", "mathpix", "ocr"
- normal: parse pdf text
- mathpix: parse pdf text using mathpix
- ocr: parse pdf image using flax
doc_parsers: list of document parsers to parse the document
text_splitter: splitter to split the document into text nodes
"""
pdf_mode: str = "normal" # "normal", "mathpix", "ocr"
@@ -34,6 +43,9 @@ class DocumentIngestor(BaseComponent):
"""Get appropriate readers for the input files based on file extension"""
file_extractor: dict[str, AutoReader | BaseReader] = {
".xlsx": PandasExcelReader(),
".docx": UnstructuredReader(),
".xls": UnstructuredReader(),
".doc": UnstructuredReader(),
}
if self.pdf_mode == "normal":