Fix UI bugs (#8)

* Auto create conversation when the user starts * Add conversation rename rule check * Fix empty name during save * Confirm deleting conversation * Show warning if users don't select file when upload files in the File Index * Feedback when user uploads duplicated file * Limit the file types * Fix valid username * Allow login when username with leading and trailing whitespaces * Improve the user * Disable admin panel for non-admnin user * Refresh user lists after creating/deleting users * Auto logging in * Clear admin information upon signing out * Fix unable to receive uploaded filename that include special characters, like !@#$%^&*().pdf * Set upload validation for FileIndex * Improve user management UI/UIX * Show extraction error when indexing file * Return selected user -1 when signing out * Fix default supported file types in file index * Validate changing password * Allow the selector to contain mulitple gradio components * A more tolerable placeholder screen * Allow chat suggestion box * Increase concurrency limit * Make adobe loader optional * Use BaseReasoning --------- Co-authored-by: trducng <trungduc1992@gmail.com>
2024-04-03 16:33:54 +07:00
parent 43a18ba070
commit ecf09b275f
23 changed files with 936 additions and 255 deletions
--- a/libs/kotaemon/kotaemon/indices/qa/citation.py
+++ b/libs/kotaemon/kotaemon/indices/qa/citation.py
@@ -104,18 +104,16 @@ class CitationPipeline(BaseComponent):
            print("CitationPipeline: invoking LLM")
            llm_output = self.get_from_path("llm").invoke(messages, **llm_kwargs)
            print("CitationPipeline: finish invoking LLM")
+            if not llm_output.messages:
+                return None
+            function_output = llm_output.messages[0].additional_kwargs["function_call"][
+                "arguments"
+            ]
+            output = QuestionAnswer.parse_raw(function_output)
        except Exception as e:
            print(e)
            return None

-        if not llm_output.messages:
-            return None
-
-        function_output = llm_output.messages[0].additional_kwargs["function_call"][
-            "arguments"
-        ]
-        output = QuestionAnswer.parse_raw(function_output)
-
        return output

    async def ainvoke(self, context: str, question: str):
--- a/libs/kotaemon/kotaemon/loaders/init.py
+++ b/libs/kotaemon/kotaemon/loaders/init.py
@@ -5,7 +5,7 @@ from .docx_loader import DocxReader
 from .excel_loader import PandasExcelReader
 from .html_loader import HtmlReader
 from .mathpix_loader import MathpixPDFReader
-from .ocr_loader import OCRReader
+from .ocr_loader import ImageReader, OCRReader
 from .unstructured_loader import UnstructuredReader

 __all__ = [
@@ -13,6 +13,7 @@ __all__ = [
    "BaseReader",
    "PandasExcelReader",
    "MathpixPDFReader",
+    "ImageReader",
    "OCRReader",
    "DirectoryReader",
    "UnstructuredReader",
--- a/libs/kotaemon/kotaemon/loaders/adobe_loader.py
+++ b/libs/kotaemon/kotaemon/loaders/adobe_loader.py
@@ -10,14 +10,6 @@ from llama_index.readers.base import BaseReader

 from kotaemon.base import Document

-from .utils.adobe import (
-    generate_figure_captions,
-    load_json,
-    parse_figure_paths,
-    parse_table_paths,
-    request_adobe_service,
-)
-
 logger = logging.getLogger(__name__)

 DEFAULT_VLM_ENDPOINT = (
@@ -74,6 +66,13 @@ class AdobeReader(BaseReader):
                includes 3 types: text, table, and image

        """
+        from .utils.adobe import (
+            generate_figure_captions,
+            load_json,
+            parse_figure_paths,
+            parse_table_paths,
+            request_adobe_service,
+        )

        filename = file.name
        filepath = str(Path(file).resolve())
--- a/libs/kotaemon/kotaemon/loaders/ocr_loader.py
+++ b/libs/kotaemon/kotaemon/loaders/ocr_loader.py
@@ -125,3 +125,70 @@ class OCRReader(BaseReader):
        )

        return documents
+
+
+class ImageReader(BaseReader):
+    """Read PDF using OCR, with high focus on table extraction
+
+    Example:
+        ```python
+        >> from knowledgehub.loaders import OCRReader
+        >> reader = OCRReader()
+        >> documents = reader.load_data("path/to/pdf")
+        ```
+
+    Args:
+        endpoint: URL to FullOCR endpoint. If not provided, will look for
+            environment variable `OCR_READER_ENDPOINT` or use the default
+            `knowledgehub.loaders.ocr_loader.DEFAULT_OCR_ENDPOINT`
+            (http://127.0.0.1:8000/v2/ai/infer/)
+        use_ocr: whether to use OCR to read text (e.g: from images, tables) in the PDF
+            If False, only the table and text within table cells will be extracted.
+    """
+
+    def __init__(self, endpoint: Optional[str] = None):
+        """Init the OCR reader with OCR endpoint (FullOCR pipeline)"""
+        super().__init__()
+        self.ocr_endpoint = endpoint or os.getenv(
+            "OCR_READER_ENDPOINT", DEFAULT_OCR_ENDPOINT
+        )
+
+    def load_data(
+        self, file_path: Path, extra_info: Optional[dict] = None, **kwargs
+    ) -> List[Document]:
+        """Load data using OCR reader
+
+        Args:
+            file_path (Path): Path to PDF file
+            debug_path (Path): Path to store debug image output
+            artifact_path (Path): Path to OCR endpoints artifacts directory
+
+        Returns:
+            List[Document]: list of documents extracted from the PDF file
+        """
+        file_path = Path(file_path).resolve()
+
+        with file_path.open("rb") as content:
+            files = {"input": content}
+            data = {"job_id": uuid4(), "table_only": False}
+
+            # call the API from FullOCR endpoint
+            if "response_content" in kwargs:
+                # overriding response content if specified
+                ocr_results = kwargs["response_content"]
+            else:
+                # call original API
+                resp = tenacious_api_post(url=self.ocr_endpoint, files=files, data=data)
+                ocr_results = resp.json()["result"]
+
+        extra_info = extra_info or {}
+        result = []
+        for ocr_result in ocr_results:
+            result.append(
+                Document(
+                    content=ocr_result["csv_string"],
+                    metadata=extra_info,
+                )
+            )
+
+        return result