[AUR-408] Export logs to Excel (#23)

This CL implements: - The logic to export log to Excel. - Route the export logic in the UI. - Demonstrate this functionality in `./examples/promptui` project.
2023-09-25 17:20:03 +07:00 · 2023-09-25 17:20:03 +07:00 · 4f189dc931
commit 4f189dc931
parent 08b6e5d3fb
5 changed files with 265 additions and 64 deletions
--- a/knowledgehub/contribs/promptui/export.py
+++ b/knowledgehub/contribs/promptui/export.py
@ -1 +1,138 @@
 """Export logs into Excel file"""
+import os
+import pickle
+from pathlib import Path
+from typing import Any, Dict, List, Type, Union
+
+import pandas as pd
+import yaml
+from theflow.storage import storage
+from theflow.utils.modules import import_dotted_string
+
+from kotaemon.base import BaseComponent
+
+
+def from_log_to_dict(pipeline_cls: Type[BaseComponent], log_config: dict) -> dict:
+    """Export the log to panda dataframes
+
+    Args:
+        pipeline_cls (Type[BaseComponent]): Pipeline class
+        log_config (dict): Log config
+
+    Returns:
+        dataframe
+    """
+    # get the directory
+    pipeline_log_path = storage.url(pipeline_cls().config.store_result)
+    dirs = list(sorted([f.path for f in os.scandir(pipeline_log_path) if f.is_dir()]))
+
+    ids = []
+    params: Dict[str, List[Any]] = {}
+    inputs: Dict[str, List[Any]] = {}
+    outputs: Dict[str, List[Any]] = {}
+
+    for idx, each_dir in enumerate(dirs):
+        ids.append(str(Path(each_dir).name))
+
+        # get the params
+        params_file = os.path.join(each_dir, "params.pkl")
+        if os.path.exists(params_file):
+            with open(params_file, "rb") as f:
+                each_params = pickle.load(f)
+            for key, value in each_params.items():
+                if key not in params:
+                    params[key] = [None] * len(dirs)
+                params[key][idx] = value
+
+        progress_file = os.path.join(each_dir, "progress.pkl")
+        if os.path.exists(progress_file):
+            with open(progress_file, "rb") as f:
+                progress = pickle.load(f)
+
+            # get the inputs
+            for each_input in log_config["inputs"]:
+                name = each_input["name"]
+                step = each_input["step"]
+                if name not in inputs:
+                    inputs[name] = [None] * len(dirs)
+                variable = each_input.get("variable", "")
+                if variable:
+                    inputs[name][idx] = progress[step]["input"]["kwargs"][variable]
+                else:
+                    inputs[name][idx] = progress[step]["input"]
+
+            # get the outputs
+            for each_output in log_config["outputs"]:
+                name = each_output["name"]
+                step = each_output["step"]
+                if name not in outputs:
+                    outputs[name] = [None] * len(dirs)
+                outputs[name][idx] = progress[step]["output"]
+                if each_output.get("item", ""):
+                    outputs[name][idx] = outputs[name][each_output["item"]]
+
+    return {"ids": ids, **params, **inputs, **outputs}
+
+
+def export(config: dict, pipeline_def, output_path):
+    """Export from config to Excel file"""
+
+    pipeline_name = f"{pipeline_def.__module__}.{pipeline_def.__name__}"
+
+    # export to Excel
+    if not config.get("logs", {}):
+        raise ValueError(f"Pipeline {pipeline_name} has no logs to export")
+
+    pds: Dict[str, pd.DataFrame] = {}
+    for log_name, log_def in config["logs"].items():
+        pds[log_name] = pd.DataFrame(from_log_to_dict(pipeline_def, log_def))
+
+    # from the list of pds, export to Excel to output_path
+    with pd.ExcelWriter(output_path, engine="openpyxl") as writer:  # type: ignore
+        for log_name, df in pds.items():
+            df.to_excel(writer, sheet_name=log_name)
+
+
+def export_from_dict(
+    config: Union[str, dict],
+    pipeline: Union[str, Type[BaseComponent]],
+    output_path: str,
+):
+    """CLI to export the logs of a pipeline into Excel file
+
+    Args:
+        config_path (str): Path to the config file
+        pipeline_name (str): Name of the pipeline
+        output_path (str): Path to the output Excel file
+    """
+    # get the pipeline class and the relevant config dict
+    config_dict: dict
+    if isinstance(config, str):
+        with open(config) as f:
+            config_dict = yaml.safe_load(f)
+    elif isinstance(config, dict):
+        config_dict = config
+    else:
+        raise TypeError(f"`config` must be str or dict, not {type(config)}")
+
+    pipeline_name: str
+    pipeline_cls: Type[BaseComponent]
+    pipeline_config: dict
+    if isinstance(pipeline, str):
+        if pipeline not in config_dict:
+            raise ValueError(f"Pipeline {pipeline} not found in config file")
+        pipeline_name = pipeline
+        pipeline_cls = import_dotted_string(pipeline, safe=False)
+        pipeline_config = config_dict[pipeline]
+    elif isinstance(pipeline, type) and issubclass(pipeline, BaseComponent):
+        pipeline_name = f"{pipeline.__module__}.{pipeline.__name__}"
+        if pipeline_name not in config_dict:
+            raise ValueError(f"Pipeline {pipeline_name} not found in config file")
+        pipeline_cls = pipeline
+        pipeline_config = config_dict[pipeline_name]
+    else:
+        raise TypeError(
+            f"`pipeline` must be str or subclass of BaseComponent, not {type(pipeline)}"
+        )
+
+    export(pipeline_config, pipeline_cls, output_path)
--- a/knowledgehub/contribs/promptui/ui.py
+++ b/knowledgehub/contribs/promptui/ui.py
@ -1,13 +1,20 @@
+import pickle
+from datetime import datetime
+from pathlib import Path
 from typing import Union

 import gradio as gr
 import yaml
+from theflow.storage import storage
 from theflow.utils.modules import import_dotted_string

 from kotaemon.contribs.promptui.base import COMPONENTS_CLASS, SUPPORTED_COMPONENTS
+from kotaemon.contribs.promptui.export import export

 USAGE_INSTRUCTION = """In case of errors, you can:

+- PromptUI instruction:
+    https://github.com/Cinnamon/kotaemon/wiki/Utilities#prompt-engineering-ui
 - Create bug fix and make PR at: https://github.com/Cinnamon/kotaemon
 - Ping any of @john @tadashi @ian @jacky in Slack channel #llm-productization"""

@ -73,6 +80,8 @@ def construct_ui(config, func_run, func_export) -> gr.Blocks:

        outputs.append(component)

+    exported_file = gr.File(label="Output file", show_label=True)
+
    temp = gr.Tab
    with gr.Blocks(analytics_enabled=False, title="Welcome to PromptUI") as demo:
        with gr.Accordion(label="Usage", open=False):
@ -80,8 +89,10 @@ def construct_ui(config, func_run, func_export) -> gr.Blocks:
        with gr.Row():
            run_btn = gr.Button("Run")
            run_btn.click(func_run, inputs=inputs + params, outputs=outputs)
-            export_btn = gr.Button("Export")
-            export_btn.click(func_export, inputs=None, outputs=None)
+            export_btn = gr.Button(
+                "Export (Result will be in Exported file next to Output)"
+            )
+            export_btn.click(func_export, inputs=None, outputs=exported_file)
        with gr.Row():
            with gr.Column():
                with temp("Inputs"):
@ -91,8 +102,11 @@ def construct_ui(config, func_run, func_export) -> gr.Blocks:
                    for component in params:
                        component.render()
            with gr.Column():
-                for component in outputs:
-                    component.render()
+                with temp("Outputs"):
+                    for component in outputs:
+                        component.render()
+                with temp("Exported file"):
+                    exported_file.render()

    return demo

@ -103,6 +117,10 @@ def build_pipeline_ui(config: dict, pipeline_def):
    params_name = list(config.get("params", {}).keys())
    outputs_def = config.get("outputs", [])

+    output_dir: Path = Path(storage.url(pipeline_def().config.store_result))
+    exported_dir = output_dir.parent / "exported"
+    exported_dir.mkdir(parents=True, exist_ok=True)
+
    def run_func(*args):
        inputs = {
            name: value for name, value in zip(inputs_name, args[: len(inputs_name)])
@ -113,6 +131,13 @@ def build_pipeline_ui(config: dict, pipeline_def):
        pipeline = pipeline_def()
        pipeline.set(params)
        pipeline(**inputs)
+        with storage.open(
+            storage.url(
+                pipeline.config.store_result, pipeline.last_run.id(), "params.pkl"
+            ),
+            "wb",
+        ) as f:
+            pickle.dump(params, f)
        if outputs_def:
            outputs = []
            for output_def in outputs_def:
@ -122,8 +147,20 @@ def build_pipeline_ui(config: dict, pipeline_def):
                outputs.append(output)
            return outputs

-    # TODO: export_func is None for now
-    return construct_ui(config, run_func, None)
+    def export_func():
+        name = (
+            f"{pipeline_def.__module__}.{pipeline_def.__name__}_{datetime.now()}.xlsx"
+        )
+        path = str(exported_dir / name)
+        gr.Info(f"Begin exporting {name}...")
+        try:
+            export(config=config, pipeline_def=pipeline_def, output_path=path)
+        except Exception as e:
+            raise gr.Error(f"Failed to export. Please contact project's AIR: {e}")
+        gr.Info(f"Exported {name}. Please go to the `Exported file` tab to download")
+        return path
+
+    return construct_ui(config, run_func, export_func)


 def build_from_dict(config: Union[str, dict]):
@ -148,4 +185,6 @@ def build_from_dict(config: Union[str, dict]):
    else:
        demo = gr.TabbedInterface(demos, list(config_dict.keys()))

+    demo.queue()
+
    return demo
--- a/setup.py
+++ b/setup.py
@ -35,6 +35,7 @@ setuptools.setup(
        "llama-hub",
        "nltk",
        "gradio",
+        "openpyxl",
    ],
    extras_require={
        "dev": [
--- a/tests/simple_pipeline.py
+++ b/tests/simple_pipeline.py
@ -0,0 +1,43 @@
+import tempfile
+from typing import List
+
+from theflow import Node
+
+from kotaemon.base import BaseComponent
+from kotaemon.embeddings import AzureOpenAIEmbeddings
+from kotaemon.llms.completions.openai import AzureOpenAI
+from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
+from kotaemon.vectorstores import ChromaVectorStore
+
+
+class Pipeline(BaseComponent):
+    vectorstore_path: str = str(tempfile.mkdtemp())
+    llm: Node[AzureOpenAI] = Node(
+        default=AzureOpenAI,
+        default_kwargs={
+            "openai_api_base": "https://test.openai.azure.com/",
+            "openai_api_key": "some-key",
+            "openai_api_version": "2023-03-15-preview",
+            "deployment_name": "gpt35turbo",
+            "temperature": 0,
+            "request_timeout": 60,
+        },
+    )
+
+    @Node.decorate(depends_on=["vectorstore_path"])
+    def retrieving_pipeline(self):
+        vector_store = ChromaVectorStore(self.vectorstore_path)
+        embedding = AzureOpenAIEmbeddings(
+            model="text-embedding-ada-002",
+            deployment="embedding-deployment",
+            openai_api_base="https://test.openai.azure.com/",
+            openai_api_key="some-key",
+        )
+
+        return RetrieveDocumentFromVectorStorePipeline(
+            vector_store=vector_store, embedding=embedding
+        )
+
+    def run_raw(self, text: str) -> str:
+        matched_texts: List[str] = self.retrieving_pipeline(text)
+        return self.llm("\n".join(matched_texts)).text[0]
--- a/tests/test_promptui.py
+++ b/tests/test_promptui.py
@ -1,66 +1,14 @@
-import pytest
-
 from kotaemon.contribs.promptui.config import export_pipeline_to_config
+from kotaemon.contribs.promptui.export import export_from_dict
 from kotaemon.contribs.promptui.ui import build_from_dict

-
-@pytest.fixture()
-def simple_pipeline_cls(tmp_path):
-    """Create a pipeline class that can be used"""
-    from typing import List
-
-    from theflow import Node
-
-    from kotaemon.base import BaseComponent
-    from kotaemon.embeddings import AzureOpenAIEmbeddings
-    from kotaemon.llms.completions.openai import AzureOpenAI
-    from kotaemon.pipelines.retrieving import (
-        RetrieveDocumentFromVectorStorePipeline,
-    )
-    from kotaemon.vectorstores import ChromaVectorStore
-
-    class Pipeline(BaseComponent):
-        vectorstore_path: str = str(tmp_path)
-        llm: Node[AzureOpenAI] = Node(
-            default=AzureOpenAI,
-            default_kwargs={
-                "openai_api_base": "https://test.openai.azure.com/",
-                "openai_api_key": "some-key",
-                "openai_api_version": "2023-03-15-preview",
-                "deployment_name": "gpt35turbo",
-                "temperature": 0,
-                "request_timeout": 60,
-            },
-        )
-
-        @Node.decorate(depends_on=["vectorstore_path"])
-        def retrieving_pipeline(self):
-            vector_store = ChromaVectorStore(self.vectorstore_path)
-            embedding = AzureOpenAIEmbeddings(
-                model="text-embedding-ada-002",
-                deployment="embedding-deployment",
-                openai_api_base="https://test.openai.azure.com/",
-                openai_api_key="some-key",
-            )
-
-            return RetrieveDocumentFromVectorStorePipeline(
-                vector_store=vector_store, embedding=embedding
-            )
-
-        def run_raw(self, text: str) -> str:
-            matched_texts: List[str] = self.retrieving_pipeline(text)
-            return self.llm("\n".join(matched_texts)).text[0]
-
-    return Pipeline
-
-
-Pipeline = simple_pipeline_cls
+from .simple_pipeline import Pipeline


 class TestPromptConfig:
-    def test_export_prompt_config(self, simple_pipeline_cls):
+    def test_export_prompt_config(self):
        """Test if the prompt config is exported correctly"""
-        pipeline = simple_pipeline_cls()
+        pipeline = Pipeline()
        config_dict = export_pipeline_to_config(pipeline)
        config = list(config_dict.values())[0]

@ -78,9 +26,42 @@ class TestPromptConfig:


 class TestPromptUI:
-    def test_uigeneration(self, simple_pipeline_cls):
+    def test_uigeneration(self):
        """Test if the gradio UI is exposed without any problem"""
-        pipeline = simple_pipeline_cls()
+        pipeline = Pipeline()
        config = export_pipeline_to_config(pipeline)

        build_from_dict(config)
+
+
+class TestExport:
+    def test_export(self, tmp_path):
+        """Test if the export functionality works without error"""
+        from pathlib import Path
+
+        import yaml
+        from theflow.storage import storage
+
+        config_path = tmp_path / "config.yaml"
+        pipeline = Pipeline()
+        Path(storage.url(pipeline.config.store_result)).mkdir(
+            parents=True, exist_ok=True
+        )
+
+        config_dict = export_pipeline_to_config(pipeline)
+        pipeline_name = list(config_dict.keys())[0]
+
+        config_dict[pipeline_name]["logs"] = {
+            "sheet1": {
+                "inputs": [{"name": "text", "step": ".", "variable": "text"}],
+                "outputs": [{"name": "answer", "step": "."}],
+            },
+        }
+        with open(config_path, "w") as f:
+            yaml.safe_dump(config_dict, f)
+
+        export_from_dict(
+            config=str(config_path),
+            pipeline=pipeline_name,
+            output_path=str(tmp_path / "exported.xlsx"),
+        )