[AUR-408] Export logs to Excel (#23)
This CL implements: - The logic to export log to Excel. - Route the export logic in the UI. - Demonstrate this functionality in `./examples/promptui` project.
This commit is contained in:
parent
08b6e5d3fb
commit
4f189dc931
|
@ -1 +1,138 @@
|
|||
"""Export logs into Excel file"""
|
||||
import os
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Type, Union
|
||||
|
||||
import pandas as pd
|
||||
import yaml
|
||||
from theflow.storage import storage
|
||||
from theflow.utils.modules import import_dotted_string
|
||||
|
||||
from kotaemon.base import BaseComponent
|
||||
|
||||
|
||||
def from_log_to_dict(pipeline_cls: Type[BaseComponent], log_config: dict) -> dict:
|
||||
"""Export the log to panda dataframes
|
||||
|
||||
Args:
|
||||
pipeline_cls (Type[BaseComponent]): Pipeline class
|
||||
log_config (dict): Log config
|
||||
|
||||
Returns:
|
||||
dataframe
|
||||
"""
|
||||
# get the directory
|
||||
pipeline_log_path = storage.url(pipeline_cls().config.store_result)
|
||||
dirs = list(sorted([f.path for f in os.scandir(pipeline_log_path) if f.is_dir()]))
|
||||
|
||||
ids = []
|
||||
params: Dict[str, List[Any]] = {}
|
||||
inputs: Dict[str, List[Any]] = {}
|
||||
outputs: Dict[str, List[Any]] = {}
|
||||
|
||||
for idx, each_dir in enumerate(dirs):
|
||||
ids.append(str(Path(each_dir).name))
|
||||
|
||||
# get the params
|
||||
params_file = os.path.join(each_dir, "params.pkl")
|
||||
if os.path.exists(params_file):
|
||||
with open(params_file, "rb") as f:
|
||||
each_params = pickle.load(f)
|
||||
for key, value in each_params.items():
|
||||
if key not in params:
|
||||
params[key] = [None] * len(dirs)
|
||||
params[key][idx] = value
|
||||
|
||||
progress_file = os.path.join(each_dir, "progress.pkl")
|
||||
if os.path.exists(progress_file):
|
||||
with open(progress_file, "rb") as f:
|
||||
progress = pickle.load(f)
|
||||
|
||||
# get the inputs
|
||||
for each_input in log_config["inputs"]:
|
||||
name = each_input["name"]
|
||||
step = each_input["step"]
|
||||
if name not in inputs:
|
||||
inputs[name] = [None] * len(dirs)
|
||||
variable = each_input.get("variable", "")
|
||||
if variable:
|
||||
inputs[name][idx] = progress[step]["input"]["kwargs"][variable]
|
||||
else:
|
||||
inputs[name][idx] = progress[step]["input"]
|
||||
|
||||
# get the outputs
|
||||
for each_output in log_config["outputs"]:
|
||||
name = each_output["name"]
|
||||
step = each_output["step"]
|
||||
if name not in outputs:
|
||||
outputs[name] = [None] * len(dirs)
|
||||
outputs[name][idx] = progress[step]["output"]
|
||||
if each_output.get("item", ""):
|
||||
outputs[name][idx] = outputs[name][each_output["item"]]
|
||||
|
||||
return {"ids": ids, **params, **inputs, **outputs}
|
||||
|
||||
|
||||
def export(config: dict, pipeline_def, output_path):
|
||||
"""Export from config to Excel file"""
|
||||
|
||||
pipeline_name = f"{pipeline_def.__module__}.{pipeline_def.__name__}"
|
||||
|
||||
# export to Excel
|
||||
if not config.get("logs", {}):
|
||||
raise ValueError(f"Pipeline {pipeline_name} has no logs to export")
|
||||
|
||||
pds: Dict[str, pd.DataFrame] = {}
|
||||
for log_name, log_def in config["logs"].items():
|
||||
pds[log_name] = pd.DataFrame(from_log_to_dict(pipeline_def, log_def))
|
||||
|
||||
# from the list of pds, export to Excel to output_path
|
||||
with pd.ExcelWriter(output_path, engine="openpyxl") as writer: # type: ignore
|
||||
for log_name, df in pds.items():
|
||||
df.to_excel(writer, sheet_name=log_name)
|
||||
|
||||
|
||||
def export_from_dict(
|
||||
config: Union[str, dict],
|
||||
pipeline: Union[str, Type[BaseComponent]],
|
||||
output_path: str,
|
||||
):
|
||||
"""CLI to export the logs of a pipeline into Excel file
|
||||
|
||||
Args:
|
||||
config_path (str): Path to the config file
|
||||
pipeline_name (str): Name of the pipeline
|
||||
output_path (str): Path to the output Excel file
|
||||
"""
|
||||
# get the pipeline class and the relevant config dict
|
||||
config_dict: dict
|
||||
if isinstance(config, str):
|
||||
with open(config) as f:
|
||||
config_dict = yaml.safe_load(f)
|
||||
elif isinstance(config, dict):
|
||||
config_dict = config
|
||||
else:
|
||||
raise TypeError(f"`config` must be str or dict, not {type(config)}")
|
||||
|
||||
pipeline_name: str
|
||||
pipeline_cls: Type[BaseComponent]
|
||||
pipeline_config: dict
|
||||
if isinstance(pipeline, str):
|
||||
if pipeline not in config_dict:
|
||||
raise ValueError(f"Pipeline {pipeline} not found in config file")
|
||||
pipeline_name = pipeline
|
||||
pipeline_cls = import_dotted_string(pipeline, safe=False)
|
||||
pipeline_config = config_dict[pipeline]
|
||||
elif isinstance(pipeline, type) and issubclass(pipeline, BaseComponent):
|
||||
pipeline_name = f"{pipeline.__module__}.{pipeline.__name__}"
|
||||
if pipeline_name not in config_dict:
|
||||
raise ValueError(f"Pipeline {pipeline_name} not found in config file")
|
||||
pipeline_cls = pipeline
|
||||
pipeline_config = config_dict[pipeline_name]
|
||||
else:
|
||||
raise TypeError(
|
||||
f"`pipeline` must be str or subclass of BaseComponent, not {type(pipeline)}"
|
||||
)
|
||||
|
||||
export(pipeline_config, pipeline_cls, output_path)
|
||||
|
|
|
@ -1,13 +1,20 @@
|
|||
import pickle
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
import gradio as gr
|
||||
import yaml
|
||||
from theflow.storage import storage
|
||||
from theflow.utils.modules import import_dotted_string
|
||||
|
||||
from kotaemon.contribs.promptui.base import COMPONENTS_CLASS, SUPPORTED_COMPONENTS
|
||||
from kotaemon.contribs.promptui.export import export
|
||||
|
||||
USAGE_INSTRUCTION = """In case of errors, you can:
|
||||
|
||||
- PromptUI instruction:
|
||||
https://github.com/Cinnamon/kotaemon/wiki/Utilities#prompt-engineering-ui
|
||||
- Create bug fix and make PR at: https://github.com/Cinnamon/kotaemon
|
||||
- Ping any of @john @tadashi @ian @jacky in Slack channel #llm-productization"""
|
||||
|
||||
|
@ -73,6 +80,8 @@ def construct_ui(config, func_run, func_export) -> gr.Blocks:
|
|||
|
||||
outputs.append(component)
|
||||
|
||||
exported_file = gr.File(label="Output file", show_label=True)
|
||||
|
||||
temp = gr.Tab
|
||||
with gr.Blocks(analytics_enabled=False, title="Welcome to PromptUI") as demo:
|
||||
with gr.Accordion(label="Usage", open=False):
|
||||
|
@ -80,8 +89,10 @@ def construct_ui(config, func_run, func_export) -> gr.Blocks:
|
|||
with gr.Row():
|
||||
run_btn = gr.Button("Run")
|
||||
run_btn.click(func_run, inputs=inputs + params, outputs=outputs)
|
||||
export_btn = gr.Button("Export")
|
||||
export_btn.click(func_export, inputs=None, outputs=None)
|
||||
export_btn = gr.Button(
|
||||
"Export (Result will be in Exported file next to Output)"
|
||||
)
|
||||
export_btn.click(func_export, inputs=None, outputs=exported_file)
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
with temp("Inputs"):
|
||||
|
@ -91,8 +102,11 @@ def construct_ui(config, func_run, func_export) -> gr.Blocks:
|
|||
for component in params:
|
||||
component.render()
|
||||
with gr.Column():
|
||||
for component in outputs:
|
||||
component.render()
|
||||
with temp("Outputs"):
|
||||
for component in outputs:
|
||||
component.render()
|
||||
with temp("Exported file"):
|
||||
exported_file.render()
|
||||
|
||||
return demo
|
||||
|
||||
|
@ -103,6 +117,10 @@ def build_pipeline_ui(config: dict, pipeline_def):
|
|||
params_name = list(config.get("params", {}).keys())
|
||||
outputs_def = config.get("outputs", [])
|
||||
|
||||
output_dir: Path = Path(storage.url(pipeline_def().config.store_result))
|
||||
exported_dir = output_dir.parent / "exported"
|
||||
exported_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def run_func(*args):
|
||||
inputs = {
|
||||
name: value for name, value in zip(inputs_name, args[: len(inputs_name)])
|
||||
|
@ -113,6 +131,13 @@ def build_pipeline_ui(config: dict, pipeline_def):
|
|||
pipeline = pipeline_def()
|
||||
pipeline.set(params)
|
||||
pipeline(**inputs)
|
||||
with storage.open(
|
||||
storage.url(
|
||||
pipeline.config.store_result, pipeline.last_run.id(), "params.pkl"
|
||||
),
|
||||
"wb",
|
||||
) as f:
|
||||
pickle.dump(params, f)
|
||||
if outputs_def:
|
||||
outputs = []
|
||||
for output_def in outputs_def:
|
||||
|
@ -122,8 +147,20 @@ def build_pipeline_ui(config: dict, pipeline_def):
|
|||
outputs.append(output)
|
||||
return outputs
|
||||
|
||||
# TODO: export_func is None for now
|
||||
return construct_ui(config, run_func, None)
|
||||
def export_func():
|
||||
name = (
|
||||
f"{pipeline_def.__module__}.{pipeline_def.__name__}_{datetime.now()}.xlsx"
|
||||
)
|
||||
path = str(exported_dir / name)
|
||||
gr.Info(f"Begin exporting {name}...")
|
||||
try:
|
||||
export(config=config, pipeline_def=pipeline_def, output_path=path)
|
||||
except Exception as e:
|
||||
raise gr.Error(f"Failed to export. Please contact project's AIR: {e}")
|
||||
gr.Info(f"Exported {name}. Please go to the `Exported file` tab to download")
|
||||
return path
|
||||
|
||||
return construct_ui(config, run_func, export_func)
|
||||
|
||||
|
||||
def build_from_dict(config: Union[str, dict]):
|
||||
|
@ -148,4 +185,6 @@ def build_from_dict(config: Union[str, dict]):
|
|||
else:
|
||||
demo = gr.TabbedInterface(demos, list(config_dict.keys()))
|
||||
|
||||
demo.queue()
|
||||
|
||||
return demo
|
||||
|
|
1
setup.py
1
setup.py
|
@ -35,6 +35,7 @@ setuptools.setup(
|
|||
"llama-hub",
|
||||
"nltk",
|
||||
"gradio",
|
||||
"openpyxl",
|
||||
],
|
||||
extras_require={
|
||||
"dev": [
|
||||
|
|
43
tests/simple_pipeline.py
Normal file
43
tests/simple_pipeline.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
import tempfile
|
||||
from typing import List
|
||||
|
||||
from theflow import Node
|
||||
|
||||
from kotaemon.base import BaseComponent
|
||||
from kotaemon.embeddings import AzureOpenAIEmbeddings
|
||||
from kotaemon.llms.completions.openai import AzureOpenAI
|
||||
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
|
||||
from kotaemon.vectorstores import ChromaVectorStore
|
||||
|
||||
|
||||
class Pipeline(BaseComponent):
|
||||
vectorstore_path: str = str(tempfile.mkdtemp())
|
||||
llm: Node[AzureOpenAI] = Node(
|
||||
default=AzureOpenAI,
|
||||
default_kwargs={
|
||||
"openai_api_base": "https://test.openai.azure.com/",
|
||||
"openai_api_key": "some-key",
|
||||
"openai_api_version": "2023-03-15-preview",
|
||||
"deployment_name": "gpt35turbo",
|
||||
"temperature": 0,
|
||||
"request_timeout": 60,
|
||||
},
|
||||
)
|
||||
|
||||
@Node.decorate(depends_on=["vectorstore_path"])
|
||||
def retrieving_pipeline(self):
|
||||
vector_store = ChromaVectorStore(self.vectorstore_path)
|
||||
embedding = AzureOpenAIEmbeddings(
|
||||
model="text-embedding-ada-002",
|
||||
deployment="embedding-deployment",
|
||||
openai_api_base="https://test.openai.azure.com/",
|
||||
openai_api_key="some-key",
|
||||
)
|
||||
|
||||
return RetrieveDocumentFromVectorStorePipeline(
|
||||
vector_store=vector_store, embedding=embedding
|
||||
)
|
||||
|
||||
def run_raw(self, text: str) -> str:
|
||||
matched_texts: List[str] = self.retrieving_pipeline(text)
|
||||
return self.llm("\n".join(matched_texts)).text[0]
|
|
@ -1,66 +1,14 @@
|
|||
import pytest
|
||||
|
||||
from kotaemon.contribs.promptui.config import export_pipeline_to_config
|
||||
from kotaemon.contribs.promptui.export import export_from_dict
|
||||
from kotaemon.contribs.promptui.ui import build_from_dict
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def simple_pipeline_cls(tmp_path):
|
||||
"""Create a pipeline class that can be used"""
|
||||
from typing import List
|
||||
|
||||
from theflow import Node
|
||||
|
||||
from kotaemon.base import BaseComponent
|
||||
from kotaemon.embeddings import AzureOpenAIEmbeddings
|
||||
from kotaemon.llms.completions.openai import AzureOpenAI
|
||||
from kotaemon.pipelines.retrieving import (
|
||||
RetrieveDocumentFromVectorStorePipeline,
|
||||
)
|
||||
from kotaemon.vectorstores import ChromaVectorStore
|
||||
|
||||
class Pipeline(BaseComponent):
|
||||
vectorstore_path: str = str(tmp_path)
|
||||
llm: Node[AzureOpenAI] = Node(
|
||||
default=AzureOpenAI,
|
||||
default_kwargs={
|
||||
"openai_api_base": "https://test.openai.azure.com/",
|
||||
"openai_api_key": "some-key",
|
||||
"openai_api_version": "2023-03-15-preview",
|
||||
"deployment_name": "gpt35turbo",
|
||||
"temperature": 0,
|
||||
"request_timeout": 60,
|
||||
},
|
||||
)
|
||||
|
||||
@Node.decorate(depends_on=["vectorstore_path"])
|
||||
def retrieving_pipeline(self):
|
||||
vector_store = ChromaVectorStore(self.vectorstore_path)
|
||||
embedding = AzureOpenAIEmbeddings(
|
||||
model="text-embedding-ada-002",
|
||||
deployment="embedding-deployment",
|
||||
openai_api_base="https://test.openai.azure.com/",
|
||||
openai_api_key="some-key",
|
||||
)
|
||||
|
||||
return RetrieveDocumentFromVectorStorePipeline(
|
||||
vector_store=vector_store, embedding=embedding
|
||||
)
|
||||
|
||||
def run_raw(self, text: str) -> str:
|
||||
matched_texts: List[str] = self.retrieving_pipeline(text)
|
||||
return self.llm("\n".join(matched_texts)).text[0]
|
||||
|
||||
return Pipeline
|
||||
|
||||
|
||||
Pipeline = simple_pipeline_cls
|
||||
from .simple_pipeline import Pipeline
|
||||
|
||||
|
||||
class TestPromptConfig:
|
||||
def test_export_prompt_config(self, simple_pipeline_cls):
|
||||
def test_export_prompt_config(self):
|
||||
"""Test if the prompt config is exported correctly"""
|
||||
pipeline = simple_pipeline_cls()
|
||||
pipeline = Pipeline()
|
||||
config_dict = export_pipeline_to_config(pipeline)
|
||||
config = list(config_dict.values())[0]
|
||||
|
||||
|
@ -78,9 +26,42 @@ class TestPromptConfig:
|
|||
|
||||
|
||||
class TestPromptUI:
|
||||
def test_uigeneration(self, simple_pipeline_cls):
|
||||
def test_uigeneration(self):
|
||||
"""Test if the gradio UI is exposed without any problem"""
|
||||
pipeline = simple_pipeline_cls()
|
||||
pipeline = Pipeline()
|
||||
config = export_pipeline_to_config(pipeline)
|
||||
|
||||
build_from_dict(config)
|
||||
|
||||
|
||||
class TestExport:
|
||||
def test_export(self, tmp_path):
|
||||
"""Test if the export functionality works without error"""
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
from theflow.storage import storage
|
||||
|
||||
config_path = tmp_path / "config.yaml"
|
||||
pipeline = Pipeline()
|
||||
Path(storage.url(pipeline.config.store_result)).mkdir(
|
||||
parents=True, exist_ok=True
|
||||
)
|
||||
|
||||
config_dict = export_pipeline_to_config(pipeline)
|
||||
pipeline_name = list(config_dict.keys())[0]
|
||||
|
||||
config_dict[pipeline_name]["logs"] = {
|
||||
"sheet1": {
|
||||
"inputs": [{"name": "text", "step": ".", "variable": "text"}],
|
||||
"outputs": [{"name": "answer", "step": "."}],
|
||||
},
|
||||
}
|
||||
with open(config_path, "w") as f:
|
||||
yaml.safe_dump(config_dict, f)
|
||||
|
||||
export_from_dict(
|
||||
config=str(config_path),
|
||||
pipeline=pipeline_name,
|
||||
output_path=str(tmp_path / "exported.xlsx"),
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue
Block a user