Update docs (#106)

This commit is contained in:
ian_Cin 2024-01-30 18:50:17 +07:00 committed by GitHub
parent cbe40fac99
commit 116919b346
57 changed files with 133 additions and 66 deletions

View File

@ -3,9 +3,7 @@
Quick and easy AI components to build Kotaemon - applicable in client
project.
## Documentation
https://docs.promptui.dm.cinnamon.is
[Documentation](https://docs.bleh-internal.cinnamon.is/)
## Install

View File

@ -139,8 +139,8 @@ Examples: https://github.com/Cinnamon/kotaemon/pull/2
- 1st line message is the PR title.
- The text area is the PR description.
![image](https://github.com/Cinnamon/kotaemon/assets/35283585/e2593010-d7ef-46e3-8719-6fcae0315b5d)
![image](https://github.com/Cinnamon/kotaemon/assets/35283585/bfe6a117-85cd-4dd4-b432-197c791a9901)
![image](images/274787925-e2593010-d7ef-46e3-8719-6fcae0315b5d.png)
![image](images/274787941-bfe6a117-85cd-4dd4-b432-197c791a9901.png)
## Develop pipelines

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 162 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 107 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 111 KiB

View File

@ -3,8 +3,17 @@
`kotaemon` library focuses on the AI building blocks to implement the Kotaemon. It can be used in both client project and in product development. It consists of base interfaces, core components and a list of utilities:
- Base interfaces: `kotaemon` defines the base interface of a component in a pipeline. A pipeline is also a component. By clearly define this interface, a pipeline of steps can be easily constructed and orchestrated.
- Core components: `kotaemon` implements (or wraps 3rd-party libraries like Langchain, llama-index,... when possible) commonly used components in kotaemon use cases. Some of these components are: LLM, vector store, document store, retriever... For a detailed list and description of these components, please refer to the [Pipeline Components](Pipeline-Components) and [Data & Data Structure Components](Data-&-Data-Structure-Components) sections.
- List of utilities: `lib-knowledge` provides utilities and tools that are usually needed in client project. For example, it provides a prompt engineering UI for AI developers in a project to quickly create a prompt engineering tool for DMs and QALs. It also provides a command to quickly spin up a project code base. For a full list and description of these utilities, please refer to the [Utilities](Utilities) section.
- Core components: `kotaemon` implements (or wraps 3rd-party libraries
like Langchain, llama-index,... when possible) commonly used components in
kotaemon use cases. Some of these components are: LLM, vector store,
document store, retriever... For a detailed list and description of these
components, please refer to the [API Reference](/reference/nav/) section.
- List of utilities: `lib-knowledge` provides utilities and tools that are
usually needed in client project. For example, it provides a prompt
engineering UI for AI developers in a project to quickly create a prompt
engineering tool for DMs and QALs. It also provides a command to quickly spin
up a project code base. For a full list and description of these utilities,
please refer to the [Tutorial/Utilities](/ultilities) section.
```mermaid
mindmap

View File

@ -13,6 +13,8 @@ while doc_dir.name != doc_dir_name and doc_dir != doc_dir.parent:
if doc_dir == doc_dir.parent:
raise ValueError(f"root_name ({doc_dir_name}) not in path ({str(Path(__file__))}).")
nav_title_map = {"cli": "CLI", "llms": "LLMs"}
def generate_docs_for_src_code(
code_dir: Path, target_doc_folder: str, ignored_modules: Iterable[Any] = []
@ -53,7 +55,9 @@ def generate_docs_for_src_code(
if ignore:
continue
nav_titles = [name.replace("_", " ").title() for name in parts]
nav_titles = [
nav_title_map.get(name, name.replace("_", " ").title()) for name in parts
]
nav[nav_titles] = doc_path.as_posix()
with mkdocs_gen_files.open(full_doc_path, "w") as f:
@ -69,7 +73,7 @@ def generate_docs_for_src_code(
generate_docs_for_src_code(
code_dir=doc_dir.parent / "libs" / "kotaemon",
code_dir=doc_dir.parent / "libs" / "kotaemon" / "kotaemon",
target_doc_folder="reference",
ignored_modules={"contribs"},
)

View File

@ -2,7 +2,7 @@ Utilities detail can be referred in the sub-pages of this section.
## Prompt engineering UI
![chat-ui](https://github.com/Cinnamon/kotaemon/assets/35283585/ac8f9aac-d853-4571-a48b-d866a99eaf3e)
![chat-ui](images/271332562-ac8f9aac-d853-4571-a48b-d866a99eaf3e.png)
**_Important:_** despite the name prompt engineering UI, this tool allows DMs to test any kind of parameters that are exposed by AIRs. Prompt is one kind of param. There can be other type of params that DMs can tweak (e.g. top_k, temperature...).
@ -146,7 +146,7 @@ $ kh promptui run <path/to/config/file.yml>
This will generate an UI as follow:
![Screenshot from 2023-09-20 12-20-31](https://github.com/Cinnamon/kotaemon/assets/35283585/9ac1b95a-b667-42e7-b318-98a1b805d6df)
![Screenshot from 2023-09-20 12-20-31](images/269170198-9ac1b95a-b667-42e7-b318-98a1b805d6df.png)
where:

View File

@ -5,9 +5,10 @@ from dataclasses import dataclass
from enum import Enum
from typing import Any, Dict, Literal, NamedTuple, Optional, Union
from kotaemon.base import LLMInterface
from pydantic import Extra
from kotaemon.base import LLMInterface
def check_log():
"""

View File

@ -1,10 +1,11 @@
from typing import List, Optional
from kotaemon.llms import LLM, ChatLLM
from langchain.agents import AgentType as LCAgentType
from langchain.agents import initialize_agent
from langchain.agents.agent import AgentExecutor as LCAgentExecutor
from kotaemon.llms import LLM, ChatLLM
from .base import BaseAgent
from .io import AgentOutput, AgentType
from .tools import BaseTool

View File

@ -1,9 +1,10 @@
from typing import Any, Callable, Dict, Optional, Tuple, Type, Union
from kotaemon.base import BaseComponent
from langchain.agents import Tool as LCTool
from pydantic import BaseModel
from kotaemon.base import BaseComponent
class ToolException(Exception):
"""An optional exception that tool throws when execution error occurs.

View File

@ -1,8 +1,9 @@
from typing import AnyStr, Optional, Type
from kotaemon.llms import BaseLLM
from pydantic import BaseModel, Field
from kotaemon.llms import BaseLLM
from .base import BaseTool, ToolException

View File

@ -1,8 +1,9 @@
from typing import Any, AnyStr, Optional, Type, Union
from kotaemon.base import Document
from pydantic import BaseModel, Field
from kotaemon.base import Document
from .base import BaseTool

View File

@ -1,9 +1,10 @@
from abc import abstractmethod
from typing import Iterator, Optional
from kotaemon.base.schema import Document
from theflow import Function, Node, Param, lazy
from kotaemon.base.schema import Document
class BaseComponent(Function):
"""A component is a class that can be used to compose a pipeline.

View File

@ -1,9 +1,10 @@
from abc import abstractmethod
from typing import List, Optional
from theflow import SessionFunction
from kotaemon.base import BaseComponent, LLMInterface
from kotaemon.base.schema import AIMessage, BaseMessage, HumanMessage, SystemMessage
from theflow import SessionFunction
class BaseChatBot(BaseComponent):

View File

@ -36,9 +36,10 @@ def export(export_path, output):
"""Export a pipeline to a config file"""
import sys
from kotaemon.contribs.promptui.config import export_pipeline_to_config
from theflow.utils.modules import import_dotted_string
from kotaemon.contribs.promptui.config import export_pipeline_to_config
sys.path.append(os.getcwd())
cls = import_dotted_string(export_path, safe=False)
export_pipeline_to_config(cls, output)

View File

@ -4,6 +4,7 @@ from pathlib import Path
from typing import Any, Dict, Optional, Type, Union
import yaml
from kotaemon.base import BaseComponent
from kotaemon.chatbot import BaseChatBot

View File

@ -6,10 +6,11 @@ from typing import Any, Dict, List, Type, Union
import pandas as pd
import yaml
from kotaemon.base import BaseComponent
from theflow.storage import storage
from theflow.utils.modules import import_dotted_string
from kotaemon.base import BaseComponent
from .logs import ResultLog

View File

@ -3,11 +3,12 @@ from datetime import datetime
from pathlib import Path
import gradio as gr
from theflow.storage import storage
from kotaemon.chatbot import ChatConversation
from kotaemon.contribs.promptui.base import get_component
from kotaemon.contribs.promptui.export import export
from kotaemon.contribs.promptui.ui.blocks import ChatBlock
from theflow.storage import storage
from ..logs import ResultLog

View File

@ -6,9 +6,10 @@ from typing import Any, Dict
import gradio as gr
import pandas as pd
from theflow.storage import storage
from kotaemon.contribs.promptui.base import get_component
from kotaemon.contribs.promptui.export import export
from theflow.storage import storage
from ..logs import ResultLog

View File

@ -3,9 +3,10 @@ from __future__ import annotations
from abc import abstractmethod
from typing import Any, Type
from kotaemon.base import BaseComponent, Document, RetrievedDocument
from llama_index.node_parser.interface import NodeParser
from kotaemon.base import BaseComponent, Document, RetrievedDocument
class DocTransformer(BaseComponent):
"""This is a base class for document transformers

View File

@ -1,5 +1,7 @@
from pathlib import Path
from llama_index.readers.base import BaseReader
from kotaemon.base import BaseComponent, Document, Param
from kotaemon.indices.extractors import BaseDocParser
from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
@ -11,7 +13,6 @@ from kotaemon.loaders import (
PandasExcelReader,
UnstructuredReader,
)
from llama_index.readers.base import BaseReader
class DocumentIngestor(BaseComponent):

View File

@ -1,9 +1,10 @@
from typing import Iterator, List
from pydantic import BaseModel, Field
from kotaemon.base import BaseComponent
from kotaemon.base.schema import HumanMessage, SystemMessage
from kotaemon.llms import BaseLLM
from pydantic import BaseModel, Field
class FactWithEvidence(BaseModel):

View File

@ -2,9 +2,10 @@ from __future__ import annotations
from concurrent.futures import ThreadPoolExecutor
from langchain.output_parsers.boolean import BooleanOutputParser
from kotaemon.base import Document
from kotaemon.llms import BaseLLM, PromptTemplate
from langchain.output_parsers.boolean import BooleanOutputParser
from .base import BaseReranking

View File

@ -1,6 +1,7 @@
from kotaemon.base import BaseComponent
from langchain_core.language_models.base import BaseLanguageModel
from kotaemon.base import BaseComponent
class BaseLLM(BaseComponent):
def to_langchain_format(self) -> BaseLanguageModel:

View File

@ -156,6 +156,7 @@ class GatedBranchingPipeline(SimpleBranchingPipeline):
if __name__ == "__main__":
import dotenv
from kotaemon.llms import AzureChatOpenAI, BasePromptComponent
from kotaemon.parsers import RegexExtractor

View File

@ -1,9 +1,10 @@
from copy import deepcopy
from typing import Callable, List
from kotaemon.base import BaseComponent, Document
from theflow import Function, Node, Param
from kotaemon.base import BaseComponent, Document
from .chats import AzureChatOpenAI
from .completions import LLM
from .prompts import BasePromptComponent
@ -84,7 +85,7 @@ class Thought(BaseComponent):
@Node.auto(depends_on="prompt")
def prompt_template(self):
"""Automatically wrap around param prompt. Can ignore"""
return BasePromptComponent(self.prompt)
return BasePromptComponent(template=self.prompt)
def run(self, **kwargs) -> Document:
"""Run the chain of thought"""

View File

@ -1,4 +1,6 @@
from typing import Callable, Union
from typing import Callable
from theflow import Param
from kotaemon.base import BaseComponent, Document
@ -19,14 +21,18 @@ class BasePromptComponent(BaseComponent):
middleware_switches = {"theflow.middleware.CachingMiddleware": False}
allow_extra = True
def __init__(self, template: Union[str, PromptTemplate], **kwargs):
super().__init__()
self.template = (
template
if isinstance(template, PromptTemplate)
else PromptTemplate(template)
template: str | PromptTemplate
@Param.auto(depends_on="template")
def template__(self):
return (
self.template
if isinstance(self.template, PromptTemplate)
else PromptTemplate(self.template)
)
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.__set(**kwargs)
def __check_redundant_kwargs(self, **kwargs):
@ -42,7 +48,7 @@ class BasePromptComponent(BaseComponent):
Returns:
None
"""
self.template.check_redundant_kwargs(**kwargs)
self.template__.check_redundant_kwargs(**kwargs)
def __check_unset_placeholders(self):
"""
@ -58,7 +64,7 @@ class BasePromptComponent(BaseComponent):
Returns:
None
"""
self.template.check_missing_kwargs(**self.__dict__)
self.template__.check_missing_kwargs(**self.__dict__)
def __validate_value_type(self, **kwargs):
"""
@ -76,6 +82,8 @@ class BasePromptComponent(BaseComponent):
"""
type_error = []
for k, v in kwargs.items():
if k.startswith("template"):
continue
if not isinstance(v, (str, int, Document, Callable)): # type: ignore
type_error.append((k, type(v)))
@ -122,7 +130,7 @@ class BasePromptComponent(BaseComponent):
)
kwargs = {}
for k in self.template.placeholders:
for k in self.template__.placeholders:
v = getattr(self, k)
# if get a callable, execute to get its output
@ -141,7 +149,7 @@ class BasePromptComponent(BaseComponent):
return kwargs
def set(self, **kwargs):
def set_value(self, **kwargs):
"""
Similar to `__set` but for external use.
@ -172,7 +180,7 @@ class BasePromptComponent(BaseComponent):
self.__check_unset_placeholders()
prepared_kwargs = self.__prepare_value()
text = self.template.populate(**prepared_kwargs)
text = self.template__.populate(**prepared_kwargs)
return Document(text=text, metadata={"origin": "PromptComponent"})
def flow(self):

View File

@ -1,10 +1,11 @@
from pathlib import Path
from typing import Any, List, Type, Union
from kotaemon.base import BaseComponent, Document
from llama_index import SimpleDirectoryReader, download_loader
from llama_index.readers.base import BaseReader
from kotaemon.base import BaseComponent, Document
class AutoReader(BaseComponent):
"""General auto reader for a variety of files. (based on llama-hub)"""

View File

@ -6,9 +6,10 @@ Pandas parser for .xlsx files.
from pathlib import Path
from typing import Any, List, Optional, Union
from kotaemon.base import Document
from llama_index.readers.base import BaseReader
from kotaemon.base import Document
class PandasExcelReader(BaseReader):
r"""Pandas-based CSV parser.

View File

@ -5,10 +5,11 @@ from pathlib import Path
from typing import Any, Dict, List, Optional
import requests
from kotaemon.base import Document
from langchain.utils import get_from_dict_or_env
from llama_index.readers.base import BaseReader
from kotaemon.base import Document
from .utils.table import parse_markdown_text_to_tables, strip_special_chars_markdown

View File

@ -3,9 +3,10 @@ from typing import List, Optional
from uuid import uuid4
import requests
from kotaemon.base import Document
from llama_index.readers.base import BaseReader
from kotaemon.base import Document
from .utils.pdf_ocr import parse_ocr_output, read_pdf_unstructured
from .utils.table import strip_special_chars_markdown

View File

@ -12,9 +12,10 @@ pip install xlrd
from pathlib import Path
from typing import Any, Dict, List, Optional
from kotaemon.base import Document
from llama_index.readers.base import BaseReader
from kotaemon.base import Document
class UnstructuredReader(BaseReader):
"""General unstructured text reader for a variety of files."""

View File

@ -3,12 +3,13 @@ from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Any, Optional
from kotaemon.base import DocumentWithEmbedding
from llama_index.schema import NodeRelationship, RelatedNodeInfo
from llama_index.vector_stores.types import BasePydanticVectorStore
from llama_index.vector_stores.types import VectorStore as LIVectorStore
from llama_index.vector_stores.types import VectorStoreQuery
from kotaemon.base import DocumentWithEmbedding
class BaseVectorStore(ABC):
@abstractmethod

View File

@ -3,10 +3,11 @@ from pathlib import Path
from typing import Any, Optional, Type
import fsspec
from kotaemon.base import DocumentWithEmbedding
from llama_index.vector_stores import SimpleVectorStore as LISimpleVectorStore
from llama_index.vector_stores.simple import SimpleVectorStoreData
from kotaemon.base import DocumentWithEmbedding
from .base import LlamaIndexVectorStore

View File

@ -1,6 +1,8 @@
from unittest.mock import patch
import pytest
from openai.types.chat.chat_completion import ChatCompletion
from kotaemon.agents import (
AgentType,
BaseTool,
@ -12,7 +14,6 @@ from kotaemon.agents import (
WikipediaTool,
)
from kotaemon.llms import AzureChatOpenAI
from openai.types.chat.chat_completion import ChatCompletion
FINAL_RESPONSE_TEXT = "Final Answer: Hello Cinnamon AI!"
REWOO_VALID_PLAN = (

View File

@ -1,6 +1,8 @@
from copy import deepcopy
import pytest
from openai.types.chat.chat_completion import ChatCompletion
from kotaemon.llms import (
AzureChatOpenAI,
BasePromptComponent,
@ -10,7 +12,6 @@ from kotaemon.llms import (
SimpleLinearPipeline,
)
from kotaemon.parsers import RegexExtractor
from openai.types.chat.chat_completion import ChatCompletion
_openai_chat_completion_response = ChatCompletion.parse_obj(
{

View File

@ -1,8 +1,9 @@
from unittest.mock import patch
from openai.types.chat.chat_completion import ChatCompletion
from kotaemon.llms import AzureChatOpenAI
from kotaemon.llms.cot import ManualSequentialChainOfThought, Thought
from openai.types.chat.chat_completion import ChatCompletion
_openai_chat_completion_response = [
ChatCompletion.parse_obj(

View File

@ -3,6 +3,7 @@ from unittest.mock import patch
import pytest
from elastic_transport import ApiResponseMeta
from kotaemon.base import Document
from kotaemon.storages import (
ElasticsearchDocumentStore,

View File

@ -3,11 +3,12 @@ from pathlib import Path
from typing import cast
import pytest
from openai.resources.embeddings import Embeddings
from kotaemon.base import Document
from kotaemon.embeddings import AzureOpenAIEmbeddings
from kotaemon.indices import VectorIndexing, VectorRetrieval
from kotaemon.storages import ChromaVectorStore, InMemoryDocumentStore
from openai.resources.embeddings import Embeddings
with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f:
openai_embedding = json.load(f)

View File

@ -9,6 +9,7 @@ try:
except ImportError:
from langchain.llms import AzureOpenAI as AzureOpenAILC
from langchain.llms import OpenAI as OpenAILC
from openai.types.completion import Completion
_openai_completion_response = Completion.parse_obj(

View File

@ -1,4 +1,5 @@
import pytest
from kotaemon.base import Document
from kotaemon.parsers import RegexExtractor

View File

@ -1,4 +1,5 @@
import pytest
from kotaemon.base import Document
from kotaemon.llms import BasePromptComponent, PromptTemplate
from kotaemon.parsers import RegexExtractor
@ -58,5 +59,5 @@ def test_run():
def test_set_method():
template = PromptTemplate("Hello, {name}!")
prompt = BasePromptComponent(template=template)
prompt.set(name="Alice")
prompt.set_value(name="Alice")
assert prompt.name == "Alice"

View File

@ -1,9 +1,10 @@
from pathlib import Path
from langchain.schema import Document as LangchainDocument
from llama_index.node_parser import SimpleNodeParser
from kotaemon.base import Document
from kotaemon.loaders import AutoReader, UnstructuredReader
from langchain.schema import Document as LangchainDocument
from llama_index.node_parser import SimpleNodeParser
def test_pdf_reader():

View File

@ -1,10 +1,11 @@
from unittest.mock import patch
import pytest
from openai.types.chat.chat_completion import ChatCompletion
from kotaemon.base import Document
from kotaemon.indices.rankings import LLMReranking
from kotaemon.llms import AzureChatOpenAI
from openai.types.chat.chat_completion import ChatCompletion
_openai_chat_completion_responses = [
ChatCompletion.parse_obj(

View File

@ -1,6 +1,7 @@
from llama_index.schema import NodeRelationship
from kotaemon.base import Document
from kotaemon.indices.splitters import TokenSplitter
from llama_index.schema import NodeRelationship
source1 = Document(
content="The City Hall and Raffles Place MRT stations are paired cross-platform "

View File

@ -2,6 +2,7 @@ import json
from pathlib import Path
import pytest
from kotaemon.loaders import MathpixPDFReader, OCRReader, PandasExcelReader
input_file = Path(__file__).parent / "resources" / "table.pdf"

View File

@ -51,6 +51,7 @@ def test_disable_telemetry_import_haystack_after_kotaemon():
import os
import haystack.telemetry
import kotaemon # noqa: F401
assert haystack.telemetry.telemetry is None

View File

@ -1,4 +1,5 @@
import pytest
from kotaemon.llms import PromptTemplate

View File

@ -2,12 +2,13 @@ import json
from pathlib import Path
import pytest
from openai.resources.embeddings import Embeddings
from kotaemon.agents.tools import ComponentTool, GoogleSearchTool, WikipediaTool
from kotaemon.base import Document
from kotaemon.embeddings import AzureOpenAIEmbeddings
from kotaemon.indices.vectorindex import VectorIndexing, VectorRetrieval
from kotaemon.storages import ChromaVectorStore, InMemoryDocumentStore
from openai.resources.embeddings import Embeddings
with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f:
openai_embedding = json.load(f)

View File

@ -4,10 +4,11 @@ from unittest.mock import patch
import pytest
from index import ReaderIndexingPipeline
from kotaemon.llms import AzureChatOpenAI
from openai.resources.embeddings import Embeddings
from openai.types.chat.chat_completion import ChatCompletion
from kotaemon.llms import AzureChatOpenAI
with open(Path(__file__).parent / "resources" / "embedding_openai.json") as f:
openai_embedding = json.load(f)

View File

@ -3,11 +3,12 @@ import logging
from functools import cache
from pathlib import Path
from kotaemon.base import BaseComponent
from kotaemon.storages import BaseDocumentStore, BaseVectorStore
from theflow.settings import settings
from theflow.utils.modules import deserialize
from kotaemon.base import BaseComponent
from kotaemon.storages import BaseDocumentStore, BaseVectorStore
logger = logging.getLogger(__name__)

View File

@ -17,10 +17,6 @@ from ktem.components import (
from ktem.db.models import Index, Source, SourceTargetRelation, engine
from ktem.indexing.base import BaseIndexing, BaseRetriever
from ktem.indexing.exceptions import FileExistsError
from kotaemon.base import RetrievedDocument
from kotaemon.indices import VectorIndexing, VectorRetrieval
from kotaemon.indices.ingests import DocumentIngestor
from kotaemon.indices.rankings import BaseReranking, CohereReranking, LLMReranking
from llama_index.vector_stores import (
FilterCondition,
FilterOperator,
@ -31,6 +27,11 @@ from llama_index.vector_stores.types import VectorStoreQueryMode
from sqlmodel import Session, select
from theflow.settings import settings
from kotaemon.base import RetrievedDocument
from kotaemon.indices import VectorIndexing, VectorRetrieval
from kotaemon.indices.ingests import DocumentIngestor
from kotaemon.indices.rankings import BaseReranking, CohereReranking, LLMReranking
USER_SETTINGS = {
"index_parser": {
"name": "Index parser",

View File

@ -6,6 +6,7 @@ from functools import partial
import tiktoken
from ktem.components import llms
from ktem.indexing.base import BaseRetriever
from kotaemon.base import (
BaseComponent,
Document,

View File

@ -60,7 +60,7 @@ plugins:
- "!^_"
members_order: source
separate_signature: true
paths: [libs/kotaemon]
paths: [libs/kotaemon/kotaemon]
- git-revision-date-localized:
enable_creation_date: true
type: timeago

View File

@ -4,3 +4,6 @@ skip = "*.js,*.css,*.map"
ignore-words-list = "llm,fo"
quiet-level = 3
check-filenames = ""
[tool.isort]
known_first_party = ["kotaemon"]