Best docs Cinnamon will probably ever have (#105)

This commit is contained in:
ian_Cin
2023-12-20 11:30:25 +07:00
committed by GitHub
parent 0e30dcbb06
commit 230328c62f
40 changed files with 1036 additions and 46 deletions

View File

@@ -67,7 +67,7 @@ class LangchainAgent(BaseAgent):
def run(self, instruction: str) -> AgentOutput:
assert (
self.agent is not None
), "Lanchain AgentExecutor is not correclty initialized"
), "Lanchain AgentExecutor is not correctly initialized"
# Langchain AgentExecutor call
output = self.agent(instruction)["output"]

View File

@@ -6,16 +6,16 @@ from kotaemon.base.schema import Document
class BaseComponent(Function):
"""A component is a class that can be used to compose a pipeline
"""A component is a class that can be used to compose a pipeline.
Benefits of component:
!!! tip "Benefits of component"
- Auto caching, logging
- Allow deployment
For each component, the spirit is:
!!! tip "For each component, the spirit is"
- Tolerate multiple input types, e.g. str, Document, List[str], List[Document]
- Enforce single output type. Hence, the output type of a component should be
as generic as possible.
as generic as possible.
"""
inflow = None

View File

@@ -22,6 +22,9 @@ class Document(BaseDocument):
This class accept one positional argument `content` of an arbitrary type, which will
store the raw content of the document. If specified, the class will use
`content` to initialize the base llama_index class.
Args:
content: the raw content of the document.
"""
content: Any
@@ -99,7 +102,7 @@ class RetrievedDocument(Document):
"""Subclass of Document with retrieval-related information
Attributes:
score (float): score of the document (from 0.0 to 1.0)
score (float): score of the document (from 0.0 to 1.0)
retrieval_metadata (dict): metadata from the retrieval process, can be used
by different components in a retrieved pipeline to communicate with each
other

View File

@@ -4,6 +4,7 @@ from .base import BaseLLM
from .branching import GatedBranchingPipeline, SimpleBranchingPipeline
from .chats import AzureChatOpenAI, ChatLLM
from .completions import LLM, AzureOpenAI, OpenAI
from .cot import ManualSequentialChainOfThought, Thought
from .linear import GatedLinearPipeline, SimpleLinearPipeline
from .prompts import BasePromptComponent, PromptTemplate
@@ -28,4 +29,7 @@ __all__ = [
"GatedLinearPipeline",
"SimpleBranchingPipeline",
"GatedBranchingPipeline",
# chain-of-thoughts
"ManualSequentialChainOfThought",
"Thought",
]

View File

@@ -12,7 +12,8 @@ class SimpleBranchingPipeline(BaseComponent):
Attributes:
branches (List[BaseComponent]): The list of branches to be executed.
Example Usage:
Example:
```python
from kotaemon.llms import (
AzureChatOpenAI,
BasePromptComponent,
@@ -45,6 +46,7 @@ class SimpleBranchingPipeline(BaseComponent):
print(pipeline(condition_text="1"))
print(pipeline(condition_text="2"))
print(pipeline(condition_text="12"))
```
"""
branches: List[BaseComponent] = Param(default_callback=lambda *_: [])
@@ -87,7 +89,8 @@ class GatedBranchingPipeline(SimpleBranchingPipeline):
Attributes:
branches (List[BaseComponent]): The list of branches to be executed.
Example Usage:
Example:
```python
from kotaemon.llms import (
AzureChatOpenAI,
BasePromptComponent,
@@ -119,6 +122,7 @@ class GatedBranchingPipeline(SimpleBranchingPipeline):
)
print(pipeline(condition_text="1"))
print(pipeline(condition_text="2"))
```
"""
def run(self, *, condition_text: Optional[str] = None, **prompt_kwargs):
@@ -135,7 +139,7 @@ class GatedBranchingPipeline(SimpleBranchingPipeline):
Union[OutputType, None]: The output of the first branch that satisfies the
condition, or None if no branch satisfies the condition.
Raise:
Raises:
ValueError: If condition_text is None
"""
if condition_text is None:

View File

@@ -1,7 +1,9 @@
from copy import deepcopy
from typing import Callable, List
from kotaemon.base import BaseComponent, Document, Node, Param
from theflow import Function, Node, Param
from kotaemon.base import BaseComponent, Document
from .chats import AzureChatOpenAI
from .completions import LLM
@@ -66,13 +68,13 @@ class Thought(BaseComponent):
prompt: str = Param(
help=(
"The prompt template string. This prompt template has Python-like "
"variable placeholders, that then will be subsituted with real values when "
"this component is executed"
"The prompt template string. This prompt template has Python-like variable"
" placeholders, that then will be substituted with real values when this"
" component is executed"
)
)
llm: LLM = Node(AzureChatOpenAI, help="The LLM model to execute the input prompt")
post_process: BaseComponent = Node(
post_process: Function = Node(
help=(
"The function post-processor that post-processes LLM output prediction ."
"It should take a string as input (this is the LLM output text) and return "
@@ -83,7 +85,7 @@ class Thought(BaseComponent):
@Node.auto(depends_on="prompt")
def prompt_template(self):
"""Automatically wrap around param prompt. Can ignore"""
return BasePromptComponent(template=self.prompt)
return BasePromptComponent(self.prompt)
def run(self, **kwargs) -> Document:
"""Run the chain of thought"""
@@ -113,20 +115,19 @@ class ManualSequentialChainOfThought(BaseComponent):
**Create and run a chain of thought without "+" operator:**
```python
>> from kotaemon.pipelines.cot import Thought, ManualSequentialChainOfThought
>> llm = AzureChatOpenAI(...)
>> thought1 = Thought(
prompt="Word {word} in {language} is ",
post_process=lambda string: {"translated": string},
)
>> thought2 = Thought(
prompt="Translate {translated} to Japanese",
post_process=lambda string: {"output": string},
)
>> thought = ManualSequentialChainOfThought(thoughts=[thought1, thought2], llm=llm)
>> thought(word="hello", language="French")
```pycon
>>> from kotaemon.pipelines.cot import Thought, ManualSequentialChainOfThought
>>> llm = AzureChatOpenAI(...)
>>> thought1 = Thought(
>>> prompt="Word {word} in {language} is ",
>>> post_process=lambda string: {"translated": string},
>>> )
>>> thought2 = Thought(
>>> prompt="Translate {translated} to Japanese",
>>> post_process=lambda string: {"output": string},
>>> )
>>> thought = ManualSequentialChainOfThought(thoughts=[thought1, thought2], llm=llm)
>>> thought(word="hello", language="French")
{'word': 'hello',
'language': 'French',
'translated': '"Bonjour"',

View File

@@ -21,6 +21,7 @@ class SimpleLinearPipeline(BaseComponent):
post-processor component or function.
Example Usage:
```python
from kotaemon.llms import AzureChatOpenAI, BasePromptComponent
def identity(x):
@@ -41,6 +42,7 @@ class SimpleLinearPipeline(BaseComponent):
post_processor=identity,
)
print(pipeline(word="lone"))
```
"""
prompt: BasePromptComponent
@@ -85,7 +87,8 @@ class GatedLinearPipeline(SimpleLinearPipeline):
condition (Callable[[IO_Type], Any]): A callable function that represents the
condition.
Example Usage:
Usage:
```{.py3 title="Example Usage"}
from kotaemon.llms import AzureChatOpenAI, BasePromptComponent
from kotaemon.parsers import RegexExtractor
@@ -109,6 +112,7 @@ class GatedLinearPipeline(SimpleLinearPipeline):
)
print(pipeline(condition_text="some pattern", word="lone"))
print(pipeline(condition_text="other pattern", word="lone"))
```
"""
condition: Callable[[IO_Type], Any]

View File

@@ -72,7 +72,7 @@ class PromptTemplate:
UserWarning,
)
def populate(self, **kwargs):
def populate(self, **kwargs) -> str:
"""
Strictly populate the template with the given keyword arguments.
@@ -81,7 +81,7 @@ class PromptTemplate:
Each keyword corresponds to a placeholder in the template.
Returns:
str: The populated template.
The populated template.
Raises:
ValueError: If an unknown placeholder is provided.

View File

@@ -4,7 +4,7 @@ from typing import Any, List, Type, Union
from llama_index import SimpleDirectoryReader, download_loader
from llama_index.readers.base import BaseReader
from ..base import BaseComponent, Document
from kotaemon.base import BaseComponent, Document
class AutoReader(BaseComponent):

View File

@@ -93,7 +93,7 @@ def get_rect_iou(gt_box: List[tuple], pd_box: List[tuple], iou_type=0) -> int:
# compute the intersection over union by taking the intersection
# area and dividing it by the sum of prediction + ground-truth
# areas - the interesection area
# areas - the intersection area
if iou_type == 0:
iou = interArea / float(gt_area + pd_area - interArea)
elif iou_type == 1:

View File

@@ -34,8 +34,7 @@ def read_pdf_unstructured(input_path: Union[Path, str]):
from unstructured.partition.auto import partition
except ImportError:
raise ImportError(
"Please install unstructured PDF reader \
`pip install unstructured[pdf]`"
"Please install unstructured PDF reader `pip install unstructured[pdf]`"
)
page_items = defaultdict(list)
@@ -60,7 +59,7 @@ def read_pdf_unstructured(input_path: Union[Path, str]):
def merge_ocr_and_pdf_texts(
ocr_list: List[dict], pdf_text_list: List[dict], debug_info=None
):
"""Merge PDF and OCR text using IOU overlaping location
"""Merge PDF and OCR text using IOU overlapping location
Args:
ocr_list: List of OCR items {"text", "box", "location"}
pdf_text_list: List of PDF items {"text", "box", "location"}
@@ -115,7 +114,7 @@ def merge_ocr_and_pdf_texts(
def merge_table_cell_and_ocr(
table_list: List[dict], ocr_list: List[dict], pdf_list: List[dict], debug_info=None
):
"""Merge table items with OCR text using IOU overlaping location
"""Merge table items with OCR text using IOU overlapping location
Args:
table_list: List of table items
"type": ("table", "cell", "text"), "text", "box", "location"}
@@ -123,7 +122,7 @@ def merge_table_cell_and_ocr(
pdf_list: List of PDF items {"text", "box", "location"}
Returns:
all_table_cells: List of tables, each of table is reprented
all_table_cells: List of tables, each of table is represented
by list of cells with combined text from OCR
not_matched_items: List of PDF text which is not overlapped by table region
"""

View File

@@ -100,11 +100,14 @@ class RegexExtractor(BaseComponent):
A list contains the output ExtractorOutput for each input
Example:
document1 = Document(...)
document2 = Document(...)
document_batch = [document1, document2]
batch_output = self(document_batch)
# batch_output will be [output1_document1, output1_document2]
```pycon
>>> document1 = Document(...)
>>> document2 = Document(...)
>>> document_batch = [document1, document2]
>>> batch_output = self(document_batch)
>>> print(batch_output)
[output1_document1, output1_document2]
```
"""
# TODO: this conversion seems common
input_: list[str] = []