Best docs Cinnamon will probably ever have (#105)

2023-12-20 11:30:25 +07:00
parent 0e30dcbb06
commit 230328c62f
40 changed files with 1036 additions and 46 deletions
--- a/knowledgehub/agents/langchain_based.py
+++ b/knowledgehub/agents/langchain_based.py
@@ -67,7 +67,7 @@ class LangchainAgent(BaseAgent):
    def run(self, instruction: str) -> AgentOutput:
        assert (
            self.agent is not None
-        ), "Lanchain AgentExecutor is not correclty initialized"
+        ), "Lanchain AgentExecutor is not correctly initialized"

        # Langchain AgentExecutor call
        output = self.agent(instruction)["output"]
--- a/knowledgehub/base/component.py
+++ b/knowledgehub/base/component.py
@@ -6,16 +6,16 @@ from kotaemon.base.schema import Document


 class BaseComponent(Function):
-    """A component is a class that can be used to compose a pipeline
+    """A component is a class that can be used to compose a pipeline.

-    Benefits of component:
+    !!! tip "Benefits of component"
        - Auto caching, logging
        - Allow deployment

-    For each component, the spirit is:
+    !!! tip "For each component, the spirit is"
        - Tolerate multiple input types, e.g. str, Document, List[str], List[Document]
        - Enforce single output type. Hence, the output type of a component should be
-        as generic as possible.
+    as generic as possible.
    """

    inflow = None
--- a/knowledgehub/base/schema.py
+++ b/knowledgehub/base/schema.py
@@ -22,6 +22,9 @@ class Document(BaseDocument):
    This class accept one positional argument `content` of an arbitrary type, which will
        store the raw content of the document. If specified, the class will use
        `content` to initialize the base llama_index class.
+
+    Args:
+        content: the raw content of the document.
    """

    content: Any
@@ -99,7 +102,7 @@ class RetrievedDocument(Document):
    """Subclass of Document with retrieval-related information

    Attributes:
-         score (float): score of the document (from 0.0 to 1.0)
+        score (float): score of the document (from 0.0 to 1.0)
        retrieval_metadata (dict): metadata from the retrieval process, can be used
            by different components in a retrieved pipeline to communicate with each
            other
--- a/knowledgehub/llms/init.py
+++ b/knowledgehub/llms/init.py
@@ -4,6 +4,7 @@ from .base import BaseLLM
 from .branching import GatedBranchingPipeline, SimpleBranchingPipeline
 from .chats import AzureChatOpenAI, ChatLLM
 from .completions import LLM, AzureOpenAI, OpenAI
+from .cot import ManualSequentialChainOfThought, Thought
 from .linear import GatedLinearPipeline, SimpleLinearPipeline
 from .prompts import BasePromptComponent, PromptTemplate

@@ -28,4 +29,7 @@ __all__ = [
    "GatedLinearPipeline",
    "SimpleBranchingPipeline",
    "GatedBranchingPipeline",
+    # chain-of-thoughts
+    "ManualSequentialChainOfThought",
+    "Thought",
 ]
--- a/knowledgehub/llms/branching.py
+++ b/knowledgehub/llms/branching.py
@@ -12,7 +12,8 @@ class SimpleBranchingPipeline(BaseComponent):
    Attributes:
        branches (List[BaseComponent]): The list of branches to be executed.

-    Example Usage:
+    Example:
+        ```python
        from kotaemon.llms import (
            AzureChatOpenAI,
            BasePromptComponent,
@@ -45,6 +46,7 @@ class SimpleBranchingPipeline(BaseComponent):
        print(pipeline(condition_text="1"))
        print(pipeline(condition_text="2"))
        print(pipeline(condition_text="12"))
+        ```
    """

    branches: List[BaseComponent] = Param(default_callback=lambda *_: [])
@@ -87,7 +89,8 @@ class GatedBranchingPipeline(SimpleBranchingPipeline):
    Attributes:
        branches (List[BaseComponent]): The list of branches to be executed.

-    Example Usage:
+    Example:
+        ```python
        from kotaemon.llms import (
            AzureChatOpenAI,
            BasePromptComponent,
@@ -119,6 +122,7 @@ class GatedBranchingPipeline(SimpleBranchingPipeline):
            )
        print(pipeline(condition_text="1"))
        print(pipeline(condition_text="2"))
+        ```
    """

    def run(self, *, condition_text: Optional[str] = None, **prompt_kwargs):
@@ -135,7 +139,7 @@ class GatedBranchingPipeline(SimpleBranchingPipeline):
            Union[OutputType, None]: The output of the first branch that satisfies the
            condition, or None if no branch satisfies the condition.

-        Raise:
+        Raises:
            ValueError: If condition_text is None
        """
        if condition_text is None:
--- a/knowledgehub/llms/cot.py
+++ b/knowledgehub/llms/cot.py
@@ -1,7 +1,9 @@
 from copy import deepcopy
 from typing import Callable, List

-from kotaemon.base import BaseComponent, Document, Node, Param
+from theflow import Function, Node, Param
+
+from kotaemon.base import BaseComponent, Document

 from .chats import AzureChatOpenAI
 from .completions import LLM
@@ -66,13 +68,13 @@ class Thought(BaseComponent):

    prompt: str = Param(
        help=(
-            "The prompt template string. This prompt template has Python-like "
-            "variable placeholders, that then will be subsituted with real values when "
-            "this component is executed"
+            "The prompt template string. This prompt template has Python-like variable"
+            " placeholders, that then will be substituted with real values when this"
+            " component is executed"
        )
    )
    llm: LLM = Node(AzureChatOpenAI, help="The LLM model to execute the input prompt")
-    post_process: BaseComponent = Node(
+    post_process: Function = Node(
        help=(
            "The function post-processor that post-processes LLM output prediction ."
            "It should take a string as input (this is the LLM output text) and return "
@@ -83,7 +85,7 @@ class Thought(BaseComponent):
    @Node.auto(depends_on="prompt")
    def prompt_template(self):
        """Automatically wrap around param prompt. Can ignore"""
-        return BasePromptComponent(template=self.prompt)
+        return BasePromptComponent(self.prompt)

    def run(self, **kwargs) -> Document:
        """Run the chain of thought"""
@@ -113,20 +115,19 @@ class ManualSequentialChainOfThought(BaseComponent):

    **Create and run a chain of thought without "+" operator:**

-    ```python
-    >> from kotaemon.pipelines.cot import Thought, ManualSequentialChainOfThought
-
-    >> llm = AzureChatOpenAI(...)
-    >> thought1 = Thought(
-           prompt="Word {word} in {language} is ",
-           post_process=lambda string: {"translated": string},
-       )
-    >> thought2 = Thought(
-            prompt="Translate {translated} to Japanese",
-            post_process=lambda string: {"output": string},
-       )
-    >> thought = ManualSequentialChainOfThought(thoughts=[thought1, thought2], llm=llm)
-    >> thought(word="hello", language="French")
+    ```pycon
+    >>> from kotaemon.pipelines.cot import Thought, ManualSequentialChainOfThought
+    >>> llm = AzureChatOpenAI(...)
+    >>> thought1 = Thought(
+    >>>    prompt="Word {word} in {language} is ",
+    >>>    post_process=lambda string: {"translated": string},
+    >>> )
+    >>> thought2 = Thought(
+    >>>     prompt="Translate {translated} to Japanese",
+    >>>     post_process=lambda string: {"output": string},
+    >>> )
+    >>> thought = ManualSequentialChainOfThought(thoughts=[thought1, thought2], llm=llm)
+    >>> thought(word="hello", language="French")
    {'word': 'hello',
     'language': 'French',
     'translated': '"Bonjour"',
--- a/knowledgehub/llms/linear.py
+++ b/knowledgehub/llms/linear.py
@@ -21,6 +21,7 @@ class SimpleLinearPipeline(BaseComponent):
            post-processor component or function.

    Example Usage:
+        ```python
        from kotaemon.llms import AzureChatOpenAI, BasePromptComponent

        def identity(x):
@@ -41,6 +42,7 @@ class SimpleLinearPipeline(BaseComponent):
            post_processor=identity,
        )
        print(pipeline(word="lone"))
+        ```
    """

    prompt: BasePromptComponent
@@ -85,7 +87,8 @@ class GatedLinearPipeline(SimpleLinearPipeline):
        condition (Callable[[IO_Type], Any]): A callable function that represents the
            condition.

-    Example Usage:
+    Usage:
+        ```{.py3 title="Example Usage"}
        from kotaemon.llms import AzureChatOpenAI, BasePromptComponent
        from kotaemon.parsers import RegexExtractor

@@ -109,6 +112,7 @@ class GatedLinearPipeline(SimpleLinearPipeline):
        )
        print(pipeline(condition_text="some pattern", word="lone"))
        print(pipeline(condition_text="other pattern", word="lone"))
+        ```
    """

    condition: Callable[[IO_Type], Any]
--- a/knowledgehub/llms/prompts/template.py
+++ b/knowledgehub/llms/prompts/template.py
@@ -72,7 +72,7 @@ class PromptTemplate:
                UserWarning,
            )

-    def populate(self, **kwargs):
+    def populate(self, **kwargs) -> str:
        """
        Strictly populate the template with the given keyword arguments.

@@ -81,7 +81,7 @@ class PromptTemplate:
                      Each keyword corresponds to a placeholder in the template.

        Returns:
-            str: The populated template.
+            The populated template.

        Raises:
            ValueError: If an unknown placeholder is provided.
--- a/knowledgehub/loaders/base.py
+++ b/knowledgehub/loaders/base.py
@@ -4,7 +4,7 @@ from typing import Any, List, Type, Union
 from llama_index import SimpleDirectoryReader, download_loader
 from llama_index.readers.base import BaseReader

-from ..base import BaseComponent, Document
+from kotaemon.base import BaseComponent, Document


 class AutoReader(BaseComponent):
--- a/knowledgehub/loaders/utils/box.py
+++ b/knowledgehub/loaders/utils/box.py
@@ -93,7 +93,7 @@ def get_rect_iou(gt_box: List[tuple], pd_box: List[tuple], iou_type=0) -> int:

    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
-    # areas - the interesection area
+    # areas - the intersection area
    if iou_type == 0:
        iou = interArea / float(gt_area + pd_area - interArea)
    elif iou_type == 1:
--- a/knowledgehub/loaders/utils/pdf_ocr.py
+++ b/knowledgehub/loaders/utils/pdf_ocr.py
@@ -34,8 +34,7 @@ def read_pdf_unstructured(input_path: Union[Path, str]):
        from unstructured.partition.auto import partition
    except ImportError:
        raise ImportError(
-            "Please install unstructured PDF reader \
-              `pip install unstructured[pdf]`"
+            "Please install unstructured PDF reader `pip install unstructured[pdf]`"
        )

    page_items = defaultdict(list)
@@ -60,7 +59,7 @@ def read_pdf_unstructured(input_path: Union[Path, str]):
 def merge_ocr_and_pdf_texts(
    ocr_list: List[dict], pdf_text_list: List[dict], debug_info=None
 ):
-    """Merge PDF and OCR text using IOU overlaping location
+    """Merge PDF and OCR text using IOU overlapping location
    Args:
        ocr_list: List of OCR items {"text", "box", "location"}
        pdf_text_list: List of PDF items {"text", "box", "location"}
@@ -115,7 +114,7 @@ def merge_ocr_and_pdf_texts(
 def merge_table_cell_and_ocr(
    table_list: List[dict], ocr_list: List[dict], pdf_list: List[dict], debug_info=None
 ):
-    """Merge table items with OCR text using IOU overlaping location
+    """Merge table items with OCR text using IOU overlapping location
    Args:
        table_list: List of table items
            "type": ("table", "cell", "text"), "text", "box", "location"}
@@ -123,7 +122,7 @@ def merge_table_cell_and_ocr(
        pdf_list: List of PDF items {"text", "box", "location"}

    Returns:
-        all_table_cells: List of tables, each of table is reprented
+        all_table_cells: List of tables, each of table is represented
            by list of cells with combined text from OCR
        not_matched_items: List of PDF text which is not overlapped by table region
    """
--- a/knowledgehub/parsers/regex_extractor.py
+++ b/knowledgehub/parsers/regex_extractor.py
@@ -100,11 +100,14 @@ class RegexExtractor(BaseComponent):
            A list contains the output ExtractorOutput for each input

        Example:
-            document1 = Document(...)
-            document2 = Document(...)
-            document_batch = [document1, document2]
-            batch_output = self(document_batch)
-            # batch_output will be [output1_document1, output1_document2]
+            ```pycon
+            >>> document1 = Document(...)
+            >>> document2 = Document(...)
+            >>> document_batch = [document1, document2]
+            >>> batch_output = self(document_batch)
+            >>> print(batch_output)
+            [output1_document1, output1_document2]
+            ```
        """
        # TODO: this conversion seems common
        input_: list[str] = []