[AUR-387, AUR-425] Add start-project to CLI (#29)

This commit is contained in:
cin-jacky 2023-10-03 13:55:34 +09:00 committed by GitHub
parent d83c22aa4e
commit 205955b8a3
13 changed files with 718 additions and 8 deletions

View File

@ -49,3 +49,4 @@ repos:
- id: mypy - id: mypy
additional_dependencies: [types-PyYAML==6.0.12.11, "types-requests"] additional_dependencies: [types-PyYAML==6.0.12.11, "types-requests"]
args: ["--check-untyped-defs", "--ignore-missing-imports"] args: ["--check-untyped-defs", "--ignore-missing-imports"]
exclude: "^templates/"

View File

@ -22,4 +22,4 @@ try:
except ImportError: except ImportError:
pass pass
__version__ = "0.0.2" __version__ = "0.0.3"

View File

@ -3,9 +3,6 @@ import os
import click import click
import yaml import yaml
from kotaemon.contribs.promptui.config import export_pipeline_to_config
from kotaemon.contribs.promptui.ui import build_from_dict
# check if the output is not a .yml file -> raise error # check if the output is not a .yml file -> raise error
def check_config_format(config): def check_config_format(config):
@ -39,6 +36,8 @@ def export(export_path, output):
from theflow.utils.modules import import_dotted_string from theflow.utils.modules import import_dotted_string
from kotaemon.contribs.promptui.config import export_pipeline_to_config
sys.path.append(os.getcwd()) sys.path.append(os.getcwd())
cls = import_dotted_string(export_path, safe=False) cls = import_dotted_string(export_path, safe=False)
export_pipeline_to_config(cls, output) export_pipeline_to_config(cls, output)
@ -48,9 +47,21 @@ def export(export_path, output):
@promptui.command() @promptui.command()
@click.argument("run_path", required=False, default="promptui.yml") @click.argument("run_path", required=False, default="promptui.yml")
def run(run_path): def run(run_path):
from kotaemon.contribs.promptui.ui import build_from_dict
build_from_dict(run_path) build_from_dict(run_path)
check_config_format(run_path) check_config_format(run_path)
@main.command()
def start_project():
os.system(
"cookiecutter https://github.com/Cinnamon/kotaemon.git"
"--directory='templates/project-default'"
)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -29,18 +29,16 @@ setuptools.setup(
packages=setuptools.find_packages( packages=setuptools.find_packages(
exclude=("tests", "tests.*", "examples", "examples.*") exclude=("tests", "tests.*", "examples", "examples.*")
), ),
dependencies=[
"click >= 8.1.7",
],
install_requires=[ install_requires=[
"farm-haystack==1.19.0", "farm-haystack==1.19.0",
"langchain", "langchain",
"theflow", "theflow",
"llama-index", "llama-index",
"llama-hub", "llama-hub",
"nltk",
"gradio", "gradio",
"openpyxl", "openpyxl",
"cookiecutter",
"click",
], ],
extras_require={ extras_require={
"dev": [ "dev": [

View File

@ -0,0 +1,4 @@
{
"project_name": "prj_kotaemon",
"ptl": "john"
}

View File

@ -0,0 +1,23 @@
.gitattributes text eol=lf
.gitignore text eol=lf
*.build text eol=lf
*.c text eol=lf
*.cmake text eol=lf
*.cpp text eol=lf
*.csv text eol=lf
*.f text eol=lf
*.f90 text eol=lf
*.for text eol=lf
*.grc text eol=lf
*.h text eol=lf
*.ipynb text eol=lf
*.m text eol=lf
*.md text eol=lf
*.pas text eol=lf
*.py text eol=lf
*.rst text eol=lf
*.sh text eol=lf
*.txt text eol=lf
*.yml text eol=lf
Makefile text eol=lf
*.html linguist-documentation

View File

@ -0,0 +1,459 @@
# Created by https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm
# Edit at https://www.toptal.com/developers/gitignore?templates=python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm
### Emacs ###
# -*- mode: gitignore; -*-
*~
\#*\#
/.emacs.desktop
/.emacs.desktop.lock
*.elc
auto-save-list
tramp
.\#*
# Org-mode
.org-id-locations
*_archive
# flymake-mode
*_flymake.*
# eshell files
/eshell/history
/eshell/lastdir
# elpa packages
/elpa/
# reftex files
*.rel
# AUCTeX auto folder
/auto/
# cask packages
.cask/
dist/
# Flycheck
flycheck_*.el
# server auth directory
/server/
# projectiles files
.projectile
# directory configuration
.dir-locals.el
# network security
/network-security.data
### Linux ###
# temporary files which can be created if a process still has a handle open of a deleted file
.fuse_hidden*
# KDE directory preferences
.directory
# Linux trash folder which might appear on any partition or disk
.Trash-*
# .nfs files are created when an open file is removed but is still being accessed
.nfs*
### macOS ###
# General
.DS_Store
.AppleDouble
.LSOverride
# Icon must end with two \r
Icon
# Thumbnails
._*
# Files that might appear in the root of a volume
.DocumentRevisions-V100
.fseventsd
.Spotlight-V100
.TemporaryItems
.Trashes
.VolumeIcon.icns
.com.apple.timemachine.donotpresent
# Directories potentially created on remote AFP share
.AppleDB
.AppleDesktop
Network Trash Folder
Temporary Items
.apdisk
### macOS Patch ###
# iCloud generated files
*.icloud
### PyCharm ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf
# AWS User-specific
.idea/**/aws.xml
# Generated files
.idea/**/contentModel.xml
# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml
# Gradle
.idea/**/gradle.xml
.idea/**/libraries
# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr
# CMake
cmake-build-*/
# Mongo Explorer plugin
.idea/**/mongoSettings.xml
# File-based project format
*.iws
# IntelliJ
out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Cursive Clojure plugin
.idea/replstate.xml
# SonarLint plugin
.idea/sonarlint/
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
# Editor-based Rest Client
.idea/httpRequests
# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
### PyCharm Patch ###
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
# *.iml
# modules.xml
# .idea/misc.xml
# *.ipr
# Sonarlint plugin
# https://plugins.jetbrains.com/plugin/7973-sonarlint
.idea/**/sonarlint/
# SonarQube Plugin
# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
.idea/**/sonarIssues.xml
# Markdown Navigator plugin
# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
.idea/**/markdown-navigator.xml
.idea/**/markdown-navigator-enh.xml
.idea/**/markdown-navigator/
# Cache file creation bug
# See https://youtrack.jetbrains.com/issue/JBR-2257
.idea/$CACHE_FILE$
# CodeStream plugin
# https://plugins.jetbrains.com/plugin/12206-codestream
.idea/codestream.xml
# Azure Toolkit for IntelliJ plugin
# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
.idea/**/azureSettings.xml
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
### Python Patch ###
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
poetry.toml
# ruff
.ruff_cache/
# LSP config files
pyrightconfig.json
### Vim ###
# Swap
[._]*.s[a-v][a-z]
!*.svg # comment out if you don't need vector files
[._]*.sw[a-p]
[._]s[a-rt-v][a-z]
[._]ss[a-gi-z]
[._]sw[a-p]
# Session
Session.vim
Sessionx.vim
# Temporary
.netrwhist
# Auto-generated tag files
tags
# Persistent undo
[._]*.un~
### VisualStudioCode ###
.vscode/*
!.vscode/settings.json
!.vscode/tasks.json
!.vscode/launch.json
!.vscode/extensions.json
!.vscode/*.code-snippets
# Local History for Visual Studio Code
.history/
# Built Visual Studio Code Extensions
*.vsix
### VisualStudioCode Patch ###
# Ignore all local history of files
.history
.ionide
### Windows ###
# Windows thumbnail cache files
Thumbs.db
Thumbs.db:encryptable
ehthumbs.db
ehthumbs_vista.db
# Dump file
*.stackdump
# Folder config file
[Dd]esktop.ini
# Recycle Bin used on file shares
$RECYCLE.BIN/
# Windows Installer files
*.cab
*.msi
*.msix
*.msm
*.msp
# Windows shortcuts
*.lnk
.theflow/
# End of https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm
logs/
.gitsecret/keys/random_seed
!*.secret
credentials.txt
S.gpg-agent*
.vscode/settings.json

View File

@ -0,0 +1,51 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0
hooks:
- id: check-yaml
- id: check-toml
- id: end-of-file-fixer
- id: trailing-whitespace
- id: detect-aws-credentials
args: ["--allow-missing-credentials"]
- id: detect-private-key
- id: check-added-large-files
- repo: https://github.com/ambv/black
rev: 22.3.0
hooks:
- id: black
language_version: python3
- repo: https://github.com/pycqa/isort
rev: 5.12.0
hooks:
- id: isort
args: ["--profile", "black"]
language_version: python3.10
- repo: https://github.com/pycqa/flake8
rev: 4.0.1
hooks:
- id: flake8
args: ["--max-line-length", "88", "--extend-ignore", "E203"]
- repo: https://github.com/myint/autoflake
rev: v1.4
hooks:
- id: autoflake
args:
[
"--in-place",
"--remove-unused-variables",
"--remove-all-unused-imports",
"--ignore-init-module-imports",
"--exclude=tests/*",
]
- repo: https://github.com/pre-commit/mirrors-prettier
rev: v2.7.1
hooks:
- id: prettier
types_or: [markdown, yaml]
- repo: https://github.com/pre-commit/mirrors-mypy
rev: "v1.5.1"
hooks:
- id: mypy
additional_dependencies: [types-PyYAML==6.0.12.11, "types-requests"]
args: ["--check-untyped-defs", "--ignore-missing-imports"]

View File

@ -0,0 +1,37 @@
<div align="center">
# Project {{ cookiecutter.project_name }}
[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/Cinnamon/kotaemon)
</div>
# Install
```bash
# Create new conda env (optional)
conda create -n {{ cookiecutter.project_name }} python=3.10
conda activate {{ cookiecutter.project_name }}
# Clone and install the project
git clone "<{{ cookiecutter.project_name }}-repo>"
cd "<{{ cookiecutter.project_name }}-repo>"
pip install -e .
# Generate the project structure
cd ..
kh start-project
```
# Usage
- Build the pipeline in `pipeline.py`
For supported utilities and tools, refer: https://github.com/Cinnamon/kotaemon/wiki/Utilities
# Contribute
- For project issues and errors, please report in this repo issues.
- For kotaemon issues and errors, please report or make PR fixes in https://github.com/Cinnamon/kotaemon.git
- If the template for this project has issues and errors, please report or make
PR fixes in https://github.com/Cinnamon/kotaemon/tree/main/templates/project-default

View File

@ -0,0 +1,20 @@
import setuptools
setuptools.setup(
name="{{ cookiecutter.project_name }}",
version="0.0.1",
author="{{ cookiecutter.ptl }}",
author_email="{{ cookiecutter.ptl }}@cinnamon.is",
description="Project {{ cookiecutter.project_name }}",
long_description="Project {{ cookiecutter.project_name }}",
url="https://github.com/Cinnamon/kotaemon",
python_requires=">=3",
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
install_requires=[
"kotaemon@git+ssh://git@github.com/Cinnamon/kotaemon.git",
],
)

View File

@ -0,0 +1,106 @@
import os
from typing import List
from theflow import Node, Param
from kotaemon.base import BaseComponent
from kotaemon.docstores import InMemoryDocumentStore
from kotaemon.embeddings import AzureOpenAIEmbeddings
from kotaemon.llms.completions.openai import AzureOpenAI
from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline
from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline
from kotaemon.vectorstores import ChromaVectorStore
class QuestionAnsweringPipeline(BaseComponent):
vectorstore_path: str = str("./tmp")
retrieval_top_k: int = 1
openai_api_key: str = os.environ.get("OPENAI_API_KEY", "")
@Node.decorate(depends_on="openai_api_key")
def llm(self):
return AzureOpenAI(
openai_api_base="https://bleh-dummy-2.openai.azure.com/",
openai_api_key=self.openai_api_key,
openai_api_version="2023-03-15-preview",
deployment_name="dummy-q2-gpt35",
temperature=0,
request_timeout=60,
)
@Node.decorate(depends_on=["vectorstore_path", "openai_api_key"])
def retrieving_pipeline(self):
vector_store = ChromaVectorStore(self.vectorstore_path)
embedding = AzureOpenAIEmbeddings(
model="text-embedding-ada-002",
deployment="dummy-q2-text-embedding",
openai_api_base="https://bleh-dummy-2.openai.azure.com/",
openai_api_key=self.openai_api_key,
)
return RetrieveDocumentFromVectorStorePipeline(
vector_store=vector_store,
embedding=embedding,
)
def run_raw(self, text: str) -> str:
# reload the document store, in case it has been updated
doc_store = InMemoryDocumentStore()
doc_store.load("docstore.json")
self.retrieving_pipeline.doc_store = doc_store
# retrieve relevant documents as context
matched_texts: List[str] = [
_.text
for _ in self.retrieving_pipeline(text, top_k=int(self.retrieval_top_k))
]
context = "\n".join(matched_texts)
# generate the answer
prompt = f'Answer the following question: "{text}". The context is: \n{context}'
self.log_progress(".prompt", prompt=prompt)
return self.llm(prompt).text[0]
class IndexingPipeline(IndexVectorStoreFromDocumentPipeline):
# Expose variables for users to switch in prompt ui
vectorstore_path: str = str("./tmp")
embedding_model: str = "text-embedding-ada-002"
deployment: str = "dummy-q2-text-embedding"
openai_api_base: str = "https://bleh-dummy-2.openai.azure.com/"
openai_api_key: str = os.environ.get("OPENAI_API_KEY", "")
@Param.decorate(depends_on=["vectorstore_path"])
def vector_store(self):
return ChromaVectorStore(self.vectorstore_path)
@Param.decorate()
def doc_store(self):
doc_store = InMemoryDocumentStore()
if os.path.isfile("docstore.json"):
doc_store.load("docstore.json")
return doc_store
@Node.decorate(depends_on=["vector_store"])
def embedding(self):
return AzureOpenAIEmbeddings(
model="text-embedding-ada-002",
deployment=self.deployment,
openai_api_base=self.openai_api_base,
openai_api_key=self.openai_api_key,
)
def run_raw(self, text: str) -> int: # type: ignore
"""Normally, this indexing pipeline returns nothing. For demonstration,
we want it to return something, so let's return the number of documents
in the vector store
"""
super().run_raw(text)
if self.doc_store is not None:
# persist to local anytime an indexing is created
# this can be bypassed when we have a FileDocucmentStore
self.doc_store.save("docstore.json")
return self.vector_store._collection.count()