diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 90d0071..bb3084c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -49,3 +49,4 @@ repos: - id: mypy additional_dependencies: [types-PyYAML==6.0.12.11, "types-requests"] args: ["--check-untyped-defs", "--ignore-missing-imports"] + exclude: "^templates/" diff --git a/knowledgehub/__init__.py b/knowledgehub/__init__.py index e1eac21..fbc4235 100644 --- a/knowledgehub/__init__.py +++ b/knowledgehub/__init__.py @@ -22,4 +22,4 @@ try: except ImportError: pass -__version__ = "0.0.2" +__version__ = "0.0.3" diff --git a/knowledgehub/cli.py b/knowledgehub/cli.py index 3345fa2..f635df1 100644 --- a/knowledgehub/cli.py +++ b/knowledgehub/cli.py @@ -3,9 +3,6 @@ import os import click import yaml -from kotaemon.contribs.promptui.config import export_pipeline_to_config -from kotaemon.contribs.promptui.ui import build_from_dict - # check if the output is not a .yml file -> raise error def check_config_format(config): @@ -39,6 +36,8 @@ def export(export_path, output): from theflow.utils.modules import import_dotted_string + from kotaemon.contribs.promptui.config import export_pipeline_to_config + sys.path.append(os.getcwd()) cls = import_dotted_string(export_path, safe=False) export_pipeline_to_config(cls, output) @@ -48,9 +47,21 @@ def export(export_path, output): @promptui.command() @click.argument("run_path", required=False, default="promptui.yml") def run(run_path): + + from kotaemon.contribs.promptui.ui import build_from_dict + build_from_dict(run_path) check_config_format(run_path) +@main.command() +def start_project(): + + os.system( + "cookiecutter https://github.com/Cinnamon/kotaemon.git" + "--directory='templates/project-default'" + ) + + if __name__ == "__main__": main() diff --git a/setup.py b/setup.py index 7d906d6..065d344 100644 --- a/setup.py +++ b/setup.py @@ -29,18 +29,16 @@ setuptools.setup( packages=setuptools.find_packages( exclude=("tests", "tests.*", "examples", "examples.*") ), - dependencies=[ - "click >= 8.1.7", - ], install_requires=[ "farm-haystack==1.19.0", "langchain", "theflow", "llama-index", "llama-hub", - "nltk", "gradio", "openpyxl", + "cookiecutter", + "click", ], extras_require={ "dev": [ diff --git a/templates/project-default/cookiecutter.json b/templates/project-default/cookiecutter.json new file mode 100644 index 0000000..f70990c --- /dev/null +++ b/templates/project-default/cookiecutter.json @@ -0,0 +1,4 @@ +{ + "project_name": "prj_kotaemon", + "ptl": "john" +} diff --git a/templates/project-default/{{cookiecutter.project_name}}/.gitattributes b/templates/project-default/{{cookiecutter.project_name}}/.gitattributes new file mode 100644 index 0000000..919df25 --- /dev/null +++ b/templates/project-default/{{cookiecutter.project_name}}/.gitattributes @@ -0,0 +1,23 @@ +.gitattributes text eol=lf +.gitignore text eol=lf +*.build text eol=lf +*.c text eol=lf +*.cmake text eol=lf +*.cpp text eol=lf +*.csv text eol=lf +*.f text eol=lf +*.f90 text eol=lf +*.for text eol=lf +*.grc text eol=lf +*.h text eol=lf +*.ipynb text eol=lf +*.m text eol=lf +*.md text eol=lf +*.pas text eol=lf +*.py text eol=lf +*.rst text eol=lf +*.sh text eol=lf +*.txt text eol=lf +*.yml text eol=lf +Makefile text eol=lf +*.html linguist-documentation diff --git a/templates/project-default/{{cookiecutter.project_name}}/.gitignore b/templates/project-default/{{cookiecutter.project_name}}/.gitignore new file mode 100644 index 0000000..0e80245 --- /dev/null +++ b/templates/project-default/{{cookiecutter.project_name}}/.gitignore @@ -0,0 +1,459 @@ +# Created by https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm +# Edit at https://www.toptal.com/developers/gitignore?templates=python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm + +### Emacs ### +# -*- mode: gitignore; -*- +*~ +\#*\# +/.emacs.desktop +/.emacs.desktop.lock +*.elc +auto-save-list +tramp +.\#* + +# Org-mode +.org-id-locations +*_archive + +# flymake-mode +*_flymake.* + +# eshell files +/eshell/history +/eshell/lastdir + +# elpa packages +/elpa/ + +# reftex files +*.rel + +# AUCTeX auto folder +/auto/ + +# cask packages +.cask/ +dist/ + +# Flycheck +flycheck_*.el + +# server auth directory +/server/ + +# projectiles files +.projectile + +# directory configuration +.dir-locals.el + +# network security +/network-security.data + +### Linux ### + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### macOS ### +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +### macOS Patch ### +# iCloud generated files +*.icloud + +### PyCharm ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# AWS User-specific +.idea/**/aws.xml + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# SonarLint plugin +.idea/sonarlint/ + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### PyCharm Patch ### +# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 + +# *.iml +# modules.xml +# .idea/misc.xml +# *.ipr + +# Sonarlint plugin +# https://plugins.jetbrains.com/plugin/7973-sonarlint +.idea/**/sonarlint/ + +# SonarQube Plugin +# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin +.idea/**/sonarIssues.xml + +# Markdown Navigator plugin +# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced +.idea/**/markdown-navigator.xml +.idea/**/markdown-navigator-enh.xml +.idea/**/markdown-navigator/ + +# Cache file creation bug +# See https://youtrack.jetbrains.com/issue/JBR-2257 +.idea/$CACHE_FILE$ + +# CodeStream plugin +# https://plugins.jetbrains.com/plugin/12206-codestream +.idea/codestream.xml + +# Azure Toolkit for IntelliJ plugin +# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij +.idea/**/azureSettings.xml + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +### Vim ### +# Swap +[._]*.s[a-v][a-z] +!*.svg # comment out if you don't need vector files +[._]*.sw[a-p] +[._]s[a-rt-v][a-z] +[._]ss[a-gi-z] +[._]sw[a-p] + +# Session +Session.vim +Sessionx.vim + +# Temporary +.netrwhist +# Auto-generated tag files +tags +# Persistent undo +[._]*.un~ + +### VisualStudioCode ### +.vscode/* +!.vscode/settings.json +!.vscode/tasks.json +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/*.code-snippets + +# Local History for Visual Studio Code +.history/ + +# Built Visual Studio Code Extensions +*.vsix + +### VisualStudioCode Patch ### +# Ignore all local history of files +.history +.ionide + +### Windows ### +# Windows thumbnail cache files +Thumbs.db +Thumbs.db:encryptable +ehthumbs.db +ehthumbs_vista.db + +# Dump file +*.stackdump + +# Folder config file +[Dd]esktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msix +*.msm +*.msp + +# Windows shortcuts +*.lnk + +.theflow/ + +# End of https://www.toptal.com/developers/gitignore/api/python,linux,macos,windows,vim,emacs,visualstudiocode,pycharm + +logs/ +.gitsecret/keys/random_seed +!*.secret +credentials.txt + +S.gpg-agent* +.vscode/settings.json diff --git a/templates/project-default/{{cookiecutter.project_name}}/.pre-commit-config.yaml b/templates/project-default/{{cookiecutter.project_name}}/.pre-commit-config.yaml new file mode 100644 index 0000000..90d0071 --- /dev/null +++ b/templates/project-default/{{cookiecutter.project_name}}/.pre-commit-config.yaml @@ -0,0 +1,51 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: check-yaml + - id: check-toml + - id: end-of-file-fixer + - id: trailing-whitespace + - id: detect-aws-credentials + args: ["--allow-missing-credentials"] + - id: detect-private-key + - id: check-added-large-files + - repo: https://github.com/ambv/black + rev: 22.3.0 + hooks: + - id: black + language_version: python3 + - repo: https://github.com/pycqa/isort + rev: 5.12.0 + hooks: + - id: isort + args: ["--profile", "black"] + language_version: python3.10 + - repo: https://github.com/pycqa/flake8 + rev: 4.0.1 + hooks: + - id: flake8 + args: ["--max-line-length", "88", "--extend-ignore", "E203"] + - repo: https://github.com/myint/autoflake + rev: v1.4 + hooks: + - id: autoflake + args: + [ + "--in-place", + "--remove-unused-variables", + "--remove-all-unused-imports", + "--ignore-init-module-imports", + "--exclude=tests/*", + ] + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v2.7.1 + hooks: + - id: prettier + types_or: [markdown, yaml] + - repo: https://github.com/pre-commit/mirrors-mypy + rev: "v1.5.1" + hooks: + - id: mypy + additional_dependencies: [types-PyYAML==6.0.12.11, "types-requests"] + args: ["--check-untyped-defs", "--ignore-missing-imports"] diff --git a/templates/project-default/{{cookiecutter.project_name}}/README.md b/templates/project-default/{{cookiecutter.project_name}}/README.md new file mode 100644 index 0000000..da7d160 --- /dev/null +++ b/templates/project-default/{{cookiecutter.project_name}}/README.md @@ -0,0 +1,37 @@ +
+ +# Project {{ cookiecutter.project_name }} + +[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/Cinnamon/kotaemon) + +
+ +# Install + +```bash +# Create new conda env (optional) +conda create -n {{ cookiecutter.project_name }} python=3.10 +conda activate {{ cookiecutter.project_name }} + +# Clone and install the project +git clone "<{{ cookiecutter.project_name }}-repo>" +cd "<{{ cookiecutter.project_name }}-repo>" +pip install -e . + +# Generate the project structure +cd .. +kh start-project +``` + +# Usage + +- Build the pipeline in `pipeline.py` + +For supported utilities and tools, refer: https://github.com/Cinnamon/kotaemon/wiki/Utilities + +# Contribute + +- For project issues and errors, please report in this repo issues. +- For kotaemon issues and errors, please report or make PR fixes in https://github.com/Cinnamon/kotaemon.git +- If the template for this project has issues and errors, please report or make + PR fixes in https://github.com/Cinnamon/kotaemon/tree/main/templates/project-default diff --git a/templates/project-default/{{cookiecutter.project_name}}/setup.py b/templates/project-default/{{cookiecutter.project_name}}/setup.py new file mode 100644 index 0000000..a032d53 --- /dev/null +++ b/templates/project-default/{{cookiecutter.project_name}}/setup.py @@ -0,0 +1,20 @@ +import setuptools + +setuptools.setup( + name="{{ cookiecutter.project_name }}", + version="0.0.1", + author="{{ cookiecutter.ptl }}", + author_email="{{ cookiecutter.ptl }}@cinnamon.is", + description="Project {{ cookiecutter.project_name }}", + long_description="Project {{ cookiecutter.project_name }}", + url="https://github.com/Cinnamon/kotaemon", + python_requires=">=3", + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + install_requires=[ + "kotaemon@git+ssh://git@github.com/Cinnamon/kotaemon.git", + ], +) diff --git a/templates/project-default/README.md b/templates/project-default/{{cookiecutter.project_name}}/tests/__init__.py similarity index 100% rename from templates/project-default/README.md rename to templates/project-default/{{cookiecutter.project_name}}/tests/__init__.py diff --git a/templates/project-default/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/__init__.py b/templates/project-default/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/templates/project-default/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/pipeline.py b/templates/project-default/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/pipeline.py new file mode 100644 index 0000000..c3eb8ea --- /dev/null +++ b/templates/project-default/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/pipeline.py @@ -0,0 +1,106 @@ +import os +from typing import List + +from theflow import Node, Param + +from kotaemon.base import BaseComponent +from kotaemon.docstores import InMemoryDocumentStore +from kotaemon.embeddings import AzureOpenAIEmbeddings +from kotaemon.llms.completions.openai import AzureOpenAI +from kotaemon.pipelines.indexing import IndexVectorStoreFromDocumentPipeline +from kotaemon.pipelines.retrieving import RetrieveDocumentFromVectorStorePipeline +from kotaemon.vectorstores import ChromaVectorStore + + +class QuestionAnsweringPipeline(BaseComponent): + vectorstore_path: str = str("./tmp") + retrieval_top_k: int = 1 + openai_api_key: str = os.environ.get("OPENAI_API_KEY", "") + + @Node.decorate(depends_on="openai_api_key") + def llm(self): + return AzureOpenAI( + openai_api_base="https://bleh-dummy-2.openai.azure.com/", + openai_api_key=self.openai_api_key, + openai_api_version="2023-03-15-preview", + deployment_name="dummy-q2-gpt35", + temperature=0, + request_timeout=60, + ) + + @Node.decorate(depends_on=["vectorstore_path", "openai_api_key"]) + def retrieving_pipeline(self): + vector_store = ChromaVectorStore(self.vectorstore_path) + embedding = AzureOpenAIEmbeddings( + model="text-embedding-ada-002", + deployment="dummy-q2-text-embedding", + openai_api_base="https://bleh-dummy-2.openai.azure.com/", + openai_api_key=self.openai_api_key, + ) + + return RetrieveDocumentFromVectorStorePipeline( + vector_store=vector_store, + embedding=embedding, + ) + + def run_raw(self, text: str) -> str: + # reload the document store, in case it has been updated + doc_store = InMemoryDocumentStore() + doc_store.load("docstore.json") + self.retrieving_pipeline.doc_store = doc_store + + # retrieve relevant documents as context + matched_texts: List[str] = [ + _.text + for _ in self.retrieving_pipeline(text, top_k=int(self.retrieval_top_k)) + ] + context = "\n".join(matched_texts) + + # generate the answer + prompt = f'Answer the following question: "{text}". The context is: \n{context}' + self.log_progress(".prompt", prompt=prompt) + + return self.llm(prompt).text[0] + + +class IndexingPipeline(IndexVectorStoreFromDocumentPipeline): + # Expose variables for users to switch in prompt ui + vectorstore_path: str = str("./tmp") + embedding_model: str = "text-embedding-ada-002" + deployment: str = "dummy-q2-text-embedding" + openai_api_base: str = "https://bleh-dummy-2.openai.azure.com/" + openai_api_key: str = os.environ.get("OPENAI_API_KEY", "") + + @Param.decorate(depends_on=["vectorstore_path"]) + def vector_store(self): + return ChromaVectorStore(self.vectorstore_path) + + @Param.decorate() + def doc_store(self): + doc_store = InMemoryDocumentStore() + if os.path.isfile("docstore.json"): + doc_store.load("docstore.json") + return doc_store + + @Node.decorate(depends_on=["vector_store"]) + def embedding(self): + return AzureOpenAIEmbeddings( + model="text-embedding-ada-002", + deployment=self.deployment, + openai_api_base=self.openai_api_base, + openai_api_key=self.openai_api_key, + ) + + def run_raw(self, text: str) -> int: # type: ignore + """Normally, this indexing pipeline returns nothing. For demonstration, + we want it to return something, so let's return the number of documents + in the vector store + """ + super().run_raw(text) + + if self.doc_store is not None: + # persist to local anytime an indexing is created + # this can be bypassed when we have a FileDocucmentStore + self.doc_store.save("docstore.json") + + return self.vector_store._collection.count()