feat: set user being able to set chunk size and overlap for indices (#524)

* use tzlocal to get the local time * delete tmp folder * update date_created and date_updated with current timezone * pass precommit * update date_created field default by local time * add chunk size and chunk overlap param for indices * refactor code to pass pre-commit * fix: minor update logics --------- Co-authored-by: Tadashi <tadashi@cinnamon.is>
2024-12-04 09:04:50 +07:00
parent a1fecfac45
commit 32732c35de
3 changed files with 31 additions and 2 deletions
--- a/libs/ktem/ktem/index/file/base.py
+++ b/libs/ktem/ktem/index/file/base.py
@@ -55,6 +55,8 @@ class BaseFileIndexIndexing(BaseComponent):
    FSPath = Param(help="The file storage path")
    user_id = Param(help="The user id")
    private = Param(False, help="Whether this is private index")
    chunk_size = Param(help="Chunk size for this index")
    chunk_overlap = Param(help="Chunk overlap for this index")
    def run(
        self, file_paths: str | Path | list[str | Path], *args, **kwargs
--- a/libs/ktem/ktem/index/file/index.py
+++ b/libs/ktem/ktem/index/file/index.py
@@ -404,6 +404,25 @@ class FileIndex(BaseIndex):
                "choices": [("Yes", True), ("No", False)],
                "info": "If private, files will not be accessible across users.",
            },
            "chunk_size": {
                "name": "Size of chunk (number of tokens)",
                "value": 0,
                "component": "number",
                "info": (
                    "Number of tokens of each text segment. "
                    "Set 0 to use developer setting."
                ),
            },
            "chunk_overlap": {
                "name": "Number of overlapping tokens between chunks",
                "value": 0,
                "component": "number",
                "info": (
                    "Number of tokens that consecutive text segments "
                    "should overlap with each other. "
                    "Set 0 to use developer setting."
                ),
            },
        }
    def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing:
@@ -423,6 +442,8 @@ class FileIndex(BaseIndex):
        obj.FSPath = self._fs_path
        obj.user_id = user_id
        obj.private = self.config.get("private", False)
        obj.chunk_size = self.config.get("chunk_size", 0)
        obj.chunk_overlap = self.config.get("chunk_overlap", 0)
        return obj
--- a/libs/ktem/ktem/index/file/pipelines.py
+++ b/libs/ktem/ktem/index/file/pipelines.py
@@ -729,7 +729,11 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
        Can subclass this method for a more elaborate pipeline routing strategy.
        """
-        _, chunk_size, chunk_overlap = dev_settings()
+
        _, dev_chunk_size, dev_chunk_overlap = dev_settings()
        chunk_size = self.chunk_size or dev_chunk_size
        chunk_overlap = self.chunk_overlap or dev_chunk_overlap
        # check if file_path is a URL
        if self.is_url(file_path):
@@ -744,12 +748,14 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
                    "the suitable pipeline for this file type in the settings."
                )
        print(f"Chunk size: {chunk_size}, chunk overlap: {chunk_overlap}")
        print("Using reader", reader)
        pipeline: IndexPipeline = IndexPipeline(
            loader=reader,
            splitter=TokenSplitter(
                chunk_size=chunk_size or 1024,
-                chunk_overlap=chunk_overlap if chunk_overlap is not None else 256,
+                chunk_overlap=chunk_overlap or 256,
                separator="\n\n",
                backup_separators=["\n", ".", "\u200B"],
            ),