feat: set user being able to set chunk size and overlap for indices (#524)

* use tzlocal to get the local time

* delete tmp folder

* update date_created and date_updated with current timezone

* pass precommit

* update date_created field default by local time

* add chunk size and chunk overlap param for indices

* refactor code to pass pre-commit

* fix: minor update logics

---------

Co-authored-by: Tadashi <tadashi@cinnamon.is>
This commit is contained in:
Tran Huu Hoang 2024-12-04 09:04:50 +07:00 committed by GitHub
parent a1fecfac45
commit 32732c35de
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 31 additions and 2 deletions

View File

@ -55,6 +55,8 @@ class BaseFileIndexIndexing(BaseComponent):
FSPath = Param(help="The file storage path") FSPath = Param(help="The file storage path")
user_id = Param(help="The user id") user_id = Param(help="The user id")
private = Param(False, help="Whether this is private index") private = Param(False, help="Whether this is private index")
chunk_size = Param(help="Chunk size for this index")
chunk_overlap = Param(help="Chunk overlap for this index")
def run( def run(
self, file_paths: str | Path | list[str | Path], *args, **kwargs self, file_paths: str | Path | list[str | Path], *args, **kwargs

View File

@ -404,6 +404,25 @@ class FileIndex(BaseIndex):
"choices": [("Yes", True), ("No", False)], "choices": [("Yes", True), ("No", False)],
"info": "If private, files will not be accessible across users.", "info": "If private, files will not be accessible across users.",
}, },
"chunk_size": {
"name": "Size of chunk (number of tokens)",
"value": 0,
"component": "number",
"info": (
"Number of tokens of each text segment. "
"Set 0 to use developer setting."
),
},
"chunk_overlap": {
"name": "Number of overlapping tokens between chunks",
"value": 0,
"component": "number",
"info": (
"Number of tokens that consecutive text segments "
"should overlap with each other. "
"Set 0 to use developer setting."
),
},
} }
def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing: def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing:
@ -423,6 +442,8 @@ class FileIndex(BaseIndex):
obj.FSPath = self._fs_path obj.FSPath = self._fs_path
obj.user_id = user_id obj.user_id = user_id
obj.private = self.config.get("private", False) obj.private = self.config.get("private", False)
obj.chunk_size = self.config.get("chunk_size", 0)
obj.chunk_overlap = self.config.get("chunk_overlap", 0)
return obj return obj

View File

@ -729,7 +729,11 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
Can subclass this method for a more elaborate pipeline routing strategy. Can subclass this method for a more elaborate pipeline routing strategy.
""" """
_, chunk_size, chunk_overlap = dev_settings()
_, dev_chunk_size, dev_chunk_overlap = dev_settings()
chunk_size = self.chunk_size or dev_chunk_size
chunk_overlap = self.chunk_overlap or dev_chunk_overlap
# check if file_path is a URL # check if file_path is a URL
if self.is_url(file_path): if self.is_url(file_path):
@ -744,12 +748,14 @@ class IndexDocumentPipeline(BaseFileIndexIndexing):
"the suitable pipeline for this file type in the settings." "the suitable pipeline for this file type in the settings."
) )
print(f"Chunk size: {chunk_size}, chunk overlap: {chunk_overlap}")
print("Using reader", reader) print("Using reader", reader)
pipeline: IndexPipeline = IndexPipeline( pipeline: IndexPipeline = IndexPipeline(
loader=reader, loader=reader,
splitter=TokenSplitter( splitter=TokenSplitter(
chunk_size=chunk_size or 1024, chunk_size=chunk_size or 1024,
chunk_overlap=chunk_overlap if chunk_overlap is not None else 256, chunk_overlap=chunk_overlap or 256,
separator="\n\n", separator="\n\n",
backup_separators=["\n", ".", "\u200B"], backup_separators=["\n", ".", "\u200B"],
), ),