kotaemon/knowledgehub/indices/splitters/__init__.py
Duc Nguyen (john) 5a9d6f75be Migrate the MVP into kotaemon (#108)
- Migrate the MVP into kotaemon.
- Preliminary include the pipeline within chatbot interface.
- Organize MVP as an application.

Todo:

- Add an info panel to view the planning of agents -> Fix streaming agents' output.

Resolve: #60
Resolve: #61 
Resolve: #62
2024-01-10 15:28:09 +07:00

50 lines
1.3 KiB
Python

from ..base import DocTransformer, LlamaIndexDocTransformerMixin
class BaseSplitter(DocTransformer):
"""Represent base splitter class"""
...
class TokenSplitter(LlamaIndexDocTransformerMixin, BaseSplitter):
def __init__(
self,
chunk_size: int = 1024,
chunk_overlap: int = 20,
separator: str = " ",
**params,
):
super().__init__(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separator=separator,
**params,
)
def _get_li_class(self):
from llama_index.text_splitter import TokenTextSplitter
return TokenTextSplitter
class SentenceWindowSplitter(LlamaIndexDocTransformerMixin, BaseSplitter):
def __init__(
self,
window_size: int = 3,
window_metadata_key: str = "window",
original_text_metadata_key: str = "original_text",
**params,
):
super().__init__(
window_size=window_size,
window_metadata_key=window_metadata_key,
original_text_metadata_key=original_text_metadata_key,
**params,
)
def _get_li_class(self):
from llama_index.node_parser import SentenceWindowNodeParser
return SentenceWindowNodeParser