1
+ from typing import cast
2
+
3
+ from chonkie import SentenceChunker
4
+
1
5
from onyx .configs .app_configs import AVERAGE_SUMMARY_EMBEDDINGS
2
6
from onyx .configs .app_configs import BLURB_SIZE
3
7
from onyx .configs .app_configs import LARGE_CHUNK_RATIO
@@ -135,9 +139,6 @@ def __init__(
135
139
mini_chunk_size : int = MINI_CHUNK_SIZE ,
136
140
callback : IndexingHeartbeatInterface | None = None ,
137
141
) -> None :
138
- # importing llama_index uses a lot of RAM, so we only import it when needed.
139
- from llama_index .core .node_parser import SentenceSplitter
140
-
141
142
self .include_metadata = include_metadata
142
143
self .chunk_token_limit = chunk_token_limit
143
144
self .enable_multipass = enable_multipass
@@ -156,23 +157,30 @@ def __init__(
156
157
self .max_context = 0
157
158
self .prompt_tokens = 0
158
159
159
- self .blurb_splitter = SentenceSplitter (
160
- tokenizer = tokenizer .tokenize ,
160
+ # Create a token counter function that returns the count instead of the tokens
161
+ def token_counter (text : str ) -> int :
162
+ return len (tokenizer .encode (text ))
163
+
164
+ self .blurb_splitter = SentenceChunker (
165
+ tokenizer_or_token_counter = token_counter ,
161
166
chunk_size = blurb_size ,
162
167
chunk_overlap = 0 ,
168
+ return_type = "texts" ,
163
169
)
164
170
165
- self .chunk_splitter = SentenceSplitter (
166
- tokenizer = tokenizer . tokenize ,
171
+ self .chunk_splitter = SentenceChunker (
172
+ tokenizer_or_token_counter = token_counter ,
167
173
chunk_size = chunk_token_limit ,
168
174
chunk_overlap = chunk_overlap ,
175
+ return_type = "texts" ,
169
176
)
170
177
171
178
self .mini_chunk_splitter = (
172
- SentenceSplitter (
173
- tokenizer = tokenizer . tokenize ,
179
+ SentenceChunker (
180
+ tokenizer_or_token_counter = token_counter ,
174
181
chunk_size = mini_chunk_size ,
175
182
chunk_overlap = 0 ,
183
+ return_type = "texts" ,
176
184
)
177
185
if enable_multipass
178
186
else None
@@ -199,7 +207,8 @@ def _extract_blurb(self, text: str) -> str:
199
207
"""
200
208
Extract a short blurb from the text (first chunk of size `blurb_size`).
201
209
"""
202
- texts = self .blurb_splitter .split_text (text )
210
+ # chunker is in `text` mode
211
+ texts = cast (list [str ], self .blurb_splitter .chunk (text ))
203
212
if not texts :
204
213
return ""
205
214
return texts [0 ]
@@ -209,7 +218,8 @@ def _get_mini_chunk_texts(self, chunk_text: str) -> list[str] | None:
209
218
For "multipass" mode: additional sub-chunks (mini-chunks) for use in certain embeddings.
210
219
"""
211
220
if self .mini_chunk_splitter and chunk_text .strip ():
212
- return self .mini_chunk_splitter .split_text (chunk_text )
221
+ # chunker is in `text` mode
222
+ return cast (list [str ], self .mini_chunk_splitter .chunk (chunk_text ))
213
223
return None
214
224
215
225
# ADDED: extra param image_url to store in the chunk
@@ -329,7 +339,8 @@ def _chunk_document_with_sections(
329
339
chunk_text = ""
330
340
link_offsets = {}
331
341
332
- split_texts = self .chunk_splitter .split_text (section_text )
342
+ # chunker is in `text` mode
343
+ split_texts = cast (list [str ], self .chunk_splitter .chunk (section_text ))
333
344
for i , split_text in enumerate (split_texts ):
334
345
# If even the split_text is bigger than strict limit, further split
335
346
if (
0 commit comments