Skip to content

Commit a79b63d

Browse files
authored
Switch to chonkie from llamaindex chunker (onyx-dot-app#4838)
* Switch to chonkie from llamaindex chunker * Remove un-intended changes * Order requirements * Upgrade chonkie version
1 parent b899534 commit a79b63d

File tree

2 files changed

+24
-13
lines changed

2 files changed

+24
-13
lines changed

backend/onyx/indexing/chunker.py

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
from typing import cast
2+
3+
from chonkie import SentenceChunker
4+
15
from onyx.configs.app_configs import AVERAGE_SUMMARY_EMBEDDINGS
26
from onyx.configs.app_configs import BLURB_SIZE
37
from onyx.configs.app_configs import LARGE_CHUNK_RATIO
@@ -135,9 +139,6 @@ def __init__(
135139
mini_chunk_size: int = MINI_CHUNK_SIZE,
136140
callback: IndexingHeartbeatInterface | None = None,
137141
) -> None:
138-
# importing llama_index uses a lot of RAM, so we only import it when needed.
139-
from llama_index.core.node_parser import SentenceSplitter
140-
141142
self.include_metadata = include_metadata
142143
self.chunk_token_limit = chunk_token_limit
143144
self.enable_multipass = enable_multipass
@@ -156,23 +157,30 @@ def __init__(
156157
self.max_context = 0
157158
self.prompt_tokens = 0
158159

159-
self.blurb_splitter = SentenceSplitter(
160-
tokenizer=tokenizer.tokenize,
160+
# Create a token counter function that returns the count instead of the tokens
161+
def token_counter(text: str) -> int:
162+
return len(tokenizer.encode(text))
163+
164+
self.blurb_splitter = SentenceChunker(
165+
tokenizer_or_token_counter=token_counter,
161166
chunk_size=blurb_size,
162167
chunk_overlap=0,
168+
return_type="texts",
163169
)
164170

165-
self.chunk_splitter = SentenceSplitter(
166-
tokenizer=tokenizer.tokenize,
171+
self.chunk_splitter = SentenceChunker(
172+
tokenizer_or_token_counter=token_counter,
167173
chunk_size=chunk_token_limit,
168174
chunk_overlap=chunk_overlap,
175+
return_type="texts",
169176
)
170177

171178
self.mini_chunk_splitter = (
172-
SentenceSplitter(
173-
tokenizer=tokenizer.tokenize,
179+
SentenceChunker(
180+
tokenizer_or_token_counter=token_counter,
174181
chunk_size=mini_chunk_size,
175182
chunk_overlap=0,
183+
return_type="texts",
176184
)
177185
if enable_multipass
178186
else None
@@ -199,7 +207,8 @@ def _extract_blurb(self, text: str) -> str:
199207
"""
200208
Extract a short blurb from the text (first chunk of size `blurb_size`).
201209
"""
202-
texts = self.blurb_splitter.split_text(text)
210+
# chunker is in `text` mode
211+
texts = cast(list[str], self.blurb_splitter.chunk(text))
203212
if not texts:
204213
return ""
205214
return texts[0]
@@ -209,7 +218,8 @@ def _get_mini_chunk_texts(self, chunk_text: str) -> list[str] | None:
209218
For "multipass" mode: additional sub-chunks (mini-chunks) for use in certain embeddings.
210219
"""
211220
if self.mini_chunk_splitter and chunk_text.strip():
212-
return self.mini_chunk_splitter.split_text(chunk_text)
221+
# chunker is in `text` mode
222+
return cast(list[str], self.mini_chunk_splitter.chunk(chunk_text))
213223
return None
214224

215225
# ADDED: extra param image_url to store in the chunk
@@ -329,7 +339,8 @@ def _chunk_document_with_sections(
329339
chunk_text = ""
330340
link_offsets = {}
331341

332-
split_texts = self.chunk_splitter.split_text(section_text)
342+
# chunker is in `text` mode
343+
split_texts = cast(list[str], self.chunk_splitter.chunk(section_text))
333344
for i, split_text in enumerate(split_texts):
334345
# If even the split_text is bigger than strict limit, further split
335346
if (

backend/requirements/default.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ beautifulsoup4==4.12.3
77
boto3==1.36.23
88
celery==5.5.1
99
chardet==5.2.0
10+
chonkie==1.0.10
1011
dask==2023.8.1
1112
ddtrace==2.6.5
1213
discord.py==2.4.0
@@ -42,7 +43,6 @@ langgraph-sdk==0.1.44
4243
litellm==1.72.2
4344
lxml==5.3.0
4445
lxml_html_clean==0.2.2
45-
llama-index==0.12.28
4646
Mako==1.2.4
4747
msal==1.28.0
4848
nltk==3.9.1

0 commit comments

Comments
 (0)