@@ -59,6 +59,22 @@ def num_tokens_from_string(self, string: str) -> int:
59
59
60
60
return len (encoding .encode (string ))
61
61
62
+ def clean_chunks_and_map (self , chunks , is_table_or_figure_map ):
63
+ cleaned_chunks = []
64
+ cleaned_is_table_or_figure_map = []
65
+
66
+ for current_chunk , is_table_or_figure in zip (chunks , is_table_or_figure_map ):
67
+ cleaned_chunk = current_chunk .strip ()
68
+ if len (cleaned_chunk ) > 0 :
69
+ # Add a newline if the chunk ends with a newline (it was a title)
70
+ if self .is_markdown_heading (current_chunk ):
71
+ cleaned_chunk = "\n \n " + cleaned_chunk + "\n \n "
72
+
73
+ cleaned_chunks .append (cleaned_chunk )
74
+ cleaned_is_table_or_figure_map .append (is_table_or_figure )
75
+
76
+ return cleaned_chunks , cleaned_is_table_or_figure_map
77
+
62
78
async def chunk (self , text : str ) -> list [dict ]:
63
79
"""Attempts to chunk the text by:
64
80
Splitting into sentences
@@ -86,6 +102,10 @@ async def chunk(self, text: str) -> list[dict]:
86
102
grouped_sentences , is_table_or_figure_map
87
103
)
88
104
105
+ forward_pass_chunks , new_is_table_or_figure_map = self .clean_chunks_and_map (
106
+ forward_pass_chunks , new_is_table_or_figure_map
107
+ )
108
+
89
109
logging .info (
90
110
f"""Number of Forward pass chunks: {
91
111
len (forward_pass_chunks )} """
@@ -129,7 +149,7 @@ def filter_empty_figures(self, text):
129
149
130
150
def clean_new_lines (self , text ):
131
151
# Remove single newlines surrounded by < and >
132
- cleaned_text = re .sub (r"(?<=>)(\n)(?=<)" , "" , text )
152
+ cleaned_text = re .sub (r"(?<=>)(\n)(?=<)" , "" , text . strip () )
133
153
134
154
# Replace all other single newlines with space
135
155
cleaned_text = re .sub (r"(?<!\n)\n(?!\n)" , " " , cleaned_text )
@@ -190,7 +210,7 @@ def split_into_sentences(self, text: str) -> list[str]:
190
210
self .is_markdown_heading (part )
191
211
and part .endswith ("\n \n " ) is False
192
212
):
193
- part = part + "\n \n "
213
+ part = " \n \n " + part + "\n \n "
194
214
195
215
heading_split_sentences .append (part )
196
216
@@ -300,23 +320,36 @@ def retrive_current_chunk_at_n(n):
300
320
else :
301
321
return current_chunk [n ]
302
322
303
- current_chunk_tokens = self .num_tokens_from_string (" " .join (current_chunk ))
323
+ def get_current_chunk_tokens (chunk_segments ):
324
+ return self .num_tokens_from_string (" " .join (chunk_segments ))
325
+
326
+ current_chunk_tokens = get_current_chunk_tokens (current_chunk )
304
327
305
328
if len (current_chunk ) >= 2 and current_chunk_tokens >= self .min_chunk_tokens :
306
- logging .info ("Comparing chunks" )
307
- cosine_sim = self .sentence_similarity (
308
- retrieve_current_chunks_from_n (- 2 ), current_sentence
309
- )
329
+ # Calculate the tokens if we were to split
330
+ if len (current_chunk ) > 2 :
331
+ would_be_new_chunk = retrieve_current_chunk_up_to_n (1 )
332
+ would_be_current_chunk = [retrive_current_chunk_at_n (- 1 )]
333
+ else :
334
+ would_be_new_chunk = retrive_current_chunk_at_n (0 )
335
+ would_be_current_chunk = [retrive_current_chunk_at_n (1 )]
336
+
310
337
if (
311
- cosine_sim < self .similarity_threshold
312
- or current_chunk_tokens >= self .max_chunk_tokens
338
+ get_current_chunk_tokens (would_be_new_chunk ) >= self .min_chunk_tokens
339
+ and get_current_chunk_tokens (would_be_current_chunk )
340
+ >= self .min_chunk_tokens
313
341
):
314
- if len (current_chunk ) > 2 :
315
- new_chunk = retrieve_current_chunk_up_to_n (1 )
316
- current_chunk = [retrive_current_chunk_at_n (- 1 )]
317
- else :
318
- new_chunk = retrive_current_chunk_at_n (0 )
319
- current_chunk = [retrive_current_chunk_at_n (1 )]
342
+ logging .info ("Comparing chunks" )
343
+ if (
344
+ current_chunk_tokens >= self .max_chunk_tokens
345
+ or self .sentence_similarity (
346
+ retrieve_current_chunks_from_n (- 2 ), current_sentence
347
+ )
348
+ < self .similarity_threshold
349
+ ):
350
+ return would_be_new_chunk , would_be_current_chunk
351
+ else :
352
+ logging .info ("Chunk too small to compare" )
320
353
else :
321
354
logging .info ("Chunk too small to compare" )
322
355
0 commit comments