Skip to content

Commit f941161

Browse files
committed
Update
1 parent 9102553 commit f941161

File tree

3 files changed

+154
-116
lines changed

3 files changed

+154
-116
lines changed

image_processing/src/image_processing/requirements.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# This file was autogenerated by uv via the following command:
22
# uv export --frozen --no-hashes --no-editable --no-sources --no-group dev --directory image_processing -o src/image_processing/requirements.txt
33
aiohappyeyeballs==2.4.4
4-
aiohttp==3.11.11
4+
aiohttp==3.11.12
55
aiosignal==1.3.2
66
annotated-types==0.7.0
77
anyio==4.8.0
@@ -16,7 +16,7 @@ azure-identity==1.19.0
1616
azure-search==1.0.0b2
1717
azure-search-documents==11.6.0b8
1818
azure-storage-blob==12.24.1
19-
beautifulsoup4==4.12.3
19+
beautifulsoup4==4.13.3
2020
blis==0.7.11
2121
bs4==0.0.2
2222
catalogue==2.0.10
@@ -34,7 +34,7 @@ en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_
3434
et-xmlfile==2.0.0
3535
filelock==3.17.0
3636
frozenlist==1.5.0
37-
fsspec==2024.12.0
37+
fsspec==2025.2.0
3838
h11==0.14.0
3939
httpcore==1.0.7
4040
httpx==0.28.1
@@ -50,15 +50,15 @@ marisa-trie==1.2.1
5050
markdown-it-py==3.0.0
5151
markupsafe==3.0.2
5252
mdurl==0.1.2
53-
model2vec==0.3.8
53+
model2vec==0.3.9
5454
msal==1.31.1
5555
msal-extensions==1.2.0
5656
msrest==0.7.1
5757
multidict==6.1.0
5858
murmurhash==1.0.12
5959
numpy==1.26.4
6060
oauthlib==3.2.2
61-
openai==1.60.2
61+
openai==1.61.1
6262
openpyxl==3.1.5
6363
packaging==24.2
6464
pandas==2.2.3
@@ -71,7 +71,7 @@ pydantic==2.10.6
7171
pydantic-core==2.27.2
7272
pygments==2.19.1
7373
pyjwt==2.10.1
74-
pymupdf==1.25.2
74+
pymupdf==1.25.3
7575
python-dateutil==2.9.0.post0
7676
python-dotenv==1.0.1
7777
pytz==2025.1

image_processing/src/image_processing/semantic_text_chunker.py

Lines changed: 48 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,22 @@ def num_tokens_from_string(self, string: str) -> int:
5959

6060
return len(encoding.encode(string))
6161

62+
def clean_chunks_and_map(self, chunks, is_table_or_figure_map):
63+
cleaned_chunks = []
64+
cleaned_is_table_or_figure_map = []
65+
66+
for current_chunk, is_table_or_figure in zip(chunks, is_table_or_figure_map):
67+
cleaned_chunk = current_chunk.strip()
68+
if len(cleaned_chunk) > 0:
69+
# Add a newline if the chunk ends with a newline (it was a title)
70+
if self.is_markdown_heading(current_chunk):
71+
cleaned_chunk = "\n\n" + cleaned_chunk + "\n\n"
72+
73+
cleaned_chunks.append(cleaned_chunk)
74+
cleaned_is_table_or_figure_map.append(is_table_or_figure)
75+
76+
return cleaned_chunks, cleaned_is_table_or_figure_map
77+
6278
async def chunk(self, text: str) -> list[dict]:
6379
"""Attempts to chunk the text by:
6480
Splitting into sentences
@@ -86,6 +102,10 @@ async def chunk(self, text: str) -> list[dict]:
86102
grouped_sentences, is_table_or_figure_map
87103
)
88104

105+
forward_pass_chunks, new_is_table_or_figure_map = self.clean_chunks_and_map(
106+
forward_pass_chunks, new_is_table_or_figure_map
107+
)
108+
89109
logging.info(
90110
f"""Number of Forward pass chunks: {
91111
len(forward_pass_chunks)}"""
@@ -129,7 +149,7 @@ def filter_empty_figures(self, text):
129149

130150
def clean_new_lines(self, text):
131151
# Remove single newlines surrounded by < and >
132-
cleaned_text = re.sub(r"(?<=>)(\n)(?=<)", "", text)
152+
cleaned_text = re.sub(r"(?<=>)(\n)(?=<)", "", text.strip())
133153

134154
# Replace all other single newlines with space
135155
cleaned_text = re.sub(r"(?<!\n)\n(?!\n)", " ", cleaned_text)
@@ -190,7 +210,7 @@ def split_into_sentences(self, text: str) -> list[str]:
190210
self.is_markdown_heading(part)
191211
and part.endswith("\n\n") is False
192212
):
193-
part = part + "\n\n"
213+
part = "\n\n" + part + "\n\n"
194214

195215
heading_split_sentences.append(part)
196216

@@ -300,23 +320,36 @@ def retrive_current_chunk_at_n(n):
300320
else:
301321
return current_chunk[n]
302322

303-
current_chunk_tokens = self.num_tokens_from_string(" ".join(current_chunk))
323+
def get_current_chunk_tokens(chunk_segments):
324+
return self.num_tokens_from_string(" ".join(chunk_segments))
325+
326+
current_chunk_tokens = get_current_chunk_tokens(current_chunk)
304327

305328
if len(current_chunk) >= 2 and current_chunk_tokens >= self.min_chunk_tokens:
306-
logging.info("Comparing chunks")
307-
cosine_sim = self.sentence_similarity(
308-
retrieve_current_chunks_from_n(-2), current_sentence
309-
)
329+
# Calculate the tokens if we were to split
330+
if len(current_chunk) > 2:
331+
would_be_new_chunk = retrieve_current_chunk_up_to_n(1)
332+
would_be_current_chunk = [retrive_current_chunk_at_n(-1)]
333+
else:
334+
would_be_new_chunk = retrive_current_chunk_at_n(0)
335+
would_be_current_chunk = [retrive_current_chunk_at_n(1)]
336+
310337
if (
311-
cosine_sim < self.similarity_threshold
312-
or current_chunk_tokens >= self.max_chunk_tokens
338+
get_current_chunk_tokens(would_be_new_chunk) >= self.min_chunk_tokens
339+
and get_current_chunk_tokens(would_be_current_chunk)
340+
>= self.min_chunk_tokens
313341
):
314-
if len(current_chunk) > 2:
315-
new_chunk = retrieve_current_chunk_up_to_n(1)
316-
current_chunk = [retrive_current_chunk_at_n(-1)]
317-
else:
318-
new_chunk = retrive_current_chunk_at_n(0)
319-
current_chunk = [retrive_current_chunk_at_n(1)]
342+
logging.info("Comparing chunks")
343+
if (
344+
current_chunk_tokens >= self.max_chunk_tokens
345+
or self.sentence_similarity(
346+
retrieve_current_chunks_from_n(-2), current_sentence
347+
)
348+
< self.similarity_threshold
349+
):
350+
return would_be_new_chunk, would_be_current_chunk
351+
else:
352+
logging.info("Chunk too small to compare")
320353
else:
321354
logging.info("Chunk too small to compare")
322355

0 commit comments

Comments
 (0)