Skip to content

Commit 9102553

Browse files
committed
Update marger
1 parent 5c9c086 commit 9102553

File tree

1 file changed

+61
-26
lines changed

1 file changed

+61
-26
lines changed

image_processing/src/image_processing/layout_and_figure_merger.py

Lines changed: 61 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import logging
55
import re
66
from layout_holders import FigureHolder, LayoutHolder
7+
from typing import List
78

89

910
class LayoutAndFigureMerger:
@@ -18,37 +19,48 @@ def insert_figure_description(
1819
figure_holder (FigureHolder): The figure to be updated.
1920
2021
Returns:
21-
str: The updated Markdown content with the new figure description.
22+
int: The change in length of the Markdown content after updating the figure description.
2223
"""
23-
2424
# Calculate the end index of the content to be replaced
2525
end_index = figure_holder.offset + figure_holder.length
2626

27-
# Ensure that the end_index does not exceed the length of the Markdown content
27+
# Ensure the offset is valid
28+
if figure_holder.offset < 0 or figure_holder.offset > len(
29+
layout_holder.content
30+
):
31+
logging.error("Figure offset is out of bounds.")
32+
raise ValueError("Figure offset is out of bounds.")
33+
34+
# Ensure the end index does not exceed the length of the Markdown content
2835
if end_index > len(layout_holder.content):
2936
logging.info(
30-
"End index exceeds the length of the content. Adjusting the end index to the length of the content."
37+
"End index exceeds the length of the content. Adjusting to the length of the content."
3138
)
3239
end_index = len(layout_holder.content)
3340

41+
logging.info(f"Figure Markdown Content: {figure_holder.markdown}")
42+
3443
# Replace the old string with the new string
3544
layout_holder.content = (
3645
layout_holder.content[: figure_holder.offset]
3746
+ figure_holder.markdown
3847
+ layout_holder.content[end_index:]
3948
)
4049

41-
return len(figure_holder.markdown) - figure_holder.length
50+
inserted_length = len(figure_holder.markdown) - figure_holder.length
51+
logging.info(f"Inserted Length: {inserted_length}")
52+
53+
return layout_holder, inserted_length
4254

4355
async def merge_figures_into_layout(
44-
self, layout: LayoutHolder, figures: list[FigureHolder]
56+
self, layout_holder: LayoutHolder, figures: List[FigureHolder]
4557
) -> LayoutHolder:
4658
"""
4759
Merges the figures into the layout.
4860
4961
Args:
50-
layout (LayoutHolder): The layout text.
51-
figures (list): The list of figures.
62+
layout_holder (LayoutHolder): The layout text.
63+
figures (List[FigureHolder]): The list of figures.
5264
5365
Returns:
5466
LayoutHolder: The updated layout text with the figures.
@@ -59,30 +71,51 @@ async def merge_figures_into_layout(
5971
# Iterate over the figures
6072
for figure in figures:
6173
logging.info(f"Inserting Figure: {figure.figure_id}")
74+
logging.info(f"Figure Description: {figure.description}")
6275
# Update the figure description in the layout
6376
figure.offset += running_offset
64-
length = self.insert_figure_description(layout, figure)
77+
layout_holder, inserted_length = self.insert_figure_description(
78+
layout_holder, figure
79+
)
6580

6681
# Update the offset
67-
running_offset += length
82+
running_offset += inserted_length
83+
84+
logging.info("Merged figures into layout.")
85+
logging.info("Updated Layout with Figures: %s", layout_holder.content)
86+
# Precompile regex patterns
87+
irrelevant_figure_pattern = re.compile(
88+
r"<figure[^>]*>\s*(Irrelevant Image|\'Irrelevant Image\')\s*</figure>",
89+
re.DOTALL,
90+
)
91+
empty_or_whitespace_figure_pattern = re.compile(
92+
r"<figure[^>]*>\s*</figure>", re.DOTALL
93+
)
94+
html_comments_pattern = re.compile(r"<!--.*?-->", re.DOTALL)
6895

6996
# Remove irrelevant figures
70-
irrelevant_figure_pattern = r"<figure[^>]*>.*?Irrelevant Image.*?</figure>"
71-
layout.content = re.sub(
72-
irrelevant_figure_pattern, "", layout.content, flags=re.DOTALL
97+
layout_holder.content = irrelevant_figure_pattern.sub("", layout_holder.content)
98+
logging.info("Removed irrelevant figures from layout.")
99+
logging.info(
100+
"Updated Layout without Irrelevant Figures: %s", layout_holder.content
73101
)
74102

75-
empty_or_whitespace_figure_pattern = r"<figure[^>]*>\s*</figure>"
76-
layout.content = re.sub(
77-
empty_or_whitespace_figure_pattern, "", layout.content, flags=re.DOTALL
103+
# Remove empty or whitespace figures
104+
layout_holder.content = empty_or_whitespace_figure_pattern.sub(
105+
"", layout_holder.content
78106
)
79-
80-
html_comments_pattern = r"<!--.*?-->"
81-
layout.content = re.sub(
82-
html_comments_pattern, "", layout.content, flags=re.DOTALL
107+
logging.info("Removed empty or whitespace figures from layout.")
108+
logging.info(
109+
"Updated Layout without Empty or Whitespace Figures: %s",
110+
layout_holder.content,
83111
)
84112

85-
return layout
113+
# Remove HTML comments
114+
layout_holder.content = html_comments_pattern.sub("", layout_holder.content)
115+
logging.info("Removed HTML comments from layout.")
116+
logging.info("Updated Layout without HTML Comments: %s", layout_holder.content)
117+
118+
return layout_holder
86119

87120
async def merge(self, record: dict) -> dict:
88121
"""
@@ -94,19 +127,21 @@ async def merge(self, record: dict) -> dict:
94127
Returns:
95128
- record (dict): The record containing the image, its caption, and the generated description.
96129
"""
97-
layout = LayoutHolder(**record["data"]["layout"])
130+
layout_holder = LayoutHolder(**record["data"]["layout"])
98131

99132
figures = [FigureHolder(**figure) for figure in record["data"]["figures"]]
100133

101134
try:
102-
logging.info(f"Input Data: {layout}")
103-
updated_layout = await self.merge_figures_into_layout(layout, figures)
104-
logging.info(f"Updated Data: {updated_layout}")
135+
logging.info(f"Input Data: {layout_holder}")
136+
updated_layout = await self.merge_figures_into_layout(
137+
layout_holder, figures
138+
)
139+
logging.info(f"Updated Layout Data: {updated_layout}")
105140
except Exception as e:
106141
logging.error(f"Failed to merge figures into layout. Error: {e}")
107142
return {
108143
"recordId": record["recordId"],
109-
"data": {},
144+
"data": None,
110145
"errors": [
111146
{
112147
"message": "Failed to merge figures into layout.",

0 commit comments

Comments
 (0)