4
4
import logging
5
5
import re
6
6
from layout_holders import FigureHolder , LayoutHolder
7
+ from typing import List
7
8
8
9
9
10
class LayoutAndFigureMerger :
@@ -18,37 +19,48 @@ def insert_figure_description(
18
19
figure_holder (FigureHolder): The figure to be updated.
19
20
20
21
Returns:
21
- str : The updated Markdown content with the new figure description.
22
+ int : The change in length of the Markdown content after updating the figure description.
22
23
"""
23
-
24
24
# Calculate the end index of the content to be replaced
25
25
end_index = figure_holder .offset + figure_holder .length
26
26
27
- # Ensure that the end_index does not exceed the length of the Markdown content
27
+ # Ensure the offset is valid
28
+ if figure_holder .offset < 0 or figure_holder .offset > len (
29
+ layout_holder .content
30
+ ):
31
+ logging .error ("Figure offset is out of bounds." )
32
+ raise ValueError ("Figure offset is out of bounds." )
33
+
34
+ # Ensure the end index does not exceed the length of the Markdown content
28
35
if end_index > len (layout_holder .content ):
29
36
logging .info (
30
- "End index exceeds the length of the content. Adjusting the end index to the length of the content."
37
+ "End index exceeds the length of the content. Adjusting to the length of the content."
31
38
)
32
39
end_index = len (layout_holder .content )
33
40
41
+ logging .info (f"Figure Markdown Content: { figure_holder .markdown } " )
42
+
34
43
# Replace the old string with the new string
35
44
layout_holder .content = (
36
45
layout_holder .content [: figure_holder .offset ]
37
46
+ figure_holder .markdown
38
47
+ layout_holder .content [end_index :]
39
48
)
40
49
41
- return len (figure_holder .markdown ) - figure_holder .length
50
+ inserted_length = len (figure_holder .markdown ) - figure_holder .length
51
+ logging .info (f"Inserted Length: { inserted_length } " )
52
+
53
+ return layout_holder , inserted_length
42
54
43
55
async def merge_figures_into_layout (
44
- self , layout : LayoutHolder , figures : list [FigureHolder ]
56
+ self , layout_holder : LayoutHolder , figures : List [FigureHolder ]
45
57
) -> LayoutHolder :
46
58
"""
47
59
Merges the figures into the layout.
48
60
49
61
Args:
50
- layout (LayoutHolder): The layout text.
51
- figures (list ): The list of figures.
62
+ layout_holder (LayoutHolder): The layout text.
63
+ figures (List[FigureHolder] ): The list of figures.
52
64
53
65
Returns:
54
66
LayoutHolder: The updated layout text with the figures.
@@ -59,30 +71,51 @@ async def merge_figures_into_layout(
59
71
# Iterate over the figures
60
72
for figure in figures :
61
73
logging .info (f"Inserting Figure: { figure .figure_id } " )
74
+ logging .info (f"Figure Description: { figure .description } " )
62
75
# Update the figure description in the layout
63
76
figure .offset += running_offset
64
- length = self .insert_figure_description (layout , figure )
77
+ layout_holder , inserted_length = self .insert_figure_description (
78
+ layout_holder , figure
79
+ )
65
80
66
81
# Update the offset
67
- running_offset += length
82
+ running_offset += inserted_length
83
+
84
+ logging .info ("Merged figures into layout." )
85
+ logging .info ("Updated Layout with Figures: %s" , layout_holder .content )
86
+ # Precompile regex patterns
87
+ irrelevant_figure_pattern = re .compile (
88
+ r"<figure[^>]*>\s*(Irrelevant Image|\'Irrelevant Image\')\s*</figure>" ,
89
+ re .DOTALL ,
90
+ )
91
+ empty_or_whitespace_figure_pattern = re .compile (
92
+ r"<figure[^>]*>\s*</figure>" , re .DOTALL
93
+ )
94
+ html_comments_pattern = re .compile (r"<!--.*?-->" , re .DOTALL )
68
95
69
96
# Remove irrelevant figures
70
- irrelevant_figure_pattern = r"<figure[^>]*>.*?Irrelevant Image.*?</figure>"
71
- layout .content = re .sub (
72
- irrelevant_figure_pattern , "" , layout .content , flags = re .DOTALL
97
+ layout_holder .content = irrelevant_figure_pattern .sub ("" , layout_holder .content )
98
+ logging .info ("Removed irrelevant figures from layout." )
99
+ logging .info (
100
+ "Updated Layout without Irrelevant Figures: %s" , layout_holder .content
73
101
)
74
102
75
- empty_or_whitespace_figure_pattern = r"<figure[^>]*>\s*</figure>"
76
- layout .content = re .sub (
77
- empty_or_whitespace_figure_pattern , "" , layout .content , flags = re . DOTALL
103
+ # Remove empty or whitespace figures
104
+ layout_holder .content = empty_or_whitespace_figure_pattern .sub (
105
+ "" , layout_holder .content
78
106
)
79
-
80
- html_comments_pattern = r"<!--.*?-->"
81
- layout . content = re . sub (
82
- html_comments_pattern , "" , layout .content , flags = re . DOTALL
107
+ logging . info ( "Removed empty or whitespace figures from layout." )
108
+ logging . info (
109
+ "Updated Layout without Empty or Whitespace Figures: %s" ,
110
+ layout_holder .content ,
83
111
)
84
112
85
- return layout
113
+ # Remove HTML comments
114
+ layout_holder .content = html_comments_pattern .sub ("" , layout_holder .content )
115
+ logging .info ("Removed HTML comments from layout." )
116
+ logging .info ("Updated Layout without HTML Comments: %s" , layout_holder .content )
117
+
118
+ return layout_holder
86
119
87
120
async def merge (self , record : dict ) -> dict :
88
121
"""
@@ -94,19 +127,21 @@ async def merge(self, record: dict) -> dict:
94
127
Returns:
95
128
- record (dict): The record containing the image, its caption, and the generated description.
96
129
"""
97
- layout = LayoutHolder (** record ["data" ]["layout" ])
130
+ layout_holder = LayoutHolder (** record ["data" ]["layout" ])
98
131
99
132
figures = [FigureHolder (** figure ) for figure in record ["data" ]["figures" ]]
100
133
101
134
try :
102
- logging .info (f"Input Data: { layout } " )
103
- updated_layout = await self .merge_figures_into_layout (layout , figures )
104
- logging .info (f"Updated Data: { updated_layout } " )
135
+ logging .info (f"Input Data: { layout_holder } " )
136
+ updated_layout = await self .merge_figures_into_layout (
137
+ layout_holder , figures
138
+ )
139
+ logging .info (f"Updated Layout Data: { updated_layout } " )
105
140
except Exception as e :
106
141
logging .error (f"Failed to merge figures into layout. Error: { e } " )
107
142
return {
108
143
"recordId" : record ["recordId" ],
109
- "data" : {} ,
144
+ "data" : None ,
110
145
"errors" : [
111
146
{
112
147
"message" : "Failed to merge figures into layout." ,
0 commit comments