Skip to content

Commit ee680df

Browse files
authored
Merge pull request #252 from transifex/PH-13655_fix-docx-hyperlinks
Ph 13655 fix docx hyperlinks
2 parents 18efed9 + 90cc90a commit ee680df

File tree

3 files changed

+193
-134
lines changed

3 files changed

+193
-134
lines changed

openformats/formats/office_open_xml/parser.py

Lines changed: 63 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,14 @@ def swap_hyperlink_elements(
5353
replacements[e] = added_format
5454

5555
for text_element, format in six.iteritems(replacements):
56-
text_element.parent.rPr.replaceWith(format)
56+
if text_element.parent.rPr:
57+
if format:
58+
text_element.parent.rPr.replaceWith(format)
59+
else:
60+
text_element.parent.rPr.extract()
61+
else:
62+
if format:
63+
text_element.insert_before(format)
5764

5865
@staticmethod
5966
def _escape_xml(translation):
@@ -164,82 +171,84 @@ def compile_paragraph(cls, paragraph, rels_soup, stringset):
164171
if stringset.get(txid, None) is None:
165172
return
166173

167-
translation = stringset[txid].string
168-
translation = cls._escape_xml(translation)
174+
translation_string = stringset[txid].string
175+
escaped_translation_string = cls._escape_xml(translation_string)
169176

170177
translation_soup = BeautifulSoup(
171-
u'<wrapper>{}</wrapper>'.format(translation), 'xml',
178+
u'<wrapper>{}</wrapper>'.format(escaped_translation_string), 'xml',
172179
).find_all(text=True)
173180

174-
leading_spaces = 0
175-
empty_text_element = None
176181
added_hl_text_elements = defaultdict(list)
177182
deleted_hl_text_elements = defaultdict(list)
183+
empty_text_element = None
184+
elements_for_removal = []
185+
last_element = None
178186

187+
leading_spaces = 0
188+
189+
# First of all try to replace each element translation
190+
# this is the happiest path
179191
for index, text_element in enumerate(text_elements):
180192
text = six.text_type(text_element.text)
193+
181194
# detect text elements that contain no text
182195
# and remove leading whitespace from the next string
183196
if not text.strip():
184197
leading_spaces = len(text) - len(text.strip())
185198
empty_text_element = text_element
186199
continue
200+
201+
last_element = text_element
202+
203+
hyperlink_url = cls.get_hyperlink_url(text_element, rels_soup)
204+
205+
# the text parts of the translation are less that the
206+
# text parts of the document, so we will just remove
207+
# any exceeding part from the document
208+
if len(translation_soup) == 0:
209+
elements_for_removal.append(text_element)
210+
continue
187211
else:
188-
hyperlink_url = cls.get_hyperlink_url(
189-
text_element, rels_soup
190-
)
191-
# the text parts of the translation are less that the
192-
# text parts of the document, so we will just remove
193-
# any excessing part from the document
194-
if len(translation_soup) == 0:
195-
cls.remove_text_element(text_element)
196-
continue
197212
translation_part = translation_soup.pop(0)
198213
translation = six.text_type(translation_part)
214+
translation_hyperlink_url = cls.get_translation_hyperlink(translation_part)
215+
199216
if not translation[:leading_spaces].strip():
200217
translation = translation[leading_spaces:]
218+
leading_spaces = 0
201219
else:
202220
if empty_text_element:
203-
cls.remove_text_element(empty_text_element)
221+
elements_for_removal.append(empty_text_element)
204222
empty_text_element = None
205223

206-
leading_spaces = 0
207-
208-
# the text parts of the translation are more that the
209-
# text parts of the document, so we will compress the
210-
# remaining translation parts into one string
211-
if (index == len(text_elements) - 1 and len(translation_soup) > 0):
212-
translation = "".join(
213-
[translation] +
214-
[six.text_type(t) for t in translation_soup]
215-
)
216-
217-
# attempt to find a parent containing `href` attribute
218-
# in order to extract the potential hyperlink url.
219-
translation_hyperlink_url = getattr(
220-
translation_part.find_parent(attrs={'href': True}
221-
), 'attrs', {}).get('href', None)
224+
text_element.clear()
225+
text_element.insert(0, translation)
222226

223227
# Edit in place hyperlink url
224228
if hyperlink_url and translation_hyperlink_url:
225229
cls.set_hyperlink_url(
226230
text_element, rels_soup, translation_hyperlink_url
227231
)
232+
else:
233+
if hyperlink_url:
234+
deleted_hl_text_elements[hyperlink_url]\
235+
.append(text_element)
236+
elif translation_hyperlink_url:
237+
added_hl_text_elements[translation_hyperlink_url]\
238+
.append(text_element)
239+
240+
# the text parts of the translation are more that the
241+
# text parts of the document, so we will compress the
242+
# remaining translation parts into one string
243+
if len(translation_soup) > 0:
244+
translation = last_element.contents[0] + \
245+
"".join([six.text_type(t) for t in translation_soup]
246+
)
247+
last_element.clear()
248+
last_element.insert(0, translation)
228249

229-
# remove hyperlink from source docx
230-
if hyperlink_url and not translation_hyperlink_url:
231-
deleted_hl_text_elements[hyperlink_url].append(text_element)
232-
233-
# create a new hyperlink
234-
if not hyperlink_url and translation_hyperlink_url:
235-
added_hl_text_elements[translation_hyperlink_url].append(
236-
text_element
237-
)
238-
239-
text_element.clear()
240-
text_element.insert(0, translation)
241-
242-
if len(added_hl_text_elements) == len(deleted_hl_text_elements):
250+
if len(added_hl_text_elements) == len(deleted_hl_text_elements)\
251+
and len(added_hl_text_elements) > 0:
243252
cls.swap_hyperlink_elements(
244253
added_hl_text_elements,
245254
deleted_hl_text_elements
@@ -252,3 +261,11 @@ def compile_paragraph(cls, paragraph, rels_soup, stringset):
252261
for url, text_elements in six.iteritems(added_hl_text_elements):
253262
for text_element in text_elements:
254263
cls.create_hyperlink_url(text_element, rels_soup, url)
264+
265+
for element in elements_for_removal:
266+
cls.remove_text_element(element)
267+
268+
def get_translation_hyperlink(self, translation_part):
269+
return getattr(
270+
translation_part.find_parent(attrs={'href': True}
271+
), 'attrs', {}).get('href', None)
4.62 KB
Binary file not shown.

0 commit comments

Comments
 (0)