Skip to content

Commit 29549e6

Browse files
authored
Merge pull request #361 from transifex/fix_space_with_format
Enclose spaces with tx tag so that they are restored as separate strings
2 parents 06c0bdb + 9fabfa0 commit 29549e6

File tree

2 files changed

+909
-1
lines changed

2 files changed

+909
-1
lines changed

openformats/formats/pptx.py

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
from openformats.handlers import Handler
1313
from openformats.exceptions import MissingParentError
1414
from openformats.formats.office_open_xml.parser import OfficeOpenXmlHandler
15+
from openformats.strings import OpenString
16+
from collections import defaultdict
1517

1618

1719
class PptxFile(object):
@@ -402,3 +404,175 @@ def compile(self, template, stringset, **kwargs):
402404
result = pptx.compress()
403405
pptx.delete()
404406
return result
407+
408+
class PptxHandlerV2(PptxHandler):
409+
"""
410+
New version of the PptxHandler that handles empty spaces in the text elements as
411+
normal text, instead of prepending it to the next text element.
412+
"""
413+
name = "PPTX_V2"
414+
415+
@classmethod
416+
def parse_paragraph(cls, paragraph, rels_soup):
417+
paragraph_text = []
418+
text_elements = paragraph.find_all(cls.TEXT_ELEMENT_TAG)
419+
if not text_elements:
420+
return None
421+
422+
text_elements_count = len(text_elements)
423+
424+
open_hyperlink = None
425+
for index, text_element in enumerate(text_elements):
426+
text = text_element.text
427+
428+
try:
429+
hyperlink_url = cls.get_hyperlink_url(
430+
text_element, rels_soup
431+
)
432+
except MissingParentError:
433+
continue
434+
435+
if all([
436+
text_elements_count == 2,
437+
not hyperlink_url or hyperlink_url == open_hyperlink
438+
]) or all([
439+
index > 0,
440+
index < text_elements_count - 1,
441+
not hyperlink_url or hyperlink_url == open_hyperlink
442+
]):
443+
# skip surrounding text with tags if:
444+
# * first element
445+
# * last element
446+
# * opening hyperlink (we will add the tx tag later)
447+
text = u'<tx>{}</tx>'.format(text)
448+
449+
if hyperlink_url and not open_hyperlink:
450+
# open an a tag
451+
text = u'<tx href="{}">{}'.format(
452+
hyperlink_url, text
453+
)
454+
open_hyperlink = hyperlink_url
455+
456+
if not hyperlink_url and open_hyperlink:
457+
# close a tag if open
458+
text = u'</tx>{}'.format(text)
459+
open_hyperlink = None
460+
461+
if hyperlink_url and open_hyperlink:
462+
if hyperlink_url != open_hyperlink:
463+
# close and open a new tag
464+
text = u'</tx><tx href="{}">{}'.format(
465+
hyperlink_url, text
466+
)
467+
open_hyperlink = hyperlink_url
468+
469+
paragraph_text.append(text)
470+
471+
if open_hyperlink:
472+
# close the open tag
473+
paragraph_text.append(u'</tx>')
474+
open_hyperlink = None
475+
476+
paragraph_text = u''.join(paragraph_text)
477+
if not paragraph_text.strip():
478+
return None
479+
480+
open_string = OpenString(
481+
paragraph_text,
482+
paragraph_text,
483+
)
484+
paragraph.attrs['txid'] = open_string.string_hash
485+
486+
return open_string
487+
488+
def compile_paragraph(cls, paragraph, rels_soup, stringset, is_rtl=False):
489+
text_elements = paragraph.find_all(cls.TEXT_ELEMENT_TAG)
490+
if not text_elements:
491+
return
492+
493+
txid = paragraph.attrs.get('txid')
494+
495+
if not txid:
496+
return
497+
498+
if stringset.get(txid, None) is None:
499+
return
500+
501+
translation_string = stringset[txid].string
502+
escaped_translation_string = cls._escape_xml(translation_string)
503+
504+
translation_soup = BeautifulSoup(
505+
u'<wrapper>{}</wrapper>'.format(escaped_translation_string), 'xml',
506+
).find_all(text=True)
507+
508+
added_hl_text_elements = defaultdict(list)
509+
deleted_hl_text_elements = defaultdict(list)
510+
elements_for_removal = []
511+
last_element = None
512+
513+
# First of all try to replace each element translation
514+
# this is the happiest path
515+
if is_rtl:
516+
cls.set_rtl_orientation(paragraph)
517+
518+
for _, text_element in enumerate(text_elements):
519+
last_element = text_element
520+
try:
521+
hyperlink_url = cls.get_hyperlink_url(text_element, rels_soup)
522+
except MissingParentError:
523+
continue
524+
525+
# the text parts of the translation are less that the
526+
# text parts of the document, so we will just remove
527+
# any exceeding part from the document
528+
if len(translation_soup) == 0:
529+
elements_for_removal.append(text_element)
530+
continue
531+
else:
532+
translation_part = translation_soup.pop(0)
533+
translation = six.text_type(translation_part)
534+
translation_hyperlink_url = cls.get_translation_hyperlink(translation_part)
535+
536+
text_element.clear()
537+
text_element.insert(0, translation)
538+
539+
# Edit in place hyperlink url
540+
if hyperlink_url and translation_hyperlink_url:
541+
cls.set_hyperlink_url(
542+
text_element, rels_soup, translation_hyperlink_url
543+
)
544+
else:
545+
if hyperlink_url:
546+
deleted_hl_text_elements[hyperlink_url]\
547+
.append(text_element)
548+
elif translation_hyperlink_url:
549+
added_hl_text_elements[translation_hyperlink_url]\
550+
.append(text_element)
551+
552+
# the text parts of the translation are more that the
553+
# text parts of the document, so we will compress the
554+
# remaining translation parts into one string
555+
if len(translation_soup) > 0 and last_element and last_element.contents:
556+
translation = last_element.contents[0] + \
557+
"".join([six.text_type(t) for t in translation_soup]
558+
)
559+
last_element.clear()
560+
last_element.insert(0, translation)
561+
562+
if len(added_hl_text_elements) == len(deleted_hl_text_elements)\
563+
and len(added_hl_text_elements) > 0:
564+
cls.swap_hyperlink_elements(
565+
added_hl_text_elements,
566+
deleted_hl_text_elements
567+
)
568+
569+
for text_elements in six.itervalues(deleted_hl_text_elements):
570+
for text_element in text_elements:
571+
cls.remove_hyperlink(text_element)
572+
573+
for url, text_elements in six.iteritems(added_hl_text_elements):
574+
for text_element in text_elements:
575+
cls.create_hyperlink_url(text_element, rels_soup, url)
576+
577+
for element in elements_for_removal:
578+
cls.remove_text_element(element)

0 commit comments

Comments
 (0)