Skip to content
This repository was archived by the owner on Jul 7, 2023. It is now read-only.

Commit 347084c

Browse files
author
Ryan Sepassi
committed
Update text_encoder unicode utilities
PiperOrigin-RevId: 193703965
1 parent d41f0cc commit 347084c

File tree

2 files changed

+25
-13
lines changed

2 files changed

+25
-13
lines changed

tensor2tensor/data_generators/text_encoder.py

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -60,26 +60,37 @@
6060
_ESCAPE_CHARS = set(u"\\_u;0123456789")
6161

6262

63-
# Conversion between Unicode and UTF-8, if required (on Python2).
64-
if six.PY2:
65-
66-
def native_to_unicode(s):
67-
return s if isinstance(s, unicode) else s.decode("utf-8")
63+
# Unicode utility functions that work with Python 2 and 3
64+
def native_to_unicode(s):
65+
return s if is_unicode(s) else to_unicode(s)
6866

69-
def unicode_to_native(s):
70-
return s.encode("utf-8") if isinstance(s, unicode) else s
71-
else: # No conversion required on Python >= 3.
7267

73-
def native_to_unicode(s):
68+
def unicode_to_native(s):
69+
if six.PY2:
70+
return s.encode("utf-8") if is_unicode(s) else s
71+
else:
7472
return s
7573

76-
def unicode_to_native(s):
74+
75+
def is_unicode(s):
76+
if six.PY2:
77+
if isinstance(s, unicode):
78+
return True
79+
else:
80+
if isinstance(s, str):
81+
return True
82+
return False
83+
84+
85+
def to_unicode(s, ignore_errors=False):
86+
if is_unicode(s):
7787
return s
88+
error_mode = "ignore" if ignore_errors else "strict"
89+
return s.decode("utf-8", errors=error_mode)
7890

7991

8092
def to_unicode_ignore_errors(s):
81-
return (unicode(s, "utf-8", errors="ignore")
82-
if six.PY2 else s.decode("utf-8", "ignore"))
93+
return to_unicode(s, ignore_errors=True)
8394

8495

8596
class TextEncoder(object):

tensor2tensor/utils/cloud_tpu.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import time
3030

3131
from six.moves import input # pylint: disable=redefined-builtin
32+
from tensor2tensor.data_generators import text_encoder
3233
import tensorflow as tf
3334

3435
TPU_IP = "10.240.%d.2"
@@ -216,7 +217,7 @@ def shell_background(cmd_, **kwargs):
216217

217218

218219
def shell_output(cmd_, **kwargs):
219-
return sp.check_output(format_cmd(cmd_, **kwargs))
220+
return text_encoder.to_unicode(sp.check_output(format_cmd(cmd_, **kwargs)))
220221

221222

222223
def shell_run(cmd_, **kwargs):

0 commit comments

Comments
 (0)