Create a Problem class for the lm1b dataset.

T2T Team · Ryan Sepassi · commit 1fc6766a52ef · 2017-08-29T15:55:30.000-07:00
PiperOrigin-RevId: 166906734
diff --git a/tensor2tensor/data_generators/lm1b.py b/tensor2tensor/data_generators/lm1b.py
@@ -29,8 +29,10 @@
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensor2tensor.data_generators import generator_utils
+from tensor2tensor.data_generators import problem
 from tensor2tensor.data_generators import text_encoder
 from tensor2tensor.data_generators import tokenizer
+from tensor2tensor.utils import registry
 
 import tensorflow as tf
 
@@ -53,7 +55,7 @@ def _original_vocab(tmp_dir):
   """
   vocab_url = ("http://download.tensorflow.org/models/LM_LSTM_CNN/"
                "vocab-2016-09-10.txt")
-  vocab_filename = os.path.basename(vocab_url)
+  vocab_filename = os.path.basename(vocab_url + ".en")
   vocab_filepath = os.path.join(tmp_dir, vocab_filename)
   if not os.path.exists(vocab_filepath):
     generator_utils.maybe_download(tmp_dir, vocab_filename, vocab_url)
@@ -140,29 +142,69 @@ def _get_or_build_subword_text_encoder(tmp_dir):
   return ret
 
 
-def generator(tmp_dir, train, characters=False):
-  """Generator for lm1b sentences.
-
-  Args:
-    tmp_dir: a string.
-    train: a boolean.
-    characters: a boolean
-
-  Yields:
-    A dictionary {"inputs": [0], "targets": [<subword ids>]}
-  """
-  _maybe_download_corpus(tmp_dir)
-  original_vocab = _original_vocab(tmp_dir)
-  files = (_train_data_filenames(tmp_dir) if train
-           else [_dev_data_filename(tmp_dir)])
-  if characters:
-    encoder = text_encoder.ByteTextEncoder()
-  else:
-    encoder = _get_or_build_subword_text_encoder(tmp_dir)
-  for filepath in files:
-    tf.logging.info("filepath = %s", filepath)
-    for line in tf.gfile.Open(filepath):
-      tokens = encoder.encode(
-          _replace_oov(original_vocab, text_encoder.native_to_unicode(line)))
-      tokens.append(EOS)
-      yield {"inputs": [0], "targets": tokens}
+@registry.register_problem("languagemodel_1b32k")
+class LanguagemodelLm1b(problem.Text2TextProblem):
+  """A language model on full English Wikipedia."""
+
+  @property
+  def is_character_level(self):
+    return False
+
+  @property
+  def has_inputs(self):
+    return True
+
+  @property
+  def input_space_id(self):
+    return problem.SpaceID.EN_TOK
+
+  @property
+  def target_space_id(self):
+    return problem.SpaceID.EN_TOK
+
+  @property
+  def num_shards(self):
+    return 10
+
+  @property
+  def vocab_name(self):
+    return "vocab-2016-09-10.txt.en"
+
+  @property
+  def use_subword_tokenizer(self):
+    return True
+
+  @property
+  def targeted_vocab_size(self):
+    return 2**15  # 32768
+
+  @property
+  def use_train_shards_for_dev(self):
+    return True
+
+  def generator(self, tmp_dir, train, characters=False):
+    """Generator for lm1b sentences.
+
+    Args:
+      tmp_dir: a string.
+      train: a boolean.
+      characters: a boolean
+
+    Yields:
+      A dictionary {"inputs": [0], "targets": [<subword ids>]}
+    """
+    _maybe_download_corpus(tmp_dir)
+    original_vocab = _original_vocab(tmp_dir)
+    files = (_train_data_filenames(tmp_dir) if train
+             else [_dev_data_filename(tmp_dir)])
+    if characters:
+      encoder = text_encoder.ByteTextEncoder()
+    else:
+      encoder = _get_or_build_subword_text_encoder(tmp_dir)
+    for filepath in files:
+      tf.logging.info("filepath = %s", filepath)
+      for line in tf.gfile.Open(filepath):
+        tokens = encoder.encode(
+            _replace_oov(original_vocab, text_encoder.native_to_unicode(line)))
+        tokens.append(EOS)
+        yield {"inputs": [0], "targets": tokens}
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
@@ -647,19 +647,32 @@ def _init_alphabet_from_tokens(self, tokens):
     self._alphabet = {c for token in tokens for c in token}
     self._alphabet |= _ESCAPE_CHARS
 
-  def _load_from_file(self, filename):
-    """Load from a file.
+  def _load_from_file_object(self, f):
+    """Load from a file object.
 
     Args:
-      filename: filename to load vocabulary from
+      f: File object to load vocabulary from
     """
     subtoken_strings = []
-    with tf.gfile.Open(filename) as f:
-      for line in f:
-        subtoken_strings.append(native_to_unicode(line.strip()[1:-1]))
+    for line in f:
+      s = line.strip()
+      # Some vocab files wrap words in single quotes, but others don't
+      if (len(s) > 1 and ((s.startswith("'") and s.endswith("'")) or
+                          (s.startswith("\"") and s.endswith("\"")))):
+        s = s[1:-1]
+      subtoken_strings.append(native_to_unicode(s))
     self._init_subtokens_from_list(subtoken_strings)
     self._init_alphabet_from_tokens(subtoken_strings)
 
+  def _load_from_file(self, filename):
+    """Load from a file.
+
+    Args:
+      filename: Filename to load vocabulary from
+    """
+    with tf.gfile.Open(filename) as f:
+      self._load_from_file_object(f)
+
   def store_to_file(self, filename):
     with tf.gfile.Open(filename, "w") as f:
       for subtoken_string in self._all_subtoken_strings:
diff --git a/tensor2tensor/data_generators/text_encoder_test.py b/tensor2tensor/data_generators/text_encoder_test.py
@@ -21,6 +21,7 @@
 from __future__ import unicode_literals
 
 import collections
+import io
 import os
 import shutil
 
@@ -31,6 +32,14 @@
 import tensorflow as tf
 
 
+class NativeToUnicodeTest(tf.test.TestCase):
+
+  def test_native_to_unicode(self):
+    s = r'foo bar'
+    self.assertIsInstance(text_encoder.native_to_unicode(s), unicode)
+    self.assertEqual(text_encoder.native_to_unicode(s), u'foo bar')
+
+
 class EscapeUnescapeTokenTest(tf.test.TestCase):
 
   def test_escape_token(self):
@@ -186,6 +195,24 @@ def test_raises_exception_when_not_encodable(self):
     with self.assertRaises(AssertionError):
       encoder.encode(original)
 
+  def test_load_from_file(self):
+    # Test a vocab file with words not wrapped with single quotes
+    encoder = text_encoder.SubwordTextEncoder()
+    correct_vocab = ['the', 'and', 'of']
+    vocab = io.StringIO('the\n'
+                        'and\n'
+                        'of\n')
+    encoder._load_from_file_object(vocab)
+    self.assertEqual(encoder._all_subtoken_strings, correct_vocab)
+
+    # Test a vocab file with words wrapped in single quotes
+    encoder = text_encoder.SubwordTextEncoder()
+    vocab = io.StringIO('\'the\'\n'
+                        '\'and\'\n'
+                        '\'of\'\n')
+    encoder._load_from_file_object(vocab)
+    self.assertEqual(encoder._all_subtoken_strings, correct_vocab)
+
 
 if __name__ == '__main__':
   tf.test.main()