Finish LM1B transfer to Problem, add CNN+DailyMail dataset, style corrections.

Lukasz Kaiser · Ryan Sepassi · commit 8353ef283fcd · 2017-08-29T16:18:59.000-07:00
PiperOrigin-RevId: 166918589
diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen
@@ -42,7 +42,6 @@ from tensor2tensor.data_generators import algorithmic_math
 from tensor2tensor.data_generators import all_problems  # pylint: disable=unused-import
 from tensor2tensor.data_generators import audio
 from tensor2tensor.data_generators import generator_utils
-from tensor2tensor.data_generators import lm1b
 from tensor2tensor.data_generators import snli
 from tensor2tensor.data_generators import wmt
 from tensor2tensor.data_generators import wsj_parsing
@@ -92,14 +91,6 @@ _SUPPORTED_PROBLEM_GENERATORS = {
             FLAGS.data_dir, FLAGS.tmp_dir, True, 2**14, 2**9),
         lambda: wsj_parsing.parsing_token_generator(
             FLAGS.data_dir, FLAGS.tmp_dir, False, 2**14, 2**9)),
-    "languagemodel_1b32k": (
-        lambda: lm1b.generator(FLAGS.tmp_dir, True),
-        lambda: lm1b.generator(FLAGS.tmp_dir, False)
-    ),
-    "languagemodel_1b_characters": (
-        lambda: lm1b.generator(FLAGS.tmp_dir, True, characters=True),
-        lambda: lm1b.generator(FLAGS.tmp_dir, False, characters=True)
-    ),
     "inference_snli32k": (
         lambda: snli.snli_token_generator(FLAGS.tmp_dir, True, 2**15),
         lambda: snli.snli_token_generator(FLAGS.tmp_dir, False, 2**15),
diff --git a/tensor2tensor/data_generators/all_problems.py b/tensor2tensor/data_generators/all_problems.py
@@ -23,6 +23,7 @@
 from tensor2tensor.data_generators import algorithmic_math
 from tensor2tensor.data_generators import audio
 from tensor2tensor.data_generators import cipher
+from tensor2tensor.data_generators import cnn_dailymail
 from tensor2tensor.data_generators import desc2code
 from tensor2tensor.data_generators import ice_parsing
 from tensor2tensor.data_generators import image
diff --git a/tensor2tensor/data_generators/cnn_dailymail.py b/tensor2tensor/data_generators/cnn_dailymail.py
@@ -0,0 +1,137 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Data generators for the CNN and Daily Mail datasets."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tarfile
+
+# Dependency imports
+
+import six
+from tensor2tensor.data_generators import generator_utils
+from tensor2tensor.data_generators import problem
+from tensor2tensor.data_generators import text_encoder
+from tensor2tensor.utils import registry
+
+import tensorflow as tf
+
+
+# Links to data from http://cs.nyu.edu/~kcho/DMQA/
+_CNN_STORIES_DRIVE_URL = "https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfTHk4NFg2SndKcjQ"
+
+_DAILYMAIL_STORIES_DRIVE_URL = "https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfM1BxdkxVaTY2bWs"
+
+
+# End-of-sentence marker.
+EOS = text_encoder.EOS_ID
+
+
+def _maybe_download_corpora(tmp_dir):
+  """Download corpora if necessary and unzip them.
+
+  Args:
+    tmp_dir: directory containing dataset.
+
+  Returns:
+    filepath of the downloaded corpus file.
+  """
+  cnn_filename = "cnn_stories.tgz"
+  dailymail_filename = "dailymail_stories.tgz"
+  cnn_finalpath = os.path.join(tmp_dir, "cnn/stories/")
+  dailymail_finalpath = os.path.join(tmp_dir, "dailymail/stories/")
+  if not tf.gfile.Exists(cnn_finalpath):
+    cnn_file = generator_utils.maybe_download_from_drive(
+        tmp_dir, cnn_filename, _CNN_STORIES_DRIVE_URL)
+    with tarfile.open(cnn_file, "r:gz") as cnn_tar:
+      cnn_tar.extractall(tmp_dir)
+  if not tf.gfile.Exists(dailymail_finalpath):
+    dailymail_file = generator_utils.maybe_download_from_drive(
+        tmp_dir, dailymail_filename, _CNN_STORIES_DRIVE_URL)
+    with tarfile.open(dailymail_file, "r:gz") as dailymail_tar:
+      dailymail_tar.extractall(tmp_dir)
+  return [cnn_finalpath, dailymail_finalpath]
+
+
+def story_generator(tmp_dir):
+  paths = _maybe_download_corpora(tmp_dir)
+  for path in paths:
+    for story_file in tf.gfile.Glob(path + "*"):
+      story = u""
+      for line in tf.gfile.Open(story_file):
+        line = unicode(line, "utf-8") if six.PY2 else line.decode("utf-8")
+        story += line
+      yield story
+
+
+def _story_summary_split(story):
+  end_pos = story.find("\n\n")  # Upto first empty line.
+  assert end_pos != -1
+  return story[:end_pos], story[end_pos:].strip()
+
+
+@registry.register_problem
+class SummarizeCnnDailymail32k(problem.Text2TextProblem):
+  """Summarize CNN and Daily Mail articles to their first paragraph."""
+
+  @property
+  def is_character_level(self):
+    return False
+
+  @property
+  def has_inputs(self):
+    return True
+
+  @property
+  def input_space_id(self):
+    return problem.SpaceID.EN_TOK
+
+  @property
+  def target_space_id(self):
+    return problem.SpaceID.EN_TOK
+
+  @property
+  def num_shards(self):
+    return 100
+
+  @property
+  def vocab_name(self):
+    return "vocab.cnndailymail"
+
+  @property
+  def use_subword_tokenizer(self):
+    return True
+
+  @property
+  def targeted_vocab_size(self):
+    return 2**15  # 32768
+
+  @property
+  def use_train_shards_for_dev(self):
+    return True
+
+  def generator(self, data_dir, tmp_dir, _):
+    encoder = generator_utils.get_or_generate_vocab_inner(
+        data_dir, self.vocab_file, self.targeted_vocab_size,
+        lambda: story_generator(tmp_dir))
+    for story in story_generator(tmp_dir):
+      summary, rest = _story_summary_split(story)
+      encoded_summary = encoder.encode(summary) + [EOS]
+      encoded_story = encoder.encode(rest) + [EOS]
+      yield {"inputs": encoded_story, "targets": encoded_summary}
diff --git a/tensor2tensor/data_generators/imdb.py b/tensor2tensor/data_generators/imdb.py
@@ -50,7 +50,7 @@ def vocab_file(self):
 
   @property
   def targeted_vocab_size(self):
-    return 2**15
+    return 2**13  # 8k vocab suffices for this small dataset.
 
   def doc_generator(self, imdb_dir, dataset, include_label=False):
     dirs = [(os.path.join(imdb_dir, dataset, "pos"), True), (os.path.join(
diff --git a/tensor2tensor/data_generators/lm1b.py b/tensor2tensor/data_generators/lm1b.py
@@ -142,9 +142,9 @@ def _get_or_build_subword_text_encoder(tmp_dir):
   return ret
 
 
-@registry.register_problem("languagemodel_1b32k")
-class LanguagemodelLm1b(problem.Text2TextProblem):
-  """A language model on full English Wikipedia."""
+@registry.register_problem
+class LanguagemodelLm1b32k(problem.Text2TextProblem):
+  """A language model on the 1B words corpus."""
 
   @property
   def is_character_level(self):
@@ -156,6 +156,8 @@ def has_inputs(self):
 
   @property
   def input_space_id(self):
+    # Ratio of dev tokens (including eos) to dev words (including eos)
+    # 176884 / 159658 = 1.107893; multiply ppx by this to compare results.
     return problem.SpaceID.EN_TOK
 
   @property
@@ -164,11 +166,11 @@ def target_space_id(self):
 
   @property
   def num_shards(self):
-    return 10
+    return 100
 
   @property
   def vocab_name(self):
-    return "vocab-2016-09-10.txt.en"
+    return "vocab.lm1b.en"
 
   @property
   def use_subword_tokenizer(self):
@@ -208,3 +210,12 @@ def generator(self, tmp_dir, train, characters=False):
             _replace_oov(original_vocab, text_encoder.native_to_unicode(line)))
         tokens.append(EOS)
         yield {"inputs": [0], "targets": tokens}
+
+
+@registry.register_problem
+class LanguagemodelLm1bCharacters(LanguagemodelLm1b32k):
+  """A language model on the 1B words corpus, character level."""
+
+  @property
+  def is_character_level(self):
+    return True
diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py
@@ -267,35 +267,6 @@ def audio_timit_tokens(model_hparams, wrong_vocab_size):
   return p
 
 
-def lm1b_32k(model_hparams):
-  """Billion-word language-modeling benchmark, 32k subword vocabulary."""
-  p = default_problem_hparams()
-  # ratio of dev tokens (including eos) to dev words (including eos)
-  # 176884 / 159658 = 1.107893
-  p.perplexity_exponent = 1.107893
-  p.input_modality = {}
-  encoder = text_encoder.SubwordTextEncoder(
-      os.path.join(model_hparams.data_dir, "lm1b_32k.subword_text_encoder"))
-  p.target_modality = (registry.Modalities.SYMBOL, encoder.vocab_size)
-  p.vocabulary = {"targets": encoder}
-  p.target_space_id = 3
-  return p
-
-
-def lm1b_characters(unused_model_hparams):
-  """Billion-word language-modeling benchmark, 32k subword vocabulary."""
-  p = default_problem_hparams()
-  # ratio of dev tokens (including eos) to dev words (including eos)
-  # 826189 / 159658 = 5.174742
-  p.perplexity_exponent = 5.174742
-  p.input_modality = {}
-  encoder = text_encoder.ByteTextEncoder()
-  p.target_modality = (registry.Modalities.SYMBOL, encoder.vocab_size)
-  p.vocabulary = {"targets": encoder}
-  p.target_space_id = 2
-  return p
-
-
 def wmt_parsing_characters(model_hparams):
   """English to parse tree translation benchmark."""
   del model_hparams  # Unused.
@@ -404,10 +375,6 @@ def img2img_imagenet(unused_model_hparams):
         lambda p: audio_timit_tokens(p, 2**13),
     "audio_timit_tokens_8k_test":
         lambda p: audio_timit_tokens(p, 2**13),
-    "languagemodel_1b_characters":
-        lm1b_characters,
-    "languagemodel_1b32k":
-        lm1b_32k,
     "parsing_english_ptb8k":
         lambda p: wmt_parsing_tokens(p, 2**13),
     "parsing_english_ptb16k":
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
@@ -657,8 +657,8 @@ def _load_from_file_object(self, f):
     for line in f:
       s = line.strip()
       # Some vocab files wrap words in single quotes, but others don't
-      if (len(s) > 1 and ((s.startswith("'") and s.endswith("'")) or
-                          (s.startswith("\"") and s.endswith("\"")))):
+      if ((s.startswith("'") and s.endswith("'")) or
+          (s.startswith("\"") and s.endswith("\""))):
         s = s[1:-1]
       subtoken_strings.append(native_to_unicode(s))
     self._init_subtokens_from_list(subtoken_strings)
diff --git a/tensor2tensor/data_generators/text_encoder_test.py b/tensor2tensor/data_generators/text_encoder_test.py
diff --git a/tensor2tensor/data_generators/wiki.py b/tensor2tensor/data_generators/wiki.py