Add IMDB sentiment classification dataset

Ryan Sepassi · Ryan Sepassi · commit a8ee62ab7a15 · 2017-08-29T15:55:24.000-07:00
PiperOrigin-RevId: 166905238
diff --git a/tensor2tensor/data_generators/all_problems.py b/tensor2tensor/data_generators/all_problems.py
@@ -26,6 +26,7 @@
 from tensor2tensor.data_generators import desc2code
 from tensor2tensor.data_generators import ice_parsing
 from tensor2tensor.data_generators import image
+from tensor2tensor.data_generators import imdb
 from tensor2tensor.data_generators import lm1b
 from tensor2tensor.data_generators import ptb
 from tensor2tensor.data_generators import snli
diff --git a/tensor2tensor/data_generators/image.py b/tensor2tensor/data_generators/image.py
@@ -272,7 +272,8 @@ def hparams(self, defaults, model_hparams):
     small_modality = "%s:small_image_modality" % registry.Modalities.IMAGE
     modality = small_modality if self.is_small else registry.Modalities.IMAGE
     p.input_modality = {"inputs": (modality, None)}
-    p.target_modality = (registry.Modalities.CLASS_LABEL, self.num_classes)
+    p.target_modality = ("%s:2d" % registry.Modalities.CLASS_LABEL,
+                         self.num_classes)
     p.batch_size_multiplier = 4 if self.is_small else 256
     p.max_expected_batch_size_per_shard = 8 if self.is_small else 2
     p.loss_multiplier = 3.0 if self.is_small else 1.0
diff --git a/tensor2tensor/data_generators/imdb.py b/tensor2tensor/data_generators/imdb.py
@@ -0,0 +1,124 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""IMDB Sentiment Classification Problem."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tarfile
+
+# Dependency imports
+
+from tensor2tensor.data_generators import generator_utils
+from tensor2tensor.data_generators import problem
+from tensor2tensor.data_generators import text_encoder
+from tensor2tensor.utils import registry
+
+import tensorflow as tf
+
+# End-of-sentence marker.
+EOS = text_encoder.EOS_ID
+
+
+@registry.register_problem
+class SentimentIMDB(problem.Problem):
+  """IMDB sentiment classification."""
+  URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
+
+  @property
+  def num_shards(self):
+    return 10
+
+  @property
+  def vocab_file(self):
+    return "sentiment_imdb.vocab"
+
+  @property
+  def targeted_vocab_size(self):
+    return 2**15
+
+  def doc_generator(self, imdb_dir, dataset, include_label=False):
+    dirs = [(os.path.join(imdb_dir, dataset, "pos"), True), (os.path.join(
+        imdb_dir, dataset, "neg"), False)]
+
+    for d, label in dirs:
+      for filename in os.listdir(d):
+        with tf.gfile.Open(os.path.join(d, filename)) as imdb_f:
+          doc = imdb_f.read().strip()
+          if include_label:
+            yield doc, label
+          else:
+            yield doc
+
+  def generator(self, data_dir, tmp_dir, train):
+    """Generate examples."""
+    # Download and extract
+    compressed_filename = os.path.basename(self.URL)
+    download_path = generator_utils.maybe_download(tmp_dir, compressed_filename,
+                                                   self.URL)
+    imdb_dir = os.path.join(tmp_dir, "aclImdb")
+    if not tf.gfile.Exists(imdb_dir):
+      with tarfile.open(download_path, "r:gz") as tar:
+        tar.extractall(tmp_dir)
+
+    # Generate vocab
+    encoder = generator_utils.get_or_generate_vocab_inner(
+        data_dir, self.vocab_file, self.targeted_vocab_size,
+        lambda: self.doc_generator(imdb_dir, "train"))
+
+    # Generate examples
+    dataset = "train" if train else "test"
+    for doc, label in self.doc_generator(imdb_dir, dataset, include_label=True):
+      yield {
+          "inputs": encoder.encode(doc) + [EOS],
+          "targets": [int(label)],
+      }
+
+  def generate_data(self, data_dir, tmp_dir, task_id=-1):
+    train_paths = self.training_filepaths(
+        data_dir, self.num_shards, shuffled=False)
+    dev_paths = self.dev_filepaths(data_dir, 1, shuffled=False)
+    generator_utils.generate_dataset_and_shuffle(
+        self.generator(data_dir, tmp_dir, True), train_paths,
+        self.generator(data_dir, tmp_dir, False), dev_paths)
+
+  def hparams(self, defaults, model_hparams):
+    p = defaults
+    source_vocab_size = self._encoders["inputs"].vocab_size
+    p.input_modality = {
+        "inputs": (registry.Modalities.SYMBOL, source_vocab_size)
+    }
+    p.target_modality = (registry.Modalities.CLASS_LABEL, 2)
+    p.input_space_id = problem.SpaceID.EN_TOK
+    p.target_space_id = problem.SpaceID.GENERIC
+
+  def feature_encoders(self, data_dir):
+    vocab_filename = os.path.join(data_dir, self.vocab_file)
+    encoder = text_encoder.SubwordTextEncoder(vocab_filename)
+    return {
+        "inputs": encoder,
+        "targets": text_encoder.TextEncoder(),
+    }
+
+  def example_reading_spec(self):
+    data_fields = {
+        "inputs": tf.VarLenFeature(tf.int64),
+        "targets": tf.FixedLenFeature([1], tf.int64),
+    }
+    data_items_to_decoders = None
+    return (data_fields, data_items_to_decoders)
diff --git a/tensor2tensor/data_generators/problem_hparams.py b/tensor2tensor/data_generators/problem_hparams.py
@@ -147,8 +147,8 @@ def default_problem_hparams():
       # Modalities used to map from input features to a space compatible with
       # chosen model architecture.  One modality spec (which is a 2-tuple,
       # (modality_full_name, vocab_size)) per feature key. modality_full_name is
-      # a string type:name, e.g. class_label:class_label_2d. Leaving off the
-      # name uses the default modality for that type (e.g. class_label ==
+      # a string type:name, e.g. class_label:2d. Leaving off the name uses the
+      # default modality for that type (e.g. class_label ==
       # class_label:default).
       input_modality={},
 
diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
@@ -361,9 +361,9 @@ def xnet_resblock(x, filters, res_relu, name):
                            "compress_block_final")
 
 
-@registry.register_class_label_modality("default")
+@registry.register_class_label_modality("2d")
 class ClassLabelModality(modality.Modality):
-  """Used for label data."""
+  """Used for label data; if is2d=True, uses Xception flow to logits."""
 
   def __init__(self, model_hparams, vocab_size, is2d=True):
     super(ClassLabelModality, self).__init__(model_hparams, vocab_size)
@@ -397,9 +397,11 @@ def targets_bottom(self, x):
   def top(self, body_output, _):
     """Transform inputs from model space to target space.
 
-    Perform the Xception "Exit flow", consisting of a single residual block and
-    two separable convolutional upscalings followed by global spatial average
-    pooling.
+    If instantiated with is2d=True, perform the Xception "Exit flow", consisting
+    of a single residual block and two separable convolutional upscalings
+    followed by global spatial average pooling.
+
+    Otherwise, a single linear layer to logits.
 
     Args:
       body_output: A Tensor with shape [batch, ?, ?, body_output_size].
@@ -417,11 +419,12 @@ def top(self, body_output, _):
         spatial_dim = tf.to_int32(spatial_dim_float)
         x_depth = int(x.get_shape()[3])
         x = tf.reshape(x, [-1, spatial_dim, spatial_dim, x_depth])
-      x = common_layers.conv_block_downsample(x, self._kernel, self._strides,
-                                              self._padding)
-      x = tf.nn.relu(x)
-      x = tf.reduce_mean(x, axis=[1, 2], keep_dims=True)
-      res = common_layers.conv(x, self._vocab_size, (1, 1))
+        x = common_layers.conv_block_downsample(x, self._kernel, self._strides,
+                                                self._padding)
+        x = tf.nn.relu(x)
+        x = tf.reduce_mean(x, axis=[1, 2], keep_dims=True)
+
+      res = tf.layers.dense(x, self._vocab_size)
       return tf.expand_dims(res, 3)
 
   def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
@@ -431,7 +434,7 @@ def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
         top_out, targets, weights_fn=weights_fn)
 
 
-@registry.register_class_label_modality("class_label_2d")
+@registry.register_class_label_modality("default")
 class ClassLabel1DModality(ClassLabelModality):
   """Used for label data."""
 
diff --git a/tensor2tensor/utils/model_builder.py b/tensor2tensor/utils/model_builder.py
@@ -164,12 +164,6 @@ def model_fn(features, targets, mode):
         features = _interactive_input_tensor_to_features_dict(features, my_hp)
       elif FLAGS.decode_from_file:
         features = _decode_input_tensor_to_features_dict(features, my_hp)
-    # A dictionary containing:
-    #  - problem_choice: A Tensor containing an integer indicating which problem
-    #                    was selected for this run.
-    #  - predictions: A Tensor containing the model's output predictions.
-    run_info = dict()
-    run_info["problem_choice"] = features["problem_choice"]
 
     if targets is not None:
       features["targets"] = targets
@@ -299,11 +293,13 @@ def nth_model(n):
 
     sharded_logits, total_loss = result_list[1:], result_list[0]
     if mode == tf.contrib.learn.ModeKeys.EVAL:
-      logits = tf.concat(sharded_logits, 0)
       # For evaluation, return the logits layer as our predictions.
-      run_info["predictions"] = logits
-      train_op = None
-      return run_info, total_loss, None
+      logits = tf.concat(sharded_logits, 0)
+      ret = {
+          "predictions": logits,
+          "problem_choice": features["problem_choice"],
+      }
+      return ret, total_loss, None
 
     assert mode == tf.contrib.learn.ModeKeys.TRAIN
 
@@ -385,7 +381,7 @@ def nth_model(n):
         del summaries[i]
 
     tf.logging.info("Global model_fn finished.")
-    return run_info, total_loss, train_op
+    return {"problem_choice": features["problem_choice"]}, total_loss, train_op
 
   return model_fn