Small transformer models (reasonable translations in 1h on 1080).

Lukasz Kaiser · Ryan Sepassi · commit 7efdbeebe777 · 2017-08-04T16:14:24.000-07:00
PiperOrigin-RevId: 164207044
diff --git a/tensor2tensor/data_generators/all_problems.py b/tensor2tensor/data_generators/all_problems.py
@@ -22,6 +22,7 @@
 from tensor2tensor.data_generators import algorithmic
 from tensor2tensor.data_generators import algorithmic_math
 from tensor2tensor.data_generators import audio
+from tensor2tensor.data_generators import cipher
 from tensor2tensor.data_generators import desc2code
 from tensor2tensor.data_generators import image
 from tensor2tensor.data_generators import lm1b
diff --git a/tensor2tensor/data_generators/cipher.py b/tensor2tensor/data_generators/cipher.py
@@ -0,0 +1,251 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Cipher data generators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import deque
+
+# Dependency imports
+
+import numpy as np
+
+from tensor2tensor.data_generators import algorithmic
+from tensor2tensor.utils import registry
+
+
+@registry.register_problem
+class CipherShift5(algorithmic.AlgorithmicProblem):
+  """Shift cipher."""
+
+  @property
+  def num_symbols(self):
+    return 5
+
+  @property
+  def distribution(self):
+    return [0.4, 0.3, 0.2, 0.08, 0.02]
+
+  @property
+  def shift(self):
+    return 1
+
+  @property
+  def train_generator(self):
+    """Generator; takes 3 args: nbr_symbols, max_length, nbr_cases."""
+
+    def _gen(nbr_symbols, max_length, nbr_cases):
+      plain_vocab = range(nbr_symbols)
+      indices = generate_plaintext_random(plain_vocab, self.distribution,
+                                          nbr_cases, max_length)
+      codes = encipher_shift(indices, plain_vocab, self.shift)
+
+      for plain, code in zip(indices, codes):
+        yield {
+            "X": plain,
+            "Y": code,
+        }
+
+    return _gen
+
+  @property
+  def train_length(self):
+    return 100
+
+  @property
+  def dev_length(self):
+    return self.train_length
+
+
+@registry.register_problem
+class CipherVigenere5(algorithmic.AlgorithmicProblem):
+  """Vinegre cipher."""
+
+  @property
+  def num_symbols(self):
+    return 5
+
+  @property
+  def distribution(self):
+    return [0.4, 0.3, 0.2, 0.08, 0.02]
+
+  @property
+  def key(self):
+    return [1, 3]
+
+  @property
+  def train_generator(self):
+    """Generator; takes 3 args: nbr_symbols, max_length, nbr_cases."""
+
+    def _gen(nbr_symbols, max_length, nbr_cases):
+      plain_vocab = range(nbr_symbols)
+      indices = generate_plaintext_random(plain_vocab, self.distribution,
+                                          nbr_cases, max_length)
+      codes = encipher_vigenere(indices, plain_vocab, self.key)
+
+      for plain, code in zip(indices, codes):
+        yield {
+            "X": plain,
+            "Y": code,
+        }
+
+    return _gen
+
+  @property
+  def train_length(self):
+    return 200
+
+  @property
+  def dev_length(self):
+    return self.train_length
+
+
+@registry.register_problem
+class CipherShift200(CipherShift5):
+  """Shift cipher."""
+
+  @property
+  def num_symbols(self):
+    return 200
+
+  @property
+  def distribution(self):
+    vals = range(self.num_symbols)
+    val_sum = sum(vals)
+    return [v / val_sum for v in vals]
+
+
+@registry.register_problem
+class CipherVigenere200(CipherVigenere5):
+  """Vinegre cipher."""
+
+  @property
+  def num_symbols(self):
+    return 200
+
+  @property
+  def distribution(self):
+    vals = range(self.num_symbols)
+    val_sum = sum(vals)
+    return [v / val_sum for v in vals]
+
+  @property
+  def key(self):
+    return [1, 3]
+
+
+class Layer(object):
+  """A single layer for shift."""
+
+  def __init__(self, vocab, shift):
+    """Initialize shift layer.
+
+    Args:
+      vocab: (list of String) the vocabulary
+      shift: (Integer) the amount of shift apply to the alphabet.
+        Positive number implies shift to the right, negative number
+        implies shift to the left.
+    """
+    self.shift = shift
+    alphabet = vocab
+    shifted_alphabet = deque(alphabet)
+    shifted_alphabet.rotate(shift)
+    self.encrypt = dict(zip(alphabet, list(shifted_alphabet)))
+    self.decrypt = dict(zip(list(shifted_alphabet), alphabet))
+
+  def encrypt_character(self, character):
+    return self.encrypt[character]
+
+  def decrypt_character(self, character):
+    return self.decrypt[character]
+
+
+def generate_plaintext_random(plain_vocab, distribution, train_samples,
+                              length):
+  """Generates samples of text from the provided vocabulary.
+
+  Args:
+    plain_vocab: vocabulary.
+    distribution: distribution.
+    train_samples: samples for training.
+    length: length.
+
+  Returns:
+    train_indices (np.array of Integers): random integers for training.
+      shape = [num_samples, length]
+    test_indices (np.array of Integers): random integers for testing.
+      shape = [num_samples, length]
+    plain_vocab   (list of Integers): unique vocabularies.
+  """
+  if distribution is not None:
+    assert len(distribution) == len(plain_vocab)
+
+  train_indices = np.random.choice(
+      range(len(plain_vocab)), (train_samples, length), p=distribution)
+
+  return train_indices
+
+
+def encipher_shift(plaintext, plain_vocab, shift):
+  """Encrypt plain text with a single shift layer.
+
+  Args:
+    plaintext (list of list of Strings): a list of plain text to encrypt.
+    plain_vocab (list of Integer): unique vocabularies being used.
+    shift (Integer): number of shift, shift to the right if shift is positive.
+  Returns:
+    ciphertext (list of Strings): encrypted plain text.
+  """
+  ciphertext = []
+  cipher = Layer(plain_vocab, shift)
+
+  for _, sentence in enumerate(plaintext):
+    cipher_sentence = []
+    for _, character in enumerate(sentence):
+      encrypted_char = cipher.encrypt_character(character)
+      cipher_sentence.append(encrypted_char)
+    ciphertext.append(cipher_sentence)
+
+  return ciphertext
+
+
+def encipher_vigenere(plaintext, plain_vocab, key):
+  """Encrypt plain text with given key.
+
+  Args:
+    plaintext (list of list of Strings): a list of plain text to encrypt.
+    plain_vocab (list of Integer): unique vocabularies being used.
+    key (list of Integer): key to encrypt cipher using Vigenere table.
+
+  Returns:
+    ciphertext (list of Strings): encrypted plain text.
+  """
+  ciphertext = []
+  # generate Vigenere table
+  layers = []
+  for i in range(len(plain_vocab)):
+    layers.append(Layer(plain_vocab, i))
+
+  for i, sentence in enumerate(plaintext):
+    cipher_sentence = []
+    for j, character in enumerate(sentence):
+      key_idx = key[j % len(key)]
+      encrypted_char = layers[key_idx].encrypt_character(character)
+      cipher_sentence.append(encrypted_char)
+    ciphertext.append(cipher_sentence)
+
+  return ciphertext
diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
@@ -59,6 +59,13 @@ def inverse_exp_decay(max_step, min_value=0.01):
   return inv_base**tf.maximum(float(max_step) - step, 0.0)
 
 
+def inverse_lin_decay(max_step, min_value=0.01):
+  """Inverse-decay linearly from 0.01 to 1.0 reached at max_step."""
+  step = tf.to_float(tf.contrib.framework.get_global_step())
+  progress = tf.minimum(step / float(max_step), 1.0)
+  return progress * (1.0 - min_value) + min_value
+
+
 def shakeshake2_py(x, y, equal=False, individual=False):
   """The shake-shake sum of 2 tensors, python version."""
   if equal:
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
@@ -386,8 +386,19 @@ def transformer_parsing_ice():
 @registry.register_hparams
 def transformer_tiny():
   hparams = transformer_base()
-  hparams.hidden_size = 64
-  hparams.filter_size = 128
+  hparams.num_hidden_layers = 2
+  hparams.hidden_size = 128
+  hparams.filter_size = 512
+  hparams.num_heads = 4
+  return hparams
+
+
+@registry.register_hparams
+def transformer_small():
+  hparams = transformer_base()
+  hparams.num_hidden_layers = 2
+  hparams.hidden_size = 256
+  hparams.filter_size = 1024
   hparams.num_heads = 4
   return hparams