Added English-Setswana translation problem (#1178)

Jade Abbott · afrozenator · commit 6f1ffa0b8b89 · 2018-10-29T15:47:49.000-07:00
diff --git a/tensor2tensor/data_generators/translate_entn.py b/tensor2tensor/data_generators/translate_entn.py
@@ -0,0 +1,67 @@
+
+# coding=utf-8
+# Copyright 2018 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Data generators for translation data-sets."""
+
+import os
+from tensor2tensor.data_generators import generator_utils
+from tensor2tensor.data_generators import problem
+from tensor2tensor.data_generators import text_encoder
+from tensor2tensor.data_generators import text_problems
+from tensor2tensor.data_generators import translate
+from tensor2tensor.utils import registry
+
+import tensorflow as tf
+
+EOS = text_encoder.EOS_ID
+
+
+_ENTN_TRAIN_DATASETS = [
+    [
+        "https://github.com/LauraMartinus/ukuxhumana/blob/master/data/en_tn/eng_tswane.train.tar.gz?raw=true",
+        (
+            "entn_parallel.train.en",
+            "entn_parallel.train.tn"
+        )
+    ]
+]
+
+_ENTN_TEST_DATASETS = [
+    [
+        "https://github.com/LauraMartinus/ukuxhumana/blob/master/data/en_tn/eng_tswane.dev.tar.gz?raw=true",
+        (
+            "entn_parallel.dev.en",
+            "entn_parallel.dev.tn"
+        )
+    ]
+]
+
+
+@registry.register_problem
+class TranslateEntnRma(translate.TranslateProblem):
+  """Problem spec for English-Setswana translation using the RMA Autshumato dataset"""
+  @property
+  def approx_vocab_size(self):
+    return 2**15  # 32768
+
+  @property
+  def vocab_filename(self):
+    return "vocab.entn.%d" % self.approx_vocab_size
+
+
+  def source_data_files(self, dataset_split):
+    train = dataset_split == problem.DatasetSplit.TRAIN
+    return _ENTN_TRAIN_DATASETS if train else _ENTN_TEST_DATASETS