Skip to content
This repository was archived by the owner on Jul 7, 2023. It is now read-only.

Commit 87d0c7f

Browse files
Jade AbbottCopybara-Service
authored andcommitted
internal merge of PR #1178
PiperOrigin-RevId: 219207387
1 parent c4c10ff commit 87d0c7f

File tree

1 file changed

+57
-0
lines changed

1 file changed

+57
-0
lines changed
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# coding=utf-8
2+
# Copyright 2018 The Tensor2Tensor Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
"""Data generators for translation data-sets."""
17+
18+
19+
from tensor2tensor.data_generators import problem
20+
from tensor2tensor.data_generators import text_encoder
21+
from tensor2tensor.data_generators import translate
22+
from tensor2tensor.utils import registry
23+
24+
25+
EOS = text_encoder.EOS_ID
26+
27+
_URL = "https://github.com/LauraMartinus/ukuxhumana/blob/master/data/en_tn"
28+
29+
_ENTN_TRAIN_DATASETS = [[
30+
_URL + "/eng_tswane.train.tar.gz?raw=true",
31+
("entn_parallel.train.en", "entn_parallel.train.tn")
32+
]]
33+
34+
_ENTN_TEST_DATASETS = [[
35+
_URL + "/eng_tswane.dev.tar.gz?raw=true",
36+
("entn_parallel.dev.en", "entn_parallel.dev.tn")
37+
]]
38+
39+
40+
@registry.register_problem
41+
class TranslateEntnRma(translate.TranslateProblem):
42+
"""Problem spec for English-Setswana translation.
43+
44+
Uses the RMA Autshumato dataset.
45+
"""
46+
47+
@property
48+
def approx_vocab_size(self):
49+
return 2**15 # 32768
50+
51+
@property
52+
def vocab_filename(self):
53+
return "vocab.entn.%d" % self.approx_vocab_size
54+
55+
def source_data_files(self, dataset_split):
56+
train = dataset_split == problem.DatasetSplit.TRAIN
57+
return _ENTN_TRAIN_DATASETS if train else _ENTN_TEST_DATASETS

0 commit comments

Comments
 (0)