Skip to content
This repository was archived by the owner on Jul 7, 2023. It is now read-only.

Commit b929e30

Browse files
authored
Merge pull request #609 from stefan-it/enmk-fix
enmk: use right order of source and target language
2 parents 39ddff9 + 9d1f536 commit b929e30

File tree

1 file changed

+8
-8
lines changed

1 file changed

+8
-8
lines changed

tensor2tensor/data_generators/translate_enmk.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,34 +33,34 @@
3333
# End-of-sentence marker.
3434
EOS = text_encoder.EOS_ID
3535

36-
# For Macedonian-English the SETimes corpus
36+
# For English-Macedonian the SETimes corpus
3737
# from http://nlp.ffzg.hr/resources/corpora/setimes/ is used.
3838
# The original dataset has 207,777 parallel sentences.
3939
# For training the first 205,777 sentences are used.
40-
_MKEN_TRAIN_DATASETS = [[
40+
_ENMK_TRAIN_DATASETS = [[
4141
"https://github.com/stefan-it/nmt-mk-en/raw/master/data/setimes.mk-en.train.tgz", # pylint: disable=line-too-long
42-
("train.mk", "train.en")
42+
("train.en", "train.mk")
4343
]]
4444

4545
# For development 1000 parallel sentences are used.
46-
_MKEN_TEST_DATASETS = [[
46+
_ENMK_TEST_DATASETS = [[
4747
"https://github.com/stefan-it/nmt-mk-en/raw/master/data/setimes.mk-en.dev.tgz", # pylint: disable=line-too-long
48-
("dev.mk", "dev.en")
48+
("dev.en", "dev.mk")
4949
]]
5050

5151

5252
@registry.register_problem
5353
class TranslateEnmkSetimes32k(translate.TranslateProblem):
54-
"""Problem spec for SETimes Mk-En translation."""
54+
"""Problem spec for SETimes En-Mk translation."""
5555

5656
@property
5757
def approx_vocab_size(self):
5858
return 2**15 # 32768
5959

6060
@property
6161
def vocab_filename(self):
62-
return "vocab.mken.%d" % self.approx_vocab_size
62+
return "vocab.enmk.%d" % self.approx_vocab_size
6363

6464
def source_data_files(self, dataset_split):
6565
train = dataset_split == problem.DatasetSplit.TRAIN
66-
return _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS
66+
return _ENMK_TRAIN_DATASETS if train else _ENMK_TEST_DATASETS

0 commit comments

Comments
 (0)