|
33 | 33 | # End-of-sentence marker.
|
34 | 34 | EOS = text_encoder.EOS_ID
|
35 | 35 |
|
36 |
| -# For Macedonian-English the SETimes corpus |
| 36 | +# For English-Macedonian the SETimes corpus |
37 | 37 | # from http://nlp.ffzg.hr/resources/corpora/setimes/ is used.
|
38 | 38 | # The original dataset has 207,777 parallel sentences.
|
39 | 39 | # For training the first 205,777 sentences are used.
|
40 |
| -_MKEN_TRAIN_DATASETS = [[ |
| 40 | +_ENMK_TRAIN_DATASETS = [[ |
41 | 41 | "https://github.com/stefan-it/nmt-mk-en/raw/master/data/setimes.mk-en.train.tgz", # pylint: disable=line-too-long
|
42 |
| - ("train.mk", "train.en") |
| 42 | + ("train.en", "train.mk") |
43 | 43 | ]]
|
44 | 44 |
|
45 | 45 | # For development 1000 parallel sentences are used.
|
46 |
| -_MKEN_TEST_DATASETS = [[ |
| 46 | +_ENMK_TEST_DATASETS = [[ |
47 | 47 | "https://github.com/stefan-it/nmt-mk-en/raw/master/data/setimes.mk-en.dev.tgz", # pylint: disable=line-too-long
|
48 |
| - ("dev.mk", "dev.en") |
| 48 | + ("dev.en", "dev.mk") |
49 | 49 | ]]
|
50 | 50 |
|
51 | 51 |
|
52 | 52 | @registry.register_problem
|
53 | 53 | class TranslateEnmkSetimes32k(translate.TranslateProblem):
|
54 |
| - """Problem spec for SETimes Mk-En translation.""" |
| 54 | + """Problem spec for SETimes En-Mk translation.""" |
55 | 55 |
|
56 | 56 | @property
|
57 | 57 | def approx_vocab_size(self):
|
58 | 58 | return 2**15 # 32768
|
59 | 59 |
|
60 | 60 | @property
|
61 | 61 | def vocab_filename(self):
|
62 |
| - return "vocab.mken.%d" % self.approx_vocab_size |
| 62 | + return "vocab.enmk.%d" % self.approx_vocab_size |
63 | 63 |
|
64 | 64 | def source_data_files(self, dataset_split):
|
65 | 65 | train = dataset_split == problem.DatasetSplit.TRAIN
|
66 |
| - return _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS |
| 66 | + return _ENMK_TRAIN_DATASETS if train else _ENMK_TEST_DATASETS |
0 commit comments