@@ -45,7 +45,6 @@ from tensor2tensor.data_generators import generator_utils
45
45
from tensor2tensor .data_generators import image
46
46
from tensor2tensor .data_generators import lm1b
47
47
from tensor2tensor .data_generators import snli
48
- from tensor2tensor .data_generators import wiki
49
48
from tensor2tensor .data_generators import wmt
50
49
from tensor2tensor .data_generators import wsj_parsing
51
50
from tensor2tensor .utils import registry
@@ -82,16 +81,6 @@ _SUPPORTED_PROBLEM_GENERATORS = {
82
81
"algorithmic_algebra_inverse" : (
83
82
lambda : algorithmic_math .algebra_inverse (26 , 0 , 2 , 100000 ),
84
83
lambda : algorithmic_math .algebra_inverse (26 , 3 , 3 , 10000 )),
85
- "ice_parsing_tokens" : (
86
- lambda : wmt .tabbed_parsing_token_generator (
87
- FLAGS .data_dir , FLAGS .tmp_dir , True , "ice" , 2 ** 13 , 2 ** 8 ),
88
- lambda : wmt .tabbed_parsing_token_generator (
89
- FLAGS .data_dir , FLAGS .tmp_dir , False , "ice" , 2 ** 13 , 2 ** 8 )),
90
- "ice_parsing_characters" : (
91
- lambda : wmt .tabbed_parsing_character_generator (
92
- FLAGS .data_dir , FLAGS .tmp_dir , True ),
93
- lambda : wmt .tabbed_parsing_character_generator (
94
- FLAGS .data_dir , FLAGS .tmp_dir , False )),
95
84
"wmt_parsing_tokens_8k" : (
96
85
lambda : wmt .parsing_token_generator (
97
86
FLAGS .data_dir , FLAGS .tmp_dir , True , 2 ** 13 ),
@@ -115,10 +104,6 @@ _SUPPORTED_PROBLEM_GENERATORS = {
115
104
lambda : lm1b .generator (FLAGS .tmp_dir , True , characters = True ),
116
105
lambda : lm1b .generator (FLAGS .tmp_dir , False , characters = True )
117
106
),
118
- "wiki_32k" : (
119
- lambda : wiki .generator (FLAGS .tmp_dir , True ),
120
- 1000
121
- ),
122
107
"image_celeba_tune" : (
123
108
lambda : image .celeba_generator (FLAGS .tmp_dir , 162770 ),
124
109
lambda : image .celeba_generator (FLAGS .tmp_dir , 19867 , 162770 )),
@@ -180,17 +165,14 @@ def main(_):
180
165
# Remove parsing if paths are not given.
181
166
if not FLAGS .parsing_path :
182
167
problems = [p for p in problems if "parsing" not in p ]
183
- # Remove en-de BPE if paths are not given.
184
- if not FLAGS .ende_bpe_path :
185
- problems = [p for p in problems if "ende_bpe" not in p ]
186
168
187
169
if not problems :
188
170
problems_str = "\n * " .join (
189
171
sorted (list (_SUPPORTED_PROBLEM_GENERATORS ) + registry .list_problems ()))
190
172
error_msg = ("You must specify one of the supported problems to "
191
173
"generate data for:\n * " + problems_str + "\n " )
192
- error_msg += ("TIMIT, ende_bpe and parsing need data_sets specified with "
193
- "--timit_paths, --ende_bpe_path and --parsing_path." )
174
+ error_msg += ("TIMIT and parsing need data_sets specified with "
175
+ "--timit_paths and --parsing_path." )
194
176
raise ValueError (error_msg )
195
177
196
178
if not FLAGS .data_dir :
@@ -213,34 +195,17 @@ def generate_data_for_problem(problem):
213
195
"""Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS."""
214
196
training_gen , dev_gen = _SUPPORTED_PROBLEM_GENERATORS [problem ]
215
197
216
- if isinstance (dev_gen , int ):
217
- # The dev set and test sets are generated as extra shards using the
218
- # training generator. The integer specifies the number of training
219
- # shards. FLAGS.num_shards is ignored.
220
- num_training_shards = dev_gen
221
- tf .logging .info ("Generating data for %s." , problem )
222
- all_output_files = generator_utils .combined_data_filenames (
223
- problem + generator_utils .UNSHUFFLED_SUFFIX , FLAGS .data_dir ,
224
- num_training_shards )
225
- generator_utils .generate_files (training_gen (), all_output_files ,
226
- FLAGS .max_cases )
227
- else :
228
- # usual case - train data and dev data are generated using separate
229
- # generators.
230
- num_shards = FLAGS .num_shards or 10
231
- tf .logging .info ("Generating training data for %s." , problem )
232
- train_output_files = generator_utils .train_data_filenames (
233
- problem + generator_utils .UNSHUFFLED_SUFFIX , FLAGS .data_dir , num_shards )
234
- generator_utils .generate_files (training_gen (), train_output_files ,
235
- FLAGS .max_cases )
236
- tf .logging .info ("Generating development data for %s." , problem )
237
- dev_shards = 10 if "coco" in problem else 1
238
- dev_output_files = generator_utils .dev_data_filenames (
239
- problem + generator_utils .UNSHUFFLED_SUFFIX , FLAGS .data_dir , dev_shards )
240
- generator_utils .generate_files (dev_gen (), dev_output_files )
241
- all_output_files = train_output_files + dev_output_files
242
-
243
- tf .logging .info ("Shuffling data..." )
198
+ num_shards = FLAGS .num_shards or 10
199
+ tf .logging .info ("Generating training data for %s." , problem )
200
+ train_output_files = generator_utils .train_data_filenames (
201
+ problem + generator_utils .UNSHUFFLED_SUFFIX , FLAGS .data_dir , num_shards )
202
+ generator_utils .generate_files (training_gen (), train_output_files ,
203
+ FLAGS .max_cases )
204
+ tf .logging .info ("Generating development data for %s." , problem )
205
+ dev_output_files = generator_utils .dev_data_filenames (
206
+ problem + generator_utils .UNSHUFFLED_SUFFIX , FLAGS .data_dir , 1 )
207
+ generator_utils .generate_files (dev_gen (), dev_output_files )
208
+ all_output_files = train_output_files + dev_output_files
244
209
generator_utils .shuffle_dataset (all_output_files )
245
210
246
211
0 commit comments