@@ -45,7 +45,6 @@ from tensor2tensor.data_generators import generator_utils
45
45
from tensor2tensor .data_generators import image
46
46
from tensor2tensor .data_generators import lm1b
47
47
from tensor2tensor .data_generators import snli
48
- from tensor2tensor .data_generators import wiki
49
48
from tensor2tensor .data_generators import wmt
50
49
from tensor2tensor .data_generators import wsj_parsing
51
50
from tensor2tensor .utils import registry
@@ -105,10 +104,6 @@ _SUPPORTED_PROBLEM_GENERATORS = {
105
104
lambda : lm1b .generator (FLAGS .tmp_dir , True , characters = True ),
106
105
lambda : lm1b .generator (FLAGS .tmp_dir , False , characters = True )
107
106
),
108
- "wiki_32k" : (
109
- lambda : wiki .generator (FLAGS .tmp_dir , True ),
110
- 1000
111
- ),
112
107
"image_celeba_tune" : (
113
108
lambda : image .celeba_generator (FLAGS .tmp_dir , 162770 ),
114
109
lambda : image .celeba_generator (FLAGS .tmp_dir , 19867 , 162770 )),
@@ -170,17 +165,14 @@ def main(_):
170
165
# Remove parsing if paths are not given.
171
166
if not FLAGS .parsing_path :
172
167
problems = [p for p in problems if "parsing" not in p ]
173
- # Remove en-de BPE if paths are not given.
174
- if not FLAGS .ende_bpe_path :
175
- problems = [p for p in problems if "ende_bpe" not in p ]
176
168
177
169
if not problems :
178
170
problems_str = "\n * " .join (
179
171
sorted (list (_SUPPORTED_PROBLEM_GENERATORS ) + registry .list_problems ()))
180
172
error_msg = ("You must specify one of the supported problems to "
181
173
"generate data for:\n * " + problems_str + "\n " )
182
- error_msg += ("TIMIT, ende_bpe and parsing need data_sets specified with "
183
- "--timit_paths, --ende_bpe_path and --parsing_path." )
174
+ error_msg += ("TIMIT and parsing need data_sets specified with "
175
+ "--timit_paths and --parsing_path." )
184
176
raise ValueError (error_msg )
185
177
186
178
if not FLAGS .data_dir :
@@ -203,34 +195,17 @@ def generate_data_for_problem(problem):
203
195
"""Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS."""
204
196
training_gen , dev_gen = _SUPPORTED_PROBLEM_GENERATORS [problem ]
205
197
206
- if isinstance (dev_gen , int ):
207
- # The dev set and test sets are generated as extra shards using the
208
- # training generator. The integer specifies the number of training
209
- # shards. FLAGS.num_shards is ignored.
210
- num_training_shards = dev_gen
211
- tf .logging .info ("Generating data for %s." , problem )
212
- all_output_files = generator_utils .combined_data_filenames (
213
- problem + generator_utils .UNSHUFFLED_SUFFIX , FLAGS .data_dir ,
214
- num_training_shards )
215
- generator_utils .generate_files (training_gen (), all_output_files ,
216
- FLAGS .max_cases )
217
- else :
218
- # usual case - train data and dev data are generated using separate
219
- # generators.
220
- num_shards = FLAGS .num_shards or 10
221
- tf .logging .info ("Generating training data for %s." , problem )
222
- train_output_files = generator_utils .train_data_filenames (
223
- problem + generator_utils .UNSHUFFLED_SUFFIX , FLAGS .data_dir , num_shards )
224
- generator_utils .generate_files (training_gen (), train_output_files ,
225
- FLAGS .max_cases )
226
- tf .logging .info ("Generating development data for %s." , problem )
227
- dev_shards = 10 if "coco" in problem else 1
228
- dev_output_files = generator_utils .dev_data_filenames (
229
- problem + generator_utils .UNSHUFFLED_SUFFIX , FLAGS .data_dir , dev_shards )
230
- generator_utils .generate_files (dev_gen (), dev_output_files )
231
- all_output_files = train_output_files + dev_output_files
232
-
233
- tf .logging .info ("Shuffling data..." )
198
+ num_shards = FLAGS .num_shards or 10
199
+ tf .logging .info ("Generating training data for %s." , problem )
200
+ train_output_files = generator_utils .train_data_filenames (
201
+ problem + generator_utils .UNSHUFFLED_SUFFIX , FLAGS .data_dir , num_shards )
202
+ generator_utils .generate_files (training_gen (), train_output_files ,
203
+ FLAGS .max_cases )
204
+ tf .logging .info ("Generating development data for %s." , problem )
205
+ dev_output_files = generator_utils .dev_data_filenames (
206
+ problem + generator_utils .UNSHUFFLED_SUFFIX , FLAGS .data_dir , 1 )
207
+ generator_utils .generate_files (dev_gen (), dev_output_files )
208
+ all_output_files = train_output_files + dev_output_files
234
209
generator_utils .shuffle_dataset (all_output_files )
235
210
236
211
0 commit comments