5
5
6
6
import click
7
7
import coloredlogs
8
+ import humanfriendly
8
9
import numcodecs
9
10
import tabulate
10
11
@@ -39,6 +40,14 @@ def list_commands(self, ctx):
39
40
"zarr_path" , type = click .Path (file_okay = False , dir_okay = True )
40
41
)
41
42
43
+ zarr_path = click .argument (
44
+ "zarr_path" , type = click .Path (exists = True , file_okay = False , dir_okay = True )
45
+ )
46
+
47
+ num_partitions = click .argument ("num_partitions" , type = click .IntRange (min = 1 ))
48
+
49
+ partition = click .argument ("partition" , type = click .IntRange (min = 0 ))
50
+
42
51
verbose = click .option ("-v" , "--verbose" , count = True , help = "Increase verbosity" )
43
52
44
53
force = click .option (
@@ -92,6 +101,27 @@ def list_commands(self, ctx):
92
101
help = "Chunk size in the samples dimension" ,
93
102
)
94
103
104
+ schema = click .option ("-s" , "--schema" , default = None , type = click .Path (exists = True ))
105
+
106
+ max_variant_chunks = click .option (
107
+ "-V" ,
108
+ "--max-variant-chunks" ,
109
+ type = int ,
110
+ default = None ,
111
+ help = (
112
+ "Truncate the output in the variants dimension to have "
113
+ "this number of chunks. Mainly intended to help with "
114
+ "schema tuning."
115
+ ),
116
+ )
117
+
118
+ max_memory = click .option (
119
+ "-M" ,
120
+ "--max-memory" ,
121
+ default = None ,
122
+ help = "An approximate bound on overall memory usage (e.g. 10G)," ,
123
+ )
124
+
95
125
96
126
def setup_logging (verbosity ):
97
127
level = "WARNING"
@@ -158,7 +188,7 @@ def explode(
158
188
@click .command
159
189
@vcfs
160
190
@new_icf_path
161
- @click . argument ( " num_partitions" , type = click . IntRange ( min = 1 ))
191
+ @num_partitions
162
192
@force
163
193
@column_chunk_size
164
194
@compressor
@@ -194,7 +224,7 @@ def dexplode_init(
194
224
195
225
@click .command
196
226
@icf_path
197
- @click . argument ( " partition" , type = click . IntRange ( min = 0 ))
227
+ @partition
198
228
@verbose
199
229
def dexplode_partition (icf_path , partition , verbose ):
200
230
"""
@@ -207,14 +237,14 @@ def dexplode_partition(icf_path, partition, verbose):
207
237
208
238
209
239
@click .command
210
- @click . argument ( "path" , type = click . Path (), required = True )
240
+ @icf_path
211
241
@verbose
212
- def dexplode_finalise (path , verbose ):
242
+ def dexplode_finalise (icf_path , verbose ):
213
243
"""
214
244
Final step for distributed conversion of VCF(s) to intermediate columnar format.
215
245
"""
216
246
setup_logging (verbose )
217
- vcf .explode_finalise (path )
247
+ vcf .explode_finalise (icf_path )
218
248
219
249
220
250
@click .command
@@ -244,26 +274,11 @@ def mkschema(icf_path):
244
274
@new_zarr_path
245
275
@force
246
276
@verbose
247
- @click . option ( "-s" , "-- schema" , default = None , type = click . Path ( exists = True ))
277
+ @schema
248
278
@variants_chunk_size
249
279
@samples_chunk_size
250
- @click .option (
251
- "-V" ,
252
- "--max-variant-chunks" ,
253
- type = int ,
254
- default = None ,
255
- help = (
256
- "Truncate the output in the variants dimension to have "
257
- "this number of chunks. Mainly intended to help with "
258
- "schema tuning."
259
- ),
260
- )
261
- @click .option (
262
- "-M" ,
263
- "--max-memory" ,
264
- default = None ,
265
- help = "An approximate bound on overall memory usage (e.g. 10G)," ,
266
- )
280
+ @max_variant_chunks
281
+ @max_memory
267
282
@worker_processes
268
283
def encode (
269
284
icf_path ,
@@ -288,13 +303,96 @@ def encode(
288
303
schema_path = schema ,
289
304
variants_chunk_size = variants_chunk_size ,
290
305
samples_chunk_size = samples_chunk_size ,
291
- max_v_chunks = max_variant_chunks ,
306
+ max_variant_chunks = max_variant_chunks ,
292
307
worker_processes = worker_processes ,
293
308
max_memory = max_memory ,
294
309
show_progress = True ,
295
310
)
296
311
297
312
313
+ @click .command
314
+ @icf_path
315
+ @new_zarr_path
316
+ @num_partitions
317
+ @force
318
+ @schema
319
+ @variants_chunk_size
320
+ @samples_chunk_size
321
+ @max_variant_chunks
322
+ @verbose
323
+ def dencode_init (
324
+ icf_path ,
325
+ zarr_path ,
326
+ num_partitions ,
327
+ force ,
328
+ schema ,
329
+ variants_chunk_size ,
330
+ samples_chunk_size ,
331
+ max_variant_chunks ,
332
+ verbose ,
333
+ ):
334
+ """
335
+ Initialise conversion of intermediate format to VCF Zarr. This will
336
+ set up the specified ZARR_PATH to perform this conversion over
337
+ NUM_PARTITIONS.
338
+
339
+ The output of this commmand is the actual number of partitions generated
340
+ (which may be less then the requested number, if there is not sufficient
341
+ chunks in the variants dimension) and a rough lower-bound on the amount
342
+ of memory required to encode a partition.
343
+
344
+ NOTE: the format of this output will likely change in subsequent releases;
345
+ it should not be considered machine-readable for now.
346
+ """
347
+ setup_logging (verbose )
348
+ check_overwrite_dir (zarr_path , force )
349
+ num_partitions , max_memory = vcf .encode_init (
350
+ icf_path ,
351
+ zarr_path ,
352
+ target_num_partitions = num_partitions ,
353
+ schema_path = schema ,
354
+ variants_chunk_size = variants_chunk_size ,
355
+ samples_chunk_size = samples_chunk_size ,
356
+ max_variant_chunks = max_variant_chunks ,
357
+ show_progress = True ,
358
+ )
359
+ formatted_size = humanfriendly .format_size (max_memory , binary = True )
360
+ # NOTE adding the size to the stdout here so that users can parse it
361
+ # and use in their submission scripts. This is a first pass, and
362
+ # will most likely change as we see what works and doesn't.
363
+ # NOTE we probably want to format this as a table, which lists
364
+ # some other properties, line by line
365
+ # NOTE This size number is also not quite enough, you need a bit of
366
+ # headroom with it (probably 10% or so). We should include this.
367
+ click .echo (f"{ num_partitions } \t { formatted_size } " )
368
+
369
+
370
+ @click .command
371
+ @zarr_path
372
+ @partition
373
+ @verbose
374
+ def dencode_partition (zarr_path , partition , verbose ):
375
+ """
376
+ Convert a partition from intermediate columnar format to VCF Zarr.
377
+ Must be called *after* the Zarr path has been initialised with dencode_init.
378
+ Partition indexes must be from 0 (inclusive) to the number of paritions
379
+ returned by dencode_init (exclusive).
380
+ """
381
+ setup_logging (verbose )
382
+ vcf .encode_partition (zarr_path , partition )
383
+
384
+
385
+ @click .command
386
+ @zarr_path
387
+ @verbose
388
+ def dencode_finalise (zarr_path , verbose ):
389
+ """
390
+ Final step for distributed conversion of ICF to VCF Zarr.
391
+ """
392
+ setup_logging (verbose )
393
+ vcf .encode_finalise (zarr_path , show_progress = True )
394
+
395
+
298
396
@click .command (name = "convert" )
299
397
@vcfs
300
398
@new_zarr_path
@@ -382,6 +480,9 @@ def vcf2zarr():
382
480
vcf2zarr .add_command (dexplode_init )
383
481
vcf2zarr .add_command (dexplode_partition )
384
482
vcf2zarr .add_command (dexplode_finalise )
483
+ vcf2zarr .add_command (dencode_init )
484
+ vcf2zarr .add_command (dencode_partition )
485
+ vcf2zarr .add_command (dencode_finalise )
385
486
386
487
387
488
@click .command (name = "convert" )
0 commit comments