@@ -152,6 +152,10 @@ class VcfPartition:
152
152
cname = "zstd" , clevel = 7 , shuffle = numcodecs .Blosc .NOSHUFFLE
153
153
)
154
154
155
+ # TODO refactor this to have embedded Contig dataclass, Filters
156
+ # and Samples dataclasses to allow for more information to be
157
+ # retained and forward compatibility.
158
+
155
159
156
160
@dataclasses .dataclass
157
161
class IcfMetadata :
@@ -1242,6 +1246,50 @@ def new(**kwargs):
1242
1246
spec ._choose_compressor_settings ()
1243
1247
return spec
1244
1248
1249
+ @staticmethod
1250
+ def from_field (
1251
+ vcf_field ,
1252
+ * ,
1253
+ num_variants ,
1254
+ num_samples ,
1255
+ variants_chunk_size ,
1256
+ samples_chunk_size ,
1257
+ variable_name = None ,
1258
+ ):
1259
+ shape = [num_variants ]
1260
+ prefix = "variant_"
1261
+ dimensions = ["variants" ]
1262
+ chunks = [variants_chunk_size ]
1263
+ if vcf_field .category == "FORMAT" :
1264
+ prefix = "call_"
1265
+ shape .append (num_samples )
1266
+ chunks .append (samples_chunk_size )
1267
+ dimensions .append ("samples" )
1268
+ if variable_name is None :
1269
+ variable_name = prefix + vcf_field .name
1270
+ # TODO make an option to add in the empty extra dimension
1271
+ if vcf_field .summary .max_number > 1 :
1272
+ shape .append (vcf_field .summary .max_number )
1273
+ # TODO we should really be checking this to see if the named dimensions
1274
+ # are actually correct.
1275
+ if vcf_field .vcf_number == "R" :
1276
+ dimensions .append ("alleles" )
1277
+ elif vcf_field .vcf_number == "A" :
1278
+ dimensions .append ("alt_alleles" )
1279
+ elif vcf_field .vcf_number == "G" :
1280
+ dimensions .append ("genotypes" )
1281
+ else :
1282
+ dimensions .append (f"{ vcf_field .category } _{ vcf_field .name } _dim" )
1283
+ return ZarrColumnSpec .new (
1284
+ vcf_field = vcf_field .full_name ,
1285
+ name = variable_name ,
1286
+ dtype = vcf_field .smallest_dtype (),
1287
+ shape = shape ,
1288
+ chunks = chunks ,
1289
+ dimensions = dimensions ,
1290
+ description = vcf_field .description ,
1291
+ )
1292
+
1245
1293
def _choose_compressor_settings (self ):
1246
1294
"""
1247
1295
Choose compressor and filter settings based on the size and
@@ -1256,8 +1304,10 @@ def _choose_compressor_settings(self):
1256
1304
# performance in some cases anyway. Turning on shuffle should be a
1257
1305
# deliberate choice.
1258
1306
shuffle = numcodecs .Blosc .NOSHUFFLE
1259
- if dt .itemsize == 1 :
1260
- # Any 1 byte field gets BITSHUFFLE by default
1307
+ if self .name == "call_genotype" and dt .itemsize == 1 :
1308
+ # call_genotype gets BITSHUFFLE by default as it gets
1309
+ # significantly better compression (at a cost of slower
1310
+ # decoding)
1261
1311
shuffle = numcodecs .Blosc .BITSHUFFLE
1262
1312
self .compressor ["shuffle" ] = shuffle
1263
1313
@@ -1313,6 +1363,16 @@ def generate(icf, variants_chunk_size=None, samples_chunk_size=None):
1313
1363
f"Generating schema with chunks={ variants_chunk_size , samples_chunk_size } "
1314
1364
)
1315
1365
1366
+ def spec_from_field (field , variable_name = None ):
1367
+ return ZarrColumnSpec .from_field (
1368
+ field ,
1369
+ num_samples = n ,
1370
+ num_variants = m ,
1371
+ samples_chunk_size = samples_chunk_size ,
1372
+ variants_chunk_size = variants_chunk_size ,
1373
+ variable_name = variable_name ,
1374
+ )
1375
+
1316
1376
def fixed_field_spec (
1317
1377
name , dtype , vcf_field = None , shape = (m ,), dimensions = ("variants" ,)
1318
1378
):
@@ -1349,67 +1409,31 @@ def fixed_field_spec(
1349
1409
dimensions = ["variants" , "alleles" ],
1350
1410
),
1351
1411
fixed_field_spec (
1352
- vcf_field = "POS" ,
1353
- name = "variant_position" ,
1354
- dtype = "i4" ,
1355
- ),
1356
- fixed_field_spec (
1357
- vcf_field = None ,
1358
1412
name = "variant_id" ,
1359
1413
dtype = "str" ,
1360
1414
),
1361
1415
fixed_field_spec (
1362
- vcf_field = None ,
1363
1416
name = "variant_id_mask" ,
1364
1417
dtype = "bool" ,
1365
1418
),
1366
- fixed_field_spec (
1367
- vcf_field = "QUAL" ,
1368
- name = "variant_quality" ,
1369
- dtype = "f4" ,
1370
- ),
1371
1419
]
1420
+ name_map = {field .full_name : field for field in icf .metadata .fields }
1421
+
1422
+ # Only two of the fixed fields have a direct one-to-one mapping.
1423
+ colspecs .extend (
1424
+ [
1425
+ spec_from_field (name_map ["QUAL" ], variable_name = "variant_quality" ),
1426
+ spec_from_field (name_map ["POS" ], variable_name = "variant_position" ),
1427
+ ]
1428
+ )
1429
+ colspecs .extend ([spec_from_field (field ) for field in icf .metadata .info_fields ])
1372
1430
1373
1431
gt_field = None
1374
- for field in icf .metadata .fields :
1375
- if field .category == "fixed" :
1376
- continue
1432
+ for field in icf .metadata .format_fields :
1377
1433
if field .name == "GT" :
1378
1434
gt_field = field
1379
1435
continue
1380
- shape = [m ]
1381
- prefix = "variant_"
1382
- dimensions = ["variants" ]
1383
- chunks = [variants_chunk_size ]
1384
- if field .category == "FORMAT" :
1385
- prefix = "call_"
1386
- shape .append (n )
1387
- chunks .append (samples_chunk_size )
1388
- dimensions .append ("samples" )
1389
- # TODO make an option to add in the empty extra dimension
1390
- if field .summary .max_number > 1 :
1391
- shape .append (field .summary .max_number )
1392
- # TODO we should really be checking this to see if the named dimensions
1393
- # are actually correct.
1394
- if field .vcf_number == "R" :
1395
- dimensions .append ("alleles" )
1396
- elif field .vcf_number == "A" :
1397
- dimensions .append ("alt_alleles" )
1398
- elif field .vcf_number == "G" :
1399
- dimensions .append ("genotypes" )
1400
- else :
1401
- dimensions .append (f"{ field .category } _{ field .name } _dim" )
1402
- variable_name = prefix + field .name
1403
- colspec = ZarrColumnSpec .new (
1404
- vcf_field = field .full_name ,
1405
- name = variable_name ,
1406
- dtype = field .smallest_dtype (),
1407
- shape = shape ,
1408
- chunks = chunks ,
1409
- dimensions = dimensions ,
1410
- description = field .description ,
1411
- )
1412
- colspecs .append (colspec )
1436
+ colspecs .append (spec_from_field (field ))
1413
1437
1414
1438
if gt_field is not None :
1415
1439
ploidy = gt_field .summary .max_number - 1
0 commit comments