@@ -152,6 +152,10 @@ class VcfPartition:
152152 cname = "zstd" , clevel = 7 , shuffle = numcodecs .Blosc .NOSHUFFLE
153153)
154154
155+ # TODO refactor this to have embedded Contig dataclass, Filters
156+ # and Samples dataclasses to allow for more information to be
157+ # retained and forward compatibility.
158+
155159
156160@dataclasses .dataclass
157161class IcfMetadata :
@@ -1242,6 +1246,50 @@ def new(**kwargs):
12421246 spec ._choose_compressor_settings ()
12431247 return spec
12441248
1249+ @staticmethod
1250+ def from_field (
1251+ vcf_field ,
1252+ * ,
1253+ num_variants ,
1254+ num_samples ,
1255+ variants_chunk_size ,
1256+ samples_chunk_size ,
1257+ variable_name = None ,
1258+ ):
1259+ shape = [num_variants ]
1260+ prefix = "variant_"
1261+ dimensions = ["variants" ]
1262+ chunks = [variants_chunk_size ]
1263+ if vcf_field .category == "FORMAT" :
1264+ prefix = "call_"
1265+ shape .append (num_samples )
1266+ chunks .append (samples_chunk_size )
1267+ dimensions .append ("samples" )
1268+ if variable_name is None :
1269+ variable_name = prefix + vcf_field .name
1270+ # TODO make an option to add in the empty extra dimension
1271+ if vcf_field .summary .max_number > 1 :
1272+ shape .append (vcf_field .summary .max_number )
1273+ # TODO we should really be checking this to see if the named dimensions
1274+ # are actually correct.
1275+ if vcf_field .vcf_number == "R" :
1276+ dimensions .append ("alleles" )
1277+ elif vcf_field .vcf_number == "A" :
1278+ dimensions .append ("alt_alleles" )
1279+ elif vcf_field .vcf_number == "G" :
1280+ dimensions .append ("genotypes" )
1281+ else :
1282+ dimensions .append (f"{ vcf_field .category } _{ vcf_field .name } _dim" )
1283+ return ZarrColumnSpec .new (
1284+ vcf_field = vcf_field .full_name ,
1285+ name = variable_name ,
1286+ dtype = vcf_field .smallest_dtype (),
1287+ shape = shape ,
1288+ chunks = chunks ,
1289+ dimensions = dimensions ,
1290+ description = vcf_field .description ,
1291+ )
1292+
12451293 def _choose_compressor_settings (self ):
12461294 """
12471295 Choose compressor and filter settings based on the size and
@@ -1256,8 +1304,10 @@ def _choose_compressor_settings(self):
12561304 # performance in some cases anyway. Turning on shuffle should be a
12571305 # deliberate choice.
12581306 shuffle = numcodecs .Blosc .NOSHUFFLE
1259- if dt .itemsize == 1 :
1260- # Any 1 byte field gets BITSHUFFLE by default
1307+ if self .name == "call_genotype" and dt .itemsize == 1 :
1308+ # call_genotype gets BITSHUFFLE by default as it gets
1309+ # significantly better compression (at a cost of slower
1310+ # decoding)
12611311 shuffle = numcodecs .Blosc .BITSHUFFLE
12621312 self .compressor ["shuffle" ] = shuffle
12631313
@@ -1313,6 +1363,16 @@ def generate(icf, variants_chunk_size=None, samples_chunk_size=None):
13131363 f"Generating schema with chunks={ variants_chunk_size , samples_chunk_size } "
13141364 )
13151365
1366+ def spec_from_field (field , variable_name = None ):
1367+ return ZarrColumnSpec .from_field (
1368+ field ,
1369+ num_samples = n ,
1370+ num_variants = m ,
1371+ samples_chunk_size = samples_chunk_size ,
1372+ variants_chunk_size = variants_chunk_size ,
1373+ variable_name = variable_name ,
1374+ )
1375+
13161376 def fixed_field_spec (
13171377 name , dtype , vcf_field = None , shape = (m ,), dimensions = ("variants" ,)
13181378 ):
@@ -1349,67 +1409,31 @@ def fixed_field_spec(
13491409 dimensions = ["variants" , "alleles" ],
13501410 ),
13511411 fixed_field_spec (
1352- vcf_field = "POS" ,
1353- name = "variant_position" ,
1354- dtype = "i4" ,
1355- ),
1356- fixed_field_spec (
1357- vcf_field = None ,
13581412 name = "variant_id" ,
13591413 dtype = "str" ,
13601414 ),
13611415 fixed_field_spec (
1362- vcf_field = None ,
13631416 name = "variant_id_mask" ,
13641417 dtype = "bool" ,
13651418 ),
1366- fixed_field_spec (
1367- vcf_field = "QUAL" ,
1368- name = "variant_quality" ,
1369- dtype = "f4" ,
1370- ),
13711419 ]
1420+ name_map = {field .full_name : field for field in icf .metadata .fields }
1421+
1422+ # Only two of the fixed fields have a direct one-to-one mapping.
1423+ colspecs .extend (
1424+ [
1425+ spec_from_field (name_map ["QUAL" ], variable_name = "variant_quality" ),
1426+ spec_from_field (name_map ["POS" ], variable_name = "variant_position" ),
1427+ ]
1428+ )
1429+ colspecs .extend ([spec_from_field (field ) for field in icf .metadata .info_fields ])
13721430
13731431 gt_field = None
1374- for field in icf .metadata .fields :
1375- if field .category == "fixed" :
1376- continue
1432+ for field in icf .metadata .format_fields :
13771433 if field .name == "GT" :
13781434 gt_field = field
13791435 continue
1380- shape = [m ]
1381- prefix = "variant_"
1382- dimensions = ["variants" ]
1383- chunks = [variants_chunk_size ]
1384- if field .category == "FORMAT" :
1385- prefix = "call_"
1386- shape .append (n )
1387- chunks .append (samples_chunk_size )
1388- dimensions .append ("samples" )
1389- # TODO make an option to add in the empty extra dimension
1390- if field .summary .max_number > 1 :
1391- shape .append (field .summary .max_number )
1392- # TODO we should really be checking this to see if the named dimensions
1393- # are actually correct.
1394- if field .vcf_number == "R" :
1395- dimensions .append ("alleles" )
1396- elif field .vcf_number == "A" :
1397- dimensions .append ("alt_alleles" )
1398- elif field .vcf_number == "G" :
1399- dimensions .append ("genotypes" )
1400- else :
1401- dimensions .append (f"{ field .category } _{ field .name } _dim" )
1402- variable_name = prefix + field .name
1403- colspec = ZarrColumnSpec .new (
1404- vcf_field = field .full_name ,
1405- name = variable_name ,
1406- dtype = field .smallest_dtype (),
1407- shape = shape ,
1408- chunks = chunks ,
1409- dimensions = dimensions ,
1410- description = field .description ,
1411- )
1412- colspecs .append (colspec )
1436+ colspecs .append (spec_from_field (field ))
14131437
14141438 if gt_field is not None :
14151439 ploidy = gt_field .summary .max_number - 1
0 commit comments