Skip to content

Commit ad03dfc

Browse files
Some refactoring of schema generation
1 parent d655bcf commit ad03dfc

File tree

1 file changed

+74
-50
lines changed

1 file changed

+74
-50
lines changed

bio2zarr/vcf.py

Lines changed: 74 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,10 @@ class VcfPartition:
152152
cname="zstd", clevel=7, shuffle=numcodecs.Blosc.NOSHUFFLE
153153
)
154154

155+
# TODO refactor this to have embedded Contig dataclass, Filters
156+
# and Samples dataclasses to allow for more information to be
157+
# retained and forward compatibility.
158+
155159

156160
@dataclasses.dataclass
157161
class IcfMetadata:
@@ -1242,6 +1246,50 @@ def new(**kwargs):
12421246
spec._choose_compressor_settings()
12431247
return spec
12441248

1249+
@staticmethod
1250+
def from_field(
1251+
vcf_field,
1252+
*,
1253+
num_variants,
1254+
num_samples,
1255+
variants_chunk_size,
1256+
samples_chunk_size,
1257+
variable_name=None,
1258+
):
1259+
shape = [num_variants]
1260+
prefix = "variant_"
1261+
dimensions = ["variants"]
1262+
chunks = [variants_chunk_size]
1263+
if vcf_field.category == "FORMAT":
1264+
prefix = "call_"
1265+
shape.append(num_samples)
1266+
chunks.append(samples_chunk_size)
1267+
dimensions.append("samples")
1268+
if variable_name is None:
1269+
variable_name = prefix + vcf_field.name
1270+
# TODO make an option to add in the empty extra dimension
1271+
if vcf_field.summary.max_number > 1:
1272+
shape.append(vcf_field.summary.max_number)
1273+
# TODO we should really be checking this to see if the named dimensions
1274+
# are actually correct.
1275+
if vcf_field.vcf_number == "R":
1276+
dimensions.append("alleles")
1277+
elif vcf_field.vcf_number == "A":
1278+
dimensions.append("alt_alleles")
1279+
elif vcf_field.vcf_number == "G":
1280+
dimensions.append("genotypes")
1281+
else:
1282+
dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
1283+
return ZarrColumnSpec.new(
1284+
vcf_field=vcf_field.full_name,
1285+
name=variable_name,
1286+
dtype=vcf_field.smallest_dtype(),
1287+
shape=shape,
1288+
chunks=chunks,
1289+
dimensions=dimensions,
1290+
description=vcf_field.description,
1291+
)
1292+
12451293
def _choose_compressor_settings(self):
12461294
"""
12471295
Choose compressor and filter settings based on the size and
@@ -1256,8 +1304,10 @@ def _choose_compressor_settings(self):
12561304
# performance in some cases anyway. Turning on shuffle should be a
12571305
# deliberate choice.
12581306
shuffle = numcodecs.Blosc.NOSHUFFLE
1259-
if dt.itemsize == 1:
1260-
# Any 1 byte field gets BITSHUFFLE by default
1307+
if self.name == "call_genotype" and dt.itemsize == 1:
1308+
# call_genotype gets BITSHUFFLE by default as it gets
1309+
# significantly better compression (at a cost of slower
1310+
# decoding)
12611311
shuffle = numcodecs.Blosc.BITSHUFFLE
12621312
self.compressor["shuffle"] = shuffle
12631313

@@ -1313,6 +1363,16 @@ def generate(icf, variants_chunk_size=None, samples_chunk_size=None):
13131363
f"Generating schema with chunks={variants_chunk_size, samples_chunk_size}"
13141364
)
13151365

1366+
def spec_from_field(field, variable_name=None):
1367+
return ZarrColumnSpec.from_field(
1368+
field,
1369+
num_samples=n,
1370+
num_variants=m,
1371+
samples_chunk_size=samples_chunk_size,
1372+
variants_chunk_size=variants_chunk_size,
1373+
variable_name=variable_name,
1374+
)
1375+
13161376
def fixed_field_spec(
13171377
name, dtype, vcf_field=None, shape=(m,), dimensions=("variants",)
13181378
):
@@ -1349,67 +1409,31 @@ def fixed_field_spec(
13491409
dimensions=["variants", "alleles"],
13501410
),
13511411
fixed_field_spec(
1352-
vcf_field="POS",
1353-
name="variant_position",
1354-
dtype="i4",
1355-
),
1356-
fixed_field_spec(
1357-
vcf_field=None,
13581412
name="variant_id",
13591413
dtype="str",
13601414
),
13611415
fixed_field_spec(
1362-
vcf_field=None,
13631416
name="variant_id_mask",
13641417
dtype="bool",
13651418
),
1366-
fixed_field_spec(
1367-
vcf_field="QUAL",
1368-
name="variant_quality",
1369-
dtype="f4",
1370-
),
13711419
]
1420+
name_map = {field.full_name: field for field in icf.metadata.fields}
1421+
1422+
# Only two of the fixed fields have a direct one-to-one mapping.
1423+
colspecs.extend(
1424+
[
1425+
spec_from_field(name_map["QUAL"], variable_name="variant_quality"),
1426+
spec_from_field(name_map["POS"], variable_name="variant_position"),
1427+
]
1428+
)
1429+
colspecs.extend([spec_from_field(field) for field in icf.metadata.info_fields])
13721430

13731431
gt_field = None
1374-
for field in icf.metadata.fields:
1375-
if field.category == "fixed":
1376-
continue
1432+
for field in icf.metadata.format_fields:
13771433
if field.name == "GT":
13781434
gt_field = field
13791435
continue
1380-
shape = [m]
1381-
prefix = "variant_"
1382-
dimensions = ["variants"]
1383-
chunks = [variants_chunk_size]
1384-
if field.category == "FORMAT":
1385-
prefix = "call_"
1386-
shape.append(n)
1387-
chunks.append(samples_chunk_size)
1388-
dimensions.append("samples")
1389-
# TODO make an option to add in the empty extra dimension
1390-
if field.summary.max_number > 1:
1391-
shape.append(field.summary.max_number)
1392-
# TODO we should really be checking this to see if the named dimensions
1393-
# are actually correct.
1394-
if field.vcf_number == "R":
1395-
dimensions.append("alleles")
1396-
elif field.vcf_number == "A":
1397-
dimensions.append("alt_alleles")
1398-
elif field.vcf_number == "G":
1399-
dimensions.append("genotypes")
1400-
else:
1401-
dimensions.append(f"{field.category}_{field.name}_dim")
1402-
variable_name = prefix + field.name
1403-
colspec = ZarrColumnSpec.new(
1404-
vcf_field=field.full_name,
1405-
name=variable_name,
1406-
dtype=field.smallest_dtype(),
1407-
shape=shape,
1408-
chunks=chunks,
1409-
dimensions=dimensions,
1410-
description=field.description,
1411-
)
1412-
colspecs.append(colspec)
1436+
colspecs.append(spec_from_field(field))
14131437

14141438
if gt_field is not None:
14151439
ploidy = gt_field.summary.max_number - 1

0 commit comments

Comments
 (0)