Skip to content

Commit 8100bfa

Browse files
committed
Add genotypes dimension to schema
1 parent 6dbe396 commit 8100bfa

File tree

2 files changed

+26
-8
lines changed

2 files changed

+26
-8
lines changed

bio2zarr/icf.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,12 +1064,28 @@ def generate_schema(
10641064
"samples": vcz.VcfZarrDimension(
10651065
size=n, chunk_size=samples_chunk_size or vcz.DEFAULT_SAMPLE_CHUNK_SIZE
10661066
),
1067-
# ploidy added conditionally below
1067+
# ploidy and genotypes added conditionally below
10681068
"alleles": vcz.VcfZarrDimension(size=max_alleles),
10691069
"alt_alleles": vcz.VcfZarrDimension(size=max_alleles - 1),
10701070
"filters": vcz.VcfZarrDimension(size=self.metadata.num_filters),
10711071
}
10721072

1073+
# Add ploidy and genotypes dimensions only when needed
1074+
max_genotypes = 0
1075+
for field in self.metadata.format_fields:
1076+
if field.vcf_number == "G":
1077+
max_genotypes = max(max_genotypes, field.summary.max_number)
1078+
if self.gt_field is not None:
1079+
ploidy = max(self.gt_field.summary.max_number - 1, 1)
1080+
dimensions["ploidy"] = vcz.VcfZarrDimension(size=ploidy)
1081+
max_genotypes = math.comb(max_alleles + ploidy - 1, ploidy)
1082+
dimensions["genotypes"] = vcz.VcfZarrDimension(size=max_genotypes)
1083+
else:
1084+
if max_genotypes > 0:
1085+
# there is no GT field, but there is at least one Number=G field,
1086+
# so need to define genotypes dimension
1087+
dimensions["genotypes"] = vcz.VcfZarrDimension(size=max_genotypes)
1088+
10731089
schema_instance = vcz.VcfZarrSchema(
10741090
format_version=vcz.ZARR_SCHEMA_FORMAT_VERSION,
10751091
dimensions=dimensions,
@@ -1148,10 +1164,6 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
11481164
array_specs.append(spec_from_field(field))
11491165

11501166
if self.gt_field is not None and n > 0:
1151-
ploidy = max(self.gt_field.summary.max_number - 1, 1)
1152-
# Add ploidy dimension only when needed
1153-
schema_instance.dimensions["ploidy"] = vcz.VcfZarrDimension(size=ploidy)
1154-
11551167
array_specs.append(
11561168
vcz.ZarrArraySpec(
11571169
name="call_genotype_phased",

bio2zarr/vcz.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -188,9 +188,15 @@ def from_field(
188188
)
189189
if max_alt_alleles > 0:
190190
dimensions.append("alt_alleles")
191-
elif max_number > 0 and vcf_field.vcf_number == "G":
192-
# TODO: need max_genotypes
193-
dimensions.append("genotypes")
191+
elif vcf_field.vcf_number == "G":
192+
max_genotypes = schema.dimensions["genotypes"].size
193+
if max_number > max_genotypes:
194+
raise ValueError(
195+
f"Max number of values {max_number} exceeds max genotypes "
196+
f"{max_genotypes} for {vcf_field.full_name}"
197+
)
198+
if max_genotypes > 0:
199+
dimensions.append("genotypes")
194200
elif max_number > 1 or vcf_field.full_name == "FORMAT/LAA":
195201
dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
196202
if dimensions[-1] not in schema.dimensions:

0 commit comments

Comments
 (0)