Skip to content

Commit 2316e0a

Browse files
committed
Add genotypes dimension to schema
1 parent 635cac1 commit 2316e0a

File tree

2 files changed

+30
-10
lines changed

2 files changed

+30
-10
lines changed

bio2zarr/icf.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1050,12 +1050,32 @@ def generate_schema(
10501050
"samples": vcz.VcfZarrDimension(
10511051
size=n, chunk_size=samples_chunk_size or 10000
10521052
),
1053-
# ploidy added conditionally below
1053+
# ploidy and genotypes added conditionally below
10541054
"alleles": vcz.VcfZarrDimension(size=max_alleles),
10551055
"alt_alleles": vcz.VcfZarrDimension(size=max_alleles - 1),
10561056
"filters": vcz.VcfZarrDimension(size=self.metadata.num_filters),
10571057
}
10581058

1059+
# Add ploidy and genotypes dimensions only when needed
1060+
gt_field = None
1061+
max_genotypes = 0
1062+
for field in self.metadata.format_fields:
1063+
if field.name == "GT":
1064+
gt_field = field
1065+
continue
1066+
if field.vcf_number == "G":
1067+
max_genotypes = max(max_genotypes, field.summary.max_number)
1068+
if gt_field is not None:
1069+
ploidy = max(gt_field.summary.max_number - 1, 1)
1070+
dimensions["ploidy"] = vcz.VcfZarrDimension(size=ploidy)
1071+
max_genotypes = math.comb(max_alleles + ploidy - 1, ploidy)
1072+
dimensions["genotypes"] = vcz.VcfZarrDimension(size=max_genotypes)
1073+
else:
1074+
if max_genotypes > 0:
1075+
# there is no GT field, but there is at least one Number=G field,
1076+
# so need to define genotypes dimension
1077+
dimensions["genotypes"] = vcz.VcfZarrDimension(size=max_genotypes)
1078+
10591079
schema_instance = vcz.VcfZarrSchema(
10601080
format_version=vcz.ZARR_SCHEMA_FORMAT_VERSION,
10611081
dimensions=dimensions,
@@ -1128,18 +1148,12 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
11281148
[spec_from_field(field) for field in self.metadata.info_fields]
11291149
)
11301150

1131-
gt_field = None
11321151
for field in self.metadata.format_fields:
11331152
if field.name == "GT":
1134-
gt_field = field
11351153
continue
11361154
array_specs.append(spec_from_field(field))
11371155

11381156
if gt_field is not None and n > 0:
1139-
ploidy = max(gt_field.summary.max_number - 1, 1)
1140-
# Add ploidy dimension only when needed
1141-
schema_instance.dimensions["ploidy"] = vcz.VcfZarrDimension(size=ploidy)
1142-
11431157
array_specs.append(
11441158
vcz.ZarrArraySpec(
11451159
name="call_genotype_phased",

bio2zarr/vcz.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -190,9 +190,15 @@ def from_field(
190190
)
191191
if max_alt_alleles > 0:
192192
dimensions.append("alt_alleles")
193-
elif max_number > 0 and vcf_field.vcf_number == "G":
194-
# TODO: need max_genotypes
195-
dimensions.append("genotypes")
193+
elif vcf_field.vcf_number == "G":
194+
max_genotypes = schema.dimensions["genotypes"].size
195+
if max_number > max_genotypes:
196+
raise ValueError(
197+
f"Max number of values {max_number} exceeds max genotypes "
198+
f"{max_genotypes} for {vcf_field.full_name}"
199+
)
200+
if max_genotypes > 0:
201+
dimensions.append("genotypes")
196202
elif max_number > 1 or vcf_field.full_name == "FORMAT/LAA":
197203
dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
198204
if dimensions[-1] not in schema.dimensions:

0 commit comments

Comments
 (0)