Skip to content

Check dimension sizes for named VCF Number fields #360

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 20 additions & 8 deletions bio2zarr/icf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1056,20 +1056,36 @@ def generate_schema(
if local_alleles is None:
local_alleles = False

max_alleles = max(self.fields["ALT"].vcf_field.summary.max_number + 1, 2)
dimensions = {
"variants": vcz.VcfZarrDimension(
size=m, chunk_size=variants_chunk_size or vcz.DEFAULT_VARIANT_CHUNK_SIZE
),
"samples": vcz.VcfZarrDimension(
size=n, chunk_size=samples_chunk_size or vcz.DEFAULT_SAMPLE_CHUNK_SIZE
),
# ploidy added conditionally below
"alleles": vcz.VcfZarrDimension(
size=max(self.fields["ALT"].vcf_field.summary.max_number + 1, 2)
),
# ploidy and genotypes added conditionally below
"alleles": vcz.VcfZarrDimension(size=max_alleles),
"alt_alleles": vcz.VcfZarrDimension(size=max_alleles - 1),
"filters": vcz.VcfZarrDimension(size=self.metadata.num_filters),
}

# Add ploidy and genotypes dimensions only when needed
max_genotypes = 0
for field in self.metadata.format_fields:
if field.vcf_number == "G":
max_genotypes = max(max_genotypes, field.summary.max_number)
if self.gt_field is not None:
ploidy = max(self.gt_field.summary.max_number - 1, 1)
dimensions["ploidy"] = vcz.VcfZarrDimension(size=ploidy)
max_genotypes = math.comb(max_alleles + ploidy - 1, ploidy)
dimensions["genotypes"] = vcz.VcfZarrDimension(size=max_genotypes)
else:
if max_genotypes > 0:
# there is no GT field, but there is at least one Number=G field,
# so need to define genotypes dimension
dimensions["genotypes"] = vcz.VcfZarrDimension(size=max_genotypes)

schema_instance = vcz.VcfZarrSchema(
format_version=vcz.ZARR_SCHEMA_FORMAT_VERSION,
dimensions=dimensions,
Expand Down Expand Up @@ -1148,10 +1164,6 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)):
array_specs.append(spec_from_field(field))

if self.gt_field is not None and n > 0:
ploidy = max(self.gt_field.summary.max_number - 1, 1)
# Add ploidy dimension only when needed
schema_instance.dimensions["ploidy"] = vcz.VcfZarrDimension(size=ploidy)

array_specs.append(
vcz.ZarrArraySpec(
name="call_genotype_phased",
Expand Down
36 changes: 26 additions & 10 deletions bio2zarr/vcz.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,19 +170,35 @@ def from_field(
array_name = prefix + vcf_field.name

max_number = vcf_field.max_number
if (max_number > 0 and vcf_field.vcf_number in ("R", "A", "G")) or (
max_number > 1 or vcf_field.full_name == "FORMAT/LAA"
):
# TODO we should really be checking this to see if the named dimensions
# are actually correct.
if vcf_field.vcf_number == "R":
if vcf_field.vcf_number == "R":
max_alleles = schema.dimensions["alleles"].size
if max_number > max_alleles:
raise ValueError(
f"Max number of values {max_number} exceeds max alleles "
f"{max_alleles} for {vcf_field.full_name}"
)
if max_alleles > 0:
dimensions.append("alleles")
elif vcf_field.vcf_number == "A":
elif vcf_field.vcf_number == "A":
max_alt_alleles = schema.dimensions["alt_alleles"].size
if max_number > max_alt_alleles:
raise ValueError(
f"Max number of values {max_number} exceeds max alt alleles "
f"{max_alt_alleles} for {vcf_field.full_name}"
)
if max_alt_alleles > 0:
dimensions.append("alt_alleles")
elif vcf_field.vcf_number == "G":
elif vcf_field.vcf_number == "G":
max_genotypes = schema.dimensions["genotypes"].size
if max_number > max_genotypes:
raise ValueError(
f"Max number of values {max_number} exceeds max genotypes "
f"{max_genotypes} for {vcf_field.full_name}"
)
if max_genotypes > 0:
dimensions.append("genotypes")
else:
dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
elif max_number > 1 or vcf_field.full_name == "FORMAT/LAA":
dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim")
if dimensions[-1] not in schema.dimensions:
schema.dimensions[dimensions[-1]] = VcfZarrDimension(
size=vcf_field.max_number
Expand Down
39 changes: 39 additions & 0 deletions tests/test_vcz.py
Original file line number Diff line number Diff line change
Expand Up @@ -857,3 +857,42 @@ def test_json_serialization(self, icf_path):

assert isinstance(schema2.dimensions["variants"], vcz.VcfZarrDimension)
assert isinstance(schema2.dimensions["samples"], vcz.VcfZarrDimension)


class TestDimensionSizes:
data_path = "tests/data/vcf/field_type_combos.vcf.gz"

@pytest.fixture(scope="class")
def icf(self, tmp_path_factory):
out = tmp_path_factory.mktemp("data") / "example.exploded"
return icf_mod.explode(out, [self.data_path])

@pytest.fixture(scope="class")
def schema(self, icf):
return icf.generate_schema()

@pytest.mark.parametrize(
("vcf_number", "dimensions", "field"),
[
("A", "alt_alleles", "FORMAT/FIA"),
("R", "alleles", "FORMAT/FIR"),
("G", "genotypes", "FORMAT/FIG"),
],
)
def test_max_number_exceeds_dimension_size(
self, icf, schema, vcf_number, dimensions, field
):
vcf_field = icf.fields[field].vcf_field
assert vcf_field.vcf_number == vcf_number
# this should not fail
vcz.ZarrArraySpec.from_field(vcf_field, schema)

# change max number to be bigger than that allowed by vcf number
max_number = schema.dimensions[dimensions].size + 1
vcf_field.summary.max_number = max_number

# creating an array spec should now fail
with pytest.raises(
ValueError, match=f"Max number of values {max_number} exceeds max"
):
vcz.ZarrArraySpec.from_field(vcf_field, schema)