diff --git a/bio2zarr/icf.py b/bio2zarr/icf.py index f828989d..0f12de64 100644 --- a/bio2zarr/icf.py +++ b/bio2zarr/icf.py @@ -1056,6 +1056,7 @@ def generate_schema( if local_alleles is None: local_alleles = False + max_alleles = max(self.fields["ALT"].vcf_field.summary.max_number + 1, 2) dimensions = { "variants": vcz.VcfZarrDimension( size=m, chunk_size=variants_chunk_size or vcz.DEFAULT_VARIANT_CHUNK_SIZE @@ -1063,13 +1064,28 @@ def generate_schema( "samples": vcz.VcfZarrDimension( size=n, chunk_size=samples_chunk_size or vcz.DEFAULT_SAMPLE_CHUNK_SIZE ), - # ploidy added conditionally below - "alleles": vcz.VcfZarrDimension( - size=max(self.fields["ALT"].vcf_field.summary.max_number + 1, 2) - ), + # ploidy and genotypes added conditionally below + "alleles": vcz.VcfZarrDimension(size=max_alleles), + "alt_alleles": vcz.VcfZarrDimension(size=max_alleles - 1), "filters": vcz.VcfZarrDimension(size=self.metadata.num_filters), } + # Add ploidy and genotypes dimensions only when needed + max_genotypes = 0 + for field in self.metadata.format_fields: + if field.vcf_number == "G": + max_genotypes = max(max_genotypes, field.summary.max_number) + if self.gt_field is not None: + ploidy = max(self.gt_field.summary.max_number - 1, 1) + dimensions["ploidy"] = vcz.VcfZarrDimension(size=ploidy) + max_genotypes = math.comb(max_alleles + ploidy - 1, ploidy) + dimensions["genotypes"] = vcz.VcfZarrDimension(size=max_genotypes) + else: + if max_genotypes > 0: + # there is no GT field, but there is at least one Number=G field, + # so need to define genotypes dimension + dimensions["genotypes"] = vcz.VcfZarrDimension(size=max_genotypes) + schema_instance = vcz.VcfZarrSchema( format_version=vcz.ZARR_SCHEMA_FORMAT_VERSION, dimensions=dimensions, @@ -1148,10 +1164,6 @@ def fixed_field_spec(name, dtype, source=None, dimensions=("variants",)): array_specs.append(spec_from_field(field)) if self.gt_field is not None and n > 0: - ploidy = max(self.gt_field.summary.max_number - 1, 1) - # Add ploidy dimension only when needed - schema_instance.dimensions["ploidy"] = vcz.VcfZarrDimension(size=ploidy) - array_specs.append( vcz.ZarrArraySpec( name="call_genotype_phased", diff --git a/bio2zarr/vcz.py b/bio2zarr/vcz.py index 85cf550c..452ef5e2 100644 --- a/bio2zarr/vcz.py +++ b/bio2zarr/vcz.py @@ -170,19 +170,35 @@ def from_field( array_name = prefix + vcf_field.name max_number = vcf_field.max_number - if (max_number > 0 and vcf_field.vcf_number in ("R", "A", "G")) or ( - max_number > 1 or vcf_field.full_name == "FORMAT/LAA" - ): - # TODO we should really be checking this to see if the named dimensions - # are actually correct. - if vcf_field.vcf_number == "R": + if vcf_field.vcf_number == "R": + max_alleles = schema.dimensions["alleles"].size + if max_number > max_alleles: + raise ValueError( + f"Max number of values {max_number} exceeds max alleles " + f"{max_alleles} for {vcf_field.full_name}" + ) + if max_alleles > 0: dimensions.append("alleles") - elif vcf_field.vcf_number == "A": + elif vcf_field.vcf_number == "A": + max_alt_alleles = schema.dimensions["alt_alleles"].size + if max_number > max_alt_alleles: + raise ValueError( + f"Max number of values {max_number} exceeds max alt alleles " + f"{max_alt_alleles} for {vcf_field.full_name}" + ) + if max_alt_alleles > 0: dimensions.append("alt_alleles") - elif vcf_field.vcf_number == "G": + elif vcf_field.vcf_number == "G": + max_genotypes = schema.dimensions["genotypes"].size + if max_number > max_genotypes: + raise ValueError( + f"Max number of values {max_number} exceeds max genotypes " + f"{max_genotypes} for {vcf_field.full_name}" + ) + if max_genotypes > 0: dimensions.append("genotypes") - else: - dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim") + elif max_number > 1 or vcf_field.full_name == "FORMAT/LAA": + dimensions.append(f"{vcf_field.category}_{vcf_field.name}_dim") if dimensions[-1] not in schema.dimensions: schema.dimensions[dimensions[-1]] = VcfZarrDimension( size=vcf_field.max_number diff --git a/tests/test_vcz.py b/tests/test_vcz.py index 63d6194e..8522d0d3 100644 --- a/tests/test_vcz.py +++ b/tests/test_vcz.py @@ -857,3 +857,42 @@ def test_json_serialization(self, icf_path): assert isinstance(schema2.dimensions["variants"], vcz.VcfZarrDimension) assert isinstance(schema2.dimensions["samples"], vcz.VcfZarrDimension) + + +class TestDimensionSizes: + data_path = "tests/data/vcf/field_type_combos.vcf.gz" + + @pytest.fixture(scope="class") + def icf(self, tmp_path_factory): + out = tmp_path_factory.mktemp("data") / "example.exploded" + return icf_mod.explode(out, [self.data_path]) + + @pytest.fixture(scope="class") + def schema(self, icf): + return icf.generate_schema() + + @pytest.mark.parametrize( + ("vcf_number", "dimensions", "field"), + [ + ("A", "alt_alleles", "FORMAT/FIA"), + ("R", "alleles", "FORMAT/FIR"), + ("G", "genotypes", "FORMAT/FIG"), + ], + ) + def test_max_number_exceeds_dimension_size( + self, icf, schema, vcf_number, dimensions, field + ): + vcf_field = icf.fields[field].vcf_field + assert vcf_field.vcf_number == vcf_number + # this should not fail + vcz.ZarrArraySpec.from_field(vcf_field, schema) + + # change max number to be bigger than that allowed by vcf number + max_number = schema.dimensions[dimensions].size + 1 + vcf_field.summary.max_number = max_number + + # creating an array spec should now fail + with pytest.raises( + ValueError, match=f"Max number of values {max_number} exceeds max" + ): + vcz.ZarrArraySpec.from_field(vcf_field, schema)