Remove explicit data from schemas

benjeffery · benjeffery · commit f3c040dcdf7f · 2025-04-03T14:15:00.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,7 @@
 # 0.1.5 2025-03-xx
 
 - Add support for merging contig IDs across multiple VCFs (#342)
+- Remove explicit sample, contig and filter lists from the schema (#343)
 
 # 0.1.4 2025-03-10
 
diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
@@ -15,7 +15,7 @@ def __init__(self, path):
         self.path = path
         self.bed = bed_reader.open_bed(path, num_threads=1)
         self.num_records = self.bed.sid_count
-        self.samples = list(self.bed.iid)
+        self.samples = [schema.Sample(id=sample) for sample in self.bed.iid]
         self.num_samples = len(self.samples)
         self.root_attrs = {}
 
@@ -122,9 +122,6 @@ def generate_schema(
         samples_chunk_size=samples_chunk_size,
         variants_chunk_size=variants_chunk_size,
         fields=array_specs,
-        samples=[schema.Sample(id=sample) for sample in bed.iid],
-        contigs=[],
-        filters=[],
     )
 
 
diff --git a/bio2zarr/schema.py b/bio2zarr/schema.py
@@ -9,7 +9,7 @@
 
 logger = logging.getLogger(__name__)
 
-ZARR_SCHEMA_FORMAT_VERSION = "0.4"
+ZARR_SCHEMA_FORMAT_VERSION = "0.5"
 
 DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
 
@@ -179,9 +179,6 @@ class VcfZarrSchema(core.JsonDataclass):
     format_version: str
     samples_chunk_size: int
     variants_chunk_size: int
-    samples: list
-    contigs: list
-    filters: list
     fields: list
 
     def validate(self):
@@ -213,9 +210,6 @@ def fromdict(d):
                 f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
             )
         ret = VcfZarrSchema(**d)
-        ret.samples = [Sample(**sd) for sd in d["samples"]]
-        ret.contigs = [Contig(**sd) for sd in d["contigs"]]
-        ret.filters = [Filter(**sd) for sd in d["filters"]]
         ret.fields = [ZarrArraySpec(**sd) for sd in d["fields"]]
         return ret
 
diff --git a/bio2zarr/vcf2zarr/icf.py b/bio2zarr/vcf2zarr/icf.py
@@ -889,7 +889,15 @@ def num_partitions(self):
 
     @property
     def samples(self):
-        return [sample.id for sample in self.metadata.samples]
+        return self.metadata.samples
+
+    @property
+    def contigs(self):
+        return self.metadata.contigs
+
+    @property
+    def filters(self):
+        return self.metadata.filters
 
     @property
     def num_samples(self):
diff --git a/bio2zarr/vcf2zarr/vcz.py b/bio2zarr/vcf2zarr/vcz.py
@@ -226,9 +226,6 @@ def fixed_field_spec(
         samples_chunk_size=samples_chunk_size,
         variants_chunk_size=variants_chunk_size,
         fields=array_specs,
-        samples=icf.metadata.samples,
-        contigs=icf.metadata.contigs,
-        filters=icf.metadata.filters,
     )
 
 
diff --git a/bio2zarr/writer.py b/bio2zarr/writer.py
@@ -238,9 +238,12 @@ def init(
         root.attrs.update(self.source.root_attrs)
 
         # Doing this synchronously - this is fine surely
-        self.encode_samples(root)
-        self.encode_filter_id(root)
-        self.encode_contig_id(root)
+        if hasattr(self.source, "samples"):
+            self.encode_samples(root)
+        if hasattr(self.source, "filters"):
+            self.encode_filter_id(root)
+        if hasattr(self.source, "contigs"):
+            self.encode_contigs(root)
 
         self.wip_path.mkdir()
         self.arrays_path.mkdir()
@@ -267,33 +270,33 @@ def init(
         )
 
     def encode_samples(self, root):
-        if [s.id for s in self.schema.samples] != self.source.samples:
-            raise ValueError("Subsetting or reordering samples not supported currently")
+        samples = self.source.samples
         array = root.array(
             "sample_id",
-            data=[sample.id for sample in self.schema.samples],
-            shape=len(self.schema.samples),
+            data=[sample.id for sample in samples],
+            shape=len(samples),
             dtype="str",
             compressor=schema.DEFAULT_ZARR_COMPRESSOR,
             chunks=(self.schema.samples_chunk_size,),
         )
         array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
         logger.debug("Samples done")
 
-    def encode_contig_id(self, root):
+    def encode_contigs(self, root):
+        contigs = self.source.contigs
         array = root.array(
             "contig_id",
-            data=[contig.id for contig in self.schema.contigs],
-            shape=len(self.schema.contigs),
+            data=[contig.id for contig in contigs],
+            shape=len(contigs),
             dtype="str",
             compressor=schema.DEFAULT_ZARR_COMPRESSOR,
         )
         array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
-        if all(contig.length is not None for contig in self.schema.contigs):
+        if all(contig.length is not None for contig in contigs):
             array = root.array(
                 "contig_length",
-                data=[contig.length for contig in self.schema.contigs],
-                shape=len(self.schema.contigs),
+                data=[contig.length for contig in contigs],
+                shape=len(contigs),
                 dtype=np.int64,
                 compressor=schema.DEFAULT_ZARR_COMPRESSOR,
             )
@@ -302,10 +305,11 @@ def encode_contig_id(self, root):
     def encode_filter_id(self, root):
         # TODO need a way to store description also
         # https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
+        filters = self.source.filters
         array = root.array(
             "filter_id",
-            data=[filt.id for filt in self.schema.filters],
-            shape=len(self.schema.filters),
+            data=[filt.id for filt in filters],
+            shape=len(filters),
             dtype="str",
             compressor=schema.DEFAULT_ZARR_COMPRESSOR,
         )
diff --git a/tests/test_vcz.py b/tests/test_vcz.py
@@ -210,12 +210,6 @@ def test_generated_no_fields(self, icf_path):
         schema.fields.clear()
         self.assert_json_round_trip(schema)
 
-    def test_generated_no_samples(self, icf_path):
-        icf = vcf2zarr.IntermediateColumnarFormat(icf_path)
-        schema = vcz_mod.generate_schema(icf)
-        schema.samples.clear()
-        self.assert_json_round_trip(schema)
-
     def test_generated_change_dtype(self, icf_path):
         icf = vcf2zarr.IntermediateColumnarFormat(icf_path)
         schema = vcz_mod.generate_schema(icf)
@@ -344,23 +338,6 @@ def test_chunk_size(self, schema):
         assert schema.samples_chunk_size == 10000
         assert schema.variants_chunk_size == 1000
 
-    def test_samples(self, schema):
-        assert schema.asdict()["samples"] == [
-            {"id": s} for s in ["NA00001", "NA00002", "NA00003"]
-        ]
-
-    def test_contigs(self, schema):
-        assert schema.asdict()["contigs"] == [
-            {"id": s, "length": None} for s in ["19", "20", "X"]
-        ]
-
-    def test_filters(self, schema):
-        assert schema.asdict()["filters"] == [
-            {"id": "PASS", "description": "All filters passed"},
-            {"id": "s50", "description": "Less than 50% of samples have data"},
-            {"id": "q10", "description": "Quality below 10"},
-        ]
-
     def test_variant_contig(self, schema):
         assert get_field_dict(schema, "variant_contig") == {
             "name": "variant_contig",
@@ -512,18 +489,6 @@ class TestVcfDescriptions:
     def test_fields(self, schema, field, description):
         assert schema.field_map()[field].description == description
 
-    @pytest.mark.parametrize(
-        ("filt", "description"),
-        [
-            ("PASS", "All filters passed"),
-            ("s50", "Less than 50% of samples have data"),
-            ("q10", "Quality below 10"),
-        ],
-    )
-    def test_filters(self, schema, filt, description):
-        d = {f.id: f.description for f in schema.filters}
-        assert d[filt] == description
-
 
 class TestVcfZarrWriterExample:
     arrays = (
@@ -696,33 +661,6 @@ def test_call_fields(self, tmp_path, field):
             vcf2zarr.explode(tmp_path / "x.icf", [tmp_path / "test.vcf.gz"])
 
 
-class TestBadSchemaChanges:
-    # [{'id': 'NA00001'}, {'id': 'NA00002'}, {'id': 'NA00003'}],
-    @pytest.mark.parametrize(
-        "samples",
-        [
-            [],
-            [{"id": "NA00001"}, {"id": "NA00003"}],
-            [{"id": "NA00001"}, {"id": "NA00002"}, {"id": "NA00004"}],
-            [
-                {"id": "NA00001"},
-                {"id": "NA00002"},
-                {"id": "NA00003"},
-                {"id": "NA00004"},
-            ],
-            [{"id": "NA00001"}, {"id": "NA00003"}, {"id": "NA00002"}],
-        ],
-    )
-    def test_removed_samples(self, tmp_path, schema, icf_path, samples):
-        d = schema.asdict()
-        d["samples"] = samples
-        schema_path = tmp_path / "schema.json"
-        with open(schema_path, "w") as f:
-            json.dump(d, f)
-        with pytest.raises(ValueError, match="Subsetting or reordering samples"):
-            vcf2zarr.encode(icf_path, tmp_path / "z", schema_path=schema_path)
-
-
 class TestInspect:
     def test_icf(self, icf_path):
         df = pd.DataFrame(vcz_mod.inspect(icf_path))

Original file line number	Diff line number	Diff line change
`@@ -226,9 +226,6 @@ def fixed_field_spec(`
`226`	`226`	`samples_chunk_size=samples_chunk_size,`
`227`	`227`	`variants_chunk_size=variants_chunk_size,`
`228`	`228`	`fields=array_specs,`
`229`		`- samples=icf.metadata.samples,`
`230`		`- contigs=icf.metadata.contigs,`
`231`		`- filters=icf.metadata.filters,`
`232`	`229`	`)`
`233`	`230`
`234`	`231`