Remove explict schema data

benjeffery · benjeffery · commit 5ce3baa208c1 · 2025-04-09T00:44:25.000+01:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,8 @@
 
 - Add support for unindexed (and uncompressed) VCFs (#337)
 
+- Remove explicit sample, contig and filter lists from the schema (#343)
+
 # 0.1.4 2025-03-10
 
 - Fix bug in handling all-missing genotypes (#328)
diff --git a/bio2zarr/icf.py b/bio2zarr/icf.py
@@ -944,7 +944,15 @@ def num_partitions(self):
 
     @property
     def samples(self):
-        return [sample.id for sample in self.metadata.samples]
+        return self.metadata.samples
+
+    @property
+    def contigs(self):
+        return self.metadata.contigs
+
+    @property
+    def filters(self):
+        return self.metadata.filters
 
     @property
     def num_samples(self):
@@ -1037,9 +1045,6 @@ def generate_schema(
             samples_chunk_size=samples_chunk_size,
             variants_chunk_size=variants_chunk_size,
             fields=[],
-            samples=self.metadata.samples,
-            contigs=self.metadata.contigs,
-            filters=self.metadata.filters,
         )
 
         logger.info(
diff --git a/bio2zarr/plink.py b/bio2zarr/plink.py
@@ -15,7 +15,7 @@ def __init__(self, path):
         self.path = path
         self.bed = bed_reader.open_bed(path, num_threads=1, count_A1=False)
         self.num_records = self.bed.sid_count
-        self.samples = list(self.bed.iid)
+        self.samples = [vcz.Sample(id=sample) for sample in self.bed.iid]
         self.num_samples = len(self.samples)
         self.root_attrs = {}
 
@@ -62,9 +62,6 @@ def generate_schema(
             samples_chunk_size=samples_chunk_size,
             variants_chunk_size=variants_chunk_size,
             fields=[],
-            samples=[vcz.Sample(id=sample) for sample in self.bed.iid],
-            contigs=[],
-            filters=[],
         )
 
         logger.info(
diff --git a/bio2zarr/vcz.py b/bio2zarr/vcz.py
@@ -13,7 +13,7 @@
 
 logger = logging.getLogger(__name__)
 
-ZARR_SCHEMA_FORMAT_VERSION = "0.4"
+ZARR_SCHEMA_FORMAT_VERSION = "0.5"
 DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
 
 _fixed_field_descriptions = {
@@ -182,25 +182,16 @@ class VcfZarrSchema(core.JsonDataclass):
     format_version: str
     samples_chunk_size: int
     variants_chunk_size: int
-    samples: list
-    contigs: list
-    filters: list
     fields: list
 
     def __init__(
         self,
         format_version: str,
-        samples: list,
-        contigs: list,
-        filters: list,
         fields: list,
         variants_chunk_size: int = None,
         samples_chunk_size: int = None,
     ):
         self.format_version = format_version
-        self.samples = samples
-        self.contigs = contigs
-        self.filters = filters
         self.fields = fields
         if variants_chunk_size is None:
             variants_chunk_size = 1000
@@ -238,9 +229,6 @@ def fromdict(d):
                 f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
             )
         ret = VcfZarrSchema(**d)
-        ret.samples = [Sample(**sd) for sd in d["samples"]]
-        ret.contigs = [Contig(**sd) for sd in d["contigs"]]
-        ret.filters = [Filter(**sd) for sd in d["filters"]]
         ret.fields = [ZarrArraySpec(**sd) for sd in d["fields"]]
         return ret
 
@@ -473,9 +461,12 @@ def init(
         root.attrs.update(self.source.root_attrs)
 
         # Doing this synchronously - this is fine surely
-        self.encode_samples(root)
-        self.encode_filter_id(root)
-        self.encode_contig_id(root)
+        if hasattr(self.source, "samples"):
+            self.encode_samples(root)
+        if hasattr(self.source, "filters"):
+            self.encode_filter_id(root)
+        if hasattr(self.source, "contigs"):
+            self.encode_contigs(root)
 
         self.wip_path.mkdir()
         self.arrays_path.mkdir()
@@ -502,33 +493,33 @@ def init(
         )
 
     def encode_samples(self, root):
-        if [s.id for s in self.schema.samples] != self.source.samples:
-            raise ValueError("Subsetting or reordering samples not supported currently")
+        samples = self.source.samples
         array = root.array(
             "sample_id",
-            data=[sample.id for sample in self.schema.samples],
-            shape=len(self.schema.samples),
+            data=[sample.id for sample in samples],
+            shape=len(samples),
             dtype="str",
             compressor=DEFAULT_ZARR_COMPRESSOR,
             chunks=(self.schema.samples_chunk_size,),
         )
         array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
         logger.debug("Samples done")
 
-    def encode_contig_id(self, root):
+    def encode_contigs(self, root):
+        contigs = self.source.contigs
         array = root.array(
             "contig_id",
-            data=[contig.id for contig in self.schema.contigs],
-            shape=len(self.schema.contigs),
+            data=[contig.id for contig in contigs],
+            shape=len(contigs),
             dtype="str",
             compressor=DEFAULT_ZARR_COMPRESSOR,
         )
         array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
-        if all(contig.length is not None for contig in self.schema.contigs):
+        if all(contig.length is not None for contig in contigs):
             array = root.array(
                 "contig_length",
-                data=[contig.length for contig in self.schema.contigs],
-                shape=len(self.schema.contigs),
+                data=[contig.length for contig in contigs],
+                shape=len(contigs),
                 dtype=np.int64,
                 compressor=DEFAULT_ZARR_COMPRESSOR,
             )
@@ -537,10 +528,11 @@ def encode_contig_id(self, root):
     def encode_filter_id(self, root):
         # TODO need a way to store description also
         # https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
+        filters = self.source.filters
         array = root.array(
             "filter_id",
-            data=[filt.id for filt in self.schema.filters],
-            shape=len(self.schema.filters),
+            data=[filt.id for filt in filters],
+            shape=len(filters),
             dtype="str",
             compressor=DEFAULT_ZARR_COMPRESSOR,
         )
diff --git a/tests/test_vcz.py b/tests/test_vcz.py
@@ -205,12 +205,6 @@ def test_generated_no_fields(self, icf_path):
         schema.fields.clear()
         self.assert_json_round_trip(schema)
 
-    def test_generated_no_samples(self, icf_path):
-        icf = icf_mod.IntermediateColumnarFormat(icf_path)
-        schema = icf.generate_schema()
-        schema.samples.clear()
-        self.assert_json_round_trip(schema)
-
     def test_generated_change_dtype(self, icf_path):
         icf = icf_mod.IntermediateColumnarFormat(icf_path)
         schema = icf.generate_schema()
@@ -337,23 +331,6 @@ def test_chunk_size(self, schema):
         assert schema.samples_chunk_size == 10000
         assert schema.variants_chunk_size == 1000
 
-    def test_samples(self, schema):
-        assert schema.asdict()["samples"] == [
-            {"id": s} for s in ["NA00001", "NA00002", "NA00003"]
-        ]
-
-    def test_contigs(self, schema):
-        assert schema.asdict()["contigs"] == [
-            {"id": s, "length": None} for s in ["19", "20", "X"]
-        ]
-
-    def test_filters(self, schema):
-        assert schema.asdict()["filters"] == [
-            {"id": "PASS", "description": "All filters passed"},
-            {"id": "s50", "description": "Less than 50% of samples have data"},
-            {"id": "q10", "description": "Quality below 10"},
-        ]
-
     def test_variant_contig(self, schema):
         assert get_field_dict(schema, "variant_contig") == {
             "name": "variant_contig",
@@ -505,18 +482,6 @@ class TestVcfDescriptions:
     def test_fields(self, schema, field, description):
         assert schema.field_map()[field].description == description
 
-    @pytest.mark.parametrize(
-        ("filt", "description"),
-        [
-            ("PASS", "All filters passed"),
-            ("s50", "Less than 50% of samples have data"),
-            ("q10", "Quality below 10"),
-        ],
-    )
-    def test_filters(self, schema, filt, description):
-        d = {f.id: f.description for f in schema.filters}
-        assert d[filt] == description
-
 
 class TestVcfZarrWriterExample:
     arrays = (
@@ -689,33 +654,6 @@ def test_call_fields(self, tmp_path, field):
             icf_mod.explode(tmp_path / "x.icf", [tmp_path / "test.vcf.gz"])
 
 
-class TestBadSchemaChanges:
-    # [{'id': 'NA00001'}, {'id': 'NA00002'}, {'id': 'NA00003'}],
-    @pytest.mark.parametrize(
-        "samples",
-        [
-            [],
-            [{"id": "NA00001"}, {"id": "NA00003"}],
-            [{"id": "NA00001"}, {"id": "NA00002"}, {"id": "NA00004"}],
-            [
-                {"id": "NA00001"},
-                {"id": "NA00002"},
-                {"id": "NA00003"},
-                {"id": "NA00004"},
-            ],
-            [{"id": "NA00001"}, {"id": "NA00003"}, {"id": "NA00002"}],
-        ],
-    )
-    def test_removed_samples(self, tmp_path, schema, icf_path, samples):
-        d = schema.asdict()
-        d["samples"] = samples
-        schema_path = tmp_path / "schema.json"
-        with open(schema_path, "w") as f:
-            json.dump(d, f)
-        with pytest.raises(ValueError, match="Subsetting or reordering samples"):
-            icf_mod.encode(icf_path, tmp_path / "z", schema_path=schema_path)
-
-
 class TestInspect:
     def test_icf(self, icf_path):
         df = pd.DataFrame(icf_mod.inspect(icf_path))