Skip to content

Commit f3c040d

Browse files
committed
Remove explicit data from schemas
1 parent 9655012 commit f3c040d

File tree

7 files changed

+31
-92
lines changed

7 files changed

+31
-92
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# 0.1.5 2025-03-xx
22

33
- Add support for merging contig IDs across multiple VCFs (#342)
4+
- Remove explicit sample, contig and filter lists from the schema (#343)
45

56
# 0.1.4 2025-03-10
67

bio2zarr/plink.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def __init__(self, path):
1515
self.path = path
1616
self.bed = bed_reader.open_bed(path, num_threads=1)
1717
self.num_records = self.bed.sid_count
18-
self.samples = list(self.bed.iid)
18+
self.samples = [schema.Sample(id=sample) for sample in self.bed.iid]
1919
self.num_samples = len(self.samples)
2020
self.root_attrs = {}
2121

@@ -122,9 +122,6 @@ def generate_schema(
122122
samples_chunk_size=samples_chunk_size,
123123
variants_chunk_size=variants_chunk_size,
124124
fields=array_specs,
125-
samples=[schema.Sample(id=sample) for sample in bed.iid],
126-
contigs=[],
127-
filters=[],
128125
)
129126

130127

bio2zarr/schema.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
logger = logging.getLogger(__name__)
1111

12-
ZARR_SCHEMA_FORMAT_VERSION = "0.4"
12+
ZARR_SCHEMA_FORMAT_VERSION = "0.5"
1313

1414
DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
1515

@@ -179,9 +179,6 @@ class VcfZarrSchema(core.JsonDataclass):
179179
format_version: str
180180
samples_chunk_size: int
181181
variants_chunk_size: int
182-
samples: list
183-
contigs: list
184-
filters: list
185182
fields: list
186183

187184
def validate(self):
@@ -213,9 +210,6 @@ def fromdict(d):
213210
f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
214211
)
215212
ret = VcfZarrSchema(**d)
216-
ret.samples = [Sample(**sd) for sd in d["samples"]]
217-
ret.contigs = [Contig(**sd) for sd in d["contigs"]]
218-
ret.filters = [Filter(**sd) for sd in d["filters"]]
219213
ret.fields = [ZarrArraySpec(**sd) for sd in d["fields"]]
220214
return ret
221215

bio2zarr/vcf2zarr/icf.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -889,7 +889,15 @@ def num_partitions(self):
889889

890890
@property
891891
def samples(self):
892-
return [sample.id for sample in self.metadata.samples]
892+
return self.metadata.samples
893+
894+
@property
895+
def contigs(self):
896+
return self.metadata.contigs
897+
898+
@property
899+
def filters(self):
900+
return self.metadata.filters
893901

894902
@property
895903
def num_samples(self):

bio2zarr/vcf2zarr/vcz.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -226,9 +226,6 @@ def fixed_field_spec(
226226
samples_chunk_size=samples_chunk_size,
227227
variants_chunk_size=variants_chunk_size,
228228
fields=array_specs,
229-
samples=icf.metadata.samples,
230-
contigs=icf.metadata.contigs,
231-
filters=icf.metadata.filters,
232229
)
233230

234231

bio2zarr/writer.py

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -238,9 +238,12 @@ def init(
238238
root.attrs.update(self.source.root_attrs)
239239

240240
# Doing this synchronously - this is fine surely
241-
self.encode_samples(root)
242-
self.encode_filter_id(root)
243-
self.encode_contig_id(root)
241+
if hasattr(self.source, "samples"):
242+
self.encode_samples(root)
243+
if hasattr(self.source, "filters"):
244+
self.encode_filter_id(root)
245+
if hasattr(self.source, "contigs"):
246+
self.encode_contigs(root)
244247

245248
self.wip_path.mkdir()
246249
self.arrays_path.mkdir()
@@ -267,33 +270,33 @@ def init(
267270
)
268271

269272
def encode_samples(self, root):
270-
if [s.id for s in self.schema.samples] != self.source.samples:
271-
raise ValueError("Subsetting or reordering samples not supported currently")
273+
samples = self.source.samples
272274
array = root.array(
273275
"sample_id",
274-
data=[sample.id for sample in self.schema.samples],
275-
shape=len(self.schema.samples),
276+
data=[sample.id for sample in samples],
277+
shape=len(samples),
276278
dtype="str",
277279
compressor=schema.DEFAULT_ZARR_COMPRESSOR,
278280
chunks=(self.schema.samples_chunk_size,),
279281
)
280282
array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
281283
logger.debug("Samples done")
282284

283-
def encode_contig_id(self, root):
285+
def encode_contigs(self, root):
286+
contigs = self.source.contigs
284287
array = root.array(
285288
"contig_id",
286-
data=[contig.id for contig in self.schema.contigs],
287-
shape=len(self.schema.contigs),
289+
data=[contig.id for contig in contigs],
290+
shape=len(contigs),
288291
dtype="str",
289292
compressor=schema.DEFAULT_ZARR_COMPRESSOR,
290293
)
291294
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
292-
if all(contig.length is not None for contig in self.schema.contigs):
295+
if all(contig.length is not None for contig in contigs):
293296
array = root.array(
294297
"contig_length",
295-
data=[contig.length for contig in self.schema.contigs],
296-
shape=len(self.schema.contigs),
298+
data=[contig.length for contig in contigs],
299+
shape=len(contigs),
297300
dtype=np.int64,
298301
compressor=schema.DEFAULT_ZARR_COMPRESSOR,
299302
)
@@ -302,10 +305,11 @@ def encode_contig_id(self, root):
302305
def encode_filter_id(self, root):
303306
# TODO need a way to store description also
304307
# https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
308+
filters = self.source.filters
305309
array = root.array(
306310
"filter_id",
307-
data=[filt.id for filt in self.schema.filters],
308-
shape=len(self.schema.filters),
311+
data=[filt.id for filt in filters],
312+
shape=len(filters),
309313
dtype="str",
310314
compressor=schema.DEFAULT_ZARR_COMPRESSOR,
311315
)

tests/test_vcz.py

Lines changed: 0 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -210,12 +210,6 @@ def test_generated_no_fields(self, icf_path):
210210
schema.fields.clear()
211211
self.assert_json_round_trip(schema)
212212

213-
def test_generated_no_samples(self, icf_path):
214-
icf = vcf2zarr.IntermediateColumnarFormat(icf_path)
215-
schema = vcz_mod.generate_schema(icf)
216-
schema.samples.clear()
217-
self.assert_json_round_trip(schema)
218-
219213
def test_generated_change_dtype(self, icf_path):
220214
icf = vcf2zarr.IntermediateColumnarFormat(icf_path)
221215
schema = vcz_mod.generate_schema(icf)
@@ -344,23 +338,6 @@ def test_chunk_size(self, schema):
344338
assert schema.samples_chunk_size == 10000
345339
assert schema.variants_chunk_size == 1000
346340

347-
def test_samples(self, schema):
348-
assert schema.asdict()["samples"] == [
349-
{"id": s} for s in ["NA00001", "NA00002", "NA00003"]
350-
]
351-
352-
def test_contigs(self, schema):
353-
assert schema.asdict()["contigs"] == [
354-
{"id": s, "length": None} for s in ["19", "20", "X"]
355-
]
356-
357-
def test_filters(self, schema):
358-
assert schema.asdict()["filters"] == [
359-
{"id": "PASS", "description": "All filters passed"},
360-
{"id": "s50", "description": "Less than 50% of samples have data"},
361-
{"id": "q10", "description": "Quality below 10"},
362-
]
363-
364341
def test_variant_contig(self, schema):
365342
assert get_field_dict(schema, "variant_contig") == {
366343
"name": "variant_contig",
@@ -512,18 +489,6 @@ class TestVcfDescriptions:
512489
def test_fields(self, schema, field, description):
513490
assert schema.field_map()[field].description == description
514491

515-
@pytest.mark.parametrize(
516-
("filt", "description"),
517-
[
518-
("PASS", "All filters passed"),
519-
("s50", "Less than 50% of samples have data"),
520-
("q10", "Quality below 10"),
521-
],
522-
)
523-
def test_filters(self, schema, filt, description):
524-
d = {f.id: f.description for f in schema.filters}
525-
assert d[filt] == description
526-
527492

528493
class TestVcfZarrWriterExample:
529494
arrays = (
@@ -696,33 +661,6 @@ def test_call_fields(self, tmp_path, field):
696661
vcf2zarr.explode(tmp_path / "x.icf", [tmp_path / "test.vcf.gz"])
697662

698663

699-
class TestBadSchemaChanges:
700-
# [{'id': 'NA00001'}, {'id': 'NA00002'}, {'id': 'NA00003'}],
701-
@pytest.mark.parametrize(
702-
"samples",
703-
[
704-
[],
705-
[{"id": "NA00001"}, {"id": "NA00003"}],
706-
[{"id": "NA00001"}, {"id": "NA00002"}, {"id": "NA00004"}],
707-
[
708-
{"id": "NA00001"},
709-
{"id": "NA00002"},
710-
{"id": "NA00003"},
711-
{"id": "NA00004"},
712-
],
713-
[{"id": "NA00001"}, {"id": "NA00003"}, {"id": "NA00002"}],
714-
],
715-
)
716-
def test_removed_samples(self, tmp_path, schema, icf_path, samples):
717-
d = schema.asdict()
718-
d["samples"] = samples
719-
schema_path = tmp_path / "schema.json"
720-
with open(schema_path, "w") as f:
721-
json.dump(d, f)
722-
with pytest.raises(ValueError, match="Subsetting or reordering samples"):
723-
vcf2zarr.encode(icf_path, tmp_path / "z", schema_path=schema_path)
724-
725-
726664
class TestInspect:
727665
def test_icf(self, icf_path):
728666
df = pd.DataFrame(vcz_mod.inspect(icf_path))

0 commit comments

Comments
 (0)