Skip to content

Commit 5ce3baa

Browse files
committed
Remove explict schema data
1 parent eed60f0 commit 5ce3baa

File tree

5 files changed

+32
-98
lines changed

5 files changed

+32
-98
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
- Add support for unindexed (and uncompressed) VCFs (#337)
66

7+
- Remove explicit sample, contig and filter lists from the schema (#343)
8+
79
# 0.1.4 2025-03-10
810

911
- Fix bug in handling all-missing genotypes (#328)

bio2zarr/icf.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -944,7 +944,15 @@ def num_partitions(self):
944944

945945
@property
946946
def samples(self):
947-
return [sample.id for sample in self.metadata.samples]
947+
return self.metadata.samples
948+
949+
@property
950+
def contigs(self):
951+
return self.metadata.contigs
952+
953+
@property
954+
def filters(self):
955+
return self.metadata.filters
948956

949957
@property
950958
def num_samples(self):
@@ -1037,9 +1045,6 @@ def generate_schema(
10371045
samples_chunk_size=samples_chunk_size,
10381046
variants_chunk_size=variants_chunk_size,
10391047
fields=[],
1040-
samples=self.metadata.samples,
1041-
contigs=self.metadata.contigs,
1042-
filters=self.metadata.filters,
10431048
)
10441049

10451050
logger.info(

bio2zarr/plink.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def __init__(self, path):
1515
self.path = path
1616
self.bed = bed_reader.open_bed(path, num_threads=1, count_A1=False)
1717
self.num_records = self.bed.sid_count
18-
self.samples = list(self.bed.iid)
18+
self.samples = [vcz.Sample(id=sample) for sample in self.bed.iid]
1919
self.num_samples = len(self.samples)
2020
self.root_attrs = {}
2121

@@ -62,9 +62,6 @@ def generate_schema(
6262
samples_chunk_size=samples_chunk_size,
6363
variants_chunk_size=variants_chunk_size,
6464
fields=[],
65-
samples=[vcz.Sample(id=sample) for sample in self.bed.iid],
66-
contigs=[],
67-
filters=[],
6865
)
6966

7067
logger.info(

bio2zarr/vcz.py

Lines changed: 20 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313

1414
logger = logging.getLogger(__name__)
1515

16-
ZARR_SCHEMA_FORMAT_VERSION = "0.4"
16+
ZARR_SCHEMA_FORMAT_VERSION = "0.5"
1717
DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
1818

1919
_fixed_field_descriptions = {
@@ -182,25 +182,16 @@ class VcfZarrSchema(core.JsonDataclass):
182182
format_version: str
183183
samples_chunk_size: int
184184
variants_chunk_size: int
185-
samples: list
186-
contigs: list
187-
filters: list
188185
fields: list
189186

190187
def __init__(
191188
self,
192189
format_version: str,
193-
samples: list,
194-
contigs: list,
195-
filters: list,
196190
fields: list,
197191
variants_chunk_size: int = None,
198192
samples_chunk_size: int = None,
199193
):
200194
self.format_version = format_version
201-
self.samples = samples
202-
self.contigs = contigs
203-
self.filters = filters
204195
self.fields = fields
205196
if variants_chunk_size is None:
206197
variants_chunk_size = 1000
@@ -238,9 +229,6 @@ def fromdict(d):
238229
f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
239230
)
240231
ret = VcfZarrSchema(**d)
241-
ret.samples = [Sample(**sd) for sd in d["samples"]]
242-
ret.contigs = [Contig(**sd) for sd in d["contigs"]]
243-
ret.filters = [Filter(**sd) for sd in d["filters"]]
244232
ret.fields = [ZarrArraySpec(**sd) for sd in d["fields"]]
245233
return ret
246234

@@ -473,9 +461,12 @@ def init(
473461
root.attrs.update(self.source.root_attrs)
474462

475463
# Doing this synchronously - this is fine surely
476-
self.encode_samples(root)
477-
self.encode_filter_id(root)
478-
self.encode_contig_id(root)
464+
if hasattr(self.source, "samples"):
465+
self.encode_samples(root)
466+
if hasattr(self.source, "filters"):
467+
self.encode_filter_id(root)
468+
if hasattr(self.source, "contigs"):
469+
self.encode_contigs(root)
479470

480471
self.wip_path.mkdir()
481472
self.arrays_path.mkdir()
@@ -502,33 +493,33 @@ def init(
502493
)
503494

504495
def encode_samples(self, root):
505-
if [s.id for s in self.schema.samples] != self.source.samples:
506-
raise ValueError("Subsetting or reordering samples not supported currently")
496+
samples = self.source.samples
507497
array = root.array(
508498
"sample_id",
509-
data=[sample.id for sample in self.schema.samples],
510-
shape=len(self.schema.samples),
499+
data=[sample.id for sample in samples],
500+
shape=len(samples),
511501
dtype="str",
512502
compressor=DEFAULT_ZARR_COMPRESSOR,
513503
chunks=(self.schema.samples_chunk_size,),
514504
)
515505
array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
516506
logger.debug("Samples done")
517507

518-
def encode_contig_id(self, root):
508+
def encode_contigs(self, root):
509+
contigs = self.source.contigs
519510
array = root.array(
520511
"contig_id",
521-
data=[contig.id for contig in self.schema.contigs],
522-
shape=len(self.schema.contigs),
512+
data=[contig.id for contig in contigs],
513+
shape=len(contigs),
523514
dtype="str",
524515
compressor=DEFAULT_ZARR_COMPRESSOR,
525516
)
526517
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
527-
if all(contig.length is not None for contig in self.schema.contigs):
518+
if all(contig.length is not None for contig in contigs):
528519
array = root.array(
529520
"contig_length",
530-
data=[contig.length for contig in self.schema.contigs],
531-
shape=len(self.schema.contigs),
521+
data=[contig.length for contig in contigs],
522+
shape=len(contigs),
532523
dtype=np.int64,
533524
compressor=DEFAULT_ZARR_COMPRESSOR,
534525
)
@@ -537,10 +528,11 @@ def encode_contig_id(self, root):
537528
def encode_filter_id(self, root):
538529
# TODO need a way to store description also
539530
# https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
531+
filters = self.source.filters
540532
array = root.array(
541533
"filter_id",
542-
data=[filt.id for filt in self.schema.filters],
543-
shape=len(self.schema.filters),
534+
data=[filt.id for filt in filters],
535+
shape=len(filters),
544536
dtype="str",
545537
compressor=DEFAULT_ZARR_COMPRESSOR,
546538
)

tests/test_vcz.py

Lines changed: 0 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -205,12 +205,6 @@ def test_generated_no_fields(self, icf_path):
205205
schema.fields.clear()
206206
self.assert_json_round_trip(schema)
207207

208-
def test_generated_no_samples(self, icf_path):
209-
icf = icf_mod.IntermediateColumnarFormat(icf_path)
210-
schema = icf.generate_schema()
211-
schema.samples.clear()
212-
self.assert_json_round_trip(schema)
213-
214208
def test_generated_change_dtype(self, icf_path):
215209
icf = icf_mod.IntermediateColumnarFormat(icf_path)
216210
schema = icf.generate_schema()
@@ -337,23 +331,6 @@ def test_chunk_size(self, schema):
337331
assert schema.samples_chunk_size == 10000
338332
assert schema.variants_chunk_size == 1000
339333

340-
def test_samples(self, schema):
341-
assert schema.asdict()["samples"] == [
342-
{"id": s} for s in ["NA00001", "NA00002", "NA00003"]
343-
]
344-
345-
def test_contigs(self, schema):
346-
assert schema.asdict()["contigs"] == [
347-
{"id": s, "length": None} for s in ["19", "20", "X"]
348-
]
349-
350-
def test_filters(self, schema):
351-
assert schema.asdict()["filters"] == [
352-
{"id": "PASS", "description": "All filters passed"},
353-
{"id": "s50", "description": "Less than 50% of samples have data"},
354-
{"id": "q10", "description": "Quality below 10"},
355-
]
356-
357334
def test_variant_contig(self, schema):
358335
assert get_field_dict(schema, "variant_contig") == {
359336
"name": "variant_contig",
@@ -505,18 +482,6 @@ class TestVcfDescriptions:
505482
def test_fields(self, schema, field, description):
506483
assert schema.field_map()[field].description == description
507484

508-
@pytest.mark.parametrize(
509-
("filt", "description"),
510-
[
511-
("PASS", "All filters passed"),
512-
("s50", "Less than 50% of samples have data"),
513-
("q10", "Quality below 10"),
514-
],
515-
)
516-
def test_filters(self, schema, filt, description):
517-
d = {f.id: f.description for f in schema.filters}
518-
assert d[filt] == description
519-
520485

521486
class TestVcfZarrWriterExample:
522487
arrays = (
@@ -689,33 +654,6 @@ def test_call_fields(self, tmp_path, field):
689654
icf_mod.explode(tmp_path / "x.icf", [tmp_path / "test.vcf.gz"])
690655

691656

692-
class TestBadSchemaChanges:
693-
# [{'id': 'NA00001'}, {'id': 'NA00002'}, {'id': 'NA00003'}],
694-
@pytest.mark.parametrize(
695-
"samples",
696-
[
697-
[],
698-
[{"id": "NA00001"}, {"id": "NA00003"}],
699-
[{"id": "NA00001"}, {"id": "NA00002"}, {"id": "NA00004"}],
700-
[
701-
{"id": "NA00001"},
702-
{"id": "NA00002"},
703-
{"id": "NA00003"},
704-
{"id": "NA00004"},
705-
],
706-
[{"id": "NA00001"}, {"id": "NA00003"}, {"id": "NA00002"}],
707-
],
708-
)
709-
def test_removed_samples(self, tmp_path, schema, icf_path, samples):
710-
d = schema.asdict()
711-
d["samples"] = samples
712-
schema_path = tmp_path / "schema.json"
713-
with open(schema_path, "w") as f:
714-
json.dump(d, f)
715-
with pytest.raises(ValueError, match="Subsetting or reordering samples"):
716-
icf_mod.encode(icf_path, tmp_path / "z", schema_path=schema_path)
717-
718-
719657
class TestInspect:
720658
def test_icf(self, icf_path):
721659
df = pd.DataFrame(icf_mod.inspect(icf_path))

0 commit comments

Comments
 (0)