Skip to content

Commit de7b6ce

Browse files
committed
Remove explict schema data
1 parent eed60f0 commit de7b6ce

File tree

6 files changed

+136
-140
lines changed

6 files changed

+136
-140
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44

55
- Add support for unindexed (and uncompressed) VCFs (#337)
66

7+
Breaking changes
8+
9+
- Remove explicit sample, contig and filter lists from the schema.
10+
Existing ICFs will need to be recreated. (#343)
11+
712
# 0.1.4 2025-03-10
813

914
- Fix bug in handling all-missing genotypes (#328)

bio2zarr/icf.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -877,9 +877,9 @@ def convert_local_allele_field_types(fields):
877877
return [*fields, la]
878878

879879

880-
class IntermediateColumnarFormat(collections.abc.Mapping):
880+
class IntermediateColumnarFormat(vcz.Source):
881881
def __init__(self, path):
882-
self.path = pathlib.Path(path)
882+
self._path = pathlib.Path(path)
883883
# TODO raise a more informative error here telling people this
884884
# directory is either a WIP or the wrong format.
885885
with open(self.path / "metadata.json") as f:
@@ -902,20 +902,11 @@ def __init__(self, path):
902902

903903
def __repr__(self):
904904
return (
905-
f"IntermediateColumnarFormat(fields={len(self)}, "
905+
f"IntermediateColumnarFormat(fields={len(self.fields)}, "
906906
f"partitions={self.num_partitions}, "
907907
f"records={self.num_records}, path={self.path})"
908908
)
909909

910-
def __getitem__(self, key):
911-
return self.fields[key]
912-
913-
def __iter__(self):
914-
return iter(self.fields)
915-
916-
def __len__(self):
917-
return len(self.fields)
918-
919910
def summary_table(self):
920911
data = []
921912
for name, icf_field in self.fields.items():
@@ -934,6 +925,10 @@ def summary_table(self):
934925
data.append(d)
935926
return data
936927

928+
@property
929+
def path(self):
930+
return self._path
931+
937932
@property
938933
def num_records(self):
939934
return self.metadata.num_records
@@ -944,7 +939,15 @@ def num_partitions(self):
944939

945940
@property
946941
def samples(self):
947-
return [sample.id for sample in self.metadata.samples]
942+
return self.metadata.samples
943+
944+
@property
945+
def contigs(self):
946+
return self.metadata.contigs
947+
948+
@property
949+
def filters(self):
950+
return self.metadata.filters
948951

949952
@property
950953
def num_samples(self):
@@ -1037,9 +1040,6 @@ def generate_schema(
10371040
samples_chunk_size=samples_chunk_size,
10381041
variants_chunk_size=variants_chunk_size,
10391042
fields=[],
1040-
samples=self.metadata.samples,
1041-
contigs=self.metadata.contigs,
1042-
filters=self.metadata.filters,
10431043
)
10441044

10451045
logger.info(

bio2zarr/plink.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,26 @@
1010
logger = logging.getLogger(__name__)
1111

1212

13-
class PlinkFormat:
13+
class PlinkFormat(vcz.Source):
1414
def __init__(self, path):
15-
self.path = path
15+
self._path = pathlib.Path(path)
1616
self.bed = bed_reader.open_bed(path, num_threads=1, count_A1=False)
17-
self.num_records = self.bed.sid_count
18-
self.samples = list(self.bed.iid)
19-
self.num_samples = len(self.samples)
20-
self.root_attrs = {}
17+
18+
@property
19+
def path(self):
20+
return self._path
21+
22+
@property
23+
def num_records(self):
24+
return self.bed.sid_count
25+
26+
@property
27+
def samples(self):
28+
return [vcz.Sample(id=sample) for sample in self.bed.iid]
29+
30+
@property
31+
def num_samples(self):
32+
return len(self.samples)
2133

2234
def iter_alleles(self, start, stop, num_alleles):
2335
ref_field = self.bed.allele_1
@@ -62,9 +74,6 @@ def generate_schema(
6274
samples_chunk_size=samples_chunk_size,
6375
variants_chunk_size=variants_chunk_size,
6476
fields=[],
65-
samples=[vcz.Sample(id=sample) for sample in self.bed.iid],
66-
contigs=[],
67-
filters=[],
6877
)
6978

7079
logger.info(

bio2zarr/vcz.py

Lines changed: 76 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import abc
12
import dataclasses
23
import json
34
import logging
@@ -13,7 +14,7 @@
1314

1415
logger = logging.getLogger(__name__)
1516

16-
ZARR_SCHEMA_FORMAT_VERSION = "0.4"
17+
ZARR_SCHEMA_FORMAT_VERSION = "0.5"
1718
DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
1819

1920
_fixed_field_descriptions = {
@@ -28,6 +29,63 @@
2829
}
2930

3031

32+
class Source(abc.ABC):
33+
@property
34+
@abc.abstractmethod
35+
def path(self):
36+
pass
37+
38+
@property
39+
@abc.abstractmethod
40+
def num_records(self):
41+
pass
42+
43+
@property
44+
@abc.abstractmethod
45+
def num_samples(self):
46+
pass
47+
48+
@property
49+
@abc.abstractmethod
50+
def samples(self):
51+
pass
52+
53+
@property
54+
def contigs(self):
55+
return None
56+
57+
@property
58+
def filters(self):
59+
return None
60+
61+
@property
62+
def root_attrs(self):
63+
return {}
64+
65+
@abc.abstractmethod
66+
def iter_alleles(self, start, stop, num_alleles):
67+
pass
68+
69+
@abc.abstractmethod
70+
def iter_genotypes(self, start, stop, num_alleles):
71+
pass
72+
73+
def iter_id(self, start, stop):
74+
return
75+
76+
def iter_contig(self, start, stop):
77+
return
78+
79+
@abc.abstractmethod
80+
def iter_field(self, field_name, shape, start, stop):
81+
"""Iterate over values for the specified field from start to stop positions."""
82+
pass
83+
84+
@abc.abstractmethod
85+
def generate_schema(self, variants_chunk_size, samples_chunk_size, local_alleles):
86+
pass
87+
88+
3189
@dataclasses.dataclass
3290
class ZarrArraySpec:
3391
name: str
@@ -182,25 +240,16 @@ class VcfZarrSchema(core.JsonDataclass):
182240
format_version: str
183241
samples_chunk_size: int
184242
variants_chunk_size: int
185-
samples: list
186-
contigs: list
187-
filters: list
188243
fields: list
189244

190245
def __init__(
191246
self,
192247
format_version: str,
193-
samples: list,
194-
contigs: list,
195-
filters: list,
196248
fields: list,
197249
variants_chunk_size: int = None,
198250
samples_chunk_size: int = None,
199251
):
200252
self.format_version = format_version
201-
self.samples = samples
202-
self.contigs = contigs
203-
self.filters = filters
204253
self.fields = fields
205254
if variants_chunk_size is None:
206255
variants_chunk_size = 1000
@@ -238,9 +287,6 @@ def fromdict(d):
238287
f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
239288
)
240289
ret = VcfZarrSchema(**d)
241-
ret.samples = [Sample(**sd) for sd in d["samples"]]
242-
ret.contigs = [Contig(**sd) for sd in d["contigs"]]
243-
ret.filters = [Filter(**sd) for sd in d["filters"]]
244290
ret.fields = [ZarrArraySpec(**sd) for sd in d["fields"]]
245291
return ret
246292

@@ -474,8 +520,10 @@ def init(
474520

475521
# Doing this synchronously - this is fine surely
476522
self.encode_samples(root)
477-
self.encode_filter_id(root)
478-
self.encode_contig_id(root)
523+
if self.source.filters is not None:
524+
self.encode_filter_id(root)
525+
if self.source.contigs is not None:
526+
self.encode_contigs(root)
479527

480528
self.wip_path.mkdir()
481529
self.arrays_path.mkdir()
@@ -502,33 +550,33 @@ def init(
502550
)
503551

504552
def encode_samples(self, root):
505-
if [s.id for s in self.schema.samples] != self.source.samples:
506-
raise ValueError("Subsetting or reordering samples not supported currently")
553+
samples = self.source.samples
507554
array = root.array(
508555
"sample_id",
509-
data=[sample.id for sample in self.schema.samples],
510-
shape=len(self.schema.samples),
556+
data=[sample.id for sample in samples],
557+
shape=len(samples),
511558
dtype="str",
512559
compressor=DEFAULT_ZARR_COMPRESSOR,
513560
chunks=(self.schema.samples_chunk_size,),
514561
)
515562
array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
516563
logger.debug("Samples done")
517564

518-
def encode_contig_id(self, root):
565+
def encode_contigs(self, root):
566+
contigs = self.source.contigs
519567
array = root.array(
520568
"contig_id",
521-
data=[contig.id for contig in self.schema.contigs],
522-
shape=len(self.schema.contigs),
569+
data=[contig.id for contig in contigs],
570+
shape=len(contigs),
523571
dtype="str",
524572
compressor=DEFAULT_ZARR_COMPRESSOR,
525573
)
526574
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
527-
if all(contig.length is not None for contig in self.schema.contigs):
575+
if all(contig.length is not None for contig in contigs):
528576
array = root.array(
529577
"contig_length",
530-
data=[contig.length for contig in self.schema.contigs],
531-
shape=len(self.schema.contigs),
578+
data=[contig.length for contig in contigs],
579+
shape=len(contigs),
532580
dtype=np.int64,
533581
compressor=DEFAULT_ZARR_COMPRESSOR,
534582
)
@@ -537,10 +585,11 @@ def encode_contig_id(self, root):
537585
def encode_filter_id(self, root):
538586
# TODO need a way to store description also
539587
# https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
588+
filters = self.source.filters
540589
array = root.array(
541590
"filter_id",
542-
data=[filt.id for filt in self.schema.filters],
543-
shape=len(self.schema.filters),
591+
data=[filt.id for filt in filters],
592+
shape=len(filters),
544593
dtype="str",
545594
compressor=DEFAULT_ZARR_COMPRESSOR,
546595
)

0 commit comments

Comments
 (0)