Skip to content

Commit 5f1f224

Browse files
committed
Remove explict schema data
1 parent eed60f0 commit 5f1f224

File tree

6 files changed

+135
-140
lines changed

6 files changed

+135
-140
lines changed

CHANGELOG.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44

55
- Add support for unindexed (and uncompressed) VCFs (#337)
66

7+
Breaking changes
8+
9+
- Remove explicit sample, contig and filter lists from the schema.
10+
Existing ICFs will need to be recreated. (#343)
11+
712
# 0.1.4 2025-03-10
813

914
- Fix bug in handling all-missing genotypes (#328)

bio2zarr/icf.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -877,9 +877,9 @@ def convert_local_allele_field_types(fields):
877877
return [*fields, la]
878878

879879

880-
class IntermediateColumnarFormat(collections.abc.Mapping):
880+
class IntermediateColumnarFormat(vcz.Source):
881881
def __init__(self, path):
882-
self.path = pathlib.Path(path)
882+
self._path = pathlib.Path(path)
883883
# TODO raise a more informative error here telling people this
884884
# directory is either a WIP or the wrong format.
885885
with open(self.path / "metadata.json") as f:
@@ -902,20 +902,11 @@ def __init__(self, path):
902902

903903
def __repr__(self):
904904
return (
905-
f"IntermediateColumnarFormat(fields={len(self)}, "
905+
f"IntermediateColumnarFormat(fields={len(self.fields)}, "
906906
f"partitions={self.num_partitions}, "
907907
f"records={self.num_records}, path={self.path})"
908908
)
909909

910-
def __getitem__(self, key):
911-
return self.fields[key]
912-
913-
def __iter__(self):
914-
return iter(self.fields)
915-
916-
def __len__(self):
917-
return len(self.fields)
918-
919910
def summary_table(self):
920911
data = []
921912
for name, icf_field in self.fields.items():
@@ -934,6 +925,10 @@ def summary_table(self):
934925
data.append(d)
935926
return data
936927

928+
@property
929+
def path(self):
930+
return self._path
931+
937932
@property
938933
def num_records(self):
939934
return self.metadata.num_records
@@ -944,7 +939,15 @@ def num_partitions(self):
944939

945940
@property
946941
def samples(self):
947-
return [sample.id for sample in self.metadata.samples]
942+
return self.metadata.samples
943+
944+
@property
945+
def contigs(self):
946+
return self.metadata.contigs
947+
948+
@property
949+
def filters(self):
950+
return self.metadata.filters
948951

949952
@property
950953
def num_samples(self):
@@ -1037,9 +1040,6 @@ def generate_schema(
10371040
samples_chunk_size=samples_chunk_size,
10381041
variants_chunk_size=variants_chunk_size,
10391042
fields=[],
1040-
samples=self.metadata.samples,
1041-
contigs=self.metadata.contigs,
1042-
filters=self.metadata.filters,
10431043
)
10441044

10451045
logger.info(

bio2zarr/plink.py

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,26 @@
1010
logger = logging.getLogger(__name__)
1111

1212

13-
class PlinkFormat:
13+
class PlinkFormat(vcz.Source):
1414
def __init__(self, path):
15-
self.path = path
15+
self._path = pathlib.Path(path)
1616
self.bed = bed_reader.open_bed(path, num_threads=1, count_A1=False)
17-
self.num_records = self.bed.sid_count
18-
self.samples = list(self.bed.iid)
19-
self.num_samples = len(self.samples)
20-
self.root_attrs = {}
17+
18+
@property
19+
def path(self):
20+
return self._path
21+
22+
@property
23+
def num_records(self):
24+
return self.bed.sid_count
25+
26+
@property
27+
def samples(self):
28+
return [vcz.Sample(id=sample) for sample in self.bed.iid]
29+
30+
@property
31+
def num_samples(self):
32+
return len(self.samples)
2133

2234
def iter_alleles(self, start, stop, num_alleles):
2335
ref_field = self.bed.allele_1
@@ -62,9 +74,6 @@ def generate_schema(
6274
samples_chunk_size=samples_chunk_size,
6375
variants_chunk_size=variants_chunk_size,
6476
fields=[],
65-
samples=[vcz.Sample(id=sample) for sample in self.bed.iid],
66-
contigs=[],
67-
filters=[],
6877
)
6978

7079
logger.info(

bio2zarr/vcz.py

Lines changed: 75 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import abc
12
import dataclasses
23
import json
34
import logging
@@ -13,7 +14,7 @@
1314

1415
logger = logging.getLogger(__name__)
1516

16-
ZARR_SCHEMA_FORMAT_VERSION = "0.4"
17+
ZARR_SCHEMA_FORMAT_VERSION = "0.5"
1718
DEFAULT_ZARR_COMPRESSOR = numcodecs.Blosc(cname="zstd", clevel=7)
1819

1920
_fixed_field_descriptions = {
@@ -28,6 +29,62 @@
2829
}
2930

3031

32+
class Source(abc.ABC):
33+
@property
34+
@abc.abstractmethod
35+
def path(self):
36+
pass
37+
38+
@property
39+
@abc.abstractmethod
40+
def num_records(self):
41+
pass
42+
43+
@property
44+
@abc.abstractmethod
45+
def num_samples(self):
46+
pass
47+
48+
@property
49+
@abc.abstractmethod
50+
def samples(self):
51+
pass
52+
53+
@property
54+
def contigs(self):
55+
return None
56+
57+
@property
58+
def filters(self):
59+
return None
60+
61+
@property
62+
def root_attrs(self):
63+
return {}
64+
65+
@abc.abstractmethod
66+
def iter_alleles(self, start, stop, num_alleles):
67+
pass
68+
69+
@abc.abstractmethod
70+
def iter_genotypes(self, start, stop, num_alleles):
71+
pass
72+
73+
def iter_id(self, start, stop):
74+
return
75+
76+
def iter_contig(self, start, stop):
77+
return
78+
79+
@abc.abstractmethod
80+
def iter_field(self, field_name, shape, start, stop):
81+
pass
82+
83+
@abc.abstractmethod
84+
def generate_schema(self, variants_chunk_size, samples_chunk_size, local_alleles):
85+
pass
86+
87+
3188
@dataclasses.dataclass
3289
class ZarrArraySpec:
3390
name: str
@@ -182,25 +239,16 @@ class VcfZarrSchema(core.JsonDataclass):
182239
format_version: str
183240
samples_chunk_size: int
184241
variants_chunk_size: int
185-
samples: list
186-
contigs: list
187-
filters: list
188242
fields: list
189243

190244
def __init__(
191245
self,
192246
format_version: str,
193-
samples: list,
194-
contigs: list,
195-
filters: list,
196247
fields: list,
197248
variants_chunk_size: int = None,
198249
samples_chunk_size: int = None,
199250
):
200251
self.format_version = format_version
201-
self.samples = samples
202-
self.contigs = contigs
203-
self.filters = filters
204252
self.fields = fields
205253
if variants_chunk_size is None:
206254
variants_chunk_size = 1000
@@ -238,9 +286,6 @@ def fromdict(d):
238286
f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
239287
)
240288
ret = VcfZarrSchema(**d)
241-
ret.samples = [Sample(**sd) for sd in d["samples"]]
242-
ret.contigs = [Contig(**sd) for sd in d["contigs"]]
243-
ret.filters = [Filter(**sd) for sd in d["filters"]]
244289
ret.fields = [ZarrArraySpec(**sd) for sd in d["fields"]]
245290
return ret
246291

@@ -474,8 +519,10 @@ def init(
474519

475520
# Doing this synchronously - this is fine surely
476521
self.encode_samples(root)
477-
self.encode_filter_id(root)
478-
self.encode_contig_id(root)
522+
if self.source.filters is not None:
523+
self.encode_filter_id(root)
524+
if self.source.contigs is not None:
525+
self.encode_contigs(root)
479526

480527
self.wip_path.mkdir()
481528
self.arrays_path.mkdir()
@@ -502,33 +549,33 @@ def init(
502549
)
503550

504551
def encode_samples(self, root):
505-
if [s.id for s in self.schema.samples] != self.source.samples:
506-
raise ValueError("Subsetting or reordering samples not supported currently")
552+
samples = self.source.samples
507553
array = root.array(
508554
"sample_id",
509-
data=[sample.id for sample in self.schema.samples],
510-
shape=len(self.schema.samples),
555+
data=[sample.id for sample in samples],
556+
shape=len(samples),
511557
dtype="str",
512558
compressor=DEFAULT_ZARR_COMPRESSOR,
513559
chunks=(self.schema.samples_chunk_size,),
514560
)
515561
array.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
516562
logger.debug("Samples done")
517563

518-
def encode_contig_id(self, root):
564+
def encode_contigs(self, root):
565+
contigs = self.source.contigs
519566
array = root.array(
520567
"contig_id",
521-
data=[contig.id for contig in self.schema.contigs],
522-
shape=len(self.schema.contigs),
568+
data=[contig.id for contig in contigs],
569+
shape=len(contigs),
523570
dtype="str",
524571
compressor=DEFAULT_ZARR_COMPRESSOR,
525572
)
526573
array.attrs["_ARRAY_DIMENSIONS"] = ["contigs"]
527-
if all(contig.length is not None for contig in self.schema.contigs):
574+
if all(contig.length is not None for contig in contigs):
528575
array = root.array(
529576
"contig_length",
530-
data=[contig.length for contig in self.schema.contigs],
531-
shape=len(self.schema.contigs),
577+
data=[contig.length for contig in contigs],
578+
shape=len(contigs),
532579
dtype=np.int64,
533580
compressor=DEFAULT_ZARR_COMPRESSOR,
534581
)
@@ -537,10 +584,11 @@ def encode_contig_id(self, root):
537584
def encode_filter_id(self, root):
538585
# TODO need a way to store description also
539586
# https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
587+
filters = self.source.filters
540588
array = root.array(
541589
"filter_id",
542-
data=[filt.id for filt in self.schema.filters],
543-
shape=len(self.schema.filters),
590+
data=[filt.id for filt in filters],
591+
shape=len(filters),
544592
dtype="str",
545593
compressor=DEFAULT_ZARR_COMPRESSOR,
546594
)

0 commit comments

Comments
 (0)