1
+ import abc
1
2
import dataclasses
2
3
import json
3
4
import logging
13
14
14
15
logger = logging .getLogger (__name__ )
15
16
16
- ZARR_SCHEMA_FORMAT_VERSION = "0.4 "
17
+ ZARR_SCHEMA_FORMAT_VERSION = "0.5 "
17
18
DEFAULT_ZARR_COMPRESSOR = numcodecs .Blosc (cname = "zstd" , clevel = 7 )
18
19
19
20
_fixed_field_descriptions = {
28
29
}
29
30
30
31
32
+ class Source (abc .ABC ):
33
+ @property
34
+ @abc .abstractmethod
35
+ def path (self ):
36
+ pass
37
+
38
+ @property
39
+ @abc .abstractmethod
40
+ def num_records (self ):
41
+ pass
42
+
43
+ @property
44
+ @abc .abstractmethod
45
+ def num_samples (self ):
46
+ pass
47
+
48
+ @property
49
+ @abc .abstractmethod
50
+ def samples (self ):
51
+ pass
52
+
53
+ @property
54
+ def contigs (self ):
55
+ return None
56
+
57
+ @property
58
+ def filters (self ):
59
+ return None
60
+
61
+ @property
62
+ def root_attrs (self ):
63
+ return {}
64
+
65
+ @abc .abstractmethod
66
+ def iter_alleles (self , start , stop , num_alleles ):
67
+ pass
68
+
69
+ @abc .abstractmethod
70
+ def iter_genotypes (self , start , stop , num_alleles ):
71
+ pass
72
+
73
+ def iter_id (self , start , stop ):
74
+ return
75
+
76
+ def iter_contig (self , start , stop ):
77
+ return
78
+
79
+ @abc .abstractmethod
80
+ def iter_field (self , field_name , shape , start , stop ):
81
+ """Iterate over values for the specified field from start to stop positions."""
82
+ pass
83
+
84
+ @abc .abstractmethod
85
+ def generate_schema (self , variants_chunk_size , samples_chunk_size , local_alleles ):
86
+ pass
87
+
88
+
31
89
@dataclasses .dataclass
32
90
class ZarrArraySpec :
33
91
name : str
@@ -182,25 +240,16 @@ class VcfZarrSchema(core.JsonDataclass):
182
240
format_version : str
183
241
samples_chunk_size : int
184
242
variants_chunk_size : int
185
- samples : list
186
- contigs : list
187
- filters : list
188
243
fields : list
189
244
190
245
def __init__ (
191
246
self ,
192
247
format_version : str ,
193
- samples : list ,
194
- contigs : list ,
195
- filters : list ,
196
248
fields : list ,
197
249
variants_chunk_size : int = None ,
198
250
samples_chunk_size : int = None ,
199
251
):
200
252
self .format_version = format_version
201
- self .samples = samples
202
- self .contigs = contigs
203
- self .filters = filters
204
253
self .fields = fields
205
254
if variants_chunk_size is None :
206
255
variants_chunk_size = 1000
@@ -238,9 +287,6 @@ def fromdict(d):
238
287
f"{ d ['format_version' ]} != { ZARR_SCHEMA_FORMAT_VERSION } "
239
288
)
240
289
ret = VcfZarrSchema (** d )
241
- ret .samples = [Sample (** sd ) for sd in d ["samples" ]]
242
- ret .contigs = [Contig (** sd ) for sd in d ["contigs" ]]
243
- ret .filters = [Filter (** sd ) for sd in d ["filters" ]]
244
290
ret .fields = [ZarrArraySpec (** sd ) for sd in d ["fields" ]]
245
291
return ret
246
292
@@ -474,8 +520,10 @@ def init(
474
520
475
521
# Doing this synchronously - this is fine surely
476
522
self .encode_samples (root )
477
- self .encode_filter_id (root )
478
- self .encode_contig_id (root )
523
+ if self .source .filters is not None :
524
+ self .encode_filter_id (root )
525
+ if self .source .contigs is not None :
526
+ self .encode_contigs (root )
479
527
480
528
self .wip_path .mkdir ()
481
529
self .arrays_path .mkdir ()
@@ -502,33 +550,33 @@ def init(
502
550
)
503
551
504
552
def encode_samples (self , root ):
505
- if [s .id for s in self .schema .samples ] != self .source .samples :
506
- raise ValueError ("Subsetting or reordering samples not supported currently" )
553
+ samples = self .source .samples
507
554
array = root .array (
508
555
"sample_id" ,
509
- data = [sample .id for sample in self . schema . samples ],
510
- shape = len (self . schema . samples ),
556
+ data = [sample .id for sample in samples ],
557
+ shape = len (samples ),
511
558
dtype = "str" ,
512
559
compressor = DEFAULT_ZARR_COMPRESSOR ,
513
560
chunks = (self .schema .samples_chunk_size ,),
514
561
)
515
562
array .attrs ["_ARRAY_DIMENSIONS" ] = ["samples" ]
516
563
logger .debug ("Samples done" )
517
564
518
- def encode_contig_id (self , root ):
565
+ def encode_contigs (self , root ):
566
+ contigs = self .source .contigs
519
567
array = root .array (
520
568
"contig_id" ,
521
- data = [contig .id for contig in self . schema . contigs ],
522
- shape = len (self . schema . contigs ),
569
+ data = [contig .id for contig in contigs ],
570
+ shape = len (contigs ),
523
571
dtype = "str" ,
524
572
compressor = DEFAULT_ZARR_COMPRESSOR ,
525
573
)
526
574
array .attrs ["_ARRAY_DIMENSIONS" ] = ["contigs" ]
527
- if all (contig .length is not None for contig in self . schema . contigs ):
575
+ if all (contig .length is not None for contig in contigs ):
528
576
array = root .array (
529
577
"contig_length" ,
530
- data = [contig .length for contig in self . schema . contigs ],
531
- shape = len (self . schema . contigs ),
578
+ data = [contig .length for contig in contigs ],
579
+ shape = len (contigs ),
532
580
dtype = np .int64 ,
533
581
compressor = DEFAULT_ZARR_COMPRESSOR ,
534
582
)
@@ -537,10 +585,11 @@ def encode_contig_id(self, root):
537
585
def encode_filter_id (self , root ):
538
586
# TODO need a way to store description also
539
587
# https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
588
+ filters = self .source .filters
540
589
array = root .array (
541
590
"filter_id" ,
542
- data = [filt .id for filt in self . schema . filters ],
543
- shape = len (self . schema . filters ),
591
+ data = [filt .id for filt in filters ],
592
+ shape = len (filters ),
544
593
dtype = "str" ,
545
594
compressor = DEFAULT_ZARR_COMPRESSOR ,
546
595
)
0 commit comments