1
+ import abc
1
2
import dataclasses
2
3
import json
3
4
import logging
13
14
14
15
logger = logging .getLogger (__name__ )
15
16
16
- ZARR_SCHEMA_FORMAT_VERSION = "0.4 "
17
+ ZARR_SCHEMA_FORMAT_VERSION = "0.5 "
17
18
DEFAULT_ZARR_COMPRESSOR = numcodecs .Blosc (cname = "zstd" , clevel = 7 )
18
19
19
20
_fixed_field_descriptions = {
28
29
}
29
30
30
31
32
+ class Source (abc .ABC ):
33
+ @property
34
+ @abc .abstractmethod
35
+ def path (self ):
36
+ pass
37
+
38
+ @property
39
+ @abc .abstractmethod
40
+ def num_records (self ):
41
+ pass
42
+
43
+ @property
44
+ @abc .abstractmethod
45
+ def num_samples (self ):
46
+ pass
47
+
48
+ @property
49
+ @abc .abstractmethod
50
+ def samples (self ):
51
+ pass
52
+
53
+ @property
54
+ def contigs (self ):
55
+ return None
56
+
57
+ @property
58
+ def filters (self ):
59
+ return None
60
+
61
+ @property
62
+ def root_attrs (self ):
63
+ return {}
64
+
65
+ @abc .abstractmethod
66
+ def iter_alleles (self , start , stop , num_alleles ):
67
+ pass
68
+
69
+ @abc .abstractmethod
70
+ def iter_genotypes (self , start , stop , num_alleles ):
71
+ pass
72
+
73
+ def iter_id (self , start , stop ):
74
+ return
75
+
76
+ def iter_contig (self , start , stop ):
77
+ return
78
+
79
+ @abc .abstractmethod
80
+ def iter_field (self , field_name , shape , start , stop ):
81
+ pass
82
+
83
+ @abc .abstractmethod
84
+ def generate_schema (self , variants_chunk_size , samples_chunk_size , local_alleles ):
85
+ pass
86
+
87
+
31
88
@dataclasses .dataclass
32
89
class ZarrArraySpec :
33
90
name : str
@@ -182,25 +239,16 @@ class VcfZarrSchema(core.JsonDataclass):
182
239
format_version : str
183
240
samples_chunk_size : int
184
241
variants_chunk_size : int
185
- samples : list
186
- contigs : list
187
- filters : list
188
242
fields : list
189
243
190
244
def __init__ (
191
245
self ,
192
246
format_version : str ,
193
- samples : list ,
194
- contigs : list ,
195
- filters : list ,
196
247
fields : list ,
197
248
variants_chunk_size : int = None ,
198
249
samples_chunk_size : int = None ,
199
250
):
200
251
self .format_version = format_version
201
- self .samples = samples
202
- self .contigs = contigs
203
- self .filters = filters
204
252
self .fields = fields
205
253
if variants_chunk_size is None :
206
254
variants_chunk_size = 1000
@@ -238,9 +286,6 @@ def fromdict(d):
238
286
f"{ d ['format_version' ]} != { ZARR_SCHEMA_FORMAT_VERSION } "
239
287
)
240
288
ret = VcfZarrSchema (** d )
241
- ret .samples = [Sample (** sd ) for sd in d ["samples" ]]
242
- ret .contigs = [Contig (** sd ) for sd in d ["contigs" ]]
243
- ret .filters = [Filter (** sd ) for sd in d ["filters" ]]
244
289
ret .fields = [ZarrArraySpec (** sd ) for sd in d ["fields" ]]
245
290
return ret
246
291
@@ -474,8 +519,10 @@ def init(
474
519
475
520
# Doing this synchronously - this is fine surely
476
521
self .encode_samples (root )
477
- self .encode_filter_id (root )
478
- self .encode_contig_id (root )
522
+ if self .source .filters is not None :
523
+ self .encode_filter_id (root )
524
+ if self .source .contigs is not None :
525
+ self .encode_contigs (root )
479
526
480
527
self .wip_path .mkdir ()
481
528
self .arrays_path .mkdir ()
@@ -502,33 +549,33 @@ def init(
502
549
)
503
550
504
551
def encode_samples (self , root ):
505
- if [s .id for s in self .schema .samples ] != self .source .samples :
506
- raise ValueError ("Subsetting or reordering samples not supported currently" )
552
+ samples = self .source .samples
507
553
array = root .array (
508
554
"sample_id" ,
509
- data = [sample .id for sample in self . schema . samples ],
510
- shape = len (self . schema . samples ),
555
+ data = [sample .id for sample in samples ],
556
+ shape = len (samples ),
511
557
dtype = "str" ,
512
558
compressor = DEFAULT_ZARR_COMPRESSOR ,
513
559
chunks = (self .schema .samples_chunk_size ,),
514
560
)
515
561
array .attrs ["_ARRAY_DIMENSIONS" ] = ["samples" ]
516
562
logger .debug ("Samples done" )
517
563
518
- def encode_contig_id (self , root ):
564
+ def encode_contigs (self , root ):
565
+ contigs = self .source .contigs
519
566
array = root .array (
520
567
"contig_id" ,
521
- data = [contig .id for contig in self . schema . contigs ],
522
- shape = len (self . schema . contigs ),
568
+ data = [contig .id for contig in contigs ],
569
+ shape = len (contigs ),
523
570
dtype = "str" ,
524
571
compressor = DEFAULT_ZARR_COMPRESSOR ,
525
572
)
526
573
array .attrs ["_ARRAY_DIMENSIONS" ] = ["contigs" ]
527
- if all (contig .length is not None for contig in self . schema . contigs ):
574
+ if all (contig .length is not None for contig in contigs ):
528
575
array = root .array (
529
576
"contig_length" ,
530
- data = [contig .length for contig in self . schema . contigs ],
531
- shape = len (self . schema . contigs ),
577
+ data = [contig .length for contig in contigs ],
578
+ shape = len (contigs ),
532
579
dtype = np .int64 ,
533
580
compressor = DEFAULT_ZARR_COMPRESSOR ,
534
581
)
@@ -537,10 +584,11 @@ def encode_contig_id(self, root):
537
584
def encode_filter_id (self , root ):
538
585
# TODO need a way to store description also
539
586
# https://github.com/sgkit-dev/vcf-zarr-spec/issues/19
587
+ filters = self .source .filters
540
588
array = root .array (
541
589
"filter_id" ,
542
- data = [filt .id for filt in self . schema . filters ],
543
- shape = len (self . schema . filters ),
590
+ data = [filt .id for filt in filters ],
591
+ shape = len (filters ),
544
592
dtype = "str" ,
545
593
compressor = DEFAULT_ZARR_COMPRESSOR ,
546
594
)
0 commit comments