Skip to content

Commit 75d7ee3

Browse files
Don't use shuffle except for bool and GT
Closes #126 Fix loophole on missing-data-columns Update CHANGELOG
1 parent bd63b95 commit 75d7ee3

File tree

3 files changed

+34
-7
lines changed

3 files changed

+34
-7
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# 0.0.6 2024-04-xx
2+
3+
- Only use NOSHUFFLE by default on ``call_genotype`` and bool arrays.
4+
15
# 0.0.5 2024-04-17
26

37
- Fix bug in schema handling (compressor settings ignored)

bio2zarr/vcf.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,6 @@ def full_name(self):
111111
return self.name
112112
return f"{self.category}/{self.name}"
113113

114-
# TODO add method here to choose a good set compressor and
115-
# filters default here for this field.
116-
117114
def smallest_dtype(self):
118115
"""
119116
Returns the smallest dtype suitable for this field based
@@ -123,7 +120,13 @@ def smallest_dtype(self):
123120
if self.vcf_type == "Float":
124121
ret = "f4"
125122
elif self.vcf_type == "Integer":
126-
ret = core.min_int_dtype(s.min_value, s.max_value)
123+
if not math.isfinite(s.max_value):
124+
# All missing values; use i1. Note we should have some API to
125+
# check more explicitly for missingness:
126+
# https://github.com/sgkit-dev/bio2zarr/issues/131
127+
ret = "i1"
128+
else:
129+
ret = core.min_int_dtype(s.min_value, s.max_value)
127130
elif self.vcf_type == "Flag":
128131
ret = "bool"
129132
elif self.vcf_type == "Character":
@@ -1300,17 +1303,19 @@ def _choose_compressor_settings(self):
13001303
13011304
See https://github.com/pystatgen/bio2zarr/discussions/74
13021305
"""
1303-
dt = np.dtype(self.dtype)
13041306
# Default is to not shuffle, because autoshuffle isn't recognised
13051307
# by many Zarr implementations, and shuffling can lead to worse
13061308
# performance in some cases anyway. Turning on shuffle should be a
13071309
# deliberate choice.
13081310
shuffle = numcodecs.Blosc.NOSHUFFLE
1309-
if self.name == "call_genotype" and dt.itemsize == 1:
1311+
if self.name == "call_genotype" and self.dtype == "i1":
13101312
# call_genotype gets BITSHUFFLE by default as it gets
13111313
# significantly better compression (at a cost of slower
13121314
# decoding)
13131315
shuffle = numcodecs.Blosc.BITSHUFFLE
1316+
elif self.dtype == "bool":
1317+
shuffle = numcodecs.Blosc.BITSHUFFLE
1318+
13141319
self.compressor["shuffle"] = shuffle
13151320

13161321

@@ -1440,7 +1445,6 @@ def fixed_field_spec(
14401445
shape = [m, n]
14411446
chunks = [variants_chunk_size, samples_chunk_size]
14421447
dimensions = ["variants", "samples"]
1443-
14441448
colspecs.append(
14451449
ZarrColumnSpec.new(
14461450
vcf_field=None,

tests/test_vcf.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,25 @@ def test_call_genotype_phased(self, schema):
298298
"filters": [],
299299
}
300300

301+
def test_call_GQ(self, schema):
302+
assert schema["columns"]["call_GQ"] == {
303+
"name": "call_GQ",
304+
"dtype": "i1",
305+
"shape": [9, 3],
306+
"chunks": [10000, 1000],
307+
"dimensions": ["variants", "samples"],
308+
"description": "Genotype Quality",
309+
"vcf_field": "FORMAT/GQ",
310+
"compressor": {
311+
"id": "blosc",
312+
"cname": "zstd",
313+
"clevel": 7,
314+
"shuffle": 0,
315+
"blocksize": 0,
316+
},
317+
"filters": [],
318+
}
319+
301320

302321
@pytest.mark.parametrize(
303322
"regions",

0 commit comments

Comments
 (0)