Don't use shuffle except for bool and GT

jeromekelleher · jeromekelleher · commit 75d7ee3d54ab · 2024-04-19T15:33:24.000+01:00
Closes #126 Fix loophole on missing-data-columns Update CHANGELOG
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# 0.0.6 2024-04-xx
+
+- Only use NOSHUFFLE by default on ``call_genotype`` and bool arrays.
+
 # 0.0.5 2024-04-17
 
 - Fix bug in schema handling (compressor settings ignored)
diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py
@@ -111,9 +111,6 @@ def full_name(self):
             return self.name
         return f"{self.category}/{self.name}"
 
-    # TODO add method here to choose a good set compressor and
-    # filters default here for this field.
-
     def smallest_dtype(self):
         """
         Returns the smallest dtype suitable for this field based
@@ -123,7 +120,13 @@ def smallest_dtype(self):
         if self.vcf_type == "Float":
             ret = "f4"
         elif self.vcf_type == "Integer":
-            ret = core.min_int_dtype(s.min_value, s.max_value)
+            if not math.isfinite(s.max_value):
+                # All missing values; use i1. Note we should have some API to
+                # check more explicitly for missingness:
+                # https://github.com/sgkit-dev/bio2zarr/issues/131
+                ret = "i1"
+            else:
+                ret = core.min_int_dtype(s.min_value, s.max_value)
         elif self.vcf_type == "Flag":
             ret = "bool"
         elif self.vcf_type == "Character":
@@ -1300,17 +1303,19 @@ def _choose_compressor_settings(self):
 
         See https://github.com/pystatgen/bio2zarr/discussions/74
         """
-        dt = np.dtype(self.dtype)
         # Default is to not shuffle, because autoshuffle isn't recognised
         # by many Zarr implementations, and shuffling can lead to worse
         # performance in some cases anyway. Turning on shuffle should be a
         # deliberate choice.
         shuffle = numcodecs.Blosc.NOSHUFFLE
-        if self.name == "call_genotype" and dt.itemsize == 1:
+        if self.name == "call_genotype" and self.dtype == "i1":
             # call_genotype gets BITSHUFFLE by default as it gets
             # significantly better compression (at a cost of slower
             # decoding)
             shuffle = numcodecs.Blosc.BITSHUFFLE
+        elif self.dtype == "bool":
+            shuffle = numcodecs.Blosc.BITSHUFFLE
+
         self.compressor["shuffle"] = shuffle
 
 
@@ -1440,7 +1445,6 @@ def fixed_field_spec(
             shape = [m, n]
             chunks = [variants_chunk_size, samples_chunk_size]
             dimensions = ["variants", "samples"]
-
             colspecs.append(
                 ZarrColumnSpec.new(
                     vcf_field=None,
diff --git a/tests/test_vcf.py b/tests/test_vcf.py
@@ -298,6 +298,25 @@ def test_call_genotype_phased(self, schema):
             "filters": [],
         }
 
+    def test_call_GQ(self, schema):
+        assert schema["columns"]["call_GQ"] == {
+            "name": "call_GQ",
+            "dtype": "i1",
+            "shape": [9, 3],
+            "chunks": [10000, 1000],
+            "dimensions": ["variants", "samples"],
+            "description": "Genotype Quality",
+            "vcf_field": "FORMAT/GQ",
+            "compressor": {
+                "id": "blosc",
+                "cname": "zstd",
+                "clevel": 7,
+                "shuffle": 0,
+                "blocksize": 0,
+            },
+            "filters": [],
+        }
+
 
 @pytest.mark.parametrize(
     "regions",