Merge pull request #25 from jrm5100/master

jrm5100 · web-flow · commit 553ce1085a06 · 2021-07-16T11:23:20.000-04:00
Version 0.10.0
diff --git a/docs/release-history.rst b/docs/release-history.rst
@@ -2,6 +2,11 @@
 Release History
 ===============
 
+v0.10.0 (2021-07-16)
+--------------------
+
+Change genotype scores to be a unit8 (255=missing) instead of float64, to save ~70% of memory usage
+
 v0.9.1 (2021-07-13)
 -------------------
 
diff --git a/pandas_genomics/arrays/genotype_array.py b/pandas_genomics/arrays/genotype_array.py
@@ -78,11 +78,11 @@ def __init__(self, variant: Optional[Variant] = None):
 
         # Data backing the GenotypeArray is stored as a numpy structured array
         # An unsigned integer for each allele in the genotype indexing the list of possible alleles
-        # A float value for the genotype score (nan if missing)
+        # An unsigned integer for the genotype score (255 if missing)
         self._record_type = np.dtype(
             [
                 ("allele_idxs", np.uint8, (self.variant.ploidy,)),
-                ("gt_score", np.float64),
+                ("gt_score", np.uint8),
             ]
         )
 
@@ -665,7 +665,9 @@ def gt_scores(self):
         """
         Return the genotype score for each genotype (as a float)
         """
-        return self._data["gt_score"]
+        scores = self._data["gt_score"].copy().astype("float")
+        scores[scores == MISSING_IDX] = np.nan
+        return scores
 
     # Operations
     # Note: genotypes are compared by first allele then second, using the order of alleles in the variant
diff --git a/pandas_genomics/io/plink/from_plink.py b/pandas_genomics/io/plink/from_plink.py
@@ -187,8 +187,7 @@ def create_gt_array(num_samples, variant_gt_bytes, variant):
     genotypes[het_gt] = (0, 1)
     # Create GenotypeArray representation of the data
     dtype = GenotypeDtype(variant)
-    scores = np.empty(num_samples)
-    scores[:] = np.nan
+    scores = np.ones(num_samples) * MISSING_IDX  # Missing Scores
     data = np.array(list(zip(genotypes, scores)), dtype=dtype._record_type)
     gt_array = GenotypeArray(values=data, dtype=dtype)
     return gt_array
diff --git a/pandas_genomics/io/vcf.py b/pandas_genomics/io/vcf.py
@@ -67,7 +67,11 @@ def from_vcf(
         allele_idxs = np.array(vcf_variant.genotypes)[:, :2]
         allele_idxs = np.where(allele_idxs == -1, MISSING_IDX, allele_idxs)
         gt_scores = vcf_variant.gt_quals
-        gt_scores = np.where(gt_scores == -1, np.nan, gt_scores)
+        # Convert genotype scores from float values to uint8 values
+        gt_scores = np.where(gt_scores > 254, 254, gt_scores)  # Max Score
+        gt_scores = np.where(gt_scores < 0, 255, gt_scores)  # Min Score (<0 is missing)
+        gt_scores = np.where(gt_scores == -1, 255, gt_scores)  # Missing values
+        gt_scores = gt_scores.round().astype("uint8")
         values = np.array(list(zip(allele_idxs, gt_scores)), dtype=dtype._record_type)
 
         # Make the GenotypeArray
diff --git a/pandas_genomics/scalars.py b/pandas_genomics/scalars.py
@@ -12,9 +12,7 @@
 import uuid
 from typing import Optional, List, Tuple, Union
 
-MISSING_IDX = (
-    255  # Integer indicating a missing allele.  Each variant must have 254 alleles max.
-)
+MISSING_IDX = 255  # Integer indicating a missing allele or genotype score.  Each variant must have 254 alleles max and the maximum genotype score is 254.
 
 
 class Variant:
@@ -359,7 +357,7 @@ class Genotype:
     allele_idxs: List[int]
         Alleles encoded as indexes into the variant allele list
     score: int, optional
-        A quality score for the Genotype.  No assumptions are made about the meaning.
+        A quality score for the Genotype between 0 and 254.  255 or < 0 is treated as missing.
 
     Examples
     --------
@@ -398,9 +396,13 @@ def __init__(
 
         self.variant = variant
         self.allele_idxs = allele_idxs
-        self.score = None
         if score is not None:
-            self.score = int(score)
+            score = int(score)
+            if score < 0 or score > 255:
+                raise ValueError("The score must be between 0 and 255, inclusive")
+            elif score == 255:
+                score = None
+        self.score = score
 
         # Validate parameters
         for a in self.allele_idxs:
@@ -502,8 +504,8 @@ def is_missing(self) -> bool:
 
     @property
     def _float_score(self):
-        """Convenience method for storing score as a float"""
+        """Convenience method for storing score as a uint8"""
         if self.score is None:
-            return float("NaN")
+            return 255
         else:
-            return float(self.score)
+            return self.score
diff --git a/pandas_genomics/sim/biallelic_model_simulator.py b/pandas_genomics/sim/biallelic_model_simulator.py
@@ -8,7 +8,7 @@
 from numpy.random._generator import default_rng
 
 from pandas_genomics.arrays import GenotypeArray, GenotypeDtype
-from pandas_genomics.scalars import Variant
+from pandas_genomics.scalars import Variant, MISSING_IDX
 
 
 class SNPEffectEncodings(Enum):
@@ -450,15 +450,15 @@ def _get_snp1_gt_array(self, gt_table_idxs):
         """Assemble a GenotypeArray for SNP1 directly from genotype table indices"""
         dtype = GenotypeDtype(self.snp1)
         gt_table_data = (
-            ((0, 0), np.nan),
-            ((0, 1), np.nan),
-            ((1, 1), np.nan),
-            ((0, 0), np.nan),
-            ((0, 1), np.nan),
-            ((1, 1), np.nan),
-            ((0, 0), np.nan),
-            ((0, 1), np.nan),
-            ((1, 1), np.nan),
+            ((0, 0), MISSING_IDX),
+            ((0, 1), MISSING_IDX),
+            ((1, 1), MISSING_IDX),
+            ((0, 0), MISSING_IDX),
+            ((0, 1), MISSING_IDX),
+            ((1, 1), MISSING_IDX),
+            ((0, 0), MISSING_IDX),
+            ((0, 1), MISSING_IDX),
+            ((1, 1), MISSING_IDX),
         )
         data = np.array(
             [gt_table_data[i] for i in gt_table_idxs], dtype=dtype._record_type
@@ -469,15 +469,15 @@ def _get_snp2_gt_array(self, gt_table_idxs):
         """Assemble a GenotypeArray for SNP2 directly from genotype table indices"""
         dtype = GenotypeDtype(self.snp2)
         gt_table_data = (
-            ((0, 0), np.nan),
-            ((0, 0), np.nan),
-            ((0, 0), np.nan),
-            ((0, 1), np.nan),
-            ((0, 1), np.nan),
-            ((0, 1), np.nan),
-            ((1, 1), np.nan),
-            ((1, 1), np.nan),
-            ((1, 1), np.nan),
+            ((0, 0), MISSING_IDX),
+            ((0, 0), MISSING_IDX),
+            ((0, 0), MISSING_IDX),
+            ((0, 1), MISSING_IDX),
+            ((0, 1), MISSING_IDX),
+            ((0, 1), MISSING_IDX),
+            ((1, 1), MISSING_IDX),
+            ((1, 1), MISSING_IDX),
+            ((1, 1), MISSING_IDX),
         )
         data = np.array(
             [gt_table_data[i] for i in gt_table_idxs], dtype=dtype._record_type
diff --git a/pandas_genomics/sim/random_gt.py b/pandas_genomics/sim/random_gt.py
@@ -3,7 +3,7 @@
 import numpy as np
 
 from pandas_genomics.arrays import GenotypeArray, GenotypeDtype
-from pandas_genomics.scalars import Variant
+from pandas_genomics.scalars import Variant, MISSING_IDX
 
 
 def generate_random_gt(
@@ -58,8 +58,7 @@ def generate_random_gt(
 
     # Create GenotypeArray representation of the data
     dtype = GenotypeDtype(variant)
-    scores = np.empty(n)
-    scores[:] = np.nan
+    scores = np.ones(n) * MISSING_IDX
     data = np.array(list(zip(genotypes, scores)), dtype=dtype._record_type)
     gt_array = GenotypeArray(values=data, dtype=dtype)
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pandas-genomics"
-version = "0.9.1"
+version = "0.10.0"
 description = "Pandas ExtensionDtypes and ExtensionArray for working with genomics data"
 license = "BSD-3-Clause"
 authors = ["John McGuigan <jrm5100@psu.edu>"]