Skip to content

Commit 553ce10

Browse files
authored
Merge pull request #25 from jrm5100/master
Version 0.10.0
2 parents f9e71ca + 9c491a3 commit 553ce10

File tree

8 files changed

+49
-38
lines changed

8 files changed

+49
-38
lines changed

docs/release-history.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,11 @@
22
Release History
33
===============
44

5+
v0.10.0 (2021-07-16)
6+
--------------------
7+
8+
Change genotype scores to be a unit8 (255=missing) instead of float64, to save ~70% of memory usage
9+
510
v0.9.1 (2021-07-13)
611
-------------------
712

pandas_genomics/arrays/genotype_array.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,11 +78,11 @@ def __init__(self, variant: Optional[Variant] = None):
7878

7979
# Data backing the GenotypeArray is stored as a numpy structured array
8080
# An unsigned integer for each allele in the genotype indexing the list of possible alleles
81-
# A float value for the genotype score (nan if missing)
81+
# An unsigned integer for the genotype score (255 if missing)
8282
self._record_type = np.dtype(
8383
[
8484
("allele_idxs", np.uint8, (self.variant.ploidy,)),
85-
("gt_score", np.float64),
85+
("gt_score", np.uint8),
8686
]
8787
)
8888

@@ -665,7 +665,9 @@ def gt_scores(self):
665665
"""
666666
Return the genotype score for each genotype (as a float)
667667
"""
668-
return self._data["gt_score"]
668+
scores = self._data["gt_score"].copy().astype("float")
669+
scores[scores == MISSING_IDX] = np.nan
670+
return scores
669671

670672
# Operations
671673
# Note: genotypes are compared by first allele then second, using the order of alleles in the variant

pandas_genomics/io/plink/from_plink.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,8 +187,7 @@ def create_gt_array(num_samples, variant_gt_bytes, variant):
187187
genotypes[het_gt] = (0, 1)
188188
# Create GenotypeArray representation of the data
189189
dtype = GenotypeDtype(variant)
190-
scores = np.empty(num_samples)
191-
scores[:] = np.nan
190+
scores = np.ones(num_samples) * MISSING_IDX # Missing Scores
192191
data = np.array(list(zip(genotypes, scores)), dtype=dtype._record_type)
193192
gt_array = GenotypeArray(values=data, dtype=dtype)
194193
return gt_array

pandas_genomics/io/vcf.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,11 @@ def from_vcf(
6767
allele_idxs = np.array(vcf_variant.genotypes)[:, :2]
6868
allele_idxs = np.where(allele_idxs == -1, MISSING_IDX, allele_idxs)
6969
gt_scores = vcf_variant.gt_quals
70-
gt_scores = np.where(gt_scores == -1, np.nan, gt_scores)
70+
# Convert genotype scores from float values to uint8 values
71+
gt_scores = np.where(gt_scores > 254, 254, gt_scores) # Max Score
72+
gt_scores = np.where(gt_scores < 0, 255, gt_scores) # Min Score (<0 is missing)
73+
gt_scores = np.where(gt_scores == -1, 255, gt_scores) # Missing values
74+
gt_scores = gt_scores.round().astype("uint8")
7175
values = np.array(list(zip(allele_idxs, gt_scores)), dtype=dtype._record_type)
7276

7377
# Make the GenotypeArray

pandas_genomics/scalars.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,7 @@
1212
import uuid
1313
from typing import Optional, List, Tuple, Union
1414

15-
MISSING_IDX = (
16-
255 # Integer indicating a missing allele. Each variant must have 254 alleles max.
17-
)
15+
MISSING_IDX = 255 # Integer indicating a missing allele or genotype score. Each variant must have 254 alleles max and the maximum genotype score is 254.
1816

1917

2018
class Variant:
@@ -359,7 +357,7 @@ class Genotype:
359357
allele_idxs: List[int]
360358
Alleles encoded as indexes into the variant allele list
361359
score: int, optional
362-
A quality score for the Genotype. No assumptions are made about the meaning.
360+
A quality score for the Genotype between 0 and 254. 255 or < 0 is treated as missing.
363361
364362
Examples
365363
--------
@@ -398,9 +396,13 @@ def __init__(
398396

399397
self.variant = variant
400398
self.allele_idxs = allele_idxs
401-
self.score = None
402399
if score is not None:
403-
self.score = int(score)
400+
score = int(score)
401+
if score < 0 or score > 255:
402+
raise ValueError("The score must be between 0 and 255, inclusive")
403+
elif score == 255:
404+
score = None
405+
self.score = score
404406

405407
# Validate parameters
406408
for a in self.allele_idxs:
@@ -502,8 +504,8 @@ def is_missing(self) -> bool:
502504

503505
@property
504506
def _float_score(self):
505-
"""Convenience method for storing score as a float"""
507+
"""Convenience method for storing score as a uint8"""
506508
if self.score is None:
507-
return float("NaN")
509+
return 255
508510
else:
509-
return float(self.score)
511+
return self.score

pandas_genomics/sim/biallelic_model_simulator.py

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from numpy.random._generator import default_rng
99

1010
from pandas_genomics.arrays import GenotypeArray, GenotypeDtype
11-
from pandas_genomics.scalars import Variant
11+
from pandas_genomics.scalars import Variant, MISSING_IDX
1212

1313

1414
class SNPEffectEncodings(Enum):
@@ -450,15 +450,15 @@ def _get_snp1_gt_array(self, gt_table_idxs):
450450
"""Assemble a GenotypeArray for SNP1 directly from genotype table indices"""
451451
dtype = GenotypeDtype(self.snp1)
452452
gt_table_data = (
453-
((0, 0), np.nan),
454-
((0, 1), np.nan),
455-
((1, 1), np.nan),
456-
((0, 0), np.nan),
457-
((0, 1), np.nan),
458-
((1, 1), np.nan),
459-
((0, 0), np.nan),
460-
((0, 1), np.nan),
461-
((1, 1), np.nan),
453+
((0, 0), MISSING_IDX),
454+
((0, 1), MISSING_IDX),
455+
((1, 1), MISSING_IDX),
456+
((0, 0), MISSING_IDX),
457+
((0, 1), MISSING_IDX),
458+
((1, 1), MISSING_IDX),
459+
((0, 0), MISSING_IDX),
460+
((0, 1), MISSING_IDX),
461+
((1, 1), MISSING_IDX),
462462
)
463463
data = np.array(
464464
[gt_table_data[i] for i in gt_table_idxs], dtype=dtype._record_type
@@ -469,15 +469,15 @@ def _get_snp2_gt_array(self, gt_table_idxs):
469469
"""Assemble a GenotypeArray for SNP2 directly from genotype table indices"""
470470
dtype = GenotypeDtype(self.snp2)
471471
gt_table_data = (
472-
((0, 0), np.nan),
473-
((0, 0), np.nan),
474-
((0, 0), np.nan),
475-
((0, 1), np.nan),
476-
((0, 1), np.nan),
477-
((0, 1), np.nan),
478-
((1, 1), np.nan),
479-
((1, 1), np.nan),
480-
((1, 1), np.nan),
472+
((0, 0), MISSING_IDX),
473+
((0, 0), MISSING_IDX),
474+
((0, 0), MISSING_IDX),
475+
((0, 1), MISSING_IDX),
476+
((0, 1), MISSING_IDX),
477+
((0, 1), MISSING_IDX),
478+
((1, 1), MISSING_IDX),
479+
((1, 1), MISSING_IDX),
480+
((1, 1), MISSING_IDX),
481481
)
482482
data = np.array(
483483
[gt_table_data[i] for i in gt_table_idxs], dtype=dtype._record_type

pandas_genomics/sim/random_gt.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import numpy as np
44

55
from pandas_genomics.arrays import GenotypeArray, GenotypeDtype
6-
from pandas_genomics.scalars import Variant
6+
from pandas_genomics.scalars import Variant, MISSING_IDX
77

88

99
def generate_random_gt(
@@ -58,8 +58,7 @@ def generate_random_gt(
5858

5959
# Create GenotypeArray representation of the data
6060
dtype = GenotypeDtype(variant)
61-
scores = np.empty(n)
62-
scores[:] = np.nan
61+
scores = np.ones(n) * MISSING_IDX
6362
data = np.array(list(zip(genotypes, scores)), dtype=dtype._record_type)
6463
gt_array = GenotypeArray(values=data, dtype=dtype)
6564

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "pandas-genomics"
3-
version = "0.9.1"
3+
version = "0.10.0"
44
description = "Pandas ExtensionDtypes and ExtensionArray for working with genomics data"
55
license = "BSD-3-Clause"
66
authors = ["John McGuigan <jrm5100@psu.edu>"]

0 commit comments

Comments
 (0)