Skip to content

Commit 6695d35

Browse files
Merge pull request #67 from jeromekelleher/fix-plink-data-mixup
Fix plink data mixup
2 parents 850a405 + 4674e04 commit 6695d35

File tree

3 files changed

+186
-19
lines changed

3 files changed

+186
-19
lines changed

bio2zarr/plink.py

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,11 @@
1212

1313

1414
def encode_genotypes_slice(bed_path, zarr_path, start, stop):
15-
bed = bed_reader.open_bed(bed_path, num_threads=1)
15+
# We need to count the A2 alleles here if we want to keep the
16+
# alleles reported as allele_1, allele_2. It's obvious here what
17+
# the correct approach is, but it is important to note that the
18+
# 0th allele is *not* necessarily the REF for these datasets.
19+
bed = bed_reader.open_bed(bed_path, num_threads=1, count_A1=False)
1620
store = zarr.DirectoryStore(zarr_path)
1721
root = zarr.group(store=store)
1822
gt = core.BufferedArray(root["call_genotype"], start)
@@ -62,7 +66,6 @@ def convert(
6266
bed = bed_reader.open_bed(bed_path, num_threads=1)
6367
n = bed.iid_count
6468
m = bed.sid_count
65-
del bed
6669
logging.info(f"Scanned plink with {n} samples and {m} variants")
6770

6871
# FIXME
@@ -79,6 +82,40 @@ def convert(
7982
chunks = [chunk_length, chunk_width]
8083
dimensions = ["variants", "samples"]
8184

85+
a = root.array(
86+
"sample_id",
87+
bed.iid,
88+
dtype="str",
89+
compressor=core.default_compressor,
90+
chunks=(chunk_width,),
91+
)
92+
a.attrs["_ARRAY_DIMENSIONS"] = ["samples"]
93+
logger.debug(f"Encoded samples")
94+
95+
# TODO encode these in slices - but read them in one go to avoid
96+
# fetching repeatedly from bim file
97+
a = root.array(
98+
"variant_position",
99+
bed.bp_position,
100+
dtype=np.int32,
101+
compressor=core.default_compressor,
102+
chunks=(chunk_length,),
103+
)
104+
a.attrs["_ARRAY_DIMENSIONS"] = ["variants"]
105+
logger.debug(f"encoded variant_position")
106+
107+
alleles = np.stack([bed.allele_1, bed.allele_2], axis=1)
108+
a = root.array(
109+
"variant_allele",
110+
alleles,
111+
dtype="str",
112+
compressor=core.default_compressor,
113+
chunks=(chunk_length,),
114+
)
115+
a.attrs["_ARRAY_DIMENSIONS"] = ["variants", "alleles"]
116+
logger.debug(f"encoded variant_allele")
117+
118+
# TODO remove this?
82119
a = root.empty(
83120
"call_genotype_phased",
84121
dtype="bool",
@@ -108,6 +145,8 @@ def convert(
108145
)
109146
a.attrs["_ARRAY_DIMENSIONS"] = list(dimensions)
110147

148+
del bed
149+
111150
num_slices = max(1, worker_processes * 4)
112151
slices = core.chunk_aligned_slices(a, num_slices)
113152

@@ -132,7 +171,7 @@ def validate(bed_path, zarr_path):
132171
root = zarr.group(store=store)
133172
call_genotype = root["call_genotype"][:]
134173

135-
bed = bed_reader.open_bed(bed_path, num_threads=1)
174+
bed = bed_reader.open_bed(bed_path, count_A1=False, num_threads=1)
136175

137176
assert call_genotype.shape[0] == bed.sid_count
138177
assert call_genotype.shape[1] == bed.iid_count

tests/test_plink.py

Lines changed: 143 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,162 @@
33
import xarray.testing as xt
44
import pytest
55
import sgkit as sg
6+
import sgkit.io.plink
7+
import bed_reader
68
import zarr
79

810
from bio2zarr import plink
911

1012

1113
class TestSmallExample:
14+
@pytest.fixture(scope="class")
15+
def bed_path(self, tmp_path_factory):
16+
tmp_path = tmp_path_factory.mktemp("data")
17+
path = tmp_path / "example.bed"
18+
# 7 sites x 3 samples
19+
# These are counts of allele 1, so we have to flip them
20+
# to get the values we expect
21+
dosages = np.array(
22+
[
23+
[0, 1, 2],
24+
[1, 0, 2],
25+
[0, 0, 0],
26+
[-127, 0, 0],
27+
[2, 0, 0],
28+
[-127, -127, -127],
29+
[2, 2, 2],
30+
],
31+
dtype=np.int8,
32+
)
33+
bed_reader.to_bed(path, dosages.T)
34+
return path
35+
36+
@pytest.fixture(scope="class")
37+
def ds(self, tmp_path_factory, bed_path):
38+
tmp_path = tmp_path_factory.mktemp("data")
39+
zarr_path = tmp_path / "example.plink.zarr"
40+
plink.convert(bed_path, zarr_path)
41+
return sg.load_dataset(zarr_path)
42+
43+
def test_genotypes(self, ds):
44+
call_genotype = ds.call_genotype.values
45+
assert call_genotype.shape == (7, 3, 2)
46+
nt.assert_array_equal(
47+
call_genotype,
48+
[
49+
[[1, 1], [1, 0], [0, 0]],
50+
[[1, 0], [1, 1], [0, 0]],
51+
[[1, 1], [1, 1], [1, 1]],
52+
[[-1, -1], [1, 1], [1, 1]],
53+
[[0, 0], [1, 1], [1, 1]],
54+
[[-1, -1], [-1, -1], [-1, -1]],
55+
[[0, 0], [0, 0], [0, 0]],
56+
],
57+
)
58+
59+
60+
class TestEqualSgkit:
61+
def test_simulated_example(self, tmp_path):
62+
data_path = "tests/data/plink/"
63+
bed_path = data_path + "plink_sim_10s_100v_10pmiss.bed"
64+
fam_path = data_path + "plink_sim_10s_100v_10pmiss.fam"
65+
bim_path = data_path + "plink_sim_10s_100v_10pmiss.bim"
66+
# print(bed_path)
67+
# print(fam_path)
68+
sg_ds = sgkit.io.plink.read_plink(
69+
bed_path=bed_path, fam_path=fam_path, bim_path=bim_path
70+
)
71+
out = tmp_path / "example.plink.zarr"
72+
plink.convert(bed_path, out)
73+
ds = sg.load_dataset(out)
74+
nt.assert_array_equal(ds.call_genotype.values, sg_ds.call_genotype.values)
75+
76+
77+
class TestExample:
78+
"""
79+
.bim file looks like this:
80+
81+
1 1_10 0 10 A G
82+
1 1_20 0 20 T C
83+
84+
Definition: https://www.cog-genomics.org/plink/1.9/formats#bim
85+
Chromosome code (either an integer, or 'X'/'Y'/'XY'/'MT'; '0'
86+
indicates unknown) or name
87+
Variant identifier
88+
Position in morgans or centimorgans (safe to use dummy value of '0')
89+
Base-pair coordinate (1-based; limited to 231-2)
90+
Allele 1 (corresponding to clear bits in .bed; usually minor)
91+
Allele 2 (corresponding to set bits in .bed; usually major)
92+
"""
93+
94+
@pytest.fixture(scope="class")
95+
def ds(self, tmp_path_factory):
96+
path = "tests/data/plink/example.bed"
97+
out = tmp_path_factory.mktemp("data") / "example.plink.zarr"
98+
plink.convert(path, out)
99+
return sg.load_dataset(out)
100+
101+
def test_sample_ids(self, ds):
102+
nt.assert_array_equal(ds.sample_id, [f"ind{j}" for j in range(10)])
103+
104+
def test_variant_position(self, ds):
105+
nt.assert_array_equal(ds.variant_position, [10, 20])
106+
107+
def test_variant_allele(self, ds):
108+
nt.assert_array_equal(ds.variant_allele, [["A", "G"], ["T", "C"]])
109+
110+
def test_genotypes(self, ds):
111+
call_genotype = ds.call_genotype.values
112+
assert call_genotype.shape == (2, 10, 2)
113+
nt.assert_array_equal(
114+
call_genotype,
115+
[
116+
[
117+
[0, 0],
118+
[0, 0],
119+
[0, 0],
120+
[1, 1],
121+
[1, 1],
122+
[1, 1],
123+
[1, 1],
124+
[1, 1],
125+
[1, 1],
126+
[1, 1],
127+
],
128+
[
129+
[0, 0],
130+
[0, 0],
131+
[0, 0],
132+
[0, 0],
133+
[1, 1],
134+
[1, 1],
135+
[1, 1],
136+
[1, 1],
137+
[1, 1],
138+
[1, 1],
139+
],
140+
],
141+
)
142+
143+
def test_sgkit(self, ds):
144+
path = "tests/data/plink/example"
145+
sg_ds = sgkit.io.plink.read_plink(path=path)
146+
# Can't compare the full dataset yet
147+
nt.assert_array_equal(ds.call_genotype.values, sg_ds.call_genotype.values)
148+
# https://github.com/pystatgen/sgkit/issues/1209
149+
nt.assert_array_equal(ds.variant_allele, sg_ds.variant_allele.astype(str))
150+
nt.assert_array_equal(ds.variant_position, sg_ds.variant_position)
151+
nt.assert_array_equal(ds.sample_id, sg_ds.sample_id)
152+
153+
154+
class TestSimulatedExample:
12155
@pytest.fixture(scope="class")
13156
def ds(self, tmp_path_factory):
14157
path = "tests/data/plink/plink_sim_10s_100v_10pmiss.bed"
15158
out = tmp_path_factory.mktemp("data") / "example.plink.zarr"
16159
plink.convert(path, out)
17160
return sg.load_dataset(out)
18161

19-
@pytest.mark.xfail
20-
# FIXME I'm not sure these are the correct genotypes here, at least
21-
# the test isn't passing and others are
22162
def test_genotypes(self, ds):
23163
# Validate a few randomly selected individual calls
24164
# (spanning all possible states for a call)
@@ -82,13 +222,6 @@ def test_chunk_size(
82222
worker_processes=worker_processes,
83223
)
84224
ds2 = sg.load_dataset(out)
85-
# print()
86-
# print(ds.call_genotype.values[2])
87-
# print(ds2.call_genotype.values[2])
88-
89-
# print(ds2)
90-
# print(ds2.call_genotype.values)
91-
# print(ds.call_genotype.values)
92225
xt.assert_equal(ds, ds2)
93226
# TODO check array chunks
94227

@@ -101,8 +234,6 @@ def test_chunk_size(
101234
(33, 3),
102235
(99, 10),
103236
(3, 10),
104-
# This one doesn't fail as it's the same as defaults
105-
# (100, 10),
106237
],
107238
)
108239
@pytest.mark.parametrize("worker_processes", [0])

tests/test_simulated_data.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import msprime
33
import pysam
44
import sgkit as sg
5+
import numpy as np
56
import numpy.testing as nt
67

78
from bio2zarr import vcf
@@ -41,7 +42,3 @@ def test_ploidy(self, ploidy, tmp_path):
4142
nt.assert_equal(ds.variant_allele[:, 0].values, "A")
4243
nt.assert_equal(ds.variant_allele[:, 1].values, "T")
4344
nt.assert_equal(ds.variant_position, ts.sites_position)
44-
45-
46-
# TODO add a plink equivalant using
47-
# https://fastlmm.github.io/bed-reader/#bed_reader.to_bed

0 commit comments

Comments
 (0)