3
3
import xarray .testing as xt
4
4
import pytest
5
5
import sgkit as sg
6
+ import sgkit .io .plink
7
+ import bed_reader
6
8
import zarr
7
9
8
10
from bio2zarr import plink
9
11
10
12
11
13
class TestSmallExample :
14
+ @pytest .fixture (scope = "class" )
15
+ def bed_path (self , tmp_path_factory ):
16
+ tmp_path = tmp_path_factory .mktemp ("data" )
17
+ path = tmp_path / "example.bed"
18
+ # 7 sites x 3 samples
19
+ # These are counts of allele 1, so we have to flip them
20
+ # to get the values we expect
21
+ dosages = np .array (
22
+ [
23
+ [0 , 1 , 2 ],
24
+ [1 , 0 , 2 ],
25
+ [0 , 0 , 0 ],
26
+ [- 127 , 0 , 0 ],
27
+ [2 , 0 , 0 ],
28
+ [- 127 , - 127 , - 127 ],
29
+ [2 , 2 , 2 ],
30
+ ],
31
+ dtype = np .int8 ,
32
+ )
33
+ bed_reader .to_bed (path , dosages .T )
34
+ return path
35
+
36
+ @pytest .fixture (scope = "class" )
37
+ def ds (self , tmp_path_factory , bed_path ):
38
+ tmp_path = tmp_path_factory .mktemp ("data" )
39
+ zarr_path = tmp_path / "example.plink.zarr"
40
+ plink .convert (bed_path , zarr_path )
41
+ return sg .load_dataset (zarr_path )
42
+
43
+ def test_genotypes (self , ds ):
44
+ call_genotype = ds .call_genotype .values
45
+ assert call_genotype .shape == (7 , 3 , 2 )
46
+ nt .assert_array_equal (
47
+ call_genotype ,
48
+ [
49
+ [[1 , 1 ], [1 , 0 ], [0 , 0 ]],
50
+ [[1 , 0 ], [1 , 1 ], [0 , 0 ]],
51
+ [[1 , 1 ], [1 , 1 ], [1 , 1 ]],
52
+ [[- 1 , - 1 ], [1 , 1 ], [1 , 1 ]],
53
+ [[0 , 0 ], [1 , 1 ], [1 , 1 ]],
54
+ [[- 1 , - 1 ], [- 1 , - 1 ], [- 1 , - 1 ]],
55
+ [[0 , 0 ], [0 , 0 ], [0 , 0 ]],
56
+ ],
57
+ )
58
+
59
+
60
+ class TestEqualSgkit :
61
+ def test_simulated_example (self , tmp_path ):
62
+ data_path = "tests/data/plink/"
63
+ bed_path = data_path + "plink_sim_10s_100v_10pmiss.bed"
64
+ fam_path = data_path + "plink_sim_10s_100v_10pmiss.fam"
65
+ bim_path = data_path + "plink_sim_10s_100v_10pmiss.bim"
66
+ # print(bed_path)
67
+ # print(fam_path)
68
+ sg_ds = sgkit .io .plink .read_plink (
69
+ bed_path = bed_path , fam_path = fam_path , bim_path = bim_path
70
+ )
71
+ out = tmp_path / "example.plink.zarr"
72
+ plink .convert (bed_path , out )
73
+ ds = sg .load_dataset (out )
74
+ nt .assert_array_equal (ds .call_genotype .values , sg_ds .call_genotype .values )
75
+
76
+
77
+ class TestExample :
78
+ """
79
+ .bim file looks like this:
80
+
81
+ 1 1_10 0 10 A G
82
+ 1 1_20 0 20 T C
83
+
84
+ Definition: https://www.cog-genomics.org/plink/1.9/formats#bim
85
+ Chromosome code (either an integer, or 'X'/'Y'/'XY'/'MT'; '0'
86
+ indicates unknown) or name
87
+ Variant identifier
88
+ Position in morgans or centimorgans (safe to use dummy value of '0')
89
+ Base-pair coordinate (1-based; limited to 231-2)
90
+ Allele 1 (corresponding to clear bits in .bed; usually minor)
91
+ Allele 2 (corresponding to set bits in .bed; usually major)
92
+ """
93
+
94
+ @pytest .fixture (scope = "class" )
95
+ def ds (self , tmp_path_factory ):
96
+ path = "tests/data/plink/example.bed"
97
+ out = tmp_path_factory .mktemp ("data" ) / "example.plink.zarr"
98
+ plink .convert (path , out )
99
+ return sg .load_dataset (out )
100
+
101
+ def test_sample_ids (self , ds ):
102
+ nt .assert_array_equal (ds .sample_id , [f"ind{ j } " for j in range (10 )])
103
+
104
+ def test_variant_position (self , ds ):
105
+ nt .assert_array_equal (ds .variant_position , [10 , 20 ])
106
+
107
+ def test_variant_allele (self , ds ):
108
+ nt .assert_array_equal (ds .variant_allele , [["A" , "G" ], ["T" , "C" ]])
109
+
110
+ def test_genotypes (self , ds ):
111
+ call_genotype = ds .call_genotype .values
112
+ assert call_genotype .shape == (2 , 10 , 2 )
113
+ nt .assert_array_equal (
114
+ call_genotype ,
115
+ [
116
+ [
117
+ [0 , 0 ],
118
+ [0 , 0 ],
119
+ [0 , 0 ],
120
+ [1 , 1 ],
121
+ [1 , 1 ],
122
+ [1 , 1 ],
123
+ [1 , 1 ],
124
+ [1 , 1 ],
125
+ [1 , 1 ],
126
+ [1 , 1 ],
127
+ ],
128
+ [
129
+ [0 , 0 ],
130
+ [0 , 0 ],
131
+ [0 , 0 ],
132
+ [0 , 0 ],
133
+ [1 , 1 ],
134
+ [1 , 1 ],
135
+ [1 , 1 ],
136
+ [1 , 1 ],
137
+ [1 , 1 ],
138
+ [1 , 1 ],
139
+ ],
140
+ ],
141
+ )
142
+
143
+ def test_sgkit (self , ds ):
144
+ path = "tests/data/plink/example"
145
+ sg_ds = sgkit .io .plink .read_plink (path = path )
146
+ # Can't compare the full dataset yet
147
+ nt .assert_array_equal (ds .call_genotype .values , sg_ds .call_genotype .values )
148
+ # https://github.com/pystatgen/sgkit/issues/1209
149
+ nt .assert_array_equal (ds .variant_allele , sg_ds .variant_allele .astype (str ))
150
+ nt .assert_array_equal (ds .variant_position , sg_ds .variant_position )
151
+ nt .assert_array_equal (ds .sample_id , sg_ds .sample_id )
152
+
153
+
154
+ class TestSimulatedExample :
12
155
@pytest .fixture (scope = "class" )
13
156
def ds (self , tmp_path_factory ):
14
157
path = "tests/data/plink/plink_sim_10s_100v_10pmiss.bed"
15
158
out = tmp_path_factory .mktemp ("data" ) / "example.plink.zarr"
16
159
plink .convert (path , out )
17
160
return sg .load_dataset (out )
18
161
19
- @pytest .mark .xfail
20
- # FIXME I'm not sure these are the correct genotypes here, at least
21
- # the test isn't passing and others are
22
162
def test_genotypes (self , ds ):
23
163
# Validate a few randomly selected individual calls
24
164
# (spanning all possible states for a call)
@@ -82,13 +222,6 @@ def test_chunk_size(
82
222
worker_processes = worker_processes ,
83
223
)
84
224
ds2 = sg .load_dataset (out )
85
- # print()
86
- # print(ds.call_genotype.values[2])
87
- # print(ds2.call_genotype.values[2])
88
-
89
- # print(ds2)
90
- # print(ds2.call_genotype.values)
91
- # print(ds.call_genotype.values)
92
225
xt .assert_equal (ds , ds2 )
93
226
# TODO check array chunks
94
227
@@ -101,8 +234,6 @@ def test_chunk_size(
101
234
(33 , 3 ),
102
235
(99 , 10 ),
103
236
(3 , 10 ),
104
- # This one doesn't fail as it's the same as defaults
105
- # (100, 10),
106
237
],
107
238
)
108
239
@pytest .mark .parametrize ("worker_processes" , [0 ])
0 commit comments