@@ -106,6 +106,16 @@ def test_variant_position(self, ds):
106
106
def test_variant_allele (self , ds ):
107
107
nt .assert_array_equal (ds .variant_allele , [["A" , "G" ], ["T" , "C" ]])
108
108
109
+ def test_contig_id (self , ds ):
110
+ """Test that contig identifiers are correctly extracted and stored."""
111
+ nt .assert_array_equal (ds .contig_id , ["1" ])
112
+
113
+ def test_variant_contig (self , ds ):
114
+ """Test that variant to contig mapping is correctly stored."""
115
+ nt .assert_array_equal (
116
+ ds .variant_contig , [0 , 0 ]
117
+ ) # Both variants on chromosome 1
118
+
109
119
def test_genotypes (self , ds ):
110
120
call_genotype = ds .call_genotype .values
111
121
assert call_genotype .shape == (2 , 10 , 2 )
@@ -196,6 +206,24 @@ def test_genotypes(self, ds):
196
206
# FIXME not working
197
207
nt .assert_array_equal (actual , expected )
198
208
209
+ def test_contig_arrays (self , ds ):
210
+ """Test that contig arrays are correctly created and filled."""
211
+ # Verify contig_id exists and is a string array
212
+ assert hasattr (ds , "contig_id" )
213
+ assert ds .contig_id .dtype == np .dtype ("O" )
214
+
215
+ # Verify variant_contig exists and contains integer indices
216
+ assert hasattr (ds , "variant_contig" )
217
+ assert ds .variant_contig .dtype == np .dtype ("int8" )
218
+ assert ds .variant_contig .shape [0 ] == 100 # 100 variants
219
+
220
+ # Verify mapping between variant_contig and contig_id
221
+ # For each unique contig index, check at least one corresponding variant exists
222
+ for i , contig in enumerate (ds .contig_id .values ):
223
+ assert np .any (
224
+ ds .variant_contig .values == i
225
+ ), f"No variants found for contig { contig } "
226
+
199
227
# @pytest.mark.xfail
200
228
@pytest .mark .parametrize (
201
229
("variants_chunk_size" , "samples_chunk_size" ),
@@ -249,3 +277,81 @@ def test_by_validating(
249
277
worker_processes = worker_processes ,
250
278
)
251
279
plink .validate (path , out )
280
+
281
+
282
+ class TestMultipleContigs :
283
+ """Test handling of multiple contigs in PLINK files."""
284
+
285
+ @pytest .fixture (scope = "class" )
286
+ def multi_contig_bed_path (self , tmp_path_factory ):
287
+ tmp_path = tmp_path_factory .mktemp ("data" )
288
+ path = tmp_path / "multi_contig.bed"
289
+ # 6 sites x 4 samples
290
+ # - 2 variants on chromosome 1
291
+ # - 1 variant on chromosome 2
292
+ # - 2 variants on chromosome X
293
+ # - 1 variant on chromosome Y
294
+ # values are counts of allele 1, will be flipped by the converter
295
+ dosages = np .array (
296
+ [
297
+ [0 , 1 , 2 , 0 ], # chr1
298
+ [1 , 0 , 2 , 1 ], # chr1
299
+ [0 , 0 , 0 , 2 ], # chr2
300
+ [2 , 0 , 1 , 0 ], # chrX
301
+ [1 , 1 , 1 , 1 ], # chrX
302
+ [0 , 2 , 0 , 2 ], # chrY
303
+ ],
304
+ dtype = np .int8 ,
305
+ )
306
+ bed_reader .to_bed (path , dosages .T )
307
+
308
+ bim_path = path .with_suffix (".bim" )
309
+ with open (bim_path , "w" ) as f :
310
+ # Format: chr, variant_id, genetic_dist, pos, allele1, allele2
311
+ f .write ("1\t 1_10\t 0\t 10\t A\t G\n " )
312
+ f .write ("1\t 1_20\t 0\t 20\t T\t C\n " )
313
+ f .write ("2\t 2_10\t 0\t 10\t A\t G\n " )
314
+ f .write ("X\t X_10\t 0\t 10\t C\t T\n " )
315
+ f .write ("X\t X_20\t 0\t 20\t G\t A\n " )
316
+ f .write ("Y\t Y_10\t 0\t 10\t T\t C\n " )
317
+
318
+ fam_path = path .with_suffix (".fam" )
319
+ with open (fam_path , "w" ) as f :
320
+ for i in range (4 ):
321
+ # Format: fam_id, ind_id, pat_id, mat_id, sex, phenotype
322
+ f .write (f"fam{ i } ind{ i } 0 0 0 -9\n " )
323
+
324
+ return path
325
+
326
+ @pytest .fixture (scope = "class" )
327
+ def ds (self , tmp_path_factory , multi_contig_bed_path ):
328
+ tmp_path = tmp_path_factory .mktemp ("data" )
329
+ zarr_path = tmp_path / "multi_contig.plink.zarr"
330
+ plink .convert (multi_contig_bed_path , zarr_path )
331
+ return sg .load_dataset (zarr_path )
332
+
333
+ def test_contig_ids (self , ds ):
334
+ nt .assert_array_equal (ds .contig_id , ["1" , "2" , "X" , "Y" ])
335
+
336
+ def test_variant_contig (self , ds ):
337
+ nt .assert_array_equal (
338
+ ds .variant_contig , [0 , 0 , 1 , 2 , 2 , 3 ]
339
+ ) # Each variant mapped to its correct contig index
340
+
341
+ def test_genotypes (self , ds ):
342
+ call_genotype = ds .call_genotype .values
343
+ assert call_genotype .shape == (6 , 4 , 2 )
344
+ nt .assert_array_equal (
345
+ call_genotype ,
346
+ [
347
+ [[1 , 1 ], [1 , 0 ], [0 , 0 ], [1 , 1 ]], # chr1
348
+ [[1 , 0 ], [1 , 1 ], [0 , 0 ], [1 , 0 ]], # chr1
349
+ [[1 , 1 ], [1 , 1 ], [1 , 1 ], [0 , 0 ]], # chr2
350
+ [[0 , 0 ], [1 , 1 ], [1 , 0 ], [1 , 1 ]], # chrX
351
+ [[1 , 0 ], [1 , 0 ], [1 , 0 ], [1 , 0 ]], # chrX
352
+ [[1 , 1 ], [0 , 0 ], [1 , 1 ], [0 , 0 ]], # chrY
353
+ ],
354
+ )
355
+
356
+ def test_variant_position (self , ds ):
357
+ nt .assert_array_equal (ds .variant_position , [10 , 20 , 10 , 10 , 20 , 10 ])
0 commit comments