Skip to content

Commit b32d656

Browse files
benjefferyjeromekelleher
authored andcommitted
Add plink contig info
1 parent 3d8ad3e commit b32d656

File tree

3 files changed

+125
-4
lines changed

3 files changed

+125
-4
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# 0.1.6 2025-0X-XX
22

3+
- Add contigs to plink output (#344)
4+
35
Breaking changes
46

57
- Remove explicit sample, contig and filter lists from the schema.

bio2zarr/plink.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import numpy as np
66
import zarr
77

8-
from bio2zarr import constants, vcz
8+
from bio2zarr import constants, core, vcz
99

1010
logger = logging.getLogger(__name__)
1111

@@ -27,10 +27,19 @@ def num_records(self):
2727
def samples(self):
2828
return [vcz.Sample(id=sample) for sample in self.bed.iid]
2929

30+
@property
31+
def contigs(self):
32+
return [vcz.Contig(id=str(chrom)) for chrom in np.unique(self.bed.chromosome)]
33+
3034
@property
3135
def num_samples(self):
3236
return len(self.samples)
3337

38+
def iter_contig(self, start, stop):
39+
chrom_to_contig_index = {contig.id: i for i, contig in enumerate(self.contigs)}
40+
for chrom in self.bed.chromosome[start:stop]:
41+
yield chrom_to_contig_index[str(chrom)]
42+
3443
def iter_field(self, field_name, shape, start, stop):
3544
assert field_name == "position" # Only position field is supported from plink
3645
yield from self.bed.bp_position[start:stop]
@@ -101,6 +110,12 @@ def generate_schema(
101110
dimensions=["variants", "alleles"],
102111
description=None,
103112
),
113+
vcz.ZarrArraySpec(
114+
name="variant_contig",
115+
dtype=core.min_int_dtype(0, len(np.unique(self.bed.chromosome))),
116+
dimensions=["variants"],
117+
description="Contig/chromosome index for each variant",
118+
),
104119
vcz.ZarrArraySpec(
105120
name="call_genotype_phased",
106121
dtype="bool",
@@ -155,9 +170,7 @@ def convert(
155170
show_progress=show_progress,
156171
)
157172
vzw.finalise(show_progress)
158-
159-
# TODO - index code needs variant_contig
160-
# vzw.create_index()
173+
vzw.create_index()
161174

162175

163176
# FIXME do this more efficiently - currently reading the whole thing

tests/test_plink.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,16 @@ def test_variant_position(self, ds):
106106
def test_variant_allele(self, ds):
107107
nt.assert_array_equal(ds.variant_allele, [["A", "G"], ["T", "C"]])
108108

109+
def test_contig_id(self, ds):
110+
"""Test that contig identifiers are correctly extracted and stored."""
111+
nt.assert_array_equal(ds.contig_id, ["1"])
112+
113+
def test_variant_contig(self, ds):
114+
"""Test that variant to contig mapping is correctly stored."""
115+
nt.assert_array_equal(
116+
ds.variant_contig, [0, 0]
117+
) # Both variants on chromosome 1
118+
109119
def test_genotypes(self, ds):
110120
call_genotype = ds.call_genotype.values
111121
assert call_genotype.shape == (2, 10, 2)
@@ -196,6 +206,24 @@ def test_genotypes(self, ds):
196206
# FIXME not working
197207
nt.assert_array_equal(actual, expected)
198208

209+
def test_contig_arrays(self, ds):
210+
"""Test that contig arrays are correctly created and filled."""
211+
# Verify contig_id exists and is a string array
212+
assert hasattr(ds, "contig_id")
213+
assert ds.contig_id.dtype == np.dtype("O")
214+
215+
# Verify variant_contig exists and contains integer indices
216+
assert hasattr(ds, "variant_contig")
217+
assert ds.variant_contig.dtype == np.dtype("int8")
218+
assert ds.variant_contig.shape[0] == 100 # 100 variants
219+
220+
# Verify mapping between variant_contig and contig_id
221+
# For each unique contig index, check at least one corresponding variant exists
222+
for i, contig in enumerate(ds.contig_id.values):
223+
assert np.any(
224+
ds.variant_contig.values == i
225+
), f"No variants found for contig {contig}"
226+
199227
# @pytest.mark.xfail
200228
@pytest.mark.parametrize(
201229
("variants_chunk_size", "samples_chunk_size"),
@@ -249,3 +277,81 @@ def test_by_validating(
249277
worker_processes=worker_processes,
250278
)
251279
plink.validate(path, out)
280+
281+
282+
class TestMultipleContigs:
283+
"""Test handling of multiple contigs in PLINK files."""
284+
285+
@pytest.fixture(scope="class")
286+
def multi_contig_bed_path(self, tmp_path_factory):
287+
tmp_path = tmp_path_factory.mktemp("data")
288+
path = tmp_path / "multi_contig.bed"
289+
# 6 sites x 4 samples
290+
# - 2 variants on chromosome 1
291+
# - 1 variant on chromosome 2
292+
# - 2 variants on chromosome X
293+
# - 1 variant on chromosome Y
294+
# values are counts of allele 1, will be flipped by the converter
295+
dosages = np.array(
296+
[
297+
[0, 1, 2, 0], # chr1
298+
[1, 0, 2, 1], # chr1
299+
[0, 0, 0, 2], # chr2
300+
[2, 0, 1, 0], # chrX
301+
[1, 1, 1, 1], # chrX
302+
[0, 2, 0, 2], # chrY
303+
],
304+
dtype=np.int8,
305+
)
306+
bed_reader.to_bed(path, dosages.T)
307+
308+
bim_path = path.with_suffix(".bim")
309+
with open(bim_path, "w") as f:
310+
# Format: chr, variant_id, genetic_dist, pos, allele1, allele2
311+
f.write("1\t1_10\t0\t10\tA\tG\n")
312+
f.write("1\t1_20\t0\t20\tT\tC\n")
313+
f.write("2\t2_10\t0\t10\tA\tG\n")
314+
f.write("X\tX_10\t0\t10\tC\tT\n")
315+
f.write("X\tX_20\t0\t20\tG\tA\n")
316+
f.write("Y\tY_10\t0\t10\tT\tC\n")
317+
318+
fam_path = path.with_suffix(".fam")
319+
with open(fam_path, "w") as f:
320+
for i in range(4):
321+
# Format: fam_id, ind_id, pat_id, mat_id, sex, phenotype
322+
f.write(f"fam{i} ind{i} 0 0 0 -9\n")
323+
324+
return path
325+
326+
@pytest.fixture(scope="class")
327+
def ds(self, tmp_path_factory, multi_contig_bed_path):
328+
tmp_path = tmp_path_factory.mktemp("data")
329+
zarr_path = tmp_path / "multi_contig.plink.zarr"
330+
plink.convert(multi_contig_bed_path, zarr_path)
331+
return sg.load_dataset(zarr_path)
332+
333+
def test_contig_ids(self, ds):
334+
nt.assert_array_equal(ds.contig_id, ["1", "2", "X", "Y"])
335+
336+
def test_variant_contig(self, ds):
337+
nt.assert_array_equal(
338+
ds.variant_contig, [0, 0, 1, 2, 2, 3]
339+
) # Each variant mapped to its correct contig index
340+
341+
def test_genotypes(self, ds):
342+
call_genotype = ds.call_genotype.values
343+
assert call_genotype.shape == (6, 4, 2)
344+
nt.assert_array_equal(
345+
call_genotype,
346+
[
347+
[[1, 1], [1, 0], [0, 0], [1, 1]], # chr1
348+
[[1, 0], [1, 1], [0, 0], [1, 0]], # chr1
349+
[[1, 1], [1, 1], [1, 1], [0, 0]], # chr2
350+
[[0, 0], [1, 1], [1, 0], [1, 1]], # chrX
351+
[[1, 0], [1, 0], [1, 0], [1, 0]], # chrX
352+
[[1, 1], [0, 0], [1, 1], [0, 0]], # chrY
353+
],
354+
)
355+
356+
def test_variant_position(self, ds):
357+
nt.assert_array_equal(ds.variant_position, [10, 20, 10, 10, 20, 10])

0 commit comments

Comments
 (0)