Skip to content

Commit fd3f53d

Browse files
committed
Add plink contig info
1 parent aedf90c commit fd3f53d

File tree

2 files changed

+48
-4
lines changed

2 files changed

+48
-4
lines changed

bio2zarr/plink.py

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import numpy as np
66
import zarr
77

8-
from bio2zarr import constants, vcz
8+
from bio2zarr import constants, core, vcz
99

1010
logger = logging.getLogger(__name__)
1111

@@ -27,6 +27,10 @@ def num_records(self):
2727
def samples(self):
2828
return [vcz.Sample(id=sample) for sample in self.bed.iid]
2929

30+
@property
31+
def contigs(self):
32+
return [vcz.Contig(id=str(chrom)) for chrom in np.unique(self.bed.chromosome)]
33+
3034
@property
3135
def num_samples(self):
3236
return len(self.samples)
@@ -44,6 +48,11 @@ def iter_alleles(self, start, stop, num_alleles):
4448
alleles[1 : 1 + len(alt)] = alt
4549
yield alleles
4650

51+
def iter_contig(self, start, stop):
52+
chrom_to_contig_index = {contig.id: i for i, contig in enumerate(self.contigs)}
53+
for chrom in self.bed.chromosome[start:stop]:
54+
yield chrom_to_contig_index[str(chrom)]
55+
4756
def iter_field(self, field_name, shape, start, stop):
4857
assert field_name == "position" # Only position field is supported from plink
4958
yield from self.bed.bp_position[start:stop]
@@ -100,6 +109,15 @@ def generate_schema(
100109
chunks=[schema_instance.variants_chunk_size, 2],
101110
description=None,
102111
),
112+
vcz.ZarrArraySpec.new(
113+
vcf_field=None,
114+
name="variant_contig",
115+
dtype=core.min_int_dtype(0, len(np.unique(self.bed.chromosome))),
116+
shape=[m],
117+
dimensions=["variants"],
118+
chunks=[schema_instance.variants_chunk_size],
119+
description="Contig/chromosome index for each variant",
120+
),
103121
vcz.ZarrArraySpec.new(
104122
vcf_field=None,
105123
name="call_genotype_phased",
@@ -171,9 +189,7 @@ def convert(
171189
show_progress=show_progress,
172190
)
173191
vzw.finalise(show_progress)
174-
175-
# TODO - index code needs variant_contig
176-
# vzw.create_index()
192+
vzw.create_index()
177193

178194

179195
# FIXME do this more efficiently - currently reading the whole thing

tests/test_plink.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,16 @@ def test_variant_position(self, ds):
106106
def test_variant_allele(self, ds):
107107
nt.assert_array_equal(ds.variant_allele, [["A", "G"], ["T", "C"]])
108108

109+
def test_contig_id(self, ds):
110+
"""Test that contig identifiers are correctly extracted and stored."""
111+
nt.assert_array_equal(ds.contig_id, ["1"])
112+
113+
def test_variant_contig(self, ds):
114+
"""Test that variant to contig mapping is correctly stored."""
115+
nt.assert_array_equal(
116+
ds.variant_contig, [0, 0]
117+
) # Both variants on chromosome 1
118+
109119
def test_genotypes(self, ds):
110120
call_genotype = ds.call_genotype.values
111121
assert call_genotype.shape == (2, 10, 2)
@@ -196,6 +206,24 @@ def test_genotypes(self, ds):
196206
# FIXME not working
197207
nt.assert_array_equal(actual, expected)
198208

209+
def test_contig_arrays(self, ds):
210+
"""Test that contig arrays are correctly created and filled."""
211+
# Verify contig_id exists and is a string array
212+
assert hasattr(ds, "contig_id")
213+
assert ds.contig_id.dtype == np.dtype("O")
214+
215+
# Verify variant_contig exists and contains integer indices
216+
assert hasattr(ds, "variant_contig")
217+
assert ds.variant_contig.dtype == np.dtype("int8")
218+
assert ds.variant_contig.shape[0] == 100 # 100 variants
219+
220+
# Verify mapping between variant_contig and contig_id
221+
# For each unique contig index, check at least one corresponding variant exists
222+
for i, contig in enumerate(ds.contig_id.values):
223+
assert np.any(
224+
ds.variant_contig.values == i
225+
), f"No variants found for contig {contig}"
226+
199227
# @pytest.mark.xfail
200228
@pytest.mark.parametrize(
201229
("variants_chunk_size", "samples_chunk_size"),

0 commit comments

Comments
 (0)