Skip to content

Commit faf9944

Browse files
committed
Add plink contig info
1 parent f3c040d commit faf9944

File tree

2 files changed

+47
-3
lines changed

2 files changed

+47
-3
lines changed

bio2zarr/plink.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import numpy as np
66
import zarr
77

8-
from bio2zarr import constants, schema, writer
8+
from bio2zarr import constants, core, schema, writer
99

1010
logger = logging.getLogger(__name__)
1111

@@ -18,6 +18,9 @@ def __init__(self, path):
1818
self.samples = [schema.Sample(id=sample) for sample in self.bed.iid]
1919
self.num_samples = len(self.samples)
2020
self.root_attrs = {}
21+
self.contigs = [
22+
schema.Contig(id=str(chrom)) for chrom in np.unique(self.bed.chromosome)
23+
]
2124

2225
def iter_alleles(self, start, stop, num_alleles):
2326
ref_field = self.bed.allele_1
@@ -49,6 +52,11 @@ def iter_genotypes(self, shape, start, stop):
4952
gt[values == 2] = [0, 0] # Homozygous REF (0 in PLINK)
5053
yield gt, phased
5154

55+
def iter_contig(self, start, stop):
56+
chrom_to_contig_index = {contig.id: i for i, contig in enumerate(self.contigs)}
57+
for chrom in self.bed.chromosome[start:stop]:
58+
yield chrom_to_contig_index[str(chrom)]
59+
5260

5361
def generate_schema(
5462
bed,
@@ -88,6 +96,15 @@ def generate_schema(
8896
chunks=[variants_chunk_size, 2],
8997
description=None,
9098
),
99+
schema.ZarrArraySpec.new(
100+
vcf_field=None,
101+
name="variant_contig",
102+
dtype=core.min_int_dtype(0, len(np.unique(bed.chromosome))),
103+
shape=[m],
104+
dimensions=["variants"],
105+
chunks=[variants_chunk_size],
106+
description="Contig/chromosome index for each variant",
107+
),
91108
schema.ZarrArraySpec.new(
92109
vcf_field=None,
93110
name="call_genotype_phased",
@@ -155,8 +172,7 @@ def convert(
155172
)
156173
vzw.finalise(show_progress)
157174

158-
# TODO - index code needs variant_contig
159-
# vzw.create_index()
175+
vzw.create_index()
160176

161177

162178
# FIXME do this more efficiently - currently reading the whole thing

tests/test_plink.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,16 @@ def test_variant_position(self, ds):
106106
def test_variant_allele(self, ds):
107107
nt.assert_array_equal(ds.variant_allele, [["A", "G"], ["T", "C"]])
108108

109+
def test_contig_id(self, ds):
110+
"""Test that contig identifiers are correctly extracted and stored."""
111+
nt.assert_array_equal(ds.contig_id, ["1"])
112+
113+
def test_variant_contig(self, ds):
114+
"""Test that variant to contig mapping is correctly stored."""
115+
nt.assert_array_equal(
116+
ds.variant_contig, [0, 0]
117+
) # Both variants on chromosome 1
118+
109119
def test_genotypes(self, ds):
110120
call_genotype = ds.call_genotype.values
111121
assert call_genotype.shape == (2, 10, 2)
@@ -196,6 +206,24 @@ def test_genotypes(self, ds):
196206
# FIXME not working
197207
nt.assert_array_equal(actual, expected)
198208

209+
def test_contig_arrays(self, ds):
210+
"""Test that contig arrays are correctly created and filled."""
211+
# Verify contig_id exists and is a string array
212+
assert hasattr(ds, "contig_id")
213+
assert ds.contig_id.dtype == np.dtype("O")
214+
215+
# Verify variant_contig exists and contains integer indices
216+
assert hasattr(ds, "variant_contig")
217+
assert ds.variant_contig.dtype == np.dtype("int8")
218+
assert ds.variant_contig.shape[0] == 100 # 100 variants
219+
220+
# Verify mapping between variant_contig and contig_id
221+
# For each unique contig index, check at least one corresponding variant exists
222+
for i, contig in enumerate(ds.contig_id.values):
223+
assert np.any(
224+
ds.variant_contig.values == i
225+
), f"No variants found for contig {contig}"
226+
199227
# @pytest.mark.xfail
200228
@pytest.mark.parametrize(
201229
("variants_chunk_size", "samples_chunk_size"),

0 commit comments

Comments
 (0)