Skip to content

Commit d655bcf

Browse files
Add description from VCF header to arrays
Closes #125
1 parent 9f7492b commit d655bcf

File tree

3 files changed

+66
-2
lines changed

3 files changed

+66
-2
lines changed

bio2zarr/vcf.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1544,8 +1544,13 @@ def init_array(self, variable):
15441544
object_codec=object_codec,
15451545
dimension_separator=self.dimension_separator,
15461546
)
1547-
# Dimension names are part of the spec in Zarr v3
1548-
a.attrs["_ARRAY_DIMENSIONS"] = variable.dimensions
1547+
a.attrs.update(
1548+
{
1549+
"description": variable.description,
1550+
# Dimension names are part of the spec in Zarr v3
1551+
"_ARRAY_DIMENSIONS": variable.dimensions,
1552+
}
1553+
)
15491554

15501555
def get_array(self, name):
15511556
return self.root["wip_" + name]

tests/test_vcf.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,3 +319,39 @@ def test_check_overlap(regions):
319319
]
320320
with pytest.raises(ValueError, match="Multiple VCFs have the region"):
321321
vcf.check_overlap(partitions)
322+
323+
324+
class TestVcfDescriptions:
325+
@pytest.mark.parametrize(
326+
("field", "description"),
327+
[
328+
("variant_NS", "Number of Samples With Data"),
329+
("variant_AN", "Total number of alleles in called genotypes"),
330+
(
331+
"variant_AC",
332+
"Allele count in genotypes, for each ALT allele, "
333+
"in the same order as listed",
334+
),
335+
("variant_DP", "Total Depth"),
336+
("variant_AF", "Allele Frequency"),
337+
("variant_AA", "Ancestral Allele"),
338+
("variant_DB", "dbSNP membership, build 129"),
339+
("variant_H2", "HapMap2 membership"),
340+
("call_GQ", "Genotype Quality"),
341+
("call_DP", "Read Depth"),
342+
("call_HQ", "Haplotype Quality"),
343+
],
344+
)
345+
def test_fields(self, schema, field, description):
346+
assert schema["columns"][field]["description"] == description
347+
348+
# This information is not in the schema yet,
349+
# https://github.com/sgkit-dev/bio2zarr/issues/123
350+
# @pytest.mark.parametrize(
351+
# ("filt", "description"),
352+
# [
353+
# ("s50","Less than 50% of samples have data"),
354+
# ("q10", "Quality below 10"),
355+
# ])
356+
# def test_filters(self, schema, filt, description):
357+
# assert schema["filters"][field]["description"] == description

tests/test_vcf_examples.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -400,6 +400,29 @@ def test_vcf_dimensions(self, ds):
400400
assert ds.variant_H2.dims == ("variants",)
401401
assert ds.variant_position.dims == ("variants",)
402402

403+
@pytest.mark.parametrize(
404+
("field", "description"),
405+
[
406+
("variant_NS", "Number of Samples With Data"),
407+
("variant_AN", "Total number of alleles in called genotypes"),
408+
(
409+
"variant_AC",
410+
"Allele count in genotypes, for each ALT allele, "
411+
"in the same order as listed",
412+
),
413+
("variant_DP", "Total Depth"),
414+
("variant_AF", "Allele Frequency"),
415+
("variant_AA", "Ancestral Allele"),
416+
("variant_DB", "dbSNP membership, build 129"),
417+
("variant_H2", "HapMap2 membership"),
418+
("call_GQ", "Genotype Quality"),
419+
("call_DP", "Read Depth"),
420+
("call_HQ", "Haplotype Quality"),
421+
],
422+
)
423+
def test_vcf_field_description(self, ds, field, description):
424+
assert ds[field].attrs["description"] == description
425+
403426

404427
class Test1000G2020Example:
405428
data_path = "tests/data/vcf/1kg_2020_chrM.vcf.gz"

0 commit comments

Comments
 (0)