Skip to content

Commit f6a0434

Browse files
ACEnglishjeromekelleher
authored andcommitted
Packaging and field sorting
adding packing dependency and allowing header INFO fields to be in different orders between multiple VCFs
1 parent d851fc3 commit f6a0434

File tree

8 files changed

+69
-3
lines changed

8 files changed

+69
-3
lines changed

bio2zarr/vcf2zarr/icf.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def fromdict(d):
4141
return VcfFieldSummary(**d)
4242

4343

44-
@dataclasses.dataclass
44+
@dataclasses.dataclass(order=True)
4545
class VcfField:
4646
category: str
4747
name: str
@@ -192,6 +192,16 @@ def fromdict(d):
192192
d["contigs"] = [Contig(**cd) for cd in d["contigs"]]
193193
return IcfMetadata(**d)
194194

195+
def __eq__(self, other):
196+
if not isinstance(other, IcfMetadata):
197+
return NotImplemented
198+
return (
199+
self.samples == other.samples
200+
and self.contigs == other.contigs
201+
and self.filters == other.filters
202+
and sorted(self.fields) == sorted(other.fields)
203+
)
204+
195205

196206
def fixed_vcf_field_definitions():
197207
def make_field_def(name, vcf_type, vcf_number):

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ dependencies = [
2323
# colouredlogs pulls in humanfriendly",
2424
"cyvcf2",
2525
"bed_reader",
26+
"packaging",
2627
]
2728
requires-python = ">=3.9"
2829
classifiers = [
563 Bytes
Binary file not shown.
96 Bytes
Binary file not shown.
553 Bytes
Binary file not shown.
97 Bytes
Binary file not shown.

tests/test_core.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -237,8 +237,8 @@ def test_examples(self, chunk_size, size, start, stop):
237237
# It works in CI on Linux, but it'll probably break at some point.
238238
# It's also necessary to update these numbers each time a new data
239239
# file gets added
240-
("tests/data", 4976329),
241-
("tests/data/vcf", 4964192),
240+
("tests/data", 4981734),
241+
("tests/data/vcf", 4969597),
242242
("tests/data/vcf/sample.vcf.gz", 1089),
243243
],
244244
)

tests/test_vcf_examples.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1100,3 +1100,58 @@ def test_missing_filter(tmp_path):
11001100
zarr_path = tmp_path / "zarr"
11011101
with pytest.raises(ValueError, match="Filter 'q10' was not defined in the header"):
11021102
vcf2zarr.convert([path], zarr_path)
1103+
1104+
1105+
class TestOutOfOrderFields:
1106+
# Mixing on purpose
1107+
data_path1 = "tests/data/vcf/out_of_order_fields/input2.bcf"
1108+
data_path2 = "tests/data/vcf/out_of_order_fields/input1.bcf"
1109+
1110+
@pytest.fixture(scope="class")
1111+
def ds(self, tmp_path_factory):
1112+
out = tmp_path_factory.mktemp("data") / "ooo_example.vcf.zarr"
1113+
vcf2zarr.convert([self.data_path1, self.data_path2], out)
1114+
return sg.load_dataset(out)
1115+
1116+
def test_filters(self, ds):
1117+
nt.assert_array_equal(ds["filter_id"], ["PASS", "FAIL"])
1118+
nt.assert_array_equal(
1119+
ds["variant_filter"],
1120+
[
1121+
[True, False],
1122+
[False, True],
1123+
[True, False],
1124+
],
1125+
)
1126+
1127+
def test_source(self, ds):
1128+
assert ds.attrs["source"] == f"bio2zarr-{provenance.__version__}"
1129+
1130+
def test_contigs(self, ds):
1131+
nt.assert_array_equal(ds["contig_id"], ["chr20", "chr21"])
1132+
nt.assert_array_equal(ds["contig_length"], [64444167.0, 46709983.0])
1133+
nt.assert_array_equal(ds["variant_contig"], [0, 1, 1])
1134+
1135+
def test_position(self, ds):
1136+
nt.assert_array_equal(ds["variant_position"], [63971, 64506, 64507])
1137+
1138+
def test_length(self, ds):
1139+
nt.assert_array_equal(ds["variant_length"], [11, 1, 1])
1140+
1141+
def test_info_fields(self, ds):
1142+
nt.assert_array_equal(
1143+
ds["variant_QNAME"],
1144+
["cluster19_000000F", ".", "cluster19_000000F"],
1145+
)
1146+
nt.assert_array_equal(ds["variant_QSTART"], [25698928, 25698928, -1])
1147+
1148+
def test_allele(self, ds):
1149+
nt.assert_array_equal(
1150+
ds["variant_allele"].values.tolist(),
1151+
[["TTCCATTCCAC", "T"], ["C", "CTCCAT"], ["G", "A"]],
1152+
)
1153+
assert ds["variant_allele"].dtype == "O"
1154+
1155+
def test_call_DPs(self, ds):
1156+
nt.assert_array_equal(ds["call_DP"], [[5], [-1], [5]])
1157+
nt.assert_array_equal(ds["call_DP2"], [[1], [1], [-1]])

0 commit comments

Comments
 (0)