sgkit-dev · tomwhite · Apr 15, 2025 · Apr 16, 2025 · Apr 15, 2025 · Apr 16, 2025
diff --git a/tests/test_stats.py b/tests/test_stats.py
@@ -3,7 +3,7 @@
 
 import pytest
 import zarr
-from bio2zarr import vcf2zarr
+from bio2zarr import vcf
 
 from vcztools.stats import nrecords, stats
 
@@ -39,7 +39,7 @@ def test_stats__no_index(tmp_path):
     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
     # don't use cache here since we want to make sure vcz is not indexed
     vcz = tmp_path.joinpath("intermediate.vcz")
-    vcf2zarr.convert([original], vcz, worker_processes=0, local_alleles=False)
+    vcf.convert([original], vcz, worker_processes=0, local_alleles=False)
 
     # delete the index created by vcf2zarr
     root = zarr.open(vcz, mode="a")

diff --git a/tests/test_vcf_writer.py b/tests/test_vcf_writer.py
@@ -6,7 +6,6 @@
 import numpy as np
 import pytest
 import zarr
-from bio2zarr import vcf2zarr
 from cyvcf2 import VCF
 from numpy.testing import assert_array_equal
 
@@ -301,15 +300,9 @@ def test_write_vcf__header_flags(tmp_path):
     assert_vcfs_close(original, output)
 
 
-def test_write_vcf__generate_header(tmp_path):
+def test_write_vcf__generate_header():
     original = pathlib.Path("tests/data/vcf") / "sample.vcf.gz"
-    # don't use cache here since we mutate the vcz
-    vcz = tmp_path.joinpath("intermediate.vcz")
-    vcf2zarr.convert([original], vcz, worker_processes=0, local_alleles=False)
-
-    # remove vcf_header
-    root = zarr.open(vcz, mode="r+")
-    del root.attrs["vcf_header"]
+    vcz = vcz_path_cache(original)
 
     output_header = StringIO()
     write_vcf(vcz, output_header, header_only=True, no_version=True)
@@ -324,20 +317,26 @@ def test_write_vcf__generate_header(tmp_path):
 ##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
 ##INFO=<ID=H2,Number=0,Type=Flag,Description="HapMap2 membership">
 ##INFO=<ID=NS,Number=1,Type=Integer,Description="Number of Samples With Data">
-##FILTER=<ID=PASS,Description="">
-##FILTER=<ID=s50,Description="">
-##FILTER=<ID=q10,Description="">
+##FILTER=<ID=PASS,Description="All filters passed">
+##FILTER=<ID=s50,Description="Less than 50% of samples have data">
+##FILTER=<ID=q10,Description="Quality below 10">
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
 ##FORMAT=<ID=HQ,Number=2,Type=Integer,Description="Haplotype Quality">
 ##contig=<ID=19>
 ##contig=<ID=20>
 ##contig=<ID=X>
+##fileDate=20090805
+##reference=1000GenomesPilot-NCBI36
+##phasing=partial
+##ALT=<ID=DEL:ME:ALU,Description="Deletion of ALU element">
+##ALT=<ID=CNV,Description="Copy number variable region">
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NA00001	NA00002	NA00003
 """  # noqa: E501
 
     # substitute value of source
+    root = zarr.open(vcz, mode="r+")
     expected_vcf_header = expected_vcf_header.format(root.attrs["source"])
 
     assert output_header.getvalue() == expected_vcf_header

diff --git a/tests/utils.py b/tests/utils.py
@@ -5,7 +5,7 @@
 
 import cyvcf2
 import numpy as np
-from bio2zarr import vcf2zarr
+from bio2zarr import vcf
 
 
 @contextmanager
@@ -29,6 +29,64 @@ def normalise_info_missingness(info_dict, key):
     return value
 
 
+def _get_header_field_dicts(vcf, header_type):
+    def to_dict(header_field):
+        d = header_field.info(extra=True)
+        del d[b"IDX"]  # remove IDX since we don't care about ordering
+
+        # cyvcf2 duplicates some keys as strings and bytes, so remove the bytes one
+        for k in list(d.keys()):
+            if isinstance(k, bytes) and k.decode("utf-8") in d:
+                del d[k]
+        return d
+
+    return {
+        field["ID"]: to_dict(field)
+        for field in vcf.header_iter()
+        if field["HeaderType"] == header_type
+    }
+
+
+def _assert_header_field_dicts_equivalent(field_dicts1, field_dicts2):
+    assert len(field_dicts1) == len(field_dicts2)
+
+    for id in field_dicts1.keys():
+        assert id in field_dicts2
+        field_dict1 = field_dicts1[id]
+        field_dict2 = field_dicts2[id]
+
+        assert len(field_dict1) == len(field_dict2)
+        # all fields should be the same, except Number="." which can match any value
+        for k in field_dict1.keys():
+            assert k in field_dict2
+            v1 = field_dict1[k]
+            v2 = field_dict2[k]
+            if k == "Number" and (v1 == "." or v2 == "."):
+                continue
+            assert v1 == v2, f"Failed in field {id} with key {k}"
+
+
+def _assert_vcf_headers_equivalent(vcf1, vcf2):
+    # Only compare INFO, FORMAT, FILTER, CONTIG fields, ignoring order
+    # Other fields are ignored
+
+    info1 = _get_header_field_dicts(vcf1, "INFO")
+    info2 = _get_header_field_dicts(vcf2, "INFO")
+    _assert_header_field_dicts_equivalent(info1, info2)
+
+    format1 = _get_header_field_dicts(vcf1, "FORMAT")
+    format2 = _get_header_field_dicts(vcf2, "FORMAT")
+    _assert_header_field_dicts_equivalent(format1, format2)
+
+    filter1 = _get_header_field_dicts(vcf1, "FILTER")
+    filter2 = _get_header_field_dicts(vcf2, "FILTER")
+    _assert_header_field_dicts_equivalent(filter1, filter2)
+
+    contig1 = _get_header_field_dicts(vcf1, "CONTIG")
+    contig2 = _get_header_field_dicts(vcf2, "CONTIG")
+    _assert_header_field_dicts_equivalent(contig1, contig2)
+
+
 def assert_vcfs_close(f1, f2, *, rtol=1e-05, atol=1e-03, allow_zero_variants=False):
     """Like :py:func:`numpy.testing.assert_allclose()`, but for VCF files.
 
@@ -48,7 +106,7 @@ def assert_vcfs_close(f1, f2, *, rtol=1e-05, atol=1e-03, allow_zero_variants=Fal
         Absolute tolerance.
     """
     with open_vcf(f1) as vcf1, open_vcf(f2) as vcf2:
-        assert vcf1.raw_header == vcf2.raw_header
+        _assert_vcf_headers_equivalent(vcf1, vcf2)
         assert vcf1.samples == vcf2.samples
 
         count = 0
@@ -145,15 +203,15 @@ def vcz_path_cache(vcf_path):
     cached_vcz_path = (cache_path / vcf_path.name).with_suffix(".vcz")
     if not cached_vcz_path.exists():
         if vcf_path.name.startswith("chr22"):
-            vcf2zarr.convert(
+            vcf.convert(
                 [vcf_path],
                 cached_vcz_path,
                 worker_processes=0,
                 variants_chunk_size=10,
                 samples_chunk_size=10,
             )
         else:
-            vcf2zarr.convert(
+            vcf.convert(
                 [vcf_path], cached_vcz_path, worker_processes=0, local_alleles=False
             )
     return cached_vcz_path
diff --git a/vcztools/vcf_writer.py b/vcztools/vcf_writer.py
@@ -1,6 +1,5 @@
 import io
 import logging
-import re
 import sys
 from datetime import datetime
 
@@ -144,11 +143,9 @@ def write_vcf(
         )
 
         if not no_header:
-            original_header = root.attrs.get("vcf_header", None)
             force_ac_an_header = not drop_genotypes and samples_selection is not None
             vcf_header = _generate_header(
                 root,
-                original_header,
                 sample_ids,
                 no_version=no_version,
                 force_ac_an=force_ac_an_header,
@@ -299,7 +296,6 @@ def c_chunk_to_vcf(
 
 def _generate_header(
     ds,
-    original_header,
     sample_ids,
     *,
     no_version: bool = False,
@@ -339,45 +335,12 @@ def _generate_header(
             if key in ("genotype", "genotype_phased"):
                 continue
             format_fields.append(key)
-    if original_header is None:  # generate entire header
-        # [1.4.1 File format]
-        print("##fileformat=VCFv4.3", file=output)
-
-        if "source" in ds.attrs:
-            print(f'##source={ds.attrs["source"]}', file=output)
-
-    else:  # use original header fields where appropriate
-        unstructured_pattern = re.compile("##([^=]+)=([^<].*)")
-        structured_pattern = re.compile("##([^=]+)=(<.*)")
-
-        for line in original_header.split("\n"):
-            if re.fullmatch(unstructured_pattern, line):
-                print(line, file=output)
-            else:
-                match = re.fullmatch(structured_pattern, line)
-                if match:
-                    category = match.group(1)
-                    id_pattern = re.compile("ID=([^,>]+)")
-                    key = id_pattern.findall(line)[0]
-                    if category not in ("contig", "FILTER", "INFO", "FORMAT"):
-                        # output other structured fields
-                        print(line, file=output)
-                    # only output certain categories if in dataset
-                    elif category == "contig" and key in contigs:
-                        contigs.remove(key)
-                        print(line, file=output)
-                    elif category == "FILTER" and key in filters:
-                        filters.remove(key)
-                        print(line, file=output)
-                    elif category == "INFO" and key in info_fields:
-                        info_fields.remove(key)
-                        print(line, file=output)
-                    elif category == "FORMAT" and key in format_fields:
-                        format_fields.remove(key)
-                        print(line, file=output)
-
-    # add all fields that are not in the original header
-    # or all fields if there was no original header
+
+    # [1.4.1 File format]
+    print("##fileformat=VCFv4.3", file=output)
+
+    if "source" in ds.attrs:
+        print(f'##source={ds.attrs["source"]}', file=output)
 
     # [1.4.2 Information field format]
     for key in info_fields:
@@ -405,8 +368,17 @@ def _generate_header(
                 )
 
     # [1.4.3 Filter field format]
-    for filter in filters:
-        print(f'##FILTER=<ID={filter},Description="">', file=output)
+    filter_descriptions = (
+        ds["filter_description"] if "filter_description" in ds else None
+    )
+    for i, filter in enumerate(filters):
+        filter_description = (
+            "" if filter_descriptions is None else filter_descriptions[i]
+        )
+        print(
+            f'##FILTER=<ID={filter},Description="{filter_description}">',
+            file=output,
+        )
 
     # [1.4.4 Individual format field format]
     for key in format_fields:
@@ -429,9 +401,7 @@ def _generate_header(
             )
 
     # [1.4.7 Contig field format]
-    contig_lengths = (
-        ds.attrs["contig_lengths"] if "contig_lengths" in ds.attrs else None
-    )
+    contig_lengths = ds["contig_length"] if "contig_length" in ds else None
     for i, contig in enumerate(contigs):
         if contig_lengths is None:
             print(f"##contig=<ID={contig}>", file=output)
@@ -444,6 +414,12 @@ def _generate_header(
             file=output,
         )
 
+    # Other meta information lines not covered above
+    if "vcf_meta_information" in ds.attrs:
+        for key, value in ds.attrs["vcf_meta_information"]:
+            if key not in ("fileformat", "source"):
+                print(f"##{key}={value}", file=output)
+
     # [1.5 Header line syntax]
     print(
         "#CHROM",