Skip to content

Commit e8512bf

Browse files
authored
Allow calls with missing genotypes to pass the IAF filter (#602)
1 parent 3a16b83 commit e8512bf

File tree

7 files changed

+103
-2
lines changed

7 files changed

+103
-2
lines changed

apis/python/tests/test_tiledbvcf.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1431,3 +1431,64 @@ def test_compression_level(tmp_path, level):
14311431
for filter in attr.filters:
14321432
if "Zstd" in str(filter):
14331433
assert filter.level == level
1434+
1435+
1436+
# Ok to skip is missing bcftools in Windows CI job
1437+
@pytest.mark.skipif(
1438+
os.environ.get("CI") == "true"
1439+
and platform.system() == "Windows"
1440+
and shutil.which("bcftools") is None,
1441+
reason="no bcftools",
1442+
)
1443+
def test_gvcf_export(tmp_path):
1444+
# Compress the input VCFs
1445+
vcf_inputs = glob.glob(os.path.join(TESTS_INPUT_DIR, "gvcf-export", "*.vcf"))
1446+
for vcf_input in vcf_inputs:
1447+
vcf_output = os.path.join(tmp_path, os.path.basename(vcf_input)) + ".gz"
1448+
cmd = f"bcftools view --no-version -Oz -o {vcf_output} {vcf_input}"
1449+
subprocess.run(cmd, shell=True, check=True)
1450+
1451+
# Index the compressed VCFs
1452+
vcf_files = glob.glob(os.path.join(tmp_path, "*.gz"))
1453+
for vcf_file in vcf_files:
1454+
cmd = f"bcftools index {vcf_file}"
1455+
subprocess.run(cmd, shell=True, check=True)
1456+
1457+
# Ingest the VCFs
1458+
uri = os.path.join(tmp_path, "vcf.tdb")
1459+
ds = tiledbvcf.Dataset(uri=uri, mode="w")
1460+
ds.create_dataset()
1461+
ds.ingest_samples(vcf_files)
1462+
ds = tiledbvcf.Dataset(uri=uri, mode="r")
1463+
1464+
# List of tests.
1465+
tests = [
1466+
{"region": "chr1:100-120", "samples": ["s0", "s1", "s2"]},
1467+
{"region": "chr1:110-120", "samples": ["s0", "s1"]},
1468+
{"region": "chr1:149-149", "samples": ["s0", "s1", "s3"]},
1469+
{"region": "chr1:150-150", "samples": ["s0", "s1", "s3", "s4"]},
1470+
]
1471+
1472+
# No IAF filtering or reporting
1473+
for test in tests:
1474+
df = ds.read(regions=test["region"])
1475+
assert set(df["sample_name"].unique()) == set(test["samples"])
1476+
1477+
attrs = [
1478+
"sample_name",
1479+
"contig",
1480+
"pos_start",
1481+
"alleles",
1482+
"fmt_GT",
1483+
"info_TILEDB_IAF",
1484+
]
1485+
1486+
# IAF reporting
1487+
for test in tests:
1488+
df = ds.read(attrs=attrs, regions=test["region"])
1489+
assert set(df["sample_name"].unique()) == set(test["samples"])
1490+
1491+
# IAF filtering and reporting
1492+
for test in tests:
1493+
df = ds.read(attrs=attrs, regions=test["region"], set_af_filter="<=1.0")
1494+
assert set(df["sample_name"].unique()) == set(test["samples"])

libtiledbvcf/src/read/reader.cc

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1283,8 +1283,13 @@ bool Reader::process_query_results_v4() {
12831283

12841284
auto gt = results.buffers()->gt(i);
12851285

1286-
// Check if any of the alleles in GT pass the AF filter
1287-
bool pass = false;
1286+
// If all GT are missing, then pass the record, otherwise check
1287+
// if any of the alleles in GT pass the AF filter.
1288+
// Note: GT == -1 represents a missing value (from htslib).
1289+
bool pass = true;
1290+
for (unsigned int i = 0; i < gt.size(); i++) {
1291+
pass = pass && gt[i] == -1;
1292+
}
12881293

12891294
read_state_.query_results.af_values.clear();
12901295
int allele_index = 0;
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
##fileformat=VCFv4.4
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##INFO=<ID=END,Number=1,Type=Integer,Description="End">
4+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
5+
##contig=<ID=chr1>
6+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s0
7+
chr1 100 . A <NON_REF> 100.0 . END=200 GT ./.
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
##fileformat=VCFv4.4
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##INFO=<ID=END,Number=1,Type=Integer,Description="End">
4+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
5+
##contig=<ID=chr1>
6+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s1
7+
chr1 100 . A <NON_REF> 200.0 . END=200 GT 0/0
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
##fileformat=VCFv4.4
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##INFO=<ID=END,Number=1,Type=Integer,Description="End">
4+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
5+
##contig=<ID=chr1>
6+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s2
7+
chr1 100 . A T 300.0 . END=100 GT 1/1
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
##fileformat=VCFv4.4
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##INFO=<ID=END,Number=1,Type=Integer,Description="End">
4+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
5+
##contig=<ID=chr1>
6+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s3
7+
chr1 149 . TC T 400.0 . END=150 GT 0/1
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
##fileformat=VCFv4.4
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##INFO=<ID=END,Number=1,Type=Integer,Description="End">
4+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
5+
##contig=<ID=chr1>
6+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT s4
7+
chr1 150 . C G 500.0 . END=150 GT 1/1

0 commit comments

Comments
 (0)