snakemake-workflows · cmeesters · Nov 19, 2025 · Nov 7, 2025 · Nov 7, 2025 · Nov 7, 2025
diff --git a/workflow/rules/commons.smk b/workflow/rules/commons.smk
@@ -166,7 +166,7 @@ def rule_all_input():
     all_input.extend(
         expand("counts/{sample}_salmon/quant.sf", sample=samples["sample"])
     )
-    all_input.append("merged/all_counts.tsv")
+    all_input.append("merged/all_counts_gene.tsv")
     all_input.extend(
         [
             expand("de_analysis/{factor}_{prop_a}_vs_{prop_b}_l2fc.tsv", **c)[0]

diff --git a/workflow/rules/diffexp.smk b/workflow/rules/diffexp.smk
@@ -6,7 +6,7 @@ localrules:
 # TODO: add mincount from config to discard loci with fewer counts.
 rule deseq2_init:
     input:
-        all_counts="merged/all_counts.tsv",
+        all_counts="merged/all_counts_gene.tsv",
         samples=samples_file,
     output:
         "de_analysis/all.rds",

diff --git a/workflow/rules/quantification.smk b/workflow/rules/quantification.smk
@@ -3,6 +3,7 @@ import os
 
 localrules:
     merge_read_counts,
+    transcriptid_to_gene,
 
 
 rule count_reads:
@@ -41,3 +42,17 @@ rule merge_read_counts:
         "../envs/pandas.yml"
     script:
         "../scripts/merge_count_tsvs.py"
+
+
+rule transcriptid_to_gene:
+    input:
+        all_counts="merged/all_counts.tsv",
+        annotation="references/standardized_genomic.gff",
+    output:
+        "merged/all_counts_gene.tsv",
+    log:
+        "logs/transcriptid_to_gene.log",
+    conda:
+        "../envs/pandas.yml"
+    script:
+        "../scripts/transcriptid_to_gene.py"
diff --git a/workflow/scripts/get_de_genes.py b/workflow/scripts/get_de_genes.py
@@ -18,6 +18,17 @@
     inplace=True,
 )
 
+
+# Remove gene name, only include original transcript ID's that match transcriptome entries
+def original_id(ref):
+    if "::" in ref:
+        return ref.split("::", 1)[1]
+    else:
+        return ref
+
+
+df["gene"] = df["gene"].apply(original_id)
+
 # Create diffexp gene IDs
 gene_names = set(df["gene"].str.strip())
 

diff --git a/workflow/scripts/transcriptid_to_gene.py b/workflow/scripts/transcriptid_to_gene.py
@@ -0,0 +1,72 @@
+import re
+import sys
+import pandas as pd
+
+# Start logging
+log_file = open(snakemake.log[0], "w")
+sys.stderr = sys.stdout = log_file
+
+
+counts = snakemake.input.all_counts
+annotation = snakemake.input.annotation
+output = snakemake.output[0]
+
+
+# Load count data
+counts = pd.read_csv(counts, sep="\t", dtype=str)
+
+
+# Parse gff attributes into a {key: value} dictionairy
+def parse_attributes(attr_str):
+    attrs = {}
+    for part in attr_str.strip().split(";"):
+        if "=" in part:
+            key, value = part.split("=", 1)
+            attrs[key] = value
+    return attrs
+
+
+# Build mapping dictionairy {ID: gene}
+# Parses gff line by line, extracts attributes field (9th),
+# Includes entries that contain both "ID" and "gene" keys
+id_to_gene = {}
+with open(annotation) as gff:
+    for line in gff:
+        if line.startswith("#"):
+            continue
+        parts = line.strip().split("\t")
+        if len(parts) < 9:
+            continue
+        attrs = parse_attributes(parts[8])
+        if "ID" in attrs and "gene" in attrs:
+            id_to_gene[attrs["ID"]] = attrs["gene"]
+
+
+# Clean transcript ID from region-specific suffixes
+#   - "gene-Dmel_CR6900" matches directly within annotation
+#   - "rna-Dmel_CG34063_CDS=1-1024" must be cleaned first
+# The pattern "_CDS=1-1024" is stripped before matching
+def clean_id(raw_id):
+    return re.sub(r"_CDS=.*", "", raw_id)
+
+
+# Map a transcript ID to its gene name
+# Try exact match using transcript ID
+# If not found, try using cleaned ID
+# If still not found, return transcript ID as fallback
+def map_id(original_ref):
+    clean_ref = clean_id(original_ref)
+    if original_ref in id_to_gene:
+        return f"{id_to_gene[original_ref]}::{original_ref}"
+    if clean_ref in id_to_gene:
+        return f"{id_to_gene[clean_ref]}::{original_ref}"
+    return original_ref
+
+
+# Replace transcript IDs in "Reference" column with mapping results
+counts["Reference"] = counts["Reference"].apply(map_id)
+
+# Write output
+counts.to_csv(output, sep="\t", index=False)
+
+log_file.close()