Skip to content
2 changes: 1 addition & 1 deletion workflow/rules/commons.smk
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def rule_all_input():
all_input.extend(
expand("counts/{sample}_salmon/quant.sf", sample=samples["sample"])
)
all_input.append("merged/all_counts.tsv")
all_input.append("merged/all_counts_gene.tsv")
all_input.extend(
[
expand("de_analysis/{factor}_{prop_a}_vs_{prop_b}_l2fc.tsv", **c)[0]
Expand Down
2 changes: 1 addition & 1 deletion workflow/rules/diffexp.smk
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ localrules:
# TODO: add mincount from config to discard loci with fewer counts.
rule deseq2_init:
input:
all_counts="merged/all_counts.tsv",
all_counts="merged/all_counts_gene.tsv",
samples=samples_file,
output:
"de_analysis/all.rds",
Expand Down
15 changes: 15 additions & 0 deletions workflow/rules/quantification.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import os

localrules:
merge_read_counts,
transcriptid_to_gene,


rule count_reads:
Expand Down Expand Up @@ -41,3 +42,17 @@ rule merge_read_counts:
"../envs/pandas.yml"
script:
"../scripts/merge_count_tsvs.py"


rule transcriptid_to_gene:
input:
all_counts="merged/all_counts.tsv",
annotation="references/standardized_genomic.gff",
output:
"merged/all_counts_gene.tsv",
log:
"logs/transcriptid_to_gene.log",
conda:
"../envs/pandas.yml"
script:
"../scripts/transcriptid_to_gene.py"
11 changes: 11 additions & 0 deletions workflow/scripts/get_de_genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,17 @@
inplace=True,
)


# Remove gene name, only include original transcript ID's that match transcriptome entries
def original_id(ref):
if "::" in ref:
return ref.split("::", 1)[1]
else:
return ref


df["gene"] = df["gene"].apply(original_id)

# Create diffexp gene IDs
gene_names = set(df["gene"].str.strip())

Expand Down
72 changes: 72 additions & 0 deletions workflow/scripts/transcriptid_to_gene.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import re
import sys
import pandas as pd

# Start logging
log_file = open(snakemake.log[0], "w")
sys.stderr = sys.stdout = log_file


counts = snakemake.input.all_counts
annotation = snakemake.input.annotation
output = snakemake.output[0]


# Load count data
counts = pd.read_csv(counts, sep="\t", dtype=str)


# Parse gff attributes into a {key: value} dictionairy
def parse_attributes(attr_str):
attrs = {}
for part in attr_str.strip().split(";"):
if "=" in part:
key, value = part.split("=", 1)
attrs[key] = value
return attrs


# Build mapping dictionairy {ID: gene}
# Parses gff line by line, extracts attributes field (9th),
# Includes entries that contain both "ID" and "gene" keys
id_to_gene = {}
with open(annotation) as gff:
for line in gff:
if line.startswith("#"):
continue
parts = line.strip().split("\t")
if len(parts) < 9:
continue
attrs = parse_attributes(parts[8])
if "ID" in attrs and "gene" in attrs:
id_to_gene[attrs["ID"]] = attrs["gene"]


# Clean transcript ID from region-specific suffixes
# - "gene-Dmel_CR6900" matches directly within annotation
# - "rna-Dmel_CG34063_CDS=1-1024" must be cleaned first
# The pattern "_CDS=1-1024" is stripped before matching
def clean_id(raw_id):
return re.sub(r"_CDS=.*", "", raw_id)


# Map a transcript ID to its gene name
# Try exact match using transcript ID
# If not found, try using cleaned ID
# If still not found, return transcript ID as fallback
def map_id(original_ref):
clean_ref = clean_id(original_ref)
if original_ref in id_to_gene:
return f"{id_to_gene[original_ref]}::{original_ref}"
if clean_ref in id_to_gene:
return f"{id_to_gene[clean_ref]}::{original_ref}"
return original_ref


# Replace transcript IDs in "Reference" column with mapping results
counts["Reference"] = counts["Reference"].apply(map_id)

# Write output
counts.to_csv(output, sep="\t", index=False)

log_file.close()