From 0e90722e5c8c8fd6106b8bd454948ba957073c9a Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Thu, 19 Sep 2024 00:23:21 +0300 Subject: [PATCH 01/40] Bug Fix - Truncated Consensus Sequences Due to incompatable variants - Added script fo filter incompataible variants from VCF file - Modified consensus calling scripts to use this filtering --- src/TRACES_cy_consensus.sh | 11 ++- src/TRACES_filter_incompatible_variants.sh | 88 ++++++++++++++++++++++ src/TRACES_hybrid_consensus.sh | 11 ++- src/TRACES_mt_consensus.sh | 11 ++- src/TRACES_multiorgan_consensus.sh | 9 ++- src/TRACES_viral_consensus.sh | 12 ++- 6 files changed, 123 insertions(+), 19 deletions(-) create mode 100755 src/TRACES_filter_incompatible_variants.sh diff --git a/src/TRACES_cy_consensus.sh b/src/TRACES_cy_consensus.sh index ffa37eb..3cc2a54 100755 --- a/src/TRACES_cy_consensus.sh +++ b/src/TRACES_cy_consensus.sh @@ -25,13 +25,16 @@ bcftools norm -f $Reference calls-cy-$Organ.vcf.gz -Oz -o calls-cy-$Organ.norm.v # # filter adjacent indels within 5bp bcftools filter --IndelGap 5 calls-cy-$Organ.norm.vcf.gz -Oz -o calls-cy-$Organ.norm.flt-indels.vcf.gz +# filter incompatible variants +finalVCF="calls-cy-$Organ.norm.flt-indels-incompat.vcf.gz" +./TRACES_filter_incompatible_variants.sh calls-cy-$Organ.norm.flt-indels.vcf.gz >| $finalVCF; # # create bed file -zcat calls-cy-$Organ.norm.flt-indels.vcf.gz |vcf2bed --snvs > cy-calls-$Organ.bed +zcat "$finalVCF" | vcf2bed --snvs > cy-calls-$Organ.bed # # CONSENSUS -tabix -f calls-cy-$Organ.norm.flt-indels.vcf.gz -bcftools consensus -m cy-zero-coverage-$Organ.bed -f $Reference calls-cy-$Organ.norm.flt-indels.vcf.gz > cy-consensus-$Organ.fa +tabix -f "$finalVCF" +bcftools consensus -m cy-zero-coverage-$Organ.bed -f $Reference $finalVCF > cy-consensus-$Organ.fa # # Give new header name for the consensus sequence tail -n +2 cy-consensus-$Organ.fa > CY_TMP_FILE_$Organ.xki @@ -40,6 +43,6 @@ cat CY_TMP_FILE_$Organ.xki >> cy-consensus-$Organ.fa rm -f CY_TMP_FILE_$Organ.xki; # # -rm -f calls-cy-$Organ.vcf.gz calls-cy-$Organ.vcf.gz.csi calls-cy-$Organ.norm.bcf calls-cy-$Organ.norm.flt-indels.bcf calls-cy-$Organ.norm.flt-indels.vcf.gz calls-cy-$Organ.norm.vcf.gz; +rm -f calls-cy-$Organ.vcf.gz calls-cy-$Organ.vcf.gz.csi calls-cy-$Organ.norm.bcf calls-cy-$Organ.norm.flt-indels.bcf calls-cy-$Organ.norm.flt-indels.vcf.gz calls-cy-$Organ.norm.vcf.gz "$finalVCF" ${finalVCF/.vcf.gz/.bcf}; # # diff --git a/src/TRACES_filter_incompatible_variants.sh b/src/TRACES_filter_incompatible_variants.sh new file mode 100755 index 0000000..ee60384 --- /dev/null +++ b/src/TRACES_filter_incompatible_variants.sh @@ -0,0 +1,88 @@ +#!/bin/bash + +#This script is intended to pass over a vcf file and remove conflicting variants +#The choice is made first by higher quality, then by VCF order +#Confilicting alleles are snps at the same position as an insertion +# and other variants which appear in the region covered by a deletion +#If there are multiple variants overlapping a position, the best one is retained +# variants are processed in an order such that come conflicts may be resolved +# Example two cariant sites covered by a deletion, where the deletion is lower +# quality than either variant, in this way the maximum number of variants are +# retained +#Written by Zachery Dickson, Sepetember 2024 +# Contact: dicksoz@mcmaster.ca +# zachery.dickson@helski.fi + +if [ "$#" -lt 1 ]; then + >&2 echo -e "Usage: $(basename $0) In.vcf[.gz] > Out.vcf.gz\n" \ + "\tFinds conflicting variants, selects the maximum conflict free set\n" \ + "\tAssumes variants are sorted by position"; + exit 1; +fi + +file=$1;shift; +if file $file | grep -q gzip; then + zcat $file; +else + cat $file; +fi | + awk -F '\\t' -v call="$0 $file" ' + #j > i + function testOverlap(i,j) { + if(chrom[j] != chrom[i]){return 0;} + if(left[j] > right[i]){return 0;} + return 1; + } + BEGIN{OFS="\t"} + /^##/{print;next;} + /^#CHROM/{ + print "##incompatFilterCommand="call; + print; + next + } + { + counter++; + fullLine[counter] = $0; + qual[counter] = $6; + chrom[counter] = $1; + left[counter] = $2; + len = length($4) +# if(length($5) > len){len = length($5)} + right[counter] = $2+len-1; + sortableLength[counter] = sprintf("%d%06d",len,counter); + } + END { + #Process Variants in order of length + n=asort(sortableLength); + for(i = n; i >= 1; i--){ + #Iterate from longest to shortest + idx = substr(sortableLength[i],length(sortableLength[i])-5)+0 + print sortableLength[i],idx,left[idx],qual[idx],left[idx],right[idx] > "/dev/stderr"; + if(Filtered[idx]){continue;} + jdx = idx; + bestQual=qual[idx]; + #Check all following entries as long as this variant is best, + # we have not checked all variants, and the next one overlaps + while(bestQual==qual[idx] && jdx < counter && + testOverlap(idx,jdx+1)){ + jdx++; + print "\t",jdx,left[jdx],qual[jdx],left[jdx],right[jdx] > "/dev/stderr"; + if(qual[jdx] > qual[idx]){ + print "\t\tBetter" > "/dev/stderr"; + Filtered[idx] = 1; + bestQual = qual[jdx]; + } else { + print "\t\tWorse" > "/dev/stderr"; + Filtered[jdx] = 1; + } + } + } + for(idx=1;idx<=counter; idx++){ + if(!Filtered[idx]){ + print fullLine[idx]; + } + } + } + ' | + bgzip; + diff --git a/src/TRACES_hybrid_consensus.sh b/src/TRACES_hybrid_consensus.sh index 4562210..333c3c6 100755 --- a/src/TRACES_hybrid_consensus.sh +++ b/src/TRACES_hybrid_consensus.sh @@ -28,14 +28,17 @@ bcftools norm -f $Reference $Label-$Organ-calls.vcf.gz -Oz -o $Label-$Organ-call # # filter adjacent indels within 5bp #bcftools filter --IndelGap 5 $Label-$Organ-calls.norm.vcf.gz -Oz -o $Label-$Organ-calls.norm.flt-indels.vcf.gz +# filter incompatible variants +finalVCF="$Label-$Organ-calls.norm.filt-incompat.vcf.gz" +./TRACES_filter_incompatible_variants.sh $Label-$Organ-calls.norm.vcf.gz >| "$finalVCF"; # # create bed file #zcat $Label-$Organ-calls.norm.flt-indels.vcf.gz |vcf2bed --snvs > $Label-calls-$Organ.bed -zcat $Label-$Organ-calls.norm.vcf.gz |vcf2bed --snvs > $Label-calls-$Organ.bed +zcat "$finalVCF" | vcf2bed --snvs > $Label-calls-$Organ.bed # # CONSENSUS -tabix -f $Label-$Organ-calls.norm.vcf.gz -bcftools consensus -m $Label-zero-coverage-$Organ.bed -f $Reference $Label-$Organ-calls.norm.vcf.gz > $Label-consensus-$Organ.fa +tabix -f "$finalVCF" +bcftools consensus -m $Label-zero-coverage-$Organ.bed -f $Reference "$finalVCF" > $Label-consensus-$Organ.fa # # Give new header name for the consensus sequence tail -n +2 $Label-consensus-$Organ.fa > $Label-$Organ-TMP_FILE.xki @@ -44,6 +47,6 @@ cat $Label-$Organ-TMP_FILE.xki >> $Label-consensus-$Organ.fa rm -f $Label-$Organ-TMP_FILE.xki; # # -rm -f $Label-$Organ-calls.vcf.gz $Label-$Organ-calls.vcf.gz.csi $Label-$Organ-calls.norm.bcf $Label-$Organ-calls.norm.vcf.gz.csi $Label-$Organ-calls.norm.vcf.gz; +rm -f $Label-$Organ-calls.vcf.gz $Label-$Organ-calls.vcf.gz.csi $Label-$Organ-calls.norm.bcf $Label-$Organ-calls.norm.vcf.gz.csi $Label-$Organ-calls.norm.vcf.gz "$finalVCF" "$finalVCF.csi"; # # diff --git a/src/TRACES_mt_consensus.sh b/src/TRACES_mt_consensus.sh index 05012b7..deadddf 100755 --- a/src/TRACES_mt_consensus.sh +++ b/src/TRACES_mt_consensus.sh @@ -30,13 +30,16 @@ bcftools norm -f $Reference $Label-$Organ-calls.vcf.gz -Oz -o $Label-$Organ-call # # filter adjacent indels within 5bp bcftools filter --IndelGap 5 $Label-$Organ-calls.norm.vcf.gz -Oz -o $Label-$Organ-calls.norm.flt-indels.vcf.gz +# filter incompatible variants +finalVCF="$Label-$Organ-calls.norm.flt-indels-incompat.vcf.gz" +./TRACES_filter_incompatible_variants.sh $Label-$Organ-calls.norm.flt-indels.vcf.gz >| "$finalVCF"; # # create bed file -zcat $Label-$Organ-calls.norm.flt-indels.vcf.gz |vcf2bed --snvs > $Label-calls-$Organ.bed +zcat "$finalVCF" |vcf2bed --snvs > $Label-calls-$Organ.bed # # CONSENSUS -tabix -f $Label-$Organ-calls.norm.flt-indels.vcf.gz -bcftools consensus -m $Label-zero-coverage-$Organ.bed -f $Reference $Label-$Organ-calls.norm.flt-indels.vcf.gz > $Label-consensus-$Organ.fa +tabix -f "$finalVCF" +bcftools consensus -m $Label-zero-coverage-$Organ.bed -f $Reference "$finalVCF" > $Label-consensus-$Organ.fa # # Give new header name for the consensus sequence tail -n +2 $Label-consensus-$Organ.fa > $Label-TMP_FILE_$Organ.xki @@ -45,6 +48,6 @@ cat $Label-TMP_FILE_$Organ.xki >> $Label-consensus-$Organ.fa rm -f $Label-TMP_FILE_$Organ.xki; # # -rm -f $Label-$Organ-calls.vcf.gz $Label-$Organ-calls.vcf.gz.csi $Label-$Organ-calls.norm.bcf $Label-$Organ-calls.norm.flt-indels.bcf $Label-$Organ-calls.norm.flt-indels.vcf.gz $Label-$Organ-calls.norm.flt-indels.vcf.gz.csi $Label-$Organ-calls.norm.vcf.gz; +rm -f $Label-$Organ-calls.vcf.gz $Label-$Organ-calls.vcf.gz.csi $Label-$Organ-calls.norm.bcf $Label-$Organ-calls.norm.flt-indels.bcf $Label-$Organ-calls.norm.flt-indels.vcf.gz $Label-$Organ-calls.norm.flt-indels.vcf.gz.csi $Label-$Organ-calls.norm.vcf.gz "$finalVCF" "$finalVCF.csi" "${finalVCF/.vcf.gz/.bcf}"; # # diff --git a/src/TRACES_multiorgan_consensus.sh b/src/TRACES_multiorgan_consensus.sh index d64ac84..cdea9fd 100755 --- a/src/TRACES_multiorgan_consensus.sh +++ b/src/TRACES_multiorgan_consensus.sh @@ -39,9 +39,12 @@ bcftools mpileup -Ou -f $REFERENCE $FIELD-data_aligned_sorted.bam \ | bcftools call --ploidy 1 -mv -Oz -o $FIELD-calls.vcf.gz bcftools index $FIELD-calls.vcf.gz bcftools norm -f $REFERENCE $FIELD-calls.vcf.gz -Oz -o $FIELD-calls.norm.vcf.gz -zcat $FIELD-calls.norm.vcf.gz |vcf2bed --snvs > $FIELD-calls.bed -tabix -f $FIELD-calls.norm.vcf.gz -bcftools consensus -f $REFERENCE $FIELD-calls.norm.vcf.gz > $FIELD-consensus.fa +# filter incompatible variants +finalVCF="$FIELD-calls.norm.filt-incompat.vcf.gz" +./TRACES_filter_incompatible_variants.sh $FIELD-calls.norm.vcf.gz >| "$finalVCF"; +zcat "$finalVCF" | vcf2bed --snvs > $FIELD-calls.bed +tabix -f "$finalVCF" +bcftools consensus -f $REFERENCE "$finalVCF" > $FIELD-consensus.fa # # Give new header name for the consensus sequence tail -n +2 $FIELD-consensus.fa > $FIELD-TMP2_FILE.xki diff --git a/src/TRACES_viral_consensus.sh b/src/TRACES_viral_consensus.sh index 5d725e9..cedb2d4 100755 --- a/src/TRACES_viral_consensus.sh +++ b/src/TRACES_viral_consensus.sh @@ -32,13 +32,17 @@ bcftools norm -f $Reference $Label-$Organ-calls.vcf.gz -Oz -o $Label-$Organ-call # # filter adjacent indels within 5bp bcftools filter --IndelGap 5 $Label-$Organ-calls.norm.vcf.gz -Oz -o $Label-$Organ-calls.norm.flt-indels.vcf.gz +# filter incompatible variants +finalVCF="$Label-$Organ-calls.norm.flt-indels-incompat.vcf.gz" +./TRACES_filter_incompatible_variants.sh $FIELD-calls.norm.filt-incompat.vcf.gz >| "$finalVCF"; + # # create bed file -zcat $Label-$Organ-calls.norm.flt-indels.vcf.gz |vcf2bed --snvs > $Label-calls-$Organ.bed +zcat "$finalVCF" | vcf2bed --snvs > $Label-calls-$Organ.bed # # CONSENSUS -tabix -f $Label-$Organ-calls.norm.flt-indels.vcf.gz -bcftools consensus -m $Label-zero-coverage-$Organ.bed -f $Reference $Label-$Organ-calls.norm.flt-indels.vcf.gz > $Label-consensus-$Organ.fa +tabix -f "$finalVCF" +bcftools consensus -m $Label-zero-coverage-$Organ.bed -f $Reference "$finalVCF" > $Label-consensus-$Organ.fa # # Give new header name for the consensus sequence tail -n +2 $Label-consensus-$Organ.fa > $Label-$Organ-TMP_FILE.xki @@ -47,6 +51,6 @@ cat $Label-$Organ-TMP_FILE.xki >> $Label-consensus-$Organ.fa rm -f $Label-$Organ-TMP_FILE.xki; # # -rm -f $Label-$Organ-calls.vcf.gz $Label-$Organ-calls.vcf.gz.csi $Label-$Organ-calls.norm.bcf $Label-$Organ-calls.norm.flt-indels.bcf $Label-$Organ-calls.norm.flt-indels.vcf.gz.csi; +rm -f $Label-$Organ-calls.vcf.gz $Label-$Organ-calls.vcf.gz.csi $Label-$Organ-calls.norm.bcf $Label-$Organ-calls.norm.flt-indels.bcf $Label-$Organ-calls.norm.flt-indels.vcf.gz.csi "$finalVCF" "${finalVCF/.vcf.gz/.bcf}"; # # From fe36e06bf7aa50ee6706f0f84ef3c8e7210f0494 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Thu, 19 Sep 2024 13:32:03 +0300 Subject: [PATCH 02/40] Remove debug statements --- src/TRACES_filter_incompatible_variants.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/TRACES_filter_incompatible_variants.sh b/src/TRACES_filter_incompatible_variants.sh index ee60384..23de5a9 100755 --- a/src/TRACES_filter_incompatible_variants.sh +++ b/src/TRACES_filter_incompatible_variants.sh @@ -57,7 +57,6 @@ fi | for(i = n; i >= 1; i--){ #Iterate from longest to shortest idx = substr(sortableLength[i],length(sortableLength[i])-5)+0 - print sortableLength[i],idx,left[idx],qual[idx],left[idx],right[idx] > "/dev/stderr"; if(Filtered[idx]){continue;} jdx = idx; bestQual=qual[idx]; @@ -66,13 +65,10 @@ fi | while(bestQual==qual[idx] && jdx < counter && testOverlap(idx,jdx+1)){ jdx++; - print "\t",jdx,left[jdx],qual[jdx],left[jdx],right[jdx] > "/dev/stderr"; if(qual[jdx] > qual[idx]){ - print "\t\tBetter" > "/dev/stderr"; Filtered[idx] = 1; bestQual = qual[jdx]; } else { - print "\t\tWorse" > "/dev/stderr"; Filtered[jdx] = 1; } } From 8e5a95523506448ca819cc58faf6d99c8acd25b8 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Thu, 17 Oct 2024 16:44:58 +0300 Subject: [PATCH 03/40] Fixed Bug caused by variable FIELD --- src/TRACES_viral_consensus.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TRACES_viral_consensus.sh b/src/TRACES_viral_consensus.sh index cedb2d4..6dd2d7a 100755 --- a/src/TRACES_viral_consensus.sh +++ b/src/TRACES_viral_consensus.sh @@ -34,7 +34,7 @@ bcftools norm -f $Reference $Label-$Organ-calls.vcf.gz -Oz -o $Label-$Organ-call bcftools filter --IndelGap 5 $Label-$Organ-calls.norm.vcf.gz -Oz -o $Label-$Organ-calls.norm.flt-indels.vcf.gz # filter incompatible variants finalVCF="$Label-$Organ-calls.norm.flt-indels-incompat.vcf.gz" -./TRACES_filter_incompatible_variants.sh $FIELD-calls.norm.filt-incompat.vcf.gz >| "$finalVCF"; +./TRACES_filter_incompatible_variants.sh $Label-$Organ-calls.norm.filt-incompat.vcf.gz >| "$finalVCF"; # # create bed file From 9246d7500386dd3786b7bee0ac36dc0ab767baa6 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Mon, 6 Jan 2025 18:41:02 +0200 Subject: [PATCH 04/40] Fixed Typo --- src/TRACES_filter_incompatible_variants.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TRACES_filter_incompatible_variants.sh b/src/TRACES_filter_incompatible_variants.sh index 23de5a9..d1d6c50 100755 --- a/src/TRACES_filter_incompatible_variants.sh +++ b/src/TRACES_filter_incompatible_variants.sh @@ -9,7 +9,7 @@ # Example two cariant sites covered by a deletion, where the deletion is lower # quality than either variant, in this way the maximum number of variants are # retained -#Written by Zachery Dickson, Sepetember 2024 +#Written by Zachery Dickson, September 2024 # Contact: dicksoz@mcmaster.ca # zachery.dickson@helski.fi From 321c413e18bf716027f1d9a11b303a42af946b1b Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Mon, 6 Jan 2025 20:19:22 +0200 Subject: [PATCH 05/40] Updated viral consensus to use bcftools Fixed Bug where TRACESPipe tried to save incorrectly named VCF Restructured VCF naming during consensus calling --- src/TRACESPipe.sh | 6 +++--- src/TRACES_cy_consensus.sh | 11 ++++++----- src/TRACES_hybrid_consensus.sh | 11 ++++++----- src/TRACES_mt_consensus.sh | 11 ++++++----- src/TRACES_multiorgan_consensus.sh | 11 ++++++----- src/TRACES_viral_consensus.sh | 11 ++++++----- 6 files changed, 33 insertions(+), 28 deletions(-) diff --git a/src/TRACESPipe.sh b/src/TRACESPipe.sh index 6ae2bfc..06e92e5 100755 --- a/src/TRACESPipe.sh +++ b/src/TRACESPipe.sh @@ -384,7 +384,7 @@ ALIGN_AND_CONSENSUS () { mv $V_TAG-calls-$ORGAN.bed ../output_data/TRACES_viral_bed/ mv $V_TAG-coverage-$ORGAN.bed ../output_data/TRACES_viral_bed/ mv $V_TAG-zero-coverage-$ORGAN.bed ../output_data/TRACES_viral_bed/ - mv $V_TAG-$ORGAN-calls.norm.flt-indels.vcf.gz ../output_data/TRACES_viral_bed/ + mv $V_TAG-$ORGAN-calls.vcf.gz ../output_data/TRACES_viral_bed/ mkdir -p ../output_data/TRACES_viral_statistics; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; echo -e "\e[34m[TRACESPipe]\e[32m Calculating coverage ...\e[0m"; @@ -1767,7 +1767,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; mv $SPECIFIC_ID-calls-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ mv $SPECIFIC_ID-coverage-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ mv $SPECIFIC_ID-zero-coverage-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ - mv $SPECIFIC_ID-$ORGAN_T-calls.norm.flt-indels.vcf.gz ../output_data/TRACES_specific_bed/ + mv $SPECIFIC_ID-$ORGAN_T-calls.vcf.gz ../output_data/TRACES_specific_bed/ mkdir -p ../output_data/TRACES_specific_statistics; echo -e "\e[34m[TRACESPipe]\e[32m Calculating coverage for alignment-based assembly ...\e[0m"; ./TRACES_overall_specific.sh $SPECIFIC_ID $ORGAN_T @@ -1809,7 +1809,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; mv $SPECIFIC_ID-calls-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ mv $SPECIFIC_ID-coverage-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ mv $SPECIFIC_ID-zero-coverage-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ - mv $SPECIFIC_ID-$ORGAN_T-calls.norm.flt-indels.vcf.gz ../output_data/TRACES_specific_bed/ + mv $SPECIFIC_ID-$ORGAN_T-calls.vcf.gz ../output_data/TRACES_specific_bed/ mkdir -p ../output_data/TRACES_specific_statistics; echo -e "\e[34m[TRACESPipe]\e[32m Calculating coverage for alignment-based assembly ...\e[0m"; ./TRACES_overall_specific.sh $SPECIFIC_ID $ORGAN_T diff --git a/src/TRACES_cy_consensus.sh b/src/TRACES_cy_consensus.sh index 3cc2a54..82646a1 100755 --- a/src/TRACES_cy_consensus.sh +++ b/src/TRACES_cy_consensus.sh @@ -17,16 +17,17 @@ awk '$4 < 1' cy-coverage-$Organ.bed > cy-zero-coverage-$Organ.bed # # CALLS W CONSENSUS samtools faidx $Reference # -P 9.9e-1 -bcftools mpileup -Ou -f $Reference $Alignments | bcftools call --ploidy 1 -P 9.9e-1 -mv -Oz -o calls-cy-$Organ.vcf.gz -bcftools index calls-cy-$Organ.vcf.gz +rawCalls="raw-cy-$Organ.vcf.gz" +bcftools mpileup -Ov -f $Reference $Alignments | bcftools call --ploidy 1 -P 9.9e-1 -mv -Oz -o "$rawCalls" +bcftools index "$rawCalls" # # normalize indels -bcftools norm -f $Reference calls-cy-$Organ.vcf.gz -Oz -o calls-cy-$Organ.norm.vcf.gz +bcftools norm -f $Reference "$rawCalls" -Oz -o calls-cy-$Organ.norm.vcf.gz # # filter adjacent indels within 5bp bcftools filter --IndelGap 5 calls-cy-$Organ.norm.vcf.gz -Oz -o calls-cy-$Organ.norm.flt-indels.vcf.gz # filter incompatible variants -finalVCF="calls-cy-$Organ.norm.flt-indels-incompat.vcf.gz" +finalVCF="calls-cy-$Organ.vcf.gz" ./TRACES_filter_incompatible_variants.sh calls-cy-$Organ.norm.flt-indels.vcf.gz >| $finalVCF; # # create bed file @@ -43,6 +44,6 @@ cat CY_TMP_FILE_$Organ.xki >> cy-consensus-$Organ.fa rm -f CY_TMP_FILE_$Organ.xki; # # -rm -f calls-cy-$Organ.vcf.gz calls-cy-$Organ.vcf.gz.csi calls-cy-$Organ.norm.bcf calls-cy-$Organ.norm.flt-indels.bcf calls-cy-$Organ.norm.flt-indels.vcf.gz calls-cy-$Organ.norm.vcf.gz "$finalVCF" ${finalVCF/.vcf.gz/.bcf}; +rm -f "$rawCalls" "$rawCalls.csi" calls-cy-$Organ.norm.bcf calls-cy-$Organ.norm.flt-indels.bcf calls-cy-$Organ.norm.flt-indels.vcf.gz calls-cy-$Organ.norm.vcf.gz "$finalVCF" ${finalVCF/.vcf.gz/.bcf}; "$finalVCF.tbi" # # diff --git a/src/TRACES_hybrid_consensus.sh b/src/TRACES_hybrid_consensus.sh index 333c3c6..f0fc593 100755 --- a/src/TRACES_hybrid_consensus.sh +++ b/src/TRACES_hybrid_consensus.sh @@ -20,16 +20,17 @@ awk '$4 < 1' $Label-coverage-$Organ.bed > $Label-zero-coverage-$Organ.bed # # CALLS samtools faidx $Reference # -P 9.9e-1 # uses the new variations for consensus -bcftools mpileup -Ou -f $Reference $Alignments | bcftools call --ploidy 1 -M -mv -Oz -o $Label-$Organ-calls.vcf.gz -bcftools index $Label-$Organ-calls.vcf.gz +rawCalls="$Label-$Organ-raw.vcf.gz" +bcftools mpileup -Ov -f $Reference $Alignments | bcftools call --ploidy 1 -M -mv -Oz -o "$rawCalls" +bcftools index "$rawCalls" # # normalize indels -bcftools norm -f $Reference $Label-$Organ-calls.vcf.gz -Oz -o $Label-$Organ-calls.norm.vcf.gz +bcftools norm -f $Reference "$rawCalls" -Oz -o $Label-$Organ-calls.norm.vcf.gz # # filter adjacent indels within 5bp #bcftools filter --IndelGap 5 $Label-$Organ-calls.norm.vcf.gz -Oz -o $Label-$Organ-calls.norm.flt-indels.vcf.gz # filter incompatible variants -finalVCF="$Label-$Organ-calls.norm.filt-incompat.vcf.gz" +finalVCF="$Label-$Organ-calls.vcf.gz" ./TRACES_filter_incompatible_variants.sh $Label-$Organ-calls.norm.vcf.gz >| "$finalVCF"; # # create bed file @@ -47,6 +48,6 @@ cat $Label-$Organ-TMP_FILE.xki >> $Label-consensus-$Organ.fa rm -f $Label-$Organ-TMP_FILE.xki; # # -rm -f $Label-$Organ-calls.vcf.gz $Label-$Organ-calls.vcf.gz.csi $Label-$Organ-calls.norm.bcf $Label-$Organ-calls.norm.vcf.gz.csi $Label-$Organ-calls.norm.vcf.gz "$finalVCF" "$finalVCF.csi"; +rm -f "$rawCalls" "$rawCalls.csi" $Label-$Organ-calls.norm.bcf $Label-$Organ-calls.norm.vcf.gz.csi $Label-$Organ-calls.norm.vcf.gz "$finalVCF" "$finalVCF.tbi"; # # diff --git a/src/TRACES_mt_consensus.sh b/src/TRACES_mt_consensus.sh index deadddf..978ade2 100755 --- a/src/TRACES_mt_consensus.sh +++ b/src/TRACES_mt_consensus.sh @@ -22,16 +22,17 @@ awk '$4 < 1' $Label-coverage-$Organ.bed > $Label-zero-coverage-$Organ.bed # CHAN # # CALLS samtools faidx $Reference # -P 9.9e-1 -bcftools mpileup -Ou -f $Reference $Alignments | bcftools call --ploidy 1 -P 9.9e-1 -mv -Oz -o $Label-$Organ-calls.vcf.gz -bcftools index $Label-$Organ-calls.vcf.gz +rawCalls="$Label-$Organ-raw.vcf.gz" +bcftools mpileup -Ov -f $Reference $Alignments | bcftools call --ploidy 1 -P 9.9e-1 -mv -Oz -o "$rawCalls" +bcftools index "$rawCalls" # # normalize indels -bcftools norm -f $Reference $Label-$Organ-calls.vcf.gz -Oz -o $Label-$Organ-calls.norm.vcf.gz +bcftools norm -f $Reference "$rawCalls" -Oz -o $Label-$Organ-calls.norm.vcf.gz # # filter adjacent indels within 5bp bcftools filter --IndelGap 5 $Label-$Organ-calls.norm.vcf.gz -Oz -o $Label-$Organ-calls.norm.flt-indels.vcf.gz # filter incompatible variants -finalVCF="$Label-$Organ-calls.norm.flt-indels-incompat.vcf.gz" +finalVCF="$Label-$Organ-calls.vcf.gz" ./TRACES_filter_incompatible_variants.sh $Label-$Organ-calls.norm.flt-indels.vcf.gz >| "$finalVCF"; # # create bed file @@ -48,6 +49,6 @@ cat $Label-TMP_FILE_$Organ.xki >> $Label-consensus-$Organ.fa rm -f $Label-TMP_FILE_$Organ.xki; # # -rm -f $Label-$Organ-calls.vcf.gz $Label-$Organ-calls.vcf.gz.csi $Label-$Organ-calls.norm.bcf $Label-$Organ-calls.norm.flt-indels.bcf $Label-$Organ-calls.norm.flt-indels.vcf.gz $Label-$Organ-calls.norm.flt-indels.vcf.gz.csi $Label-$Organ-calls.norm.vcf.gz "$finalVCF" "$finalVCF.csi" "${finalVCF/.vcf.gz/.bcf}"; +rm -f "$rawCalls" "$rawCalls.csi" $Label-$Organ-calls.norm.bcf $Label-$Organ-calls.norm.vcf.gz $Label-$Organ-calls.norm.flt-indels.bcf $Label-$Organ-calls.norm.flt-indels.vcf.gz $Label-$Organ-calls.norm.flt-indels.vcf.gz.csi "$finalVCF" "$finalVCF.tbi" "${finalVCF/.vcf.gz/.bcf}"; # # diff --git a/src/TRACES_multiorgan_consensus.sh b/src/TRACES_multiorgan_consensus.sh index cdea9fd..01a62ca 100755 --- a/src/TRACES_multiorgan_consensus.sh +++ b/src/TRACES_multiorgan_consensus.sh @@ -35,12 +35,13 @@ samtools index $FIELD-data_aligned_sorted.bam $FIELD-data_aligned_sorted.bam.bai bedtools genomecov -ibam $FIELD-data_aligned_sorted.bam -bga > $FIELD-coverage.bed awk '$4 < 1' $FIELD-coverage.bed > $FIELD-zero-coverage.bed samtools faidx $REFERENCE -bcftools mpileup -Ou -f $REFERENCE $FIELD-data_aligned_sorted.bam \ -| bcftools call --ploidy 1 -mv -Oz -o $FIELD-calls.vcf.gz -bcftools index $FIELD-calls.vcf.gz -bcftools norm -f $REFERENCE $FIELD-calls.vcf.gz -Oz -o $FIELD-calls.norm.vcf.gz +rawCalls="$FIELD-raw.vcf.gz" +bcftools mpileup -Ov -f $REFERENCE $FIELD-data_aligned_sorted.bam \ +| bcftools call --ploidy 1 -mv -Oz -o "$rawCalls" +bcftools index "$rawCalls" +bcftools norm -f $REFERENCE "$rawCalls" -Oz -o $FIELD-calls.norm.vcf.gz # filter incompatible variants -finalVCF="$FIELD-calls.norm.filt-incompat.vcf.gz" +finalVCF="$FIELD-calls.vcf.gz" ./TRACES_filter_incompatible_variants.sh $FIELD-calls.norm.vcf.gz >| "$finalVCF"; zcat "$finalVCF" | vcf2bed --snvs > $FIELD-calls.bed tabix -f "$finalVCF" diff --git a/src/TRACES_viral_consensus.sh b/src/TRACES_viral_consensus.sh index 6dd2d7a..be81442 100755 --- a/src/TRACES_viral_consensus.sh +++ b/src/TRACES_viral_consensus.sh @@ -24,16 +24,17 @@ awk '$4 < 1' $Label-coverage-$Organ.bed > $Label-zero-coverage-$Organ.bed # # CALLS samtools faidx $Reference # -P 9.9e-1 #here! -samtools mpileup -Ou -f $Reference $Alignments | bcftools call --ploidy 1 -P 9.9e-1 -mv -Oz -o $Label-$Organ-calls.vcf.gz -bcftools index $Label-$Organ-calls.vcf.gz +rawCalls="$Label-$Organ-raw.vcf.gz" +bcftools mpileup -Ov -f $Reference $Alignments | bcftools call --ploidy 1 -P 9.9e-1 -mv -Oz -o "$rawCalls" +bcftools index "$rawCalls" # # normalize indels -bcftools norm -f $Reference $Label-$Organ-calls.vcf.gz -Oz -o $Label-$Organ-calls.norm.vcf.gz +bcftools norm -f $Reference "$rawCalls" -Oz -o $Label-$Organ-calls.norm.vcf.gz # # filter adjacent indels within 5bp bcftools filter --IndelGap 5 $Label-$Organ-calls.norm.vcf.gz -Oz -o $Label-$Organ-calls.norm.flt-indels.vcf.gz # filter incompatible variants -finalVCF="$Label-$Organ-calls.norm.flt-indels-incompat.vcf.gz" +finalVCF="$Label-$Organ-calls.vcf.gz" ./TRACES_filter_incompatible_variants.sh $Label-$Organ-calls.norm.filt-incompat.vcf.gz >| "$finalVCF"; # @@ -51,6 +52,6 @@ cat $Label-$Organ-TMP_FILE.xki >> $Label-consensus-$Organ.fa rm -f $Label-$Organ-TMP_FILE.xki; # # -rm -f $Label-$Organ-calls.vcf.gz $Label-$Organ-calls.vcf.gz.csi $Label-$Organ-calls.norm.bcf $Label-$Organ-calls.norm.flt-indels.bcf $Label-$Organ-calls.norm.flt-indels.vcf.gz.csi "$finalVCF" "${finalVCF/.vcf.gz/.bcf}"; +rm -f "$rawCalls" "$rawCalls.csi" $Label-$Organ-calls.norm.bcf $Label-$Organ-calls.norm.vcf.gz $Label-$Organ-calls.norm.flt-indels.bcf $Label-$Organ-calls.norm.flt-indels.vcf.gz "$finalVCL.csi"; # # From aad7282713cdb5ec676341190b158111eadd2101 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Mon, 6 Jan 2025 20:34:54 +0200 Subject: [PATCH 06/40] Fixed bug in viral consensus leading to no final vcf being produced --- src/TRACES_viral_consensus.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TRACES_viral_consensus.sh b/src/TRACES_viral_consensus.sh index be81442..8bc0432 100755 --- a/src/TRACES_viral_consensus.sh +++ b/src/TRACES_viral_consensus.sh @@ -35,7 +35,7 @@ bcftools norm -f $Reference "$rawCalls" -Oz -o $Label-$Organ-calls.norm.vcf.gz bcftools filter --IndelGap 5 $Label-$Organ-calls.norm.vcf.gz -Oz -o $Label-$Organ-calls.norm.flt-indels.vcf.gz # filter incompatible variants finalVCF="$Label-$Organ-calls.vcf.gz" -./TRACES_filter_incompatible_variants.sh $Label-$Organ-calls.norm.filt-incompat.vcf.gz >| "$finalVCF"; +./TRACES_filter_incompatible_variants.sh $Label-$Organ-calls.norm.flt-indels.vcf.gz >| "$finalVCF"; # # create bed file From e90feb566d9d9722ec5c0db933fc4a0275c0a2af Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Mon, 6 Jan 2025 21:53:30 +0200 Subject: [PATCH 07/40] Updated how depth of coverage is calculated to ignore secondary/supp alignments and cut off peaks at 1000 --- src/TRACES_overall_virus.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/TRACES_overall_virus.sh b/src/TRACES_overall_virus.sh index aee418a..a6f1c92 100755 --- a/src/TRACES_overall_virus.sh +++ b/src/TRACES_overall_virus.sh @@ -2,9 +2,13 @@ # VIRUS="$1"; ORGAN="$2"; +MAX_DEPTH=1000; # -TOTAL_COVERAGE=`awk '{sum += (($3-$2)*$4)} END {print sum}' ../output_data/TRACES_viral_bed/$VIRUS-coverage-$ORGAN.bed `; +#TOTAL_COVERAGE=`awk '{sum += (($3-$2)*$4)} END {print sum}' ../output_data/TRACES_viral_bed/$VIRUS-coverage-$ORGAN.bed `; TOTAL_SIZE=`tail -n 1 ../output_data/TRACES_viral_bed/$VIRUS-coverage-$ORGAN.bed | awk '{ print $3}'`; +acc=$(head -1 ../output_data/TRACES_viral_bed/$VIRUS-coverage-$ORGAN.bed | cut -f1); +#Updated Depth calculation which ignores peaks, but more importantly ignores secondary/supplementary alignments +TOTAL_COVERAGE=$(samtools bedcov --max-depth $MAX_DEPTH <(echo -e "$acc\t0\t$TOTAL_SIZE") ../output_data/TRACES_viral_alignments/viral_aligned_sorted-$ORGAN-$VIRUS.bam | awk '{print $NF}') NORMALIZED_COVERAGE=`echo "scale=4; $TOTAL_COVERAGE / $TOTAL_SIZE" | bc -l`; printf "$VIRUS\t$ORGAN\t$NORMALIZED_COVERAGE\n" > ../output_data/TRACES_viral_statistics/$VIRUS-total-depth-coverage-$ORGAN.txt; # From d3b56dec348f112f4c44335c2d98833b0dda1557 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Mon, 6 Jan 2025 22:09:01 +0200 Subject: [PATCH 08/40] Updated formatting on new depth calculation format --- src/.TRACES_hybrid.sh.swp | Bin 0 -> 12288 bytes src/TRACES_overall_virus.sh | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 src/.TRACES_hybrid.sh.swp diff --git a/src/.TRACES_hybrid.sh.swp b/src/.TRACES_hybrid.sh.swp new file mode 100644 index 0000000000000000000000000000000000000000..a0a88b56661b5f956411ee3fc7df53430f866f36 GIT binary patch literal 12288 zcmeI2&u`pB6vwBi7ibFzt{mR}KowH$%_bm)tO{B+*%ArTXcOR2P&&59*-`AV>zPfu z9N=F-NQeu62dEX|58yB0f`kwU&Im4?_}cNNiM61mw~9B?SN1&5d-LYw8B42u!O92Q z@6o#77icdE@oV>9{OG64;=X=Xd?F(q9+eONy&dZ!OO>ykKT(O*=1BXoUf}ZSX~3S$ z^F)Wz>CD)kD`co_u5;D1McCUP4{vSl9ZY6Hp(Ec;7KC9F6JP>QMBtiOTkWruuAO#^ zUViEQCyK&OOn?b60Vco%m;e)C0!)AjJe>qweo1_W)Ss!-@7C8#jceWI3lm@hOn?b6 z0Vco%m;e)C0!)AjFaaj;6cUh;5LccP;#;Wv|9}4b|Bn}h_#OHU`Vsm8`X2fYdIWt9 zeF=@Azn&N3D`*GWgx-d3LLKN8=;v!fd;$5;YtVJ*Rp=`8@G9np#?UVG=M^D7hX&9u z$mwC7+c#x?+?W6pU;<2l2`~XBzy$uU1UjNf$&1N`GLEw(vIjEJhekyQ-SO7Q>)sjP z9`5*7rUMb?3r5+jaE-x%O#MJ=^8H>mb@|jCMAFHN1*xaAq{=wk`!9i)4~mpsmL-<7 ziPVFoGhcdOxkPa`Nr_>BYl9)ZwP4I*f@1z+PTzo6M)G6n%QT>-$(W`oIB`u=F2knD z$^oVcP9&uYn*c8R?~J#Gn|qCc)ZM!^9F6Y0izka#&|GB|@)Y~UR54pe5TT}-NQIM( zJVot~)ZMl9!5f4Ao9pS&WNxAgDwMRDp*SnZO|+#GSr}~&DU~w{6h+E2*>OhFT9pRL zj8vMt8NBQ?aTWnrQolW)qrJbqy8$l*Q&qGZ2y>*!PU}7uQ)4&CuXpG!DyUH}O`Zq4 zS5I{o?R>_oE9>cp!|~4c&h3r!jIao#J~dIRN~QG0s*KPv-KAD}+S-qaw&~;YEzGGr z(2n|pv)o4@Afrf^CEnP{SVLNsd*2|d%GD?stt+NsRRd1aRkjlqP%aPEV+iNdEMFF+j1sZ|urNCsA!P*E0B{bg95GXVY**Hk%KBbG_x9eZW)B`8I*3diK`slBsk8_B-KJKuYTfIP=ULa9k F|2N57<)8on literal 0 HcmV?d00001 diff --git a/src/TRACES_overall_virus.sh b/src/TRACES_overall_virus.sh index a6f1c92..85e904a 100755 --- a/src/TRACES_overall_virus.sh +++ b/src/TRACES_overall_virus.sh @@ -10,7 +10,7 @@ acc=$(head -1 ../output_data/TRACES_viral_bed/$VIRUS-coverage-$ORGAN.bed | cut - #Updated Depth calculation which ignores peaks, but more importantly ignores secondary/supplementary alignments TOTAL_COVERAGE=$(samtools bedcov --max-depth $MAX_DEPTH <(echo -e "$acc\t0\t$TOTAL_SIZE") ../output_data/TRACES_viral_alignments/viral_aligned_sorted-$ORGAN-$VIRUS.bam | awk '{print $NF}') NORMALIZED_COVERAGE=`echo "scale=4; $TOTAL_COVERAGE / $TOTAL_SIZE" | bc -l`; -printf "$VIRUS\t$ORGAN\t$NORMALIZED_COVERAGE\n" > ../output_data/TRACES_viral_statistics/$VIRUS-total-depth-coverage-$ORGAN.txt; +printf "$VIRUS\t$ORGAN\t%0.4f\n" "$NORMALIZED_COVERAGE" > ../output_data/TRACES_viral_statistics/$VIRUS-total-depth-coverage-$ORGAN.txt; # ZERO_COVERAGE=`awk '{sum += ($3-$2)} END {print sum}' ../output_data/TRACES_viral_bed/$VIRUS-zero-coverage-$ORGAN.bed `; if [ -z $ZERO_COVERAGE ] From 3cfd4bdf362ec1e26024edcffa8717aedd2faedb Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Mon, 17 Feb 2025 10:53:11 +0200 Subject: [PATCH 09/40] Added threads to all subcommands Also modified handling of sensitivity to reduce duplication of code Added git ignore for swp files --- .gitignore | 1 + src/TRACES_viral_align_reads.sh | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 11 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1377554 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.swp diff --git a/src/TRACES_viral_align_reads.sh b/src/TRACES_viral_align_reads.sh index e1546f4..c6368f2 100755 --- a/src/TRACES_viral_align_reads.sh +++ b/src/TRACES_viral_align_reads.sh @@ -15,14 +15,14 @@ # BUILD THE INDEX rm -f index-$2-$3-file* viral_aligned_sorted-$2-$3.bam.bai bowtie2-build $1 index-$2-$3-file -# +# HANDLE OPTIONAL SENSITIVITY +sensitivityArg="" +if [[ "$6" == "1" ]]; then + sensitivityArg="--very-sensitive" +fi # ALIGN -if [[ "$6" == "1" ]]; - then - bowtie2 -a --threads $4 --very-sensitive -x index-$2-$3-file -1 o_fw_pr.fq -2 o_rv_pr.fq -U o_fw_unpr.fq,o_rv_unpr.fq > aligned-$2-$3.sam - else - bowtie2 -a --threads $4 -x index-$2-$3-file -1 o_fw_pr.fq -2 o_rv_pr.fq -U o_fw_unpr.fq,o_rv_unpr.fq > aligned-$2-$3.sam - fi +bowtie2 -a --threads $4 "$sensitivityArg" -x index-$2-$3-file \ + -1 o_fw_pr.fq -2 o_rv_pr.fq -U o_fw_unpr.fq,o_rv_unpr.fq > aligned-$2-$3.sam # # SORT & BIN samtools sort --threads $4 aligned-$2-$3.sam > viral_aligned_sorted-$2-$3.bam @@ -32,18 +32,18 @@ if [[ "$5" -eq "1" ]]; then echo "Removing Duplications ..."; # SORT BY NAME - samtools sort -n viral_aligned_sorted-$2-$3.bam > viral_aligned_sorted_sorted-$2-$3.bam + samtools sort --thread $4 -n viral_aligned_sorted-$2-$3.bam > viral_aligned_sorted_sorted-$2-$3.bam # # ADD ms AND MC FOR MARKDUP - samtools fixmate -m viral_aligned_sorted_sorted-$2-$3.bam viral_aligned_sorted_sorted-$2-$3-fixmate.bam + samtools fixmate --threads $4 -m viral_aligned_sorted_sorted-$2-$3.bam viral_aligned_sorted_sorted-$2-$3-fixmate.bam rm -f viral_aligned_sorted_sorted-$2-$3.bam # # SORTING POSITION ORDER - samtools sort -o viral_aligned_sorted_sorted-$2-$3-fixmate-sort.bam viral_aligned_sorted_sorted-$2-$3-fixmate.bam + samtools sort --threads $4 -o viral_aligned_sorted_sorted-$2-$3-fixmate-sort.bam viral_aligned_sorted_sorted-$2-$3-fixmate.bam rm -f viral_aligned_sorted_sorted-$2-$3-fixmate.bam # # REMOVE DUPLICATES - samtools markdup -r viral_aligned_sorted_sorted-$2-$3-fixmate-sort.bam viral_aligned_sorted-$2-$3.bam + samtools markdup --threads $4 -r viral_aligned_sorted_sorted-$2-$3-fixmate-sort.bam viral_aligned_sorted-$2-$3.bam rm -f viral_aligned_sorted_sorted-$2-$3-fixmate-sort.bam # fi From 8d962be03cf4de542c70bff3fee15251991af358 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Mon, 17 Feb 2025 11:14:53 +0200 Subject: [PATCH 10/40] --very-sensitive now affects --run-specific --- src/TRACESPipe.sh | 54 +++++++---------------------------------------- 1 file changed, 8 insertions(+), 46 deletions(-) diff --git a/src/TRACESPipe.sh b/src/TRACESPipe.sh index 06e92e5..da3e5a0 100755 --- a/src/TRACESPipe.sh +++ b/src/TRACESPipe.sh @@ -523,6 +523,7 @@ while [[ $# -gt 0 ]] ;; -vhs|--very-sensitive) HIGH_SENSITIVITY=1; + RUN_SPECIFIC_SENSITIVE=1; SHOW_HELP=0; shift ;; @@ -977,7 +978,9 @@ if [ "$SHOW_HELP" -eq "1" ]; echo " " echo " -rsx , --run-extreme " echo " Run specific reference align/consensus " - echo " using extreme sensitivity, " + echo " using extreme sensitivity; " + echo " Retained for backwards compatibility; " + echo " Now an alias for -vhs -rsr , " echo " " echo " -rmt, --run-mito Run Mito align and consensus seq, " echo " -rmtd, --run-mito-dam Run Mito damage only, " @@ -1047,7 +1050,7 @@ if [ "$SHOW_VERSION" -eq "1" ]; echo " "; echo " TRACESPipe "; echo " "; - echo " Version: 1.0.2 "; + echo " Version: 1.1.3 "; echo " "; echo " Department of Virology and "; echo " Department of Forensic Medicine, "; @@ -1057,6 +1060,7 @@ if [ "$SHOW_VERSION" -eq "1" ]; echo " University of Aveiro, Portugal. "; echo " "; echo " diogo.pratas@helsinki.fi "; + echo " zachery.dickson@helsinki.fi "; echo " "; exit 0; fi @@ -1750,47 +1754,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence with pattern \"$SPECIFIC_ID\" from VDB.fa ...\e[0m"; gto_fasta_extract_read_by_pattern -p "$SPECIFIC_ID" < VDB.fa | awk "/^>/ {n++} n>1 {exit} 1" > SPECIFIC-$SPECIFIC_ID.fa echo -e "\e[34m[TRACESPipe]\e[32m Aligning ... \e[0m"; - ./TRACES_viral_align_reads.sh SPECIFIC-$SPECIFIC_ID.fa $ORGAN_T $SPECIFIC_ID $THREADS $REMOVE_DUPLICATIONS 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; - echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; - # - echo -e "\e[34m[TRACESPipe]\e[32m Generate a consensus sequence with bcftools ...\e[0m"; - ./TRACES_viral_consensus.sh SPECIFIC-$SPECIFIC_ID.fa viral_aligned_sorted-$ORGAN_T-$SPECIFIC_ID.bam $ORGAN_T $SPECIFIC_ID 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; - mkdir -p ../output_data/TRACES_specific_alignments; - #rm -f ../output_data/TRACES_specific_alignments/* - cp SPECIFIC-$SPECIFIC_ID.fa ../output_data/TRACES_specific_alignments/ - cp SPECIFIC-$SPECIFIC_ID.fa.fai ../output_data/TRACES_specific_alignments/ - mv viral_aligned_sorted-$ORGAN_T-$SPECIFIC_ID.bam ../output_data/TRACES_specific_alignments/ - mv viral_aligned_sorted-$ORGAN_T-$SPECIFIC_ID.bam.bai ../output_data/TRACES_specific_alignments/ - mkdir -p ../output_data/TRACES_specific_consensus; - mv $SPECIFIC_ID-consensus-$ORGAN_T.fa ../output_data/TRACES_specific_consensus/ - mkdir -p ../output_data/TRACES_specific_bed; - mv $SPECIFIC_ID-calls-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ - mv $SPECIFIC_ID-coverage-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ - mv $SPECIFIC_ID-zero-coverage-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ - mv $SPECIFIC_ID-$ORGAN_T-calls.vcf.gz ../output_data/TRACES_specific_bed/ - mkdir -p ../output_data/TRACES_specific_statistics; - echo -e "\e[34m[TRACESPipe]\e[32m Calculating coverage for alignment-based assembly ...\e[0m"; - ./TRACES_overall_specific.sh $SPECIFIC_ID $ORGAN_T - C_BREADTH=`cat ../output_data/TRACES_specific_statistics/$SPECIFIC_ID-total-horizontal-coverage-$ORGAN_T.txt`; - C_DEPTH=`cat ../output_data/TRACES_specific_statistics/$SPECIFIC_ID-total-depth-coverage-$ORGAN_T.txt`; - echo -e "\e[34m[TRACESPipe]\e[1m Breadth (H) coverage: $C_BREADTH \e[0m"; - echo -e "\e[34m[TRACESPipe]\e[1m Depth-x (V) coverage: $C_DEPTH \e[0m"; - echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; - fi - # - # ========================================================================== - # RUN SPECIFIC SPECIFIC ALIGN/CONSENSUS WITH EXTREME HIGH SENSITIVITY - # - if [[ "$RUN_SPECIFIC_SENSITIVE" -eq "1" ]]; - then - echo -e "\e[34m[TRACESPipe]\e[32m Aligning reads to specific viral ref(s) with pattern \"$SPECIFIC_ID\" using bowtie2 with EXTREME sensitivity ...\e[0m"; - # - CHECK_VDB; - # - echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence with pattern \"$SPECIFIC_ID\" from VDB.fa ...\e[0m"; - gto_fasta_extract_read_by_pattern -p "$SPECIFIC_ID" < VDB.fa | awk "/^>/ {n++} n>1 {exit} 1" > SPECIFIC-$SPECIFIC_ID.fa - echo -e "\e[34m[TRACESPipe]\e[32m Aligning ...\e[0m"; - ./TRACES_viral_sensitive_align_reads.sh SPECIFIC-$SPECIFIC_ID.fa $ORGAN_T $SPECIFIC_ID $THREADS $REMOVE_DUPLICATIONS 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; + ./TRACES_viral_align_reads.sh SPECIFIC-$SPECIFIC_ID.fa $ORGAN_T $SPECIFIC_ID $THREADS $REMOVE_DUPLICATIONS 1>> ../logs/Log-stdout-$ORGAN_T.txt $RUN_SPECIFIC_SENSITIVE 2>> ../logs/Log-stderr-$ORGAN_T.txt; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; # echo -e "\e[34m[TRACESPipe]\e[32m Generate a consensus sequence with bcftools ...\e[0m"; @@ -1802,10 +1766,8 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; mv viral_aligned_sorted-$ORGAN_T-$SPECIFIC_ID.bam ../output_data/TRACES_specific_alignments/ mv viral_aligned_sorted-$ORGAN_T-$SPECIFIC_ID.bam.bai ../output_data/TRACES_specific_alignments/ mkdir -p ../output_data/TRACES_specific_consensus; - #rm -f ../output_data/TRACES_specific_consensus/* mv $SPECIFIC_ID-consensus-$ORGAN_T.fa ../output_data/TRACES_specific_consensus/ mkdir -p ../output_data/TRACES_specific_bed; - #rm -f ../output_data/TRACES_specific_bed/* mv $SPECIFIC_ID-calls-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ mv $SPECIFIC_ID-coverage-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ mv $SPECIFIC_ID-zero-coverage-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ @@ -1819,7 +1781,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; echo -e "\e[34m[TRACESPipe]\e[1m Depth-x (V) coverage: $C_DEPTH \e[0m"; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; fi - # + # # ========================================================================== # DETAILED VIRAL ALIGN/CONSENSUS # From 2ca657036d925759593f30fbe7db4d85fadbdd2c Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Mon, 17 Feb 2025 11:41:50 +0200 Subject: [PATCH 11/40] Revert "--very-sensitive now affects --run-specific" These changes moved to a branch This reverts commit 8d962be03cf4de542c70bff3fee15251991af358. --- src/TRACESPipe.sh | 54 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 8 deletions(-) diff --git a/src/TRACESPipe.sh b/src/TRACESPipe.sh index da3e5a0..06e92e5 100755 --- a/src/TRACESPipe.sh +++ b/src/TRACESPipe.sh @@ -523,7 +523,6 @@ while [[ $# -gt 0 ]] ;; -vhs|--very-sensitive) HIGH_SENSITIVITY=1; - RUN_SPECIFIC_SENSITIVE=1; SHOW_HELP=0; shift ;; @@ -978,9 +977,7 @@ if [ "$SHOW_HELP" -eq "1" ]; echo " " echo " -rsx , --run-extreme " echo " Run specific reference align/consensus " - echo " using extreme sensitivity; " - echo " Retained for backwards compatibility; " - echo " Now an alias for -vhs -rsr , " + echo " using extreme sensitivity, " echo " " echo " -rmt, --run-mito Run Mito align and consensus seq, " echo " -rmtd, --run-mito-dam Run Mito damage only, " @@ -1050,7 +1047,7 @@ if [ "$SHOW_VERSION" -eq "1" ]; echo " "; echo " TRACESPipe "; echo " "; - echo " Version: 1.1.3 "; + echo " Version: 1.0.2 "; echo " "; echo " Department of Virology and "; echo " Department of Forensic Medicine, "; @@ -1060,7 +1057,6 @@ if [ "$SHOW_VERSION" -eq "1" ]; echo " University of Aveiro, Portugal. "; echo " "; echo " diogo.pratas@helsinki.fi "; - echo " zachery.dickson@helsinki.fi "; echo " "; exit 0; fi @@ -1754,7 +1750,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence with pattern \"$SPECIFIC_ID\" from VDB.fa ...\e[0m"; gto_fasta_extract_read_by_pattern -p "$SPECIFIC_ID" < VDB.fa | awk "/^>/ {n++} n>1 {exit} 1" > SPECIFIC-$SPECIFIC_ID.fa echo -e "\e[34m[TRACESPipe]\e[32m Aligning ... \e[0m"; - ./TRACES_viral_align_reads.sh SPECIFIC-$SPECIFIC_ID.fa $ORGAN_T $SPECIFIC_ID $THREADS $REMOVE_DUPLICATIONS 1>> ../logs/Log-stdout-$ORGAN_T.txt $RUN_SPECIFIC_SENSITIVE 2>> ../logs/Log-stderr-$ORGAN_T.txt; + ./TRACES_viral_align_reads.sh SPECIFIC-$SPECIFIC_ID.fa $ORGAN_T $SPECIFIC_ID $THREADS $REMOVE_DUPLICATIONS 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; # echo -e "\e[34m[TRACESPipe]\e[32m Generate a consensus sequence with bcftools ...\e[0m"; @@ -1781,7 +1777,49 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; echo -e "\e[34m[TRACESPipe]\e[1m Depth-x (V) coverage: $C_DEPTH \e[0m"; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; fi - # + # + # ========================================================================== + # RUN SPECIFIC SPECIFIC ALIGN/CONSENSUS WITH EXTREME HIGH SENSITIVITY + # + if [[ "$RUN_SPECIFIC_SENSITIVE" -eq "1" ]]; + then + echo -e "\e[34m[TRACESPipe]\e[32m Aligning reads to specific viral ref(s) with pattern \"$SPECIFIC_ID\" using bowtie2 with EXTREME sensitivity ...\e[0m"; + # + CHECK_VDB; + # + echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence with pattern \"$SPECIFIC_ID\" from VDB.fa ...\e[0m"; + gto_fasta_extract_read_by_pattern -p "$SPECIFIC_ID" < VDB.fa | awk "/^>/ {n++} n>1 {exit} 1" > SPECIFIC-$SPECIFIC_ID.fa + echo -e "\e[34m[TRACESPipe]\e[32m Aligning ...\e[0m"; + ./TRACES_viral_sensitive_align_reads.sh SPECIFIC-$SPECIFIC_ID.fa $ORGAN_T $SPECIFIC_ID $THREADS $REMOVE_DUPLICATIONS 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; + echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; + # + echo -e "\e[34m[TRACESPipe]\e[32m Generate a consensus sequence with bcftools ...\e[0m"; + ./TRACES_viral_consensus.sh SPECIFIC-$SPECIFIC_ID.fa viral_aligned_sorted-$ORGAN_T-$SPECIFIC_ID.bam $ORGAN_T $SPECIFIC_ID 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; + mkdir -p ../output_data/TRACES_specific_alignments; + #rm -f ../output_data/TRACES_specific_alignments/* + cp SPECIFIC-$SPECIFIC_ID.fa ../output_data/TRACES_specific_alignments/ + cp SPECIFIC-$SPECIFIC_ID.fa.fai ../output_data/TRACES_specific_alignments/ + mv viral_aligned_sorted-$ORGAN_T-$SPECIFIC_ID.bam ../output_data/TRACES_specific_alignments/ + mv viral_aligned_sorted-$ORGAN_T-$SPECIFIC_ID.bam.bai ../output_data/TRACES_specific_alignments/ + mkdir -p ../output_data/TRACES_specific_consensus; + #rm -f ../output_data/TRACES_specific_consensus/* + mv $SPECIFIC_ID-consensus-$ORGAN_T.fa ../output_data/TRACES_specific_consensus/ + mkdir -p ../output_data/TRACES_specific_bed; + #rm -f ../output_data/TRACES_specific_bed/* + mv $SPECIFIC_ID-calls-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ + mv $SPECIFIC_ID-coverage-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ + mv $SPECIFIC_ID-zero-coverage-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ + mv $SPECIFIC_ID-$ORGAN_T-calls.vcf.gz ../output_data/TRACES_specific_bed/ + mkdir -p ../output_data/TRACES_specific_statistics; + echo -e "\e[34m[TRACESPipe]\e[32m Calculating coverage for alignment-based assembly ...\e[0m"; + ./TRACES_overall_specific.sh $SPECIFIC_ID $ORGAN_T + C_BREADTH=`cat ../output_data/TRACES_specific_statistics/$SPECIFIC_ID-total-horizontal-coverage-$ORGAN_T.txt`; + C_DEPTH=`cat ../output_data/TRACES_specific_statistics/$SPECIFIC_ID-total-depth-coverage-$ORGAN_T.txt`; + echo -e "\e[34m[TRACESPipe]\e[1m Breadth (H) coverage: $C_BREADTH \e[0m"; + echo -e "\e[34m[TRACESPipe]\e[1m Depth-x (V) coverage: $C_DEPTH \e[0m"; + echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; + fi + # # ========================================================================== # DETAILED VIRAL ALIGN/CONSENSUS # From c3adc8ba674185d59bffd6533293e05f74cd2c25 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Wed, 5 Mar 2025 16:10:02 +0200 Subject: [PATCH 12/40] Changed default top-size behaviour to use whole database --- src/.TRACES_hybrid.sh.swp | Bin 12288 -> 0 bytes src/TRACESPipe.sh | 60 +++++++++++++++++++++----------------- 2 files changed, 33 insertions(+), 27 deletions(-) delete mode 100644 src/.TRACES_hybrid.sh.swp diff --git a/src/.TRACES_hybrid.sh.swp b/src/.TRACES_hybrid.sh.swp deleted file mode 100644 index a0a88b56661b5f956411ee3fc7df53430f866f36..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12288 zcmeI2&u`pB6vwBi7ibFzt{mR}KowH$%_bm)tO{B+*%ArTXcOR2P&&59*-`AV>zPfu z9N=F-NQeu62dEX|58yB0f`kwU&Im4?_}cNNiM61mw~9B?SN1&5d-LYw8B42u!O92Q z@6o#77icdE@oV>9{OG64;=X=Xd?F(q9+eONy&dZ!OO>ykKT(O*=1BXoUf}ZSX~3S$ z^F)Wz>CD)kD`co_u5;D1McCUP4{vSl9ZY6Hp(Ec;7KC9F6JP>QMBtiOTkWruuAO#^ zUViEQCyK&OOn?b60Vco%m;e)C0!)AjJe>qweo1_W)Ss!-@7C8#jceWI3lm@hOn?b6 z0Vco%m;e)C0!)AjFaaj;6cUh;5LccP;#;Wv|9}4b|Bn}h_#OHU`Vsm8`X2fYdIWt9 zeF=@Azn&N3D`*GWgx-d3LLKN8=;v!fd;$5;YtVJ*Rp=`8@G9np#?UVG=M^D7hX&9u z$mwC7+c#x?+?W6pU;<2l2`~XBzy$uU1UjNf$&1N`GLEw(vIjEJhekyQ-SO7Q>)sjP z9`5*7rUMb?3r5+jaE-x%O#MJ=^8H>mb@|jCMAFHN1*xaAq{=wk`!9i)4~mpsmL-<7 ziPVFoGhcdOxkPa`Nr_>BYl9)ZwP4I*f@1z+PTzo6M)G6n%QT>-$(W`oIB`u=F2knD z$^oVcP9&uYn*c8R?~J#Gn|qCc)ZM!^9F6Y0izka#&|GB|@)Y~UR54pe5TT}-NQIM( zJVot~)ZMl9!5f4Ao9pS&WNxAgDwMRDp*SnZO|+#GSr}~&DU~w{6h+E2*>OhFT9pRL zj8vMt8NBQ?aTWnrQolW)qrJbqy8$l*Q&qGZ2y>*!PU}7uQ)4&CuXpG!DyUH}O`Zq4 zS5I{o?R>_oE9>cp!|~4c&h3r!jIao#J~dIRN~QG0s*KPv-KAD}+S-qaw&~;YEzGGr z(2n|pv)o4@Afrf^CEnP{SVLNsd*2|d%GD?stt+NsRRd1aRkjlqP%aPEV+iNdEMFF+j1sZ|urNCsA!P*E0B{bg95GXVY**Hk%KBbG_x9eZW)B`8I*3diK`slBsk8_B-KJKuYTfIP=ULa9k F|2N57<)8on diff --git a/src/TRACESPipe.sh b/src/TRACESPipe.sh index da3e5a0..1e4ea98 100755 --- a/src/TRACESPipe.sh +++ b/src/TRACESPipe.sh @@ -117,9 +117,10 @@ RUN_BLAST_RECONSTRUCTED=0; # MINIMAL_SIMILARITY_VALUE=1.0; TSIZE=2; +TOP_SIZE=0; +TOP_SIZE_VIR=0; CACHE=70; -TOP_SIZE_VIR=8000; -TOP_SIZE=12000; +VIRAL_DATABASE_FILE="VDB.fa" # # ============================================================================== # THESE ARE THE CURRENT FLAGGED VIRUSES OR VIRUSES GROUPS FOR ENHANCED ASSEMBLY: @@ -179,9 +180,9 @@ CHECK_GZIP_FILES () { # # CHECK_VDB () { - if [ ! -f VDB.fa ]; + if [ ! -f "$VIRAL_DATABASE_FILE" ]; then - echo -e "\e[31mERROR: viral database (VDB.fa) not found!\e[0m" + echo -e "\e[31mERROR: viral database ("$VIRAL_DATABASE_FILE") not found!\e[0m" echo "TIP: before this, run: ./TRACESPipe.sh --build-viral" echo "For addition information, see the instructions at the web page." exit 1; @@ -360,9 +361,9 @@ ALIGN_AND_CONSENSUS () { fi # echo -e "\e[34m[TRACESPipe]\e[96m Similarity best match: $V_INFO\e[0m"; - echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence from VDB.fa\e[0m"; + echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence from "$VIRAL_DATABASE_FILE"\e[0m"; CHECK_VDB; - gto_fasta_extract_read_by_pattern -p "$V_GID" < VDB.fa | awk "/^>/ {n++} n>1 {exit} 1" > $ORGAN-$V_TAG.fa 2>> ../logs/Log-$ORGAN.txt; + gto_fasta_extract_read_by_pattern -p "$V_GID" < "$VIRAL_DATABASE_FILE" | awk "/^>/ {n++} n>1 {exit} 1" > $ORGAN-$V_TAG.fa 2>> ../logs/Log-$ORGAN.txt; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; echo -e "\e[34m[TRACESPipe]\e[32m Aligning reads to $V_TAG best reference with bowtie2 ...\e[0m"; ./TRACES_viral_align_reads.sh $ORGAN-$V_TAG.fa $ORGAN $V_TAG $THREADS $DUPL $HIGH 1>> ../logs/Log-stdout-$ORGAN.txt 2>> ../logs/Log-stderr-$ORGAN.txt; @@ -826,16 +827,16 @@ while [[ $# -gt 0 ]] SHOW_HELP=0; shift 2; ;; - -tsv|--top-size-virus) - TOP_SIZE_VIR="$2"; - SHOW_HELP=0; - shift 2; - ;; -ts|--top-size) TOP_SIZE="$2"; SHOW_HELP=0; shift 2; ;; + -tsv|--top-size-virus) + TOP_SIZE_VIR="$2"; + SHOW_HELP=0; + shift 2; + ;; -*) # unknown option with small echo "Invalid arg ($1)!"; echo "For help, try: ./TRACESPipe.sh -h" @@ -890,9 +891,9 @@ if [ "$SHOW_HELP" -eq "1" ]; echo " -udb, --build-unviral Build non viral database (control), " echo " " echo " -afs , --add-fasta " - echo " Add a FASTA sequence to the VDB.fa, " + echo " Add a FASTA sequence to the "$VIRAL_DATABASE_FILE", " echo " -aes , --add-extra-seq " - echo " Add extra sequence to the VDB.fa, " + echo " Add extra sequence to the "$VIRAL_DATABASE_FILE", " echo " -gx, --get-extra-vir Downloads/appends (VDB) extra viral seq, " echo " " echo " -gad, --gen-adapters Generate FASTA file with adapters, " @@ -962,10 +963,12 @@ if [ "$SHOW_HELP" -eq "1" ]; echo " Cache to be used by FALCON-meta, " echo " -tsv , --top-size-virus " echo " Top size to be used by FALCON-meta when " - echo " using TRACES_metagenomic_viral.sh, " + echo " using TRACES_metagenomic_viral.sh; " + echo " default:0 -> seq count in viral db " echo " -ts , --top-size " echo " Top size to be used by FALCON-meta when " - echo " using TRACES_metagenomic.sh, " + echo " using TRACES_metagenomic.sh; " + echo " default:0 -> seq count in non-viral db " echo " " echo " -rava, --run-all-v-alig Run all viral align/sort/consensus seqs " echo " from a specific list, " @@ -1408,7 +1411,7 @@ if [[ "$GET_CY" -eq "1" ]]; if [[ "$ADD_FASTA" -eq "1" ]]; then CHECK_VDB; - ./TRACES_add_fasta_to_database.sh VDB.fa $NEW_FASTA + ./TRACES_add_fasta_to_database.sh "$VIRAL_DATABASE_FILE" $NEW_FASTA fi # # ============================================================================== @@ -1416,7 +1419,7 @@ if [[ "$ADD_FASTA" -eq "1" ]]; if [[ "$GET_EXTRA" -eq "1" ]]; then CHECK_ENRICH; - ./TRACES_get_enriched_sequences.sh VDB.fa + ./TRACES_get_enriched_sequences.sh "$VIRAL_DATABASE_FILE" fi # # ============================================================================== @@ -1424,7 +1427,7 @@ if [[ "$GET_EXTRA" -eq "1" ]]; if [[ "$ADD_EXTRA_SEQ" -eq "1" ]]; then CHECK_VDB; - ./TRACES_get_extra_seq.sh VDB.fa $NEW_SEQ_ID + ./TRACES_get_extra_seq.sh "$VIRAL_DATABASE_FILE" $NEW_SEQ_ID fi # # @@ -1561,6 +1564,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; # CHECK_VDB; CHECK_PHIX; + [ $TOP_SIZE_VIR -eq 0 ] && TOP_SIZE_VIR=$(grep -c '^>' "$VIRAL_DATABASE_FILE"); # mv o_fw_pr.fq NP-o_fw_pr.fq; mv o_fw_unpr.fq NP-o_fw_unpr.fq; @@ -1569,7 +1573,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; # echo -e "\e[34m[TRACESPipe]\e[32m Running viral metagenomic analysis with FALCON-meta ...\e[0m"; mkdir -p ../output_data/TRACES_results - ./TRACES_metagenomics_viral.sh $ORGAN_T VDB.fa $TOP_SIZE_VIR $THREADS $TSIZE $CACHE 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; + ./TRACES_metagenomics_viral.sh $ORGAN_T "$VIRAL_DATABASE_FILE" $TOP_SIZE_VIR $THREADS $TSIZE $CACHE 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; # echo -e "\e[34m[TRACESPipe]\e[32m Finding the best references ...\e[0m"; @@ -1657,6 +1661,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; then CHECK_VDB; CHECK_PHIX; + [ $TOP_SIZE_VIR -eq 0 ] && TOP_SIZE_VIR=$(grep -c '^>' "$VIRAL_DATABASE_FILE"); # echo -e "\e[34m[TRACESPipe]\e[32m Removing PhiX from the samples with MAGNET ...\e[0m"; #./TRACES_remove_phix.sh $THREADS @@ -1675,7 +1680,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; # echo -e "\e[34m[TRACESPipe]\e[32m Running viral metagenomic analysis with FALCON-meta ...\e[0m"; mkdir -p ../output_data/TRACES_results - ./TRACES_metagenomics_viral.sh $ORGAN_T VDB.fa $TOP_SIZE_VIR $THREADS $TSIZE $CACHE 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; + ./TRACES_metagenomics_viral.sh $ORGAN_T "$VIRAL_DATABASE_FILE" $TOP_SIZE_VIR $THREADS $TSIZE $CACHE 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; # echo -e "\e[34m[TRACESPipe]\e[32m Finding the best references ...\e[0m"; @@ -1704,7 +1709,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; # echo -e "\e[34m[TRACESPipe]\e[32m Building complexity profiles with gto ...\e[0m"; cat NP-o_fw_pr.fq NP-o_fw_unpr.fq NP-o_rv_pr.fq NP-o_rv_unpr.fq > P_TRACES_sample_reads.fq - ./TRACES_profiles.sh GIS-$ORGAN_T VDB.fa P_TRACES_sample_reads.fq $ORGAN_T 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; + ./TRACES_profiles.sh GIS-$ORGAN_T "$VIRAL_DATABASE_FILE" P_TRACES_sample_reads.fq $ORGAN_T 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; fi # @@ -1715,6 +1720,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; then # CHECK_DB; + [ $TOP_SIZE -eq 0 ] && TOP_SIZE=$(grep -c '^>' DB.fa); # echo -e "\e[34m[TRACESPipe]\e[32m Running NON viral metagenomic analysis with FALCON ...\e[0m"; ./TRACES_metagenomics.sh $ORGAN_T DB.fa $TOP_SIZE $THREADS $TSIZE $CACHE 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; @@ -1751,8 +1757,8 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; # CHECK_VDB; # - echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence with pattern \"$SPECIFIC_ID\" from VDB.fa ...\e[0m"; - gto_fasta_extract_read_by_pattern -p "$SPECIFIC_ID" < VDB.fa | awk "/^>/ {n++} n>1 {exit} 1" > SPECIFIC-$SPECIFIC_ID.fa + echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence with pattern \"$SPECIFIC_ID\" from "$VIRAL_DATABASE_FILE" ...\e[0m"; + gto_fasta_extract_read_by_pattern -p "$SPECIFIC_ID" < "$VIRAL_DATABASE_FILE" | awk "/^>/ {n++} n>1 {exit} 1" > SPECIFIC-$SPECIFIC_ID.fa echo -e "\e[34m[TRACESPipe]\e[32m Aligning ... \e[0m"; ./TRACES_viral_align_reads.sh SPECIFIC-$SPECIFIC_ID.fa $ORGAN_T $SPECIFIC_ID $THREADS $REMOVE_DUPLICATIONS 1>> ../logs/Log-stdout-$ORGAN_T.txt $RUN_SPECIFIC_SENSITIVE 2>> ../logs/Log-stderr-$ORGAN_T.txt; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; @@ -2161,8 +2167,8 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; # CHECK_VDB; # - echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence with pattern \"$SPECIFIC_DENOVO_ID\" from VDB.fa ...\e[0m"; - gto_fasta_extract_read_by_pattern -p "$SPECIFIC_DENOVO_ID" < VDB.fa | awk "/^>/ {n++} n>1 {exit} 1" > SPECIFIC-$SPECIFIC_DENOVO_ID.fa + echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence with pattern \"$SPECIFIC_DENOVO_ID\" from "$VIRAL_DATABASE_FILE" ...\e[0m"; + gto_fasta_extract_read_by_pattern -p "$SPECIFIC_DENOVO_ID" < "$VIRAL_DATABASE_FILE" | awk "/^>/ {n++} n>1 {exit} 1" > SPECIFIC-$SPECIFIC_DENOVO_ID.fa echo -e "\e[34m[TRACESPipe]\e[32m Aligning and creating consensus ... \e[0m"; SCAFFOLDS_PATH="../output_data/TRACES_denovo_$ORGAN_T/scaffolds.fasta"; ./TRACES_denovo_specific.sh SPECIFIC-$SPECIFIC_DENOVO_ID.fa $SCAFFOLDS_PATH $SPECIFIC_DENOVO_ID $ORGAN_T $THREADS 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; @@ -2299,9 +2305,9 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; then IDBLAST=`head -n 1 $BLASTS_OUTPUT/SINGLE-LIST-$VIRUS-$ORGAN.txt | awk '{ print $2}' | tr '|' '\t' | awk '{ print $4;}'`; # - # Try to find genome in VDB.fa + # Try to find genome in "$VIRAL_DATABASE_FILE" CHECK_VDB; - gto_fasta_extract_read_by_pattern -p "$IDBLAST" < VDB.fa \ + gto_fasta_extract_read_by_pattern -p "$IDBLAST" < "$VIRAL_DATABASE_FILE" \ | awk "/^>/ {n++} n>1 {exit} 1" > TMP-BEST-BLAST.fa NVALSEQ=`wc -l TMP-BEST-BLAST.fa | awk '{ print $1;}'`; if [[ "$NVALSEQ" -eq "0" ]]; From 58a54ed09f9fbc2436609120a3a40236077b850e Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Wed, 5 Mar 2025 16:10:02 +0200 Subject: [PATCH 13/40] Changed default top-size behaviour to use whole database Changed VDB.fa in code to a variable for future maintainability --- src/.TRACES_hybrid.sh.swp | Bin 12288 -> 0 bytes src/TRACESPipe.sh | 60 +++++++++++++++++++++----------------- 2 files changed, 33 insertions(+), 27 deletions(-) delete mode 100644 src/.TRACES_hybrid.sh.swp diff --git a/src/.TRACES_hybrid.sh.swp b/src/.TRACES_hybrid.sh.swp deleted file mode 100644 index a0a88b56661b5f956411ee3fc7df53430f866f36..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12288 zcmeI2&u`pB6vwBi7ibFzt{mR}KowH$%_bm)tO{B+*%ArTXcOR2P&&59*-`AV>zPfu z9N=F-NQeu62dEX|58yB0f`kwU&Im4?_}cNNiM61mw~9B?SN1&5d-LYw8B42u!O92Q z@6o#77icdE@oV>9{OG64;=X=Xd?F(q9+eONy&dZ!OO>ykKT(O*=1BXoUf}ZSX~3S$ z^F)Wz>CD)kD`co_u5;D1McCUP4{vSl9ZY6Hp(Ec;7KC9F6JP>QMBtiOTkWruuAO#^ zUViEQCyK&OOn?b60Vco%m;e)C0!)AjJe>qweo1_W)Ss!-@7C8#jceWI3lm@hOn?b6 z0Vco%m;e)C0!)AjFaaj;6cUh;5LccP;#;Wv|9}4b|Bn}h_#OHU`Vsm8`X2fYdIWt9 zeF=@Azn&N3D`*GWgx-d3LLKN8=;v!fd;$5;YtVJ*Rp=`8@G9np#?UVG=M^D7hX&9u z$mwC7+c#x?+?W6pU;<2l2`~XBzy$uU1UjNf$&1N`GLEw(vIjEJhekyQ-SO7Q>)sjP z9`5*7rUMb?3r5+jaE-x%O#MJ=^8H>mb@|jCMAFHN1*xaAq{=wk`!9i)4~mpsmL-<7 ziPVFoGhcdOxkPa`Nr_>BYl9)ZwP4I*f@1z+PTzo6M)G6n%QT>-$(W`oIB`u=F2knD z$^oVcP9&uYn*c8R?~J#Gn|qCc)ZM!^9F6Y0izka#&|GB|@)Y~UR54pe5TT}-NQIM( zJVot~)ZMl9!5f4Ao9pS&WNxAgDwMRDp*SnZO|+#GSr}~&DU~w{6h+E2*>OhFT9pRL zj8vMt8NBQ?aTWnrQolW)qrJbqy8$l*Q&qGZ2y>*!PU}7uQ)4&CuXpG!DyUH}O`Zq4 zS5I{o?R>_oE9>cp!|~4c&h3r!jIao#J~dIRN~QG0s*KPv-KAD}+S-qaw&~;YEzGGr z(2n|pv)o4@Afrf^CEnP{SVLNsd*2|d%GD?stt+NsRRd1aRkjlqP%aPEV+iNdEMFF+j1sZ|urNCsA!P*E0B{bg95GXVY**Hk%KBbG_x9eZW)B`8I*3diK`slBsk8_B-KJKuYTfIP=ULa9k F|2N57<)8on diff --git a/src/TRACESPipe.sh b/src/TRACESPipe.sh index da3e5a0..1e4ea98 100755 --- a/src/TRACESPipe.sh +++ b/src/TRACESPipe.sh @@ -117,9 +117,10 @@ RUN_BLAST_RECONSTRUCTED=0; # MINIMAL_SIMILARITY_VALUE=1.0; TSIZE=2; +TOP_SIZE=0; +TOP_SIZE_VIR=0; CACHE=70; -TOP_SIZE_VIR=8000; -TOP_SIZE=12000; +VIRAL_DATABASE_FILE="VDB.fa" # # ============================================================================== # THESE ARE THE CURRENT FLAGGED VIRUSES OR VIRUSES GROUPS FOR ENHANCED ASSEMBLY: @@ -179,9 +180,9 @@ CHECK_GZIP_FILES () { # # CHECK_VDB () { - if [ ! -f VDB.fa ]; + if [ ! -f "$VIRAL_DATABASE_FILE" ]; then - echo -e "\e[31mERROR: viral database (VDB.fa) not found!\e[0m" + echo -e "\e[31mERROR: viral database ("$VIRAL_DATABASE_FILE") not found!\e[0m" echo "TIP: before this, run: ./TRACESPipe.sh --build-viral" echo "For addition information, see the instructions at the web page." exit 1; @@ -360,9 +361,9 @@ ALIGN_AND_CONSENSUS () { fi # echo -e "\e[34m[TRACESPipe]\e[96m Similarity best match: $V_INFO\e[0m"; - echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence from VDB.fa\e[0m"; + echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence from "$VIRAL_DATABASE_FILE"\e[0m"; CHECK_VDB; - gto_fasta_extract_read_by_pattern -p "$V_GID" < VDB.fa | awk "/^>/ {n++} n>1 {exit} 1" > $ORGAN-$V_TAG.fa 2>> ../logs/Log-$ORGAN.txt; + gto_fasta_extract_read_by_pattern -p "$V_GID" < "$VIRAL_DATABASE_FILE" | awk "/^>/ {n++} n>1 {exit} 1" > $ORGAN-$V_TAG.fa 2>> ../logs/Log-$ORGAN.txt; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; echo -e "\e[34m[TRACESPipe]\e[32m Aligning reads to $V_TAG best reference with bowtie2 ...\e[0m"; ./TRACES_viral_align_reads.sh $ORGAN-$V_TAG.fa $ORGAN $V_TAG $THREADS $DUPL $HIGH 1>> ../logs/Log-stdout-$ORGAN.txt 2>> ../logs/Log-stderr-$ORGAN.txt; @@ -826,16 +827,16 @@ while [[ $# -gt 0 ]] SHOW_HELP=0; shift 2; ;; - -tsv|--top-size-virus) - TOP_SIZE_VIR="$2"; - SHOW_HELP=0; - shift 2; - ;; -ts|--top-size) TOP_SIZE="$2"; SHOW_HELP=0; shift 2; ;; + -tsv|--top-size-virus) + TOP_SIZE_VIR="$2"; + SHOW_HELP=0; + shift 2; + ;; -*) # unknown option with small echo "Invalid arg ($1)!"; echo "For help, try: ./TRACESPipe.sh -h" @@ -890,9 +891,9 @@ if [ "$SHOW_HELP" -eq "1" ]; echo " -udb, --build-unviral Build non viral database (control), " echo " " echo " -afs , --add-fasta " - echo " Add a FASTA sequence to the VDB.fa, " + echo " Add a FASTA sequence to the "$VIRAL_DATABASE_FILE", " echo " -aes , --add-extra-seq " - echo " Add extra sequence to the VDB.fa, " + echo " Add extra sequence to the "$VIRAL_DATABASE_FILE", " echo " -gx, --get-extra-vir Downloads/appends (VDB) extra viral seq, " echo " " echo " -gad, --gen-adapters Generate FASTA file with adapters, " @@ -962,10 +963,12 @@ if [ "$SHOW_HELP" -eq "1" ]; echo " Cache to be used by FALCON-meta, " echo " -tsv , --top-size-virus " echo " Top size to be used by FALCON-meta when " - echo " using TRACES_metagenomic_viral.sh, " + echo " using TRACES_metagenomic_viral.sh; " + echo " default:0 -> seq count in viral db " echo " -ts , --top-size " echo " Top size to be used by FALCON-meta when " - echo " using TRACES_metagenomic.sh, " + echo " using TRACES_metagenomic.sh; " + echo " default:0 -> seq count in non-viral db " echo " " echo " -rava, --run-all-v-alig Run all viral align/sort/consensus seqs " echo " from a specific list, " @@ -1408,7 +1411,7 @@ if [[ "$GET_CY" -eq "1" ]]; if [[ "$ADD_FASTA" -eq "1" ]]; then CHECK_VDB; - ./TRACES_add_fasta_to_database.sh VDB.fa $NEW_FASTA + ./TRACES_add_fasta_to_database.sh "$VIRAL_DATABASE_FILE" $NEW_FASTA fi # # ============================================================================== @@ -1416,7 +1419,7 @@ if [[ "$ADD_FASTA" -eq "1" ]]; if [[ "$GET_EXTRA" -eq "1" ]]; then CHECK_ENRICH; - ./TRACES_get_enriched_sequences.sh VDB.fa + ./TRACES_get_enriched_sequences.sh "$VIRAL_DATABASE_FILE" fi # # ============================================================================== @@ -1424,7 +1427,7 @@ if [[ "$GET_EXTRA" -eq "1" ]]; if [[ "$ADD_EXTRA_SEQ" -eq "1" ]]; then CHECK_VDB; - ./TRACES_get_extra_seq.sh VDB.fa $NEW_SEQ_ID + ./TRACES_get_extra_seq.sh "$VIRAL_DATABASE_FILE" $NEW_SEQ_ID fi # # @@ -1561,6 +1564,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; # CHECK_VDB; CHECK_PHIX; + [ $TOP_SIZE_VIR -eq 0 ] && TOP_SIZE_VIR=$(grep -c '^>' "$VIRAL_DATABASE_FILE"); # mv o_fw_pr.fq NP-o_fw_pr.fq; mv o_fw_unpr.fq NP-o_fw_unpr.fq; @@ -1569,7 +1573,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; # echo -e "\e[34m[TRACESPipe]\e[32m Running viral metagenomic analysis with FALCON-meta ...\e[0m"; mkdir -p ../output_data/TRACES_results - ./TRACES_metagenomics_viral.sh $ORGAN_T VDB.fa $TOP_SIZE_VIR $THREADS $TSIZE $CACHE 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; + ./TRACES_metagenomics_viral.sh $ORGAN_T "$VIRAL_DATABASE_FILE" $TOP_SIZE_VIR $THREADS $TSIZE $CACHE 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; # echo -e "\e[34m[TRACESPipe]\e[32m Finding the best references ...\e[0m"; @@ -1657,6 +1661,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; then CHECK_VDB; CHECK_PHIX; + [ $TOP_SIZE_VIR -eq 0 ] && TOP_SIZE_VIR=$(grep -c '^>' "$VIRAL_DATABASE_FILE"); # echo -e "\e[34m[TRACESPipe]\e[32m Removing PhiX from the samples with MAGNET ...\e[0m"; #./TRACES_remove_phix.sh $THREADS @@ -1675,7 +1680,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; # echo -e "\e[34m[TRACESPipe]\e[32m Running viral metagenomic analysis with FALCON-meta ...\e[0m"; mkdir -p ../output_data/TRACES_results - ./TRACES_metagenomics_viral.sh $ORGAN_T VDB.fa $TOP_SIZE_VIR $THREADS $TSIZE $CACHE 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; + ./TRACES_metagenomics_viral.sh $ORGAN_T "$VIRAL_DATABASE_FILE" $TOP_SIZE_VIR $THREADS $TSIZE $CACHE 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; # echo -e "\e[34m[TRACESPipe]\e[32m Finding the best references ...\e[0m"; @@ -1704,7 +1709,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; # echo -e "\e[34m[TRACESPipe]\e[32m Building complexity profiles with gto ...\e[0m"; cat NP-o_fw_pr.fq NP-o_fw_unpr.fq NP-o_rv_pr.fq NP-o_rv_unpr.fq > P_TRACES_sample_reads.fq - ./TRACES_profiles.sh GIS-$ORGAN_T VDB.fa P_TRACES_sample_reads.fq $ORGAN_T 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; + ./TRACES_profiles.sh GIS-$ORGAN_T "$VIRAL_DATABASE_FILE" P_TRACES_sample_reads.fq $ORGAN_T 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; fi # @@ -1715,6 +1720,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; then # CHECK_DB; + [ $TOP_SIZE -eq 0 ] && TOP_SIZE=$(grep -c '^>' DB.fa); # echo -e "\e[34m[TRACESPipe]\e[32m Running NON viral metagenomic analysis with FALCON ...\e[0m"; ./TRACES_metagenomics.sh $ORGAN_T DB.fa $TOP_SIZE $THREADS $TSIZE $CACHE 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; @@ -1751,8 +1757,8 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; # CHECK_VDB; # - echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence with pattern \"$SPECIFIC_ID\" from VDB.fa ...\e[0m"; - gto_fasta_extract_read_by_pattern -p "$SPECIFIC_ID" < VDB.fa | awk "/^>/ {n++} n>1 {exit} 1" > SPECIFIC-$SPECIFIC_ID.fa + echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence with pattern \"$SPECIFIC_ID\" from "$VIRAL_DATABASE_FILE" ...\e[0m"; + gto_fasta_extract_read_by_pattern -p "$SPECIFIC_ID" < "$VIRAL_DATABASE_FILE" | awk "/^>/ {n++} n>1 {exit} 1" > SPECIFIC-$SPECIFIC_ID.fa echo -e "\e[34m[TRACESPipe]\e[32m Aligning ... \e[0m"; ./TRACES_viral_align_reads.sh SPECIFIC-$SPECIFIC_ID.fa $ORGAN_T $SPECIFIC_ID $THREADS $REMOVE_DUPLICATIONS 1>> ../logs/Log-stdout-$ORGAN_T.txt $RUN_SPECIFIC_SENSITIVE 2>> ../logs/Log-stderr-$ORGAN_T.txt; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; @@ -2161,8 +2167,8 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; # CHECK_VDB; # - echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence with pattern \"$SPECIFIC_DENOVO_ID\" from VDB.fa ...\e[0m"; - gto_fasta_extract_read_by_pattern -p "$SPECIFIC_DENOVO_ID" < VDB.fa | awk "/^>/ {n++} n>1 {exit} 1" > SPECIFIC-$SPECIFIC_DENOVO_ID.fa + echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence with pattern \"$SPECIFIC_DENOVO_ID\" from "$VIRAL_DATABASE_FILE" ...\e[0m"; + gto_fasta_extract_read_by_pattern -p "$SPECIFIC_DENOVO_ID" < "$VIRAL_DATABASE_FILE" | awk "/^>/ {n++} n>1 {exit} 1" > SPECIFIC-$SPECIFIC_DENOVO_ID.fa echo -e "\e[34m[TRACESPipe]\e[32m Aligning and creating consensus ... \e[0m"; SCAFFOLDS_PATH="../output_data/TRACES_denovo_$ORGAN_T/scaffolds.fasta"; ./TRACES_denovo_specific.sh SPECIFIC-$SPECIFIC_DENOVO_ID.fa $SCAFFOLDS_PATH $SPECIFIC_DENOVO_ID $ORGAN_T $THREADS 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; @@ -2299,9 +2305,9 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; then IDBLAST=`head -n 1 $BLASTS_OUTPUT/SINGLE-LIST-$VIRUS-$ORGAN.txt | awk '{ print $2}' | tr '|' '\t' | awk '{ print $4;}'`; # - # Try to find genome in VDB.fa + # Try to find genome in "$VIRAL_DATABASE_FILE" CHECK_VDB; - gto_fasta_extract_read_by_pattern -p "$IDBLAST" < VDB.fa \ + gto_fasta_extract_read_by_pattern -p "$IDBLAST" < "$VIRAL_DATABASE_FILE" \ | awk "/^>/ {n++} n>1 {exit} 1" > TMP-BEST-BLAST.fa NVALSEQ=`wc -l TMP-BEST-BLAST.fa | awk '{ print $1;}'`; if [[ "$NVALSEQ" -eq "0" ]]; From 7209ff7985e86d0aca15de83ee77f35ff35ecc02 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Tue, 8 Apr 2025 15:58:37 +0300 Subject: [PATCH 14/40] Created a single script to handle breadth and depth calculations across all branches of TRACESPipe (will use updated depth calcs) --- src/TRACES_overall.sh | 90 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 src/TRACES_overall.sh diff --git a/src/TRACES_overall.sh b/src/TRACES_overall.sh new file mode 100644 index 0000000..3ebb7ca --- /dev/null +++ b/src/TRACES_overall.sh @@ -0,0 +1,90 @@ +#!/bin/bash +# +#USAGE: TRACES_overall.sh reftype pattern organ [maxdepth=1000] +# RefType is one of virus, specific, cy, mtdna and defines the type of reference we are calculating stats for +# Pattern defines the specific reference used +# for cy and mtdna this should be cy and mt respectively +# for specific it tis the pattern used to find the reference in the DB +# for virus it is the virus Name +# Organ is the identifier library being processed +# MaxDepth is the maximum depth to consider at any given position +# +# +#Define Global Variables +declare -A ValidRefTypes; +for type in virus specific cy mtdna; do + ValidRefTypes[$type]=1 +done +declare -A RequiredPattern; +RequiredPattern["cy"]="cy" +RequiredPattern["mtdna"]="mt" +DEFAULT_MAX_DEPTH=1000; +#Pull the Ref Type, validate it, and set the directories containing relevant bed and bam, and output files +REFTYPE="$1";shift; +if [[ -z "${ValidRefTypes[$REFTYPE]}" ]]; then + >&2 echo -e "[BUG] Attempt to call TRACES_overall.sh on an unrecognized type ($REFTYPE)\n" \ + "\t Valid types are: " "${!ValidRefTypes[@]}" \ + ; + exit 1; +fi +BedDir="../output_data/TRACES_${REFTYPE}_bed" +BamDir="../output_data/TRACES_${REFTYPE}_alignments" +StatsDir="../output_data/TRACES_${REFTYPE}_statistics" +#Pull the pattern and validate it against the reftype +PATTERN="$1";shift; +if [[ -n "${RequiredPattern[$REFTYPE]}" && "$PATTERN" != "${RequiredPattern[$REFTYPE]}" ]]; then + rq="${RequiredPattern[$REFTYPE]}" + >&2 echo "[BUG WARNING] Call to TRACES_overall.sh $REFTYPE with pattern!='$rq'"; + PATTERN=$rq +fi +#Set relevant pref- and suffixes +BedPrefix="$PATTERN" #post-delimiter is constant across all ref types +AlnBamPrefix="$PATTERN" #post-delimiter is constant across all ref types +AlnBamSuffix="-$PATTERN" #pre-delimiter included as the suffix isn't present for cy and mtdna +StatsPrefix="$PATTERN" #post-delimiter is constant across all ref types +#Handle cases where the -fixes aren't pattern +case $REFTYPE in + virus) + AlnBamPrefix="virus" + ;; + specific) + AlnBamPrefix="virus" + ;; + cy) + AlnBamSuffix="" + ;; + mtdna) + AlnBamSuffix="" + ;; +esac +#Pull the organ and then set the relevant bed, bam, and output files +ORGAN="$1"; shift +CovBedFile="$BedDir/$BedPrefix-coverage-$ORGAN.bed" +AlnBamFile="$BamDir/${AlnBamPrefix}_aligned_sorted-${ORGAN}${AlnBamSuffix}.bam" +DepthFile="$StatsDir/${StatsPrefix}-total-depth-coverage-${ORGAN}.txt" +ZeroBedFile="$BedDir/${BedPrefix}-zero-coverage-$ORGAN.bed" +BreadthFile="$StatsDir/${StatsPrefix}-total-horizontal-coverage-${ORGAN}.txt" +#Pull the Max Depth from the comand line options +MAX_DEPTH=$1; shift +if [ -z "$MAX_DEPTH" ]; then + MAX_DEPTH=$DEFAULT_MAX_DEPTH; +fi +# +#Get the total size of the reference sequence, and the sequence ID for the first contig +# Note: we assume all references are single contigs at this point +TOTAL_SIZE=$(tail -n 1 "$CovBedFile" | awk '{ print $3}'); +acc=$(head -1 "$CovBedFile" | cut -f1); +bedEntry="$acc\t0\t$TOTAL_SIZE" +#Updated Depth calculation which ignores peaks, but more importantly ignores secondary/supplementary alignments +#Column 3 is the length of the genome (constructed in the bedEntry above +#Column 4 is the total bases covered +NORMALIZED_COVERAGE=$(samtools bedcov --max-depth "$MAX_DEPTH" <(echo -e "$bedEntry") "$AlnBamFile" | awk '{print $3/$4}') +printf "$PATTERN\t$ORGAN\t%0.4f\n" "$NORMALIZED_COVERAGE" > "$DepthFile" +# +ZERO_COVERAGE=$(awk '{sum += ($3-$2)} END {print sum}' "$ZeroBedFile"); +if [ -z "$ZERO_COVERAGE" ]; then + ZERO_COVERAGE=0; +fi +NORMALIZED_ZERO_COVERAGE=$(echo "scale=4; (100-(($ZERO_COVERAGE / $TOTAL_SIZE)*100))" | bc -l); +printf "$PATTERN\t$ORGAN\t%0.4f\n" "$NORMALIZED_ZERO_COVERAGE" > "$BreadthFile" +# From 3cb22987b14237bb52ebd34dac81856c5d4f8591 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Wed, 9 Apr 2025 13:55:33 +0300 Subject: [PATCH 15/40] Add Exec Permissions to TRACES_overall --- .gitignore | 1 + src/TRACES_overall.sh | 0 2 files changed, 1 insertion(+) mode change 100644 => 100755 src/TRACES_overall.sh diff --git a/.gitignore b/.gitignore index 1377554..9c053c6 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ *.swp +tests diff --git a/src/TRACES_overall.sh b/src/TRACES_overall.sh old mode 100644 new mode 100755 From 22b62c6b3929b3b5c643bedd6d8897b29d8dd0c2 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Wed, 9 Apr 2025 16:36:07 +0300 Subject: [PATCH 16/40] Fix bugs in TRACES_overall.sh virus REFTYPE swapped with viral depth calculation no longer inverted --- src/TRACES_overall.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/TRACES_overall.sh b/src/TRACES_overall.sh index 3ebb7ca..876fbfb 100755 --- a/src/TRACES_overall.sh +++ b/src/TRACES_overall.sh @@ -12,7 +12,7 @@ # #Define Global Variables declare -A ValidRefTypes; -for type in virus specific cy mtdna; do +for type in cy mtdna specific viral; do ValidRefTypes[$type]=1 done declare -A RequiredPattern; @@ -44,11 +44,11 @@ AlnBamSuffix="-$PATTERN" #pre-delimiter included as the suffix isn't present for StatsPrefix="$PATTERN" #post-delimiter is constant across all ref types #Handle cases where the -fixes aren't pattern case $REFTYPE in - virus) - AlnBamPrefix="virus" + viral) + AlnBamPrefix="viral" ;; specific) - AlnBamPrefix="virus" + AlnBamPrefix="viral" ;; cy) AlnBamSuffix="" @@ -78,7 +78,7 @@ bedEntry="$acc\t0\t$TOTAL_SIZE" #Updated Depth calculation which ignores peaks, but more importantly ignores secondary/supplementary alignments #Column 3 is the length of the genome (constructed in the bedEntry above #Column 4 is the total bases covered -NORMALIZED_COVERAGE=$(samtools bedcov --max-depth "$MAX_DEPTH" <(echo -e "$bedEntry") "$AlnBamFile" | awk '{print $3/$4}') +NORMALIZED_COVERAGE=$(samtools bedcov --max-depth "$MAX_DEPTH" <(echo -e "$bedEntry") "$AlnBamFile" | awk '{print $4/$3}') printf "$PATTERN\t$ORGAN\t%0.4f\n" "$NORMALIZED_COVERAGE" > "$DepthFile" # ZERO_COVERAGE=$(awk '{sum += ($3-$2)} END {print sum}' "$ZeroBedFile"); From 21cbbd09d2d53032e1d683f72ac894056d7be52c Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Wed, 9 Apr 2025 16:40:49 +0300 Subject: [PATCH 17/40] Replaced use of TRACES_overall_X.sh with TRACES_overall.sh This effectively converts all depth and breadth calculations to the non-supp/sec+maxDepth method --- src/TRACESPipe.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/TRACESPipe.sh b/src/TRACESPipe.sh index 06e92e5..46df75f 100755 --- a/src/TRACESPipe.sh +++ b/src/TRACESPipe.sh @@ -388,7 +388,7 @@ ALIGN_AND_CONSENSUS () { mkdir -p ../output_data/TRACES_viral_statistics; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; echo -e "\e[34m[TRACESPipe]\e[32m Calculating coverage ...\e[0m"; - ./TRACES_overall_virus.sh $V_TAG $ORGAN + ./TRACES_overall.sh viral $V_TAG $ORGAN C_BREADTH=`cat ../output_data/TRACES_viral_statistics/$V_TAG-total-horizontal-coverage-$ORGAN.txt`; C_DEPTH=`cat ../output_data/TRACES_viral_statistics/$V_TAG-total-depth-coverage-$ORGAN.txt`; echo -e "\e[34m[TRACESPipe]\e[1m Breadth (H) coverage: $C_BREADTH \e[0m"; @@ -1770,7 +1770,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; mv $SPECIFIC_ID-$ORGAN_T-calls.vcf.gz ../output_data/TRACES_specific_bed/ mkdir -p ../output_data/TRACES_specific_statistics; echo -e "\e[34m[TRACESPipe]\e[32m Calculating coverage for alignment-based assembly ...\e[0m"; - ./TRACES_overall_specific.sh $SPECIFIC_ID $ORGAN_T + ./TRACES_overall.sh specific $SPECIFIC_ID $ORGAN_T C_BREADTH=`cat ../output_data/TRACES_specific_statistics/$SPECIFIC_ID-total-horizontal-coverage-$ORGAN_T.txt`; C_DEPTH=`cat ../output_data/TRACES_specific_statistics/$SPECIFIC_ID-total-depth-coverage-$ORGAN_T.txt`; echo -e "\e[34m[TRACESPipe]\e[1m Breadth (H) coverage: $C_BREADTH \e[0m"; @@ -1812,7 +1812,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; mv $SPECIFIC_ID-$ORGAN_T-calls.vcf.gz ../output_data/TRACES_specific_bed/ mkdir -p ../output_data/TRACES_specific_statistics; echo -e "\e[34m[TRACESPipe]\e[32m Calculating coverage for alignment-based assembly ...\e[0m"; - ./TRACES_overall_specific.sh $SPECIFIC_ID $ORGAN_T + ./TRACES_overall.sh specific $SPECIFIC_ID $ORGAN_T C_BREADTH=`cat ../output_data/TRACES_specific_statistics/$SPECIFIC_ID-total-horizontal-coverage-$ORGAN_T.txt`; C_DEPTH=`cat ../output_data/TRACES_specific_statistics/$SPECIFIC_ID-total-depth-coverage-$ORGAN_T.txt`; echo -e "\e[34m[TRACESPipe]\e[1m Breadth (H) coverage: $C_BREADTH \e[0m"; @@ -1866,7 +1866,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; mv mt-zero-coverage-$ORGAN_T.bed ../output_data/TRACES_mtdna_bed/ mkdir -p ../output_data/TRACES_mtdna_statistics; echo -e "\e[34m[TRACESPipe]\e[32m Calculating coverage for alignment-based assembly ...\e[0m"; - ./TRACES_overall_mtdna.sh $ORGAN_T + ./TRACES_overall.sh mtdna mt $ORGAN_T C_BREADTH=`cat ../output_data/TRACES_mtdna_statistics/mt-total-horizontal-coverage-$ORGAN_T.txt`; C_DEPTH=`cat ../output_data/TRACES_mtdna_statistics/mt-total-depth-coverage-$ORGAN_T.txt`; echo -e "\e[34m[TRACESPipe]\e[35m Breadth (H) coverage: $C_BREADTH \e[0m"; @@ -1929,7 +1929,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; mv cy-coverage-$ORGAN_T.bed ../output_data/TRACES_cy_bed/ mv cy-zero-coverage-$ORGAN_T.bed ../output_data/TRACES_cy_bed/ mkdir -p ../output_data/TRACES_cy_statistics; - ./TRACES_overall_cy.sh $ORGAN_T + ./TRACES_overall.sh cy cy $ORGAN_T echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m" fi # From 8bbaf96be95b049eb9efbeaf42f9ea10704795a8 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Thu, 12 Jun 2025 09:40:22 +0300 Subject: [PATCH 18/40] Updated some dependency versions in INSTALL Fixed typos/language in some comments --- src/TRACES_filter_incompatible_variants.sh | 11 ++++++----- src/TRACES_install.sh | 6 +++--- src/TRACES_overall.sh | 4 ++-- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/src/TRACES_filter_incompatible_variants.sh b/src/TRACES_filter_incompatible_variants.sh index d1d6c50..9961439 100755 --- a/src/TRACES_filter_incompatible_variants.sh +++ b/src/TRACES_filter_incompatible_variants.sh @@ -4,11 +4,12 @@ #The choice is made first by higher quality, then by VCF order #Confilicting alleles are snps at the same position as an insertion # and other variants which appear in the region covered by a deletion -#If there are multiple variants overlapping a position, the best one is retained -# variants are processed in an order such that come conflicts may be resolved -# Example two cariant sites covered by a deletion, where the deletion is lower -# quality than either variant, in this way the maximum number of variants are -# retained +#If there are multiple variants overlapping a position, the best one is retained. +#Variants are processed in an order such that some conflicts may be resolved +# Example two variant sites covered by a deletion, where the deletion is lower +# quality than either variant: the deletion is filtered and the two variants are +# retained. In this way the maximum number of variants are +# #Written by Zachery Dickson, September 2024 # Contact: dicksoz@mcmaster.ca # zachery.dickson@helski.fi diff --git a/src/TRACES_install.sh b/src/TRACES_install.sh index 696182c..b014723 100755 --- a/src/TRACES_install.sh +++ b/src/TRACES_install.sh @@ -45,10 +45,10 @@ if [[ "$INSTALL_PIPELINE" -eq "1" ]]; conda install -c bioconda spades --yes conda install -c bioconda igv --yes conda install -c bioconda bowtie2 --yes - conda install -c bioconda samtools=1.9 --yes - conda install -c bioconda bcftools=1.9 --yes + conda install -c bioconda samtools=1.21 --yes + conda install -c bioconda bcftools=1.21 --yes conda install -c bioconda bedops --yes - conda install -c bioconda bedtools --yes + conda install -c bioconda bedtools=2.31.1 --yes conda install -c bioconda fastq-pair --yes conda install -c bioconda entrez-direct --yes conda install -c bioconda/label/cf201901 entrez-direct --yes diff --git a/src/TRACES_overall.sh b/src/TRACES_overall.sh index 876fbfb..03919f7 100755 --- a/src/TRACES_overall.sh +++ b/src/TRACES_overall.sh @@ -1,10 +1,10 @@ #!/bin/bash # #USAGE: TRACES_overall.sh reftype pattern organ [maxdepth=1000] -# RefType is one of virus, specific, cy, mtdna and defines the type of reference we are calculating stats for +# RefType is one of viral, specific, cy, mtdna and defines the type of reference we are calculating stats for # Pattern defines the specific reference used # for cy and mtdna this should be cy and mt respectively -# for specific it tis the pattern used to find the reference in the DB +# for specific it is the pattern used to find the reference in the DB # for virus it is the virus Name # Organ is the identifier library being processed # MaxDepth is the maximum depth to consider at any given position From 256bcf52dd4cee34e0da88770f89d3da62f18682 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Mon, 17 Feb 2025 11:14:53 +0200 Subject: [PATCH 19/40] --very-sensitive now affects --run-specific --- src/TRACESPipe.sh | 54 +++++++---------------------------------------- 1 file changed, 8 insertions(+), 46 deletions(-) diff --git a/src/TRACESPipe.sh b/src/TRACESPipe.sh index 46df75f..576fb39 100755 --- a/src/TRACESPipe.sh +++ b/src/TRACESPipe.sh @@ -523,6 +523,7 @@ while [[ $# -gt 0 ]] ;; -vhs|--very-sensitive) HIGH_SENSITIVITY=1; + RUN_SPECIFIC_SENSITIVE=1; SHOW_HELP=0; shift ;; @@ -977,7 +978,9 @@ if [ "$SHOW_HELP" -eq "1" ]; echo " " echo " -rsx , --run-extreme " echo " Run specific reference align/consensus " - echo " using extreme sensitivity, " + echo " using extreme sensitivity; " + echo " Retained for backwards compatibility; " + echo " Now an alias for -vhs -rsr , " echo " " echo " -rmt, --run-mito Run Mito align and consensus seq, " echo " -rmtd, --run-mito-dam Run Mito damage only, " @@ -1047,7 +1050,7 @@ if [ "$SHOW_VERSION" -eq "1" ]; echo " "; echo " TRACESPipe "; echo " "; - echo " Version: 1.0.2 "; + echo " Version: 1.1.3 "; echo " "; echo " Department of Virology and "; echo " Department of Forensic Medicine, "; @@ -1057,6 +1060,7 @@ if [ "$SHOW_VERSION" -eq "1" ]; echo " University of Aveiro, Portugal. "; echo " "; echo " diogo.pratas@helsinki.fi "; + echo " zachery.dickson@helsinki.fi "; echo " "; exit 0; fi @@ -1750,47 +1754,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence with pattern \"$SPECIFIC_ID\" from VDB.fa ...\e[0m"; gto_fasta_extract_read_by_pattern -p "$SPECIFIC_ID" < VDB.fa | awk "/^>/ {n++} n>1 {exit} 1" > SPECIFIC-$SPECIFIC_ID.fa echo -e "\e[34m[TRACESPipe]\e[32m Aligning ... \e[0m"; - ./TRACES_viral_align_reads.sh SPECIFIC-$SPECIFIC_ID.fa $ORGAN_T $SPECIFIC_ID $THREADS $REMOVE_DUPLICATIONS 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; - echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; - # - echo -e "\e[34m[TRACESPipe]\e[32m Generate a consensus sequence with bcftools ...\e[0m"; - ./TRACES_viral_consensus.sh SPECIFIC-$SPECIFIC_ID.fa viral_aligned_sorted-$ORGAN_T-$SPECIFIC_ID.bam $ORGAN_T $SPECIFIC_ID 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; - mkdir -p ../output_data/TRACES_specific_alignments; - #rm -f ../output_data/TRACES_specific_alignments/* - cp SPECIFIC-$SPECIFIC_ID.fa ../output_data/TRACES_specific_alignments/ - cp SPECIFIC-$SPECIFIC_ID.fa.fai ../output_data/TRACES_specific_alignments/ - mv viral_aligned_sorted-$ORGAN_T-$SPECIFIC_ID.bam ../output_data/TRACES_specific_alignments/ - mv viral_aligned_sorted-$ORGAN_T-$SPECIFIC_ID.bam.bai ../output_data/TRACES_specific_alignments/ - mkdir -p ../output_data/TRACES_specific_consensus; - mv $SPECIFIC_ID-consensus-$ORGAN_T.fa ../output_data/TRACES_specific_consensus/ - mkdir -p ../output_data/TRACES_specific_bed; - mv $SPECIFIC_ID-calls-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ - mv $SPECIFIC_ID-coverage-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ - mv $SPECIFIC_ID-zero-coverage-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ - mv $SPECIFIC_ID-$ORGAN_T-calls.vcf.gz ../output_data/TRACES_specific_bed/ - mkdir -p ../output_data/TRACES_specific_statistics; - echo -e "\e[34m[TRACESPipe]\e[32m Calculating coverage for alignment-based assembly ...\e[0m"; - ./TRACES_overall.sh specific $SPECIFIC_ID $ORGAN_T - C_BREADTH=`cat ../output_data/TRACES_specific_statistics/$SPECIFIC_ID-total-horizontal-coverage-$ORGAN_T.txt`; - C_DEPTH=`cat ../output_data/TRACES_specific_statistics/$SPECIFIC_ID-total-depth-coverage-$ORGAN_T.txt`; - echo -e "\e[34m[TRACESPipe]\e[1m Breadth (H) coverage: $C_BREADTH \e[0m"; - echo -e "\e[34m[TRACESPipe]\e[1m Depth-x (V) coverage: $C_DEPTH \e[0m"; - echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; - fi - # - # ========================================================================== - # RUN SPECIFIC SPECIFIC ALIGN/CONSENSUS WITH EXTREME HIGH SENSITIVITY - # - if [[ "$RUN_SPECIFIC_SENSITIVE" -eq "1" ]]; - then - echo -e "\e[34m[TRACESPipe]\e[32m Aligning reads to specific viral ref(s) with pattern \"$SPECIFIC_ID\" using bowtie2 with EXTREME sensitivity ...\e[0m"; - # - CHECK_VDB; - # - echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence with pattern \"$SPECIFIC_ID\" from VDB.fa ...\e[0m"; - gto_fasta_extract_read_by_pattern -p "$SPECIFIC_ID" < VDB.fa | awk "/^>/ {n++} n>1 {exit} 1" > SPECIFIC-$SPECIFIC_ID.fa - echo -e "\e[34m[TRACESPipe]\e[32m Aligning ...\e[0m"; - ./TRACES_viral_sensitive_align_reads.sh SPECIFIC-$SPECIFIC_ID.fa $ORGAN_T $SPECIFIC_ID $THREADS $REMOVE_DUPLICATIONS 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; + ./TRACES_viral_align_reads.sh SPECIFIC-$SPECIFIC_ID.fa $ORGAN_T $SPECIFIC_ID $THREADS $REMOVE_DUPLICATIONS $RUN_SPECIFIC_SENSITIVE 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; # echo -e "\e[34m[TRACESPipe]\e[32m Generate a consensus sequence with bcftools ...\e[0m"; @@ -1802,10 +1766,8 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; mv viral_aligned_sorted-$ORGAN_T-$SPECIFIC_ID.bam ../output_data/TRACES_specific_alignments/ mv viral_aligned_sorted-$ORGAN_T-$SPECIFIC_ID.bam.bai ../output_data/TRACES_specific_alignments/ mkdir -p ../output_data/TRACES_specific_consensus; - #rm -f ../output_data/TRACES_specific_consensus/* mv $SPECIFIC_ID-consensus-$ORGAN_T.fa ../output_data/TRACES_specific_consensus/ mkdir -p ../output_data/TRACES_specific_bed; - #rm -f ../output_data/TRACES_specific_bed/* mv $SPECIFIC_ID-calls-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ mv $SPECIFIC_ID-coverage-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ mv $SPECIFIC_ID-zero-coverage-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ @@ -1819,7 +1781,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; echo -e "\e[34m[TRACESPipe]\e[1m Depth-x (V) coverage: $C_DEPTH \e[0m"; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; fi - # + # # ========================================================================== # DETAILED VIRAL ALIGN/CONSENSUS # From 84bf307da9839796adf698f6eeaff5cb92c37eac Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Thu, 12 Jun 2025 11:11:52 +0300 Subject: [PATCH 20/40] GlobalSensitive and DynamicTop merged in; Added Versioning Files --- Version.txt | 1 + VersionNotes.txt | 2 ++ src/TRACESPipe.sh | 10 +++++++--- 3 files changed, 10 insertions(+), 3 deletions(-) create mode 100644 Version.txt create mode 100644 VersionNotes.txt diff --git a/Version.txt b/Version.txt new file mode 100644 index 0000000..65087b4 --- /dev/null +++ b/Version.txt @@ -0,0 +1 @@ +1.1.4 diff --git a/VersionNotes.txt b/VersionNotes.txt new file mode 100644 index 0000000..354bc92 --- /dev/null +++ b/VersionNotes.txt @@ -0,0 +1,2 @@ +Version 1.1.4 + GlobalSensitive and DynamicTop merged in; Added Versioning Files diff --git a/src/TRACESPipe.sh b/src/TRACESPipe.sh index 9db2b62..1b292e3 100755 --- a/src/TRACESPipe.sh +++ b/src/TRACESPipe.sh @@ -11,6 +11,8 @@ # ============================================================================== # ################################################################################## # +SOURCE_DIR="$(dirname "$(readlink -f "$0")")" +# SHOW_HELP=0; SHOW_VERSION=0; FORCE=0; @@ -1048,12 +1050,14 @@ if [ "$SHOW_HELP" -eq "1" ]; # ============================================================================== # VERSION # -if [ "$SHOW_VERSION" -eq "1" ]; - then +if [ "$SHOW_VERSION" -eq "1" ]; then + version="1.1.3"; + versionFile="$SOURCE_DIR/../Version.txt" + [ -f "$versionFile" ] && version=$(cat "$versionFile") echo " "; echo " TRACESPipe "; echo " "; - echo " Version: 1.1.3 "; + echo " Version: $version "; echo " "; echo " Department of Virology and "; echo " Department of Forensic Medicine, "; From 1bd07e726a9268c57bd015890452fd2abd5fc3e6 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Thu, 12 Jun 2025 14:23:46 +0300 Subject: [PATCH 21/40] Inital branch commit with option setup --- Version.txt | 2 +- VersionNotes.txt | 4 ++ src/TRACESPipe.sh | 152 +++++++++++++++++++++++++++++++--------------- 3 files changed, 108 insertions(+), 50 deletions(-) diff --git a/Version.txt b/Version.txt index 65087b4..f68815f 100644 --- a/Version.txt +++ b/Version.txt @@ -1 +1 @@ -1.1.4 +1.2.0dev diff --git a/VersionNotes.txt b/VersionNotes.txt index 354bc92..ce0f85d 100644 --- a/VersionNotes.txt +++ b/VersionNotes.txt @@ -1,2 +1,6 @@ Version 1.1.4 GlobalSensitive and DynamicTop merged in; Added Versioning Files +Version 1.2.0dev + Reorganized Usage Message into blocks for easier reading + Added infrastructure for altVDB and VDBmetadata options + Handled interactions between altVDB and internal database commands diff --git a/src/TRACESPipe.sh b/src/TRACESPipe.sh index 1b292e3..80c04fd 100755 --- a/src/TRACESPipe.sh +++ b/src/TRACESPipe.sh @@ -113,6 +113,10 @@ RUN_DIFF_ID=""; # RUN_BLAST_RECONSTRUCTED=0; # +B_ALT_VIRAL_DB=0; +VIRAL_DATABASE_METADATA=""; +VIRAL_DATABASE_FILE="VDB.fa" +# # ============================================================================== # # DEFAULT VALUES: @@ -122,7 +126,6 @@ TSIZE=2; TOP_SIZE=0; TOP_SIZE_VIR=0; CACHE=70; -VIRAL_DATABASE_FILE="VDB.fa" # # ============================================================================== # THESE ARE THE CURRENT FLAGGED VIRUSES OR VIRUSES GROUPS FOR ENHANCED ASSEMBLY: @@ -185,8 +188,12 @@ CHECK_VDB () { if [ ! -f "$VIRAL_DATABASE_FILE" ]; then echo -e "\e[31mERROR: viral database ("$VIRAL_DATABASE_FILE") not found!\e[0m" - echo "TIP: before this, run: ./TRACESPipe.sh --build-viral" - echo "For addition information, see the instructions at the web page." + if [ "$B_ALT_VIRAL_DB" -eq 0 ]; then + echo "TIP: before this, run: ./TRACESPipe.sh --build-viral" + echo " OR" + echo "Use the --alt-viral-db option to specify an alternative VDB location" + echo "For addition information, see the instructions at the web page." + fi exit 1; fi } @@ -425,6 +432,11 @@ while [[ $# -gt 0 ]] FORCE=1; shift ;; + -avdb|--alt-viral-db) + B_ALT_VIRAL_DB=1; + VIRAL_DATABASE_FILE="$2" + shift 2; + ;; -flog|--flush-logs) FLUSH_LOGS=1; shift @@ -466,6 +478,10 @@ while [[ $# -gt 0 ]] SHOW_HELP=0; shift ;; + -vdbm|--viral-db-metadata) + VIRAL_DATABASE_METADATA="$2"; + shift 2 + ;; -vdbr|--build-viral-r) BUILD_VDB_REF=1; SHOW_HELP=0; @@ -870,11 +886,17 @@ if [ "$SHOW_HELP" -eq "1" ]; echo " " echo -e "\e[93m Usage: ./TRACESPipe.sh [options] \e[0m" echo " " + echo -e "\e[34m =========== GENERAL OPTIONS ==========\e[0m" + echo " " echo " -h, --help Show this help message and exit, " echo " -v, --version Show the version and some information, " echo " -f, --force Force running and overwrite of files, " echo " -flog, --flush-logs Flush logs (delete logs), " echo " -fout, --flush-output Flush output data (delete all output_data), " + echo " -t , --threads Number of threads to use, " + echo " " + echo -e "\e[34m =========== SETUP COMMANDS ==========\e[0m" + echo " " echo " -i, --install Installation of all the tools, " echo " -up, --update Update all the tools in TRACESPipe, " echo " -spv, --show-prog-ver Show included programs versions, " @@ -882,8 +904,6 @@ if [ "$SHOW_HELP" -eq "1" ]; echo " -st, --sample Creates human ref. VDB and sample organ, " echo " " echo " -gmt, --get-max-threads Get the number of maximum machine threads," - echo " -t , --threads " - echo " Number of threads to use, " echo " " echo " -dec, --decrypt Decrypt (all files in ../encrypted_data), " echo " -enc, --encrypt Encrypt (all files in ../to_encrypt_data)," @@ -925,53 +945,26 @@ if [ "$SHOW_HELP" -eq "1" ]; echo " -cbn, --create-blast-db It creates a nucleotide blast database, " echo " -ubn, --update-blast-db It updates a nucleotide blast database, " echo " " + echo -e "\e[34m =========== ANALYSIS COMMANDS ==========\e[0m" + echo " " + echo " -ra, --run-analysis Run data analysis (core), " + echo " -all, --run-all Run all the options (excluding the specific). " + echo " " echo " -sfs , --search-blast-db " echo " It blasts the nucleotide (nt) blast DB, " - echo " " echo " -sfrs , --search-blast-remote-db " echo " It blasts remotly thenucleotide (nt) blast " echo " database (it requires internet connection), " echo " " - echo " -rdup, --remove-dup Remove duplications (e.g. PCR dup), " - echo " -vhs, --very-sensitive Aligns with very high sensitivity (slower), " - echo " " echo " -gbb, --best-of-bests Identifies the best of bests references " echo " between multiple organs [similar reference], " echo " " - echo " -iss , --inter-sim-size " - echo " Inter-genome similarity top size (control), " - echo " " echo " -rpro, --run-profiles Run complexity and relative profiles (control), " echo " " echo " -rpgi , --run-gid-complexity-profile " echo " Run complexity profiles by GID, " - echo " -cpwi , --complexity-profile-window " - echo " Complexity profile window size, " - echo " -cple , --complexity-profile-level " - echo " Complexity profile compression level [1;10], " - echo " " echo " -rm, --run-meta Run viral metagenomic identification, " echo " -ro, --run-meta-nv Run NON-viral metagenomic identification," - echo " " - echo " -mis , --min-similarity " - echo " Minimum similarity value to consider the " - echo " sequence for alignment-consensus (filter), " - echo " " - echo " -top , --view-top " - echo " Display the top with the highest " - echo " similarity (by descending order), " - echo " " - echo " -c , --cache " - echo " Cache to be used by FALCON-meta, " - echo " -tsv , --top-size-virus " - echo " Top size to be used by FALCON-meta when " - echo " using TRACES_metagenomic_viral.sh; " - echo " default:0 -> seq count in viral db " - echo " -ts , --top-size " - echo " Top size to be used by FALCON-meta when " - echo " using TRACES_metagenomic.sh; " - echo " default:0 -> seq count in non-viral db " - echo " " echo " -rava, --run-all-v-alig Run all viral align/sort/consensus seqs " echo " from a specific list, " echo " " @@ -1010,16 +1003,6 @@ if [ "$SHOW_HELP" -eq "1" ]; echo " " echo " -covp , --coverage-profile " echo " Run coverage profile for specific BED file, " - echo " -cmax , --max-coverage " - echo " Maximum depth coverage (depth normalization), " - echo " -clog , --coverage-log-scale " - echo " Coverage profile logarithmic scale VALUE=Base, " - echo " -cwis , --coverage-window-size " - echo " Coverage window size for low-pass filter, " - echo " -cdro , --coverage-drop " - echo " Coverage drop size (sampling), " - echo " -covm , --coverage-min-x " - echo " Coverage minimum value for x-axis " echo " " echo " -diff, --run-diff Run diff -> reference and hybrid (ident/SNPs), " echo " " @@ -1030,8 +1013,67 @@ if [ "$SHOW_HELP" -eq "1" ]; echo " -brec, --blast-reconstructed " echo " Run local blast over reconstructed genomes, " echo " " - echo " -ra, --run-analysis Run data analysis (core), " - echo " -all, --run-all Run all the options (excluding the specific). " + echo " " + echo -e "\e[34m =========== ANALYSIS OPTIONS ==========\e[0m" + echo " " + echo " -avdb , --alt-viral-db " + echo " Specify a path to fasta file containing " + echo " viral sequences " + echo " -vdbm , --viral-db-metadata " + echo " Specify a path to a tab del file which " + echo " has sequence GID/ACC in the first column" + echo " and a Name representing a virus, or group" + echo " of viruses in the second " + echo " This changes the meta-analysis behaviour:" + echo " rather than using internal virus names " + echo " and inclusion criteria, the relationships" + echo " defined in the provided file are used. " + echo " This allows the user to define groupings " + echo " of interest in the packaged, or user " + echo " provided viral database. " + echo " " + echo " -rdup, --remove-dup Remove duplications (e.g. PCR dup), " + echo " -vhs, --very-sensitive Aligns with very high sensitivity (slower), " + echo " " + echo " -iss , --inter-sim-size " + echo " Inter-genome similarity top size (control), " + echo " " + echo " -cpwi , --complexity-profile-window " + echo " Complexity profile window size, " + echo " -cple , --complexity-profile-level " + echo " Complexity profile compression level [1;10], " + echo " " + echo " -mis , --min-similarity " + echo " Minimum similarity value to consider the " + echo " sequence for alignment-consensus (filter), " + echo " " + echo " -top , --view-top " + echo " Display the top with the highest " + echo " similarity (by descending order), " + echo " " + echo " -c , --cache " + echo " Cache to be used by FALCON-meta, " + echo " -tsv , --top-size-virus " + echo " Top size to be used by FALCON-meta when " + echo " using TRACES_metagenomic_viral.sh; " + echo " default:0 -> seq count in viral db " + echo " -ts , --top-size " + echo " Top size to be used by FALCON-meta when " + echo " using TRACES_metagenomic.sh; " + echo " default:0 -> seq count in non-viral db " + echo " " + echo " -cmax , --max-coverage " + echo " Maximum depth coverage (depth normalization), " + echo " -clog , --coverage-log-scale " + echo " Coverage profile logarithmic scale VALUE=Base, " + echo " -cwis , --coverage-window-size " + echo " Coverage window size for low-pass filter, " + echo " -cdro , --coverage-drop " + echo " Coverage drop size (sampling), " + echo " -covm , --coverage-min-x " + echo " Coverage minimum value for x-axis " + echo " " + echo -e "\e[34m =========== EXAMPLES ==========\e[0m" echo " " echo -e "\e[93m Ex: ./TRACESPipe.sh --flush-output --flush-logs --run-mito --run-meta \e[0m" echo -e "\e[93m --remove-dup --run-de-novo --run-hybrid --min-similarity 1 --run-diff \e[0m" @@ -1414,6 +1456,10 @@ if [[ "$GET_CY" -eq "1" ]]; # if [[ "$ADD_FASTA" -eq "1" ]]; then + if [ "$B_ALT_VIRAL_DB" -eq 1 ]; then + >&2 echo -e "\e[31mERROR: Cannot add fasta when using an alternative viral DB\e[0m" + exit 1 + fi CHECK_VDB; ./TRACES_add_fasta_to_database.sh "$VIRAL_DATABASE_FILE" $NEW_FASTA fi @@ -1422,6 +1468,10 @@ if [[ "$ADD_FASTA" -eq "1" ]]; # if [[ "$GET_EXTRA" -eq "1" ]]; then + if [ "$B_ALT_VIRAL_DB" -eq 1 ]; then + >&2 echo -e "\e[31mERROR: Cannot add extra viral seq when using an alternative viral DB\e[0m" + exit 1 + fi CHECK_ENRICH; ./TRACES_get_enriched_sequences.sh "$VIRAL_DATABASE_FILE" fi @@ -1430,6 +1480,10 @@ if [[ "$GET_EXTRA" -eq "1" ]]; # if [[ "$ADD_EXTRA_SEQ" -eq "1" ]]; then + if [ "$B_ALT_VIRAL_DB" -eq 1 ]; then + >&2 echo -e "\e[31mERROR: Cannot add extra seq when using an alternative viral DB\e[0m" + exit 1 + fi CHECK_VDB; ./TRACES_get_extra_seq.sh "$VIRAL_DATABASE_FILE" $NEW_SEQ_ID fi From a6e6c495f32ba0bdce319144a30cc233bda69023 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Thu, 12 Jun 2025 15:22:11 +0300 Subject: [PATCH 22/40] Added TRACES_get_best_by_meta.sh --- VersionNotes.txt | 2 ++ src/TRACES_get_best_by_meta.sh | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) create mode 100755 src/TRACES_get_best_by_meta.sh diff --git a/VersionNotes.txt b/VersionNotes.txt index ce0f85d..4c6f2d9 100644 --- a/VersionNotes.txt +++ b/VersionNotes.txt @@ -4,3 +4,5 @@ Version 1.2.0dev Reorganized Usage Message into blocks for easier reading Added infrastructure for altVDB and VDBmetadata options Handled interactions between altVDB and internal database commands +Version 1.2.0dev + diff --git a/src/TRACES_get_best_by_meta.sh b/src/TRACES_get_best_by_meta.sh new file mode 100755 index 0000000..80864d5 --- /dev/null +++ b/src/TRACES_get_best_by_meta.sh @@ -0,0 +1,18 @@ +#!/bin/bash +ORGAN=$1; shift +Virus=$1; shift +MetaFile=$1; shift +# +awk -v Virus="$Virus" -F '\\t' ' + BEGIN{OFS="\t";bestSim="-";bestAcc="-"} + (ARGIND == 1){if($2==Virus){InSet[$1]=1};next} + { + sub("NC_","NC-",$4); + split($4,a,"_"); + acc = a[1]; + sub("NC-","NC_",acc); + sim=$3; + } + (InSet[acc] && sim > bestSim+0){ bestSim=sim; bestAcc=acc;} + END {print bestSim,bestAcc} +' "$MetaFile" "top-$ORGAN.csv" From 1c6488785f6e3575ffde94f9574cf84d466bf470 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Thu, 12 Jun 2025 15:55:32 +0300 Subject: [PATCH 23/40] Get Best By Meta Usage implemented --- VersionNotes.txt | 2 ++ src/TRACESPipe.sh | 70 +++++++++++++++++++++++++++++++++++++---------- 2 files changed, 58 insertions(+), 14 deletions(-) diff --git a/VersionNotes.txt b/VersionNotes.txt index 4c6f2d9..af72715 100644 --- a/VersionNotes.txt +++ b/VersionNotes.txt @@ -4,5 +4,7 @@ Version 1.2.0dev Reorganized Usage Message into blocks for easier reading Added infrastructure for altVDB and VDBmetadata options Handled interactions between altVDB and internal database commands + Added General script for getting best result from top when virus group metadata is available + Version 1.2.0dev diff --git a/src/TRACESPipe.sh b/src/TRACESPipe.sh index 80c04fd..12934e7 100755 --- a/src/TRACESPipe.sh +++ b/src/TRACESPipe.sh @@ -145,16 +145,23 @@ declare -a VIRUSES=("B19" "HV1" "HV2" "HV3" "HV4" "HV5" "HV6" "HV6A" "HV6B" # CHECK INTERNAL SYSTEM FILES # CHECK_FILTERING_SYSTEM_FILES () { - for VIRUS in "${VIRUSES[@]}" - do - if [ ! -f TRACES_get_best_$VIRUS.sh ]; - then - echo -e "\e[31mERROR: TRACES_get_best_$VIRUS.sh file not found!\e[0m" - echo "This file may have been deleted acidentally or VIRAL array changed." - echo "System is corrupted!" - exit 1; + if [ -n "$VIRAL_DATABASE_METADATA" ]; then + if [ ! -f TRACES_get_best_by_meta.sh ]; then + echo -e "\e[31mERROR: TRACES_get_best_by_meta.sh file not found!\e[0m" + echo "This file may have been deleted acidentally." + echo "System is corrupted!" + exit 1; + fi + else + for VIRUS in "${VIRUSES[@]}"; do + if [ ! -f TRACES_get_best_$VIRUS.sh ]; then + echo -e "\e[31mERROR: TRACES_get_best_$VIRUS.sh file not found!\e[0m" + echo "This file may have been deleted acidentally or VIRAL array changed." + echo "System is corrupted!" + exit 1; + fi + done fi - done } # # ============================================================================== @@ -353,7 +360,11 @@ ALIGN_AND_CONSENSUS () { # CHECK_TOP "$ORGAN"; # - V_INFO=`./TRACES_get_best_$V_TAG.sh $ORGAN`; + if [ -n "$VIRAL_DATABASE_METADATA" ]; then + V_INFO="$(./TRACES_get_best_by_meta.sh "$ORGAN" "$V_TAG" "$VIRAL_DATABASE_METADATA")"; + else + V_INFO=`./TRACES_get_best_$V_TAG.sh $ORGAN`; + fi V_GID=`echo "$V_INFO" | awk '{ print $2; }'`; V_VAL=`echo "$V_INFO" | awk '{ print $1; }'`; # @@ -480,6 +491,21 @@ while [[ $# -gt 0 ]] ;; -vdbm|--viral-db-metadata) VIRAL_DATABASE_METADATA="$2"; + if [ -s "$VIRAL_DATABASE_METADATA" ]; then + #Parse the unique virus groups in the metadata (ignore lines without at least two columns + readarray -t VIRUSES < <(awk -f '\\t' ' + (FNR > 1 && $1 && $2){VirusSet[$2]=1;} + END {n=asorti(VirusSet,vl); for(i=1;i<=n;i++){print vl[i]}} + ' "$VIRAL_DATABASE_METADATA") + #Ensure at least one virus group is present + if [ "${#VIRUSES[@]}" -lt 1 ]; then + >&2 echo -e "\e[31mERROR: Provided Viral database Metadata file did not contain any valid accession-virus pairs\e[0m" + exit 1; + fi + else + >&2 echo -e "\e[31mERROR: Provided Viral database Metadata file is non-existant or empty\e[0m" + exit 1; + fi shift 2 ;; -vdbr|--build-viral-r) @@ -1641,7 +1667,11 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; rm -f ../output_data/TRACES_results/REPORT_META_VIRAL_$ORGAN_T.txt; for VIRUS in "${VIRUSES[@]}" do - ./TRACES_get_best_$VIRUS.sh $ORGAN_T >> ../output_data/TRACES_results/REPORT_META_VIRAL_$ORGAN_T.txt + if [ -n "$VIRAL_DATABASE_METADATA" ]; then + ./TRACES_get_best_by_meta.sh "$ORGAN_T" "$VIRUS" "$VIRAL_DATABASE_METADATA" + else + ./TRACES_get_best_$VIRUS.sh $ORGAN_T + fi >> ../output_data/TRACES_results/REPORT_META_VIRAL_$ORGAN_T.txt done echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; # @@ -1748,7 +1778,11 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; rm -f ../output_data/TRACES_results/REPORT_META_VIRAL_$ORGAN_T.txt; for VIRUS in "${VIRUSES[@]}" do - ./TRACES_get_best_$VIRUS.sh $ORGAN_T >> ../output_data/TRACES_results/REPORT_META_VIRAL_$ORGAN_T.txt + if [ -n "$VIRAL_DATABASE_METADATA" ]; then + ./TRACES_get_best_by_meta.sh "$ORGAN_T" "$VIRUS" "$VIRAL_DATABASE_METADATA" + else + ./TRACES_get_best_$VIRUS.sh $ORGAN_T + fi >> ../output_data/TRACES_results/REPORT_META_VIRAL_$ORGAN_T.txt done echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; # @@ -2173,7 +2207,11 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; SNPS=`cat out.report | grep TotalSNPs | awk '{ print $2;}'`; XBREADTH=`awk '{ print $3;}' ../output_data/TRACES_viral_statistics/$VIRUS-total-horizontal-coverage-$ORGAN_T.txt`; XDEPTH=`awk '{ print $3;}' ../output_data/TRACES_viral_statistics/$VIRUS-total-depth-coverage-$ORGAN_T.txt`; - V_INFO=`./TRACES_get_best_$VIRUS.sh $ORGAN_T`; + if [ -n "$VIRAL_DATABASE_METADATA" ]; then + V_INFO="$(./TRACES_get_best_by_meta.sh "$ORGAN" "$VIRUS" "$VIRAL_DATABASE_METADATA")"; + else + V_INFO=`./TRACES_get_best_$VIRUS.sh $ORGAN`; + fi V_GID=`echo "$V_INFO" | awk '{ print $2; }'`; V_VAL=`echo "$V_INFO" | awk '{ print $1; }'`; # @@ -2386,7 +2424,11 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; SNPS=`cat out.report | grep TotalSNPs | awk '{ print $2;}'`; XBREADTH=`awk '{ print $3;}' ../output_data/TRACES_viral_statistics/$VIRUS-total-horizontal-coverage-$ORGAN.txt`; XDEPTH=`awk '{ print $3;}' ../output_data/TRACES_viral_statistics/$VIRUS-total-depth-coverage-$ORGAN.txt`; - V_INFO=`./TRACES_get_best_$VIRUS.sh $ORGAN`; + if [ -n "$VIRAL_DATABASE_METADATA" ]; then + V_INFO="$(./TRACES_get_best_by_meta.sh "$ORGAN" "$VIRUS" "$VIRAL_DATABASE_METADATA")"; + else + V_INFO=`./TRACES_get_best_$VIRUS.sh $ORGAN`; + fi V_GID=`echo "$V_INFO" | awk '{ print $2; }'`; V_VAL=`echo "$V_INFO" | awk '{ print $1; }'`; # From a1f17acc536588254c01246ab6be22b240fa3a7f Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Thu, 12 Jun 2025 16:02:36 +0300 Subject: [PATCH 24/40] Updates to Usage --- VersionNotes.txt | 5 ++--- src/TRACESPipe.sh | 9 ++++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/VersionNotes.txt b/VersionNotes.txt index af72715..04f6e5e 100644 --- a/VersionNotes.txt +++ b/VersionNotes.txt @@ -5,6 +5,5 @@ Version 1.2.0dev Added infrastructure for altVDB and VDBmetadata options Handled interactions between altVDB and internal database commands Added General script for getting best result from top when virus group metadata is available - -Version 1.2.0dev - + If Metadata for the database is provided, virus groups from the database are used + and mappings between accessions and viruses are used diff --git a/src/TRACESPipe.sh b/src/TRACESPipe.sh index 12934e7..5379f63 100755 --- a/src/TRACESPipe.sh +++ b/src/TRACESPipe.sh @@ -1045,17 +1045,20 @@ if [ "$SHOW_HELP" -eq "1" ]; echo " -avdb , --alt-viral-db " echo " Specify a path to fasta file containing " echo " viral sequences " + echo " Sequence names must include the accession" + echo " as the first field, either whitespace or" + echo " underscore (_) delimited (NC_ handled) " echo " -vdbm , --viral-db-metadata " echo " Specify a path to a tab del file which " echo " has sequence GID/ACC in the first column" - echo " and a Name representing a virus, or group" - echo " of viruses in the second " + echo " and a Name representing a virus label " + echo " in the second " echo " This changes the meta-analysis behaviour:" echo " rather than using internal virus names " echo " and inclusion criteria, the relationships" echo " defined in the provided file are used. " echo " This allows the user to define groupings " - echo " of interest in the packaged, or user " + echo " of interest in the default, or user " echo " provided viral database. " echo " " echo " -rdup, --remove-dup Remove duplications (e.g. PCR dup), " From 399c6cd8d337da26dcd464f4572a9d868282223e Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Thu, 12 Jun 2025 16:46:44 +0300 Subject: [PATCH 25/40] Fixed the long lived Copping typo --- src/TRACESPipe.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/TRACESPipe.sh b/src/TRACESPipe.sh index 1b292e3..9abf887 100755 --- a/src/TRACESPipe.sh +++ b/src/TRACESPipe.sh @@ -1505,7 +1505,7 @@ if [[ "$RUN_GID_DAMAGE_ANALYSIS" -eq "1" ]]; echo -e "\e[34m[TRACESPipe]\e[93m Running organ=$ORGAN_T\e[0m"; # rm -f FW_READS.fq.gz RV_READS.fq.gz - echo -e "\e[34m[TRACESPipe]\e[32m Copping an instance of the files ...\e[0m"; + echo -e "\e[34m[TRACESPipe]\e[32m Copying an instance of the files ...\e[0m"; cp ../input_data/$SPL_Forward FW_READS.fq.gz; cp ../input_data/$SPL_Reverse RV_READS.fq.gz; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; @@ -1545,7 +1545,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; echo -e "\e[34m[TRACESPipe]\e[93m Running organ=$ORGAN_T\e[0m"; # rm -f FW_READS.fq.gz RV_READS.fq.gz - echo -e "\e[34m[TRACESPipe]\e[32m Copping an instance of the files ...\e[0m"; + echo -e "\e[34m[TRACESPipe]\e[32m Copying an instance of the files ...\e[0m"; cp ../input_data/$SPL_Forward FW_READS.fq.gz; cp ../input_data/$SPL_Reverse RV_READS.fq.gz; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; @@ -1640,7 +1640,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; echo -e "\e[34m[TRACESPipe]\e[93m Organ=$ORGAN_T Forward=$SPL_Forward Reverse=$SPL_Reverse\e[0m"; # rm -f FW_READS.fq.gz RV_READS.fq.gz - echo -e "\e[34m[TRACESPipe]\e[32m Copping an instance of the files ...\e[0m"; + echo -e "\e[34m[TRACESPipe]\e[32m Copying an instance of the files ...\e[0m"; cp ../input_data/$SPL_Forward FW_READS.fq.gz; cp ../input_data/$SPL_Reverse RV_READS.fq.gz; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; From 6242b5f164197a8c44374d7c470b62a02b721da8 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Thu, 12 Jun 2025 16:50:03 +0300 Subject: [PATCH 26/40] Fixed fatal typo in awk delim specification --- src/TRACESPipe.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TRACESPipe.sh b/src/TRACESPipe.sh index ad411f5..d9549a9 100755 --- a/src/TRACESPipe.sh +++ b/src/TRACESPipe.sh @@ -493,7 +493,7 @@ while [[ $# -gt 0 ]] VIRAL_DATABASE_METADATA="$2"; if [ -s "$VIRAL_DATABASE_METADATA" ]; then #Parse the unique virus groups in the metadata (ignore lines without at least two columns - readarray -t VIRUSES < <(awk -f '\\t' ' + readarray -t VIRUSES < <(awk -F '\\t' ' (FNR > 1 && $1 && $2){VirusSet[$2]=1;} END {n=asorti(VirusSet,vl); for(i=1;i<=n;i++){print vl[i]}} ' "$VIRAL_DATABASE_METADATA") From 8f9ea729f01b3fa654ce8d538a0f1d9146420d7b Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Sat, 14 Jun 2025 00:10:49 +0300 Subject: [PATCH 27/40] Updates to viral name tracking Internal Viral Names System file created to track default supported viruses Viral Names in src dir now contains the set of viruses used in the analysis Removed all hardcoded viral lists --- VersionNotes.txt | 2 + src/TRACESPipe.sh | 57 +++++++++++++++++---------- src/TRACES_coverage_table.sh | 3 +- system_files/internal_viral_names.txt | 35 ++++++++++++++++ 4 files changed, 76 insertions(+), 21 deletions(-) create mode 100644 system_files/internal_viral_names.txt diff --git a/VersionNotes.txt b/VersionNotes.txt index 04f6e5e..33222e9 100644 --- a/VersionNotes.txt +++ b/VersionNotes.txt @@ -7,3 +7,5 @@ Version 1.2.0dev Added General script for getting best result from top when virus group metadata is available If Metadata for the database is provided, virus groups from the database are used and mappings between accessions and viruses are used +Version 1.2.0dev + diff --git a/src/TRACESPipe.sh b/src/TRACESPipe.sh index d9549a9..32a1e87 100755 --- a/src/TRACESPipe.sh +++ b/src/TRACESPipe.sh @@ -12,6 +12,7 @@ ################################################################################## # SOURCE_DIR="$(dirname "$(readlink -f "$0")")" +DEFAULT_VIRAL_NAMES_FILE="../system_file/internal_viral_names.txt" # SHOW_HELP=0; SHOW_VERSION=0; @@ -135,11 +136,20 @@ CACHE=70; # HERV IS CURRENTLY BEING ADDRESSED AS HAPLOID IN ALIGNMENTS -> THIS WILL # REQUIRE ADAPTATION IN THE FUTURE. # -declare -a VIRUSES=("B19" "HV1" "HV2" "HV3" "HV4" "HV5" "HV6" "HV6A" "HV6B" - "HV7" "HV8" "POLY1" "POLY2" "POLY3" "POLY4" "POLY5" - "POLY6" "POLY7" "POLY8" "POLY9" "POLY10" "POLY11" "POLY12" - "POLY13" "POLY14" "HBV" "HPV" "TTV" "HBOV1" "HBOVNOT1" - "VARV" "SV40" "CUTA" "HERV"); +#declare -a VIRUSES=("B19" "HV1" "HV2" "HV3" "HV4" "HV5" "HV6" "HV6A" "HV6B" +# "HV7" "HV8" "POLY1" "POLY2" "POLY3" "POLY4" "POLY5" +# "POLY6" "POLY7" "POLY8" "POLY9" "POLY10" "POLY11" "POLY12" +# "POLY13" "POLY14" "HBV" "HPV" "TTV" "HBOV1" "HBOVNOT1" +# "VARV" "SV40" "CUTA" "HERV"); +if [ -s "$DEFAULT_VIRAL_NAMES_FILE" ]; then + readarray -t VIRUSES < "$DEFAULT_VIRAL_NAMES_FILE"; +else + >&2 echo -e "\e[31mERROR: $DEFAULT_VIRAL_NAMES_FILE Not found!\e[0m" + >&2 echo "This file may have been deleted acidentally." + >&2 echo "System is corrupted!" + exit 1; +fi + # # ============================================================================== # CHECK INTERNAL SYSTEM FILES @@ -491,21 +501,6 @@ while [[ $# -gt 0 ]] ;; -vdbm|--viral-db-metadata) VIRAL_DATABASE_METADATA="$2"; - if [ -s "$VIRAL_DATABASE_METADATA" ]; then - #Parse the unique virus groups in the metadata (ignore lines without at least two columns - readarray -t VIRUSES < <(awk -F '\\t' ' - (FNR > 1 && $1 && $2){VirusSet[$2]=1;} - END {n=asorti(VirusSet,vl); for(i=1;i<=n;i++){print vl[i]}} - ' "$VIRAL_DATABASE_METADATA") - #Ensure at least one virus group is present - if [ "${#VIRUSES[@]}" -lt 1 ]; then - >&2 echo -e "\e[31mERROR: Provided Viral database Metadata file did not contain any valid accession-virus pairs\e[0m" - exit 1; - fi - else - >&2 echo -e "\e[31mERROR: Provided Viral database Metadata file is non-existant or empty\e[0m" - exit 1; - fi shift 2 ;; -vdbr|--build-viral-r) @@ -1154,6 +1149,28 @@ mkdir -p ../output_data/ # CHECK_PROGRAMS; # +# If viral database metadata was provided, parse it for the virus groups to analyze +if [ -n "$VIRAL_DATABASE_METADATA" ]; then + if [ -s "$VIRAL_DATABASE_METADATA" ]; then + #Parse the unique virus groups in the metadata (ignore lines without at least two columns + readarray -t VIRUSES < <(awk -F '\\t' ' + (FNR > 1 && $1 && $2){VirusSet[$2]=1;} + END {n=asorti(VirusSet,vl); for(i=1;i<=n;i++){print vl[i]}} + ' "$VIRAL_DATABASE_METADATA") + #Ensure at least one virus group is present + if [ "${#VIRUSES[@]}" -lt 1 ]; then + >&2 echo -e "\e[31mERROR: Provided Viral database Metadata file did not contain any valid accession-virus pairs\e[0m" + exit 1; + fi + else + >&2 echo -e "\e[31mERROR: Provided Viral database Metadata file is non-existant or empty\e[0m" + exit 1; + fi +fi +# Save the virus groups to be analyzed +rm -f viral_names.txt; +{ printf "%s\n" "${VIRUSES[@]}"; echo ""; } > viral_names.txt +# # DELETE OLD RUN FILES # if [[ "$RUN_DIFF" -eq "1" ]]; diff --git a/src/TRACES_coverage_table.sh b/src/TRACES_coverage_table.sh index 66be5a1..1ddc2b8 100755 --- a/src/TRACES_coverage_table.sh +++ b/src/TRACES_coverage_table.sh @@ -1,6 +1,7 @@ #!/bin/bash # -declare -a VIRUSES=('B19' 'HV1' 'HV2' 'HV3' 'HV4' 'HV5' 'HV6' 'HV6A' 'HV6B' 'HV7' 'HV8' 'POLY1' 'POLY2' 'POLY3' 'POLY4' 'POLY5' 'POLY6' 'POLY7' 'POLY8' 'POLY9' 'POLY10' 'POLY11' 'POLY12' 'POLY13' 'POLY14' 'TTV' 'HBOV1' 'HBOVNOT1' 'HBV' 'HPV' 'VARV' 'SV40' 'CUTA' 'HERV'); +readarray -t VIRUSES < viral_names.txt +#declare -a VIRUSES=('B19' 'HV1' 'HV2' 'HV3' 'HV4' 'HV5' 'HV6' 'HV6A' 'HV6B' 'HV7' 'HV8' 'POLY1' 'POLY2' 'POLY3' 'POLY4' 'POLY5' 'POLY6' 'POLY7' 'POLY8' 'POLY9' 'POLY10' 'POLY11' 'POLY12' 'POLY13' 'POLY14' 'TTV' 'HBOV1' 'HBOVNOT1' 'HBV' 'HPV' 'VARV' 'SV40' 'CUTA' 'HERV'); declare -a ORGANS=( $(cat ../meta_data/meta_info.txt | tr ':' '\t' | awk '{ print $1;}') ); # printf "; "; diff --git a/system_files/internal_viral_names.txt b/system_files/internal_viral_names.txt new file mode 100644 index 0000000..8b26263 --- /dev/null +++ b/system_files/internal_viral_names.txt @@ -0,0 +1,35 @@ +B19 +HV1 +HV2 +HV3 +HV4 +HV5 +HV6 +HV6A +HV6B +HV7 +HV8 +POLY1 +POLY2 +POLY3 +POLY4 +POLY5 +POLY6 +POLY7 +POLY8 +POLY9 +POLY10 +POLY11 +POLY12 +POLY13 +POLY14 +HBV +HPV +TTV +HBOV1 +HBOVNOT1 +VARV +SV40 +CUTA +HERV + From d59c6bc104aae9b10d31fcf9efd4f35ca84807aa Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Sat, 14 Jun 2025 00:16:52 +0300 Subject: [PATCH 28/40] Fixes to Version Notes --- VersionNotes.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/VersionNotes.txt b/VersionNotes.txt index 33222e9..76c9646 100644 --- a/VersionNotes.txt +++ b/VersionNotes.txt @@ -7,5 +7,7 @@ Version 1.2.0dev Added General script for getting best result from top when virus group metadata is available If Metadata for the database is provided, virus groups from the database are used and mappings between accessions and viruses are used -Version 1.2.0dev + System file for default supported viruses created + virus_names in src directory now contains the viruses used in the analysis + From 48dcf5f6949a4400f06f88242df22a304646f5d9 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Sat, 14 Jun 2025 00:17:16 +0300 Subject: [PATCH 29/40] Fixes to version notes --- VersionNotes.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/VersionNotes.txt b/VersionNotes.txt index 76c9646..f1130ce 100644 --- a/VersionNotes.txt +++ b/VersionNotes.txt @@ -9,5 +9,3 @@ Version 1.2.0dev and mappings between accessions and viruses are used System file for default supported viruses created virus_names in src directory now contains the viruses used in the analysis - - From cfddf68a3fbd7183b50945e2f2b21f6d5f673b39 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Sat, 14 Jun 2025 00:18:51 +0300 Subject: [PATCH 30/40] Removed src/viral_names --- src/viral_names.txt | 35 ----------------------------------- 1 file changed, 35 deletions(-) delete mode 100755 src/viral_names.txt diff --git a/src/viral_names.txt b/src/viral_names.txt deleted file mode 100755 index 8b26263..0000000 --- a/src/viral_names.txt +++ /dev/null @@ -1,35 +0,0 @@ -B19 -HV1 -HV2 -HV3 -HV4 -HV5 -HV6 -HV6A -HV6B -HV7 -HV8 -POLY1 -POLY2 -POLY3 -POLY4 -POLY5 -POLY6 -POLY7 -POLY8 -POLY9 -POLY10 -POLY11 -POLY12 -POLY13 -POLY14 -HBV -HPV -TTV -HBOV1 -HBOVNOT1 -VARV -SV40 -CUTA -HERV - From e7b05521214dc3273c99804221dc41686b884c43 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Mon, 16 Jun 2025 16:35:35 +0300 Subject: [PATCH 31/40] Fixed fatal type in path to internal_viral_names Removed blank line from internal viral names which leads to a fatal check for empty virus filtering scripts --- src/TRACESPipe.sh | 2 +- system_files/internal_viral_names.txt | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/TRACESPipe.sh b/src/TRACESPipe.sh index 32a1e87..ca90489 100755 --- a/src/TRACESPipe.sh +++ b/src/TRACESPipe.sh @@ -12,7 +12,7 @@ ################################################################################## # SOURCE_DIR="$(dirname "$(readlink -f "$0")")" -DEFAULT_VIRAL_NAMES_FILE="../system_file/internal_viral_names.txt" +DEFAULT_VIRAL_NAMES_FILE="../system_files/internal_viral_names.txt" # SHOW_HELP=0; SHOW_VERSION=0; diff --git a/system_files/internal_viral_names.txt b/system_files/internal_viral_names.txt index 8b26263..bf2f98d 100644 --- a/system_files/internal_viral_names.txt +++ b/system_files/internal_viral_names.txt @@ -32,4 +32,3 @@ VARV SV40 CUTA HERV - From 0078c4cafb061aa96a25be4d7dbc7cad66c443f5 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Mon, 16 Jun 2025 22:56:18 +0300 Subject: [PATCH 32/40] Updated version notes --- Version.txt | 2 +- VersionNotes.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Version.txt b/Version.txt index f68815f..26aaba0 100644 --- a/Version.txt +++ b/Version.txt @@ -1 +1 @@ -1.2.0dev +1.2.0 diff --git a/VersionNotes.txt b/VersionNotes.txt index f1130ce..4a2f8f1 100644 --- a/VersionNotes.txt +++ b/VersionNotes.txt @@ -1,6 +1,6 @@ Version 1.1.4 GlobalSensitive and DynamicTop merged in; Added Versioning Files -Version 1.2.0dev +Version 1.2.0 Reorganized Usage Message into blocks for easier reading Added infrastructure for altVDB and VDBmetadata options Handled interactions between altVDB and internal database commands From 66ed3c728358bbb81d8f3df078d34c42dc0a9fcc Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Mon, 23 Jun 2025 08:42:14 +0300 Subject: [PATCH 33/40] Made internal_virus_names path relative to executable path rather than pwd --- src/TRACESPipe.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TRACESPipe.sh b/src/TRACESPipe.sh index ca90489..5097039 100755 --- a/src/TRACESPipe.sh +++ b/src/TRACESPipe.sh @@ -12,7 +12,7 @@ ################################################################################## # SOURCE_DIR="$(dirname "$(readlink -f "$0")")" -DEFAULT_VIRAL_NAMES_FILE="../system_files/internal_viral_names.txt" +DEFAULT_VIRAL_NAMES_FILE="$SOURCE_DIR/../system_files/internal_viral_names.txt" # SHOW_HELP=0; SHOW_VERSION=0; From 04588c215272d9def6d406c3abe3f040b40973f9 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Mon, 23 Jun 2025 14:52:19 +0300 Subject: [PATCH 34/40] Fixed Bug where RUN_SPECIFIC wasn't turned on by RUN_EXTREME --- Version.txt | 2 +- VersionNotes.txt | 2 ++ src/TRACESPipe.sh | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/Version.txt b/Version.txt index 26aaba0..6085e94 100644 --- a/Version.txt +++ b/Version.txt @@ -1 +1 @@ -1.2.0 +1.2.1 diff --git a/VersionNotes.txt b/VersionNotes.txt index 4a2f8f1..b5dc514 100644 --- a/VersionNotes.txt +++ b/VersionNotes.txt @@ -9,3 +9,5 @@ Version 1.2.0 and mappings between accessions and viruses are used System file for default supported viruses created virus_names in src directory now contains the viruses used in the analysis +Version 1.2.1 + diff --git a/src/TRACESPipe.sh b/src/TRACESPipe.sh index 5097039..9aebd42 100755 --- a/src/TRACESPipe.sh +++ b/src/TRACESPipe.sh @@ -717,6 +717,7 @@ while [[ $# -gt 0 ]] ;; -rsx|--run-extreme) RUN_ANALYSIS=1; + RUN_SPECIFIC=1; RUN_SPECIFIC_SENSITIVE=1; SPECIFIC_ID="$2"; SHOW_HELP=0; From db72213fed7181ec64aa7f120919b7f2316579f9 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Mon, 23 Jun 2025 14:53:57 +0300 Subject: [PATCH 35/40] Fix version Notes --- VersionNotes.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/VersionNotes.txt b/VersionNotes.txt index b5dc514..087680f 100644 --- a/VersionNotes.txt +++ b/VersionNotes.txt @@ -10,4 +10,5 @@ Version 1.2.0 System file for default supported viruses created virus_names in src directory now contains the viruses used in the analysis Version 1.2.1 + Bug Fix - Run Extreme when specified alone now performs alignments From 698214d3fc703600dbeb0d5b235b9eb87292d642 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Wed, 25 Jun 2025 12:47:14 +0300 Subject: [PATCH 36/40] Added Min Sim length for metagenomics threshold --- Version.txt | 2 +- VersionNotes.txt | 2 ++ src/TRACESPipe.sh | 38 ++++++++++++++++++++++++++++------ src/TRACES_get_best_by_meta.sh | 7 ++++--- 4 files changed, 39 insertions(+), 10 deletions(-) diff --git a/Version.txt b/Version.txt index 6085e94..3e6e9a9 100644 --- a/Version.txt +++ b/Version.txt @@ -1 +1 @@ -1.2.1 +1.3.0dev diff --git a/VersionNotes.txt b/VersionNotes.txt index 087680f..e4fea94 100644 --- a/VersionNotes.txt +++ b/VersionNotes.txt @@ -12,3 +12,5 @@ Version 1.2.0 Version 1.2.1 Bug Fix - Run Extreme when specified alone now performs alignments +Version 1.3.0dev + diff --git a/src/TRACESPipe.sh b/src/TRACESPipe.sh index 9aebd42..9ff85cc 100755 --- a/src/TRACESPipe.sh +++ b/src/TRACESPipe.sh @@ -123,6 +123,7 @@ VIRAL_DATABASE_FILE="VDB.fa" # DEFAULT VALUES: # MINIMAL_SIMILARITY_VALUE=1.0; +MINIMAL_SIMILARITY_LENGTH=0; TSIZE=2; TOP_SIZE=0; TOP_SIZE_VIR=0; @@ -357,14 +358,17 @@ CHECK_E_FILE () { # ALIGN_AND_CONSENSUS () { # + B_O_B="$3"; ORGAN="$8"; HIGH="$7"; DUPL="$6"; THREADS="$5"; V_TAG="$1"; IDX_TAG="$4"; + MIN_SIM="$2" + MIN_SIM_LEN="$9"; # - V_GID="X"; + V_GID="-"; # echo -e "\e[34m[TRACESPipe]\e[32m Assessing $V_TAG best reference ...\e[0m"; # @@ -375,15 +379,17 @@ ALIGN_AND_CONSENSUS () { else V_INFO=`./TRACES_get_best_$V_TAG.sh $ORGAN`; fi - V_GID=`echo "$V_INFO" | awk '{ print $2; }'`; - V_VAL=`echo "$V_INFO" | awk '{ print $1; }'`; + read -r V_GID V_VAL V_LEN <<< "$V_INFO"; + [ -z "$V_LEN" ] && V_LEN=1; # echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; # - if [[ "$V_GID" != "-" ]] && [[ $(bc <<< "$V_VAL > $2") -eq 1 ]]; + if [[ "$V_GID" != "-" ]] && \ + [[ $(bc <<< "$V_VAL > $MIN_SIM") -eq 1 ]] && \ + [[ $(bc <<< "0.01 * $V_VAL * $V_LEN >= $MIN_SIM_LEN") -eq 1 ]]; then # - if [[ "$3" -eq "1" ]] + if [[ "$B_O_B" -eq "1" ]] then echo -e "\e[34m[TRACESPipe]\e[96m Best reference: $V_GID\e[0m"; V_GID=`sed "${IDX_TAG}q;d" ../output_data/TRACES_results/REPORT_META_VIRAL_BESTS_$ORGAN.txt | awk '{ print $2; }'`; @@ -395,6 +401,16 @@ ALIGN_AND_CONSENSUS () { CHECK_VDB; gto_fasta_extract_read_by_pattern -p "$V_GID" < "$VIRAL_DATABASE_FILE" | awk "/^>/ {n++} n>1 {exit} 1" > $ORGAN-$V_TAG.fa 2>> ../logs/Log-$ORGAN.txt; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; + #In the non-metadata case the length check happens here + if [ -z "$V_LEN" ]; then + V_LEN=$(gto_fasta_info < "$ORGAN-$V_TAG.fa" | awk '(FNR ==2){print $NF}') + fi + #If the length is too short the rest of the function can be skipped after cleaning up the extracted reference + if [[ $(bc <<< "0.01 * $V_VAL * $V_LEN < $MIN_SIM_LEN") -eq 1 ]]; then + echo -e "\e[34m[TRACESPipe]\e[96m Similarity Length too short for best match: $V_INFO\t$V_LEN\e[0m"; + rm -f $ORGAN-$V_TAG.fa; + return 0; + fi echo -e "\e[34m[TRACESPipe]\e[32m Aligning reads to $V_TAG best reference with bowtie2 ...\e[0m"; ./TRACES_viral_align_reads.sh $ORGAN-$V_TAG.fa $ORGAN $V_TAG $THREADS $DUPL $HIGH 1>> ../logs/Log-stdout-$ORGAN.txt 2>> ../logs/Log-stderr-$ORGAN.txt; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; @@ -577,6 +593,11 @@ while [[ $# -gt 0 ]] SHOW_HELP=0; shift 2 ;; + -misl|--min-similarity-len) + MINIMAL_SIMILARITY_LENGTH="$2"; + SHOW_HELP=0; + shift 2 + ;; -dwms|--download-mito-species) DOWNLOAD_MITO_SPECIES=1; SHOW_HELP=0; @@ -1072,6 +1093,11 @@ if [ "$SHOW_HELP" -eq "1" ]; echo " Minimum similarity value to consider the " echo " sequence for alignment-consensus (filter), " echo " " + echo " -misl , --min-similarity-len " + echo " Minimum product of similarity value and " + echo " best hit sequence length for " + echo " alignment-consensus (filter), " + echo " " echo " -top , --view-top " echo " Display the top with the highest " echo " similarity (by descending order), " @@ -1909,7 +1935,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; IDX=1; for VIRUS in "${VIRUSES[@]}" do - ALIGN_AND_CONSENSUS "$VIRUS" "$MINIMAL_SIMILARITY_VALUE" "$RUN_BEST_OF_BESTS" "$IDX" "$THREADS" "$REMOVE_DUPLICATIONS" "$HIGH_SENSITIVITY" "$ORGAN_T" + ALIGN_AND_CONSENSUS "$VIRUS" "$MINIMAL_SIMILARITY_VALUE" "$RUN_BEST_OF_BESTS" "$IDX" "$THREADS" "$REMOVE_DUPLICATIONS" "$HIGH_SENSITIVITY" "$ORGAN_T" "$MINIMAL_SIMILARITY_LENGTH" ((++IDX)); done fi diff --git a/src/TRACES_get_best_by_meta.sh b/src/TRACES_get_best_by_meta.sh index 80864d5..c5f7fd6 100755 --- a/src/TRACES_get_best_by_meta.sh +++ b/src/TRACES_get_best_by_meta.sh @@ -4,7 +4,7 @@ Virus=$1; shift MetaFile=$1; shift # awk -v Virus="$Virus" -F '\\t' ' - BEGIN{OFS="\t";bestSim="-";bestAcc="-"} + BEGIN{OFS="\t";bestSim="-";bestAcc="-";bestLen="-"} (ARGIND == 1){if($2==Virus){InSet[$1]=1};next} { sub("NC_","NC-",$4); @@ -12,7 +12,8 @@ awk -v Virus="$Virus" -F '\\t' ' acc = a[1]; sub("NC-","NC_",acc); sim=$3; + len=$2; } - (InSet[acc] && sim > bestSim+0){ bestSim=sim; bestAcc=acc;} - END {print bestSim,bestAcc} + (InSet[acc] && sim > bestSim+0){ bestSim=sim; bestAcc=acc; bestLen=len;} + END {print bestSim,bestAcc,bestLen} ' "$MetaFile" "top-$ORGAN.csv" From a5995f7d7bb1d6f9f0fa1d6fb4d9656cec139f1e Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Wed, 25 Jun 2025 12:48:46 +0300 Subject: [PATCH 37/40] Fix Version Notes --- VersionNotes.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/VersionNotes.txt b/VersionNotes.txt index e4fea94..d766c47 100644 --- a/VersionNotes.txt +++ b/VersionNotes.txt @@ -11,6 +11,5 @@ Version 1.2.0 virus_names in src directory now contains the viruses used in the analysis Version 1.2.1 Bug Fix - Run Extreme when specified alone now performs alignments - Version 1.3.0dev - + Added Minimum Similarity Length threshold From 59d34c7492824da39e8e42ea97b1b64d8c174a10 Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Fri, 27 Jun 2025 11:45:06 +0300 Subject: [PATCH 38/40] Fixed bug in variable order leading to empty reference files before mapping --- src/TRACESPipe.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TRACESPipe.sh b/src/TRACESPipe.sh index 9ff85cc..1e25d54 100755 --- a/src/TRACESPipe.sh +++ b/src/TRACESPipe.sh @@ -379,7 +379,7 @@ ALIGN_AND_CONSENSUS () { else V_INFO=`./TRACES_get_best_$V_TAG.sh $ORGAN`; fi - read -r V_GID V_VAL V_LEN <<< "$V_INFO"; + read -r V_VAL V_GID V_LEN <<< "$V_INFO"; [ -z "$V_LEN" ] && V_LEN=1; # echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; From c2317efa2006674564aa01adf0f57cfa845c164b Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Fri, 27 Jun 2025 11:47:13 +0300 Subject: [PATCH 39/40] Reference files no longer remain in the src directory after ALIGN_AND_CONSENSE --- src/TRACESPipe.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TRACESPipe.sh b/src/TRACESPipe.sh index 1e25d54..dab1ddd 100755 --- a/src/TRACESPipe.sh +++ b/src/TRACESPipe.sh @@ -419,8 +419,8 @@ ALIGN_AND_CONSENSUS () { ./TRACES_viral_consensus.sh $ORGAN-$V_TAG.fa viral_aligned_sorted-$ORGAN-$V_TAG.bam $ORGAN $V_TAG 1>> ../logs/Log-stdout-$ORGAN.txt 2>> ../logs/Log-stderr-$ORGAN.txt; # mkdir -p ../output_data/TRACES_viral_alignments; - cp $ORGAN-$V_TAG.fa ../output_data/TRACES_viral_alignments/ - cp $ORGAN-$V_TAG.fa.fai ../output_data/TRACES_viral_alignments/ + mv $ORGAN-$V_TAG.fa ../output_data/TRACES_viral_alignments/ + mv $ORGAN-$V_TAG.fa.fai ../output_data/TRACES_viral_alignments/ mv viral_aligned_sorted-$ORGAN-$V_TAG.bam ../output_data/TRACES_viral_alignments/ mv viral_aligned_sorted-$ORGAN-$V_TAG.bam.bai ../output_data/TRACES_viral_alignments/ mkdir -p ../output_data/TRACES_viral_consensus; From ae986097ca66270b988d0aabf9f19ce111c9bcbc Mon Sep 17 00:00:00 2001 From: Zachery Dickson Date: Thu, 3 Jul 2025 16:11:28 +0300 Subject: [PATCH 40/40] Update VErsioning to incorporate MinSimLen Feature --- Version.txt | 2 +- VersionNotes.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Version.txt b/Version.txt index 3e6e9a9..f0bb29e 100644 --- a/Version.txt +++ b/Version.txt @@ -1 +1 @@ -1.3.0dev +1.3.0 diff --git a/VersionNotes.txt b/VersionNotes.txt index d766c47..1a37f81 100644 --- a/VersionNotes.txt +++ b/VersionNotes.txt @@ -11,5 +11,5 @@ Version 1.2.0 virus_names in src directory now contains the viruses used in the analysis Version 1.2.1 Bug Fix - Run Extreme when specified alone now performs alignments -Version 1.3.0dev +Version 1.3.0 Added Minimum Similarity Length threshold