diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9c053c6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.swp +tests diff --git a/Version.txt b/Version.txt new file mode 100644 index 0000000..f0bb29e --- /dev/null +++ b/Version.txt @@ -0,0 +1 @@ +1.3.0 diff --git a/VersionNotes.txt b/VersionNotes.txt new file mode 100644 index 0000000..1a37f81 --- /dev/null +++ b/VersionNotes.txt @@ -0,0 +1,15 @@ +Version 1.1.4 + GlobalSensitive and DynamicTop merged in; Added Versioning Files +Version 1.2.0 + Reorganized Usage Message into blocks for easier reading + Added infrastructure for altVDB and VDBmetadata options + Handled interactions between altVDB and internal database commands + Added General script for getting best result from top when virus group metadata is available + If Metadata for the database is provided, virus groups from the database are used + and mappings between accessions and viruses are used + System file for default supported viruses created + virus_names in src directory now contains the viruses used in the analysis +Version 1.2.1 + Bug Fix - Run Extreme when specified alone now performs alignments +Version 1.3.0 + Added Minimum Similarity Length threshold diff --git a/src/TRACESPipe.sh b/src/TRACESPipe.sh index 6ae2bfc..dab1ddd 100755 --- a/src/TRACESPipe.sh +++ b/src/TRACESPipe.sh @@ -11,6 +11,9 @@ # ============================================================================== # ################################################################################## # +SOURCE_DIR="$(dirname "$(readlink -f "$0")")" +DEFAULT_VIRAL_NAMES_FILE="$SOURCE_DIR/../system_files/internal_viral_names.txt" +# SHOW_HELP=0; SHOW_VERSION=0; FORCE=0; @@ -111,15 +114,20 @@ RUN_DIFF_ID=""; # RUN_BLAST_RECONSTRUCTED=0; # +B_ALT_VIRAL_DB=0; +VIRAL_DATABASE_METADATA=""; +VIRAL_DATABASE_FILE="VDB.fa" +# # ============================================================================== # # DEFAULT VALUES: # MINIMAL_SIMILARITY_VALUE=1.0; +MINIMAL_SIMILARITY_LENGTH=0; TSIZE=2; +TOP_SIZE=0; +TOP_SIZE_VIR=0; CACHE=70; -TOP_SIZE_VIR=8000; -TOP_SIZE=12000; # # ============================================================================== # THESE ARE THE CURRENT FLAGGED VIRUSES OR VIRUSES GROUPS FOR ENHANCED ASSEMBLY: @@ -129,26 +137,42 @@ TOP_SIZE=12000; # HERV IS CURRENTLY BEING ADDRESSED AS HAPLOID IN ALIGNMENTS -> THIS WILL # REQUIRE ADAPTATION IN THE FUTURE. # -declare -a VIRUSES=("B19" "HV1" "HV2" "HV3" "HV4" "HV5" "HV6" "HV6A" "HV6B" - "HV7" "HV8" "POLY1" "POLY2" "POLY3" "POLY4" "POLY5" - "POLY6" "POLY7" "POLY8" "POLY9" "POLY10" "POLY11" "POLY12" - "POLY13" "POLY14" "HBV" "HPV" "TTV" "HBOV1" "HBOVNOT1" - "VARV" "SV40" "CUTA" "HERV"); +#declare -a VIRUSES=("B19" "HV1" "HV2" "HV3" "HV4" "HV5" "HV6" "HV6A" "HV6B" +# "HV7" "HV8" "POLY1" "POLY2" "POLY3" "POLY4" "POLY5" +# "POLY6" "POLY7" "POLY8" "POLY9" "POLY10" "POLY11" "POLY12" +# "POLY13" "POLY14" "HBV" "HPV" "TTV" "HBOV1" "HBOVNOT1" +# "VARV" "SV40" "CUTA" "HERV"); +if [ -s "$DEFAULT_VIRAL_NAMES_FILE" ]; then + readarray -t VIRUSES < "$DEFAULT_VIRAL_NAMES_FILE"; +else + >&2 echo -e "\e[31mERROR: $DEFAULT_VIRAL_NAMES_FILE Not found!\e[0m" + >&2 echo "This file may have been deleted acidentally." + >&2 echo "System is corrupted!" + exit 1; +fi + # # ============================================================================== # CHECK INTERNAL SYSTEM FILES # CHECK_FILTERING_SYSTEM_FILES () { - for VIRUS in "${VIRUSES[@]}" - do - if [ ! -f TRACES_get_best_$VIRUS.sh ]; - then - echo -e "\e[31mERROR: TRACES_get_best_$VIRUS.sh file not found!\e[0m" - echo "This file may have been deleted acidentally or VIRAL array changed." - echo "System is corrupted!" - exit 1; + if [ -n "$VIRAL_DATABASE_METADATA" ]; then + if [ ! -f TRACES_get_best_by_meta.sh ]; then + echo -e "\e[31mERROR: TRACES_get_best_by_meta.sh file not found!\e[0m" + echo "This file may have been deleted acidentally." + echo "System is corrupted!" + exit 1; + fi + else + for VIRUS in "${VIRUSES[@]}"; do + if [ ! -f TRACES_get_best_$VIRUS.sh ]; then + echo -e "\e[31mERROR: TRACES_get_best_$VIRUS.sh file not found!\e[0m" + echo "This file may have been deleted acidentally or VIRAL array changed." + echo "System is corrupted!" + exit 1; + fi + done fi - done } # # ============================================================================== @@ -179,11 +203,15 @@ CHECK_GZIP_FILES () { # # CHECK_VDB () { - if [ ! -f VDB.fa ]; + if [ ! -f "$VIRAL_DATABASE_FILE" ]; then - echo -e "\e[31mERROR: viral database (VDB.fa) not found!\e[0m" - echo "TIP: before this, run: ./TRACESPipe.sh --build-viral" - echo "For addition information, see the instructions at the web page." + echo -e "\e[31mERROR: viral database ("$VIRAL_DATABASE_FILE") not found!\e[0m" + if [ "$B_ALT_VIRAL_DB" -eq 0 ]; then + echo "TIP: before this, run: ./TRACESPipe.sh --build-viral" + echo " OR" + echo "Use the --alt-viral-db option to specify an alternative VDB location" + echo "For addition information, see the instructions at the web page." + fi exit 1; fi } @@ -330,29 +358,38 @@ CHECK_E_FILE () { # ALIGN_AND_CONSENSUS () { # + B_O_B="$3"; ORGAN="$8"; HIGH="$7"; DUPL="$6"; THREADS="$5"; V_TAG="$1"; IDX_TAG="$4"; + MIN_SIM="$2" + MIN_SIM_LEN="$9"; # - V_GID="X"; + V_GID="-"; # echo -e "\e[34m[TRACESPipe]\e[32m Assessing $V_TAG best reference ...\e[0m"; # CHECK_TOP "$ORGAN"; # - V_INFO=`./TRACES_get_best_$V_TAG.sh $ORGAN`; - V_GID=`echo "$V_INFO" | awk '{ print $2; }'`; - V_VAL=`echo "$V_INFO" | awk '{ print $1; }'`; + if [ -n "$VIRAL_DATABASE_METADATA" ]; then + V_INFO="$(./TRACES_get_best_by_meta.sh "$ORGAN" "$V_TAG" "$VIRAL_DATABASE_METADATA")"; + else + V_INFO=`./TRACES_get_best_$V_TAG.sh $ORGAN`; + fi + read -r V_VAL V_GID V_LEN <<< "$V_INFO"; + [ -z "$V_LEN" ] && V_LEN=1; # echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; # - if [[ "$V_GID" != "-" ]] && [[ $(bc <<< "$V_VAL > $2") -eq 1 ]]; + if [[ "$V_GID" != "-" ]] && \ + [[ $(bc <<< "$V_VAL > $MIN_SIM") -eq 1 ]] && \ + [[ $(bc <<< "0.01 * $V_VAL * $V_LEN >= $MIN_SIM_LEN") -eq 1 ]]; then # - if [[ "$3" -eq "1" ]] + if [[ "$B_O_B" -eq "1" ]] then echo -e "\e[34m[TRACESPipe]\e[96m Best reference: $V_GID\e[0m"; V_GID=`sed "${IDX_TAG}q;d" ../output_data/TRACES_results/REPORT_META_VIRAL_BESTS_$ORGAN.txt | awk '{ print $2; }'`; @@ -360,10 +397,20 @@ ALIGN_AND_CONSENSUS () { fi # echo -e "\e[34m[TRACESPipe]\e[96m Similarity best match: $V_INFO\e[0m"; - echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence from VDB.fa\e[0m"; + echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence from "$VIRAL_DATABASE_FILE"\e[0m"; CHECK_VDB; - gto_fasta_extract_read_by_pattern -p "$V_GID" < VDB.fa | awk "/^>/ {n++} n>1 {exit} 1" > $ORGAN-$V_TAG.fa 2>> ../logs/Log-$ORGAN.txt; + gto_fasta_extract_read_by_pattern -p "$V_GID" < "$VIRAL_DATABASE_FILE" | awk "/^>/ {n++} n>1 {exit} 1" > $ORGAN-$V_TAG.fa 2>> ../logs/Log-$ORGAN.txt; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; + #In the non-metadata case the length check happens here + if [ -z "$V_LEN" ]; then + V_LEN=$(gto_fasta_info < "$ORGAN-$V_TAG.fa" | awk '(FNR ==2){print $NF}') + fi + #If the length is too short the rest of the function can be skipped after cleaning up the extracted reference + if [[ $(bc <<< "0.01 * $V_VAL * $V_LEN < $MIN_SIM_LEN") -eq 1 ]]; then + echo -e "\e[34m[TRACESPipe]\e[96m Similarity Length too short for best match: $V_INFO\t$V_LEN\e[0m"; + rm -f $ORGAN-$V_TAG.fa; + return 0; + fi echo -e "\e[34m[TRACESPipe]\e[32m Aligning reads to $V_TAG best reference with bowtie2 ...\e[0m"; ./TRACES_viral_align_reads.sh $ORGAN-$V_TAG.fa $ORGAN $V_TAG $THREADS $DUPL $HIGH 1>> ../logs/Log-stdout-$ORGAN.txt 2>> ../logs/Log-stderr-$ORGAN.txt; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; @@ -372,8 +419,8 @@ ALIGN_AND_CONSENSUS () { ./TRACES_viral_consensus.sh $ORGAN-$V_TAG.fa viral_aligned_sorted-$ORGAN-$V_TAG.bam $ORGAN $V_TAG 1>> ../logs/Log-stdout-$ORGAN.txt 2>> ../logs/Log-stderr-$ORGAN.txt; # mkdir -p ../output_data/TRACES_viral_alignments; - cp $ORGAN-$V_TAG.fa ../output_data/TRACES_viral_alignments/ - cp $ORGAN-$V_TAG.fa.fai ../output_data/TRACES_viral_alignments/ + mv $ORGAN-$V_TAG.fa ../output_data/TRACES_viral_alignments/ + mv $ORGAN-$V_TAG.fa.fai ../output_data/TRACES_viral_alignments/ mv viral_aligned_sorted-$ORGAN-$V_TAG.bam ../output_data/TRACES_viral_alignments/ mv viral_aligned_sorted-$ORGAN-$V_TAG.bam.bai ../output_data/TRACES_viral_alignments/ mkdir -p ../output_data/TRACES_viral_consensus; @@ -384,11 +431,11 @@ ALIGN_AND_CONSENSUS () { mv $V_TAG-calls-$ORGAN.bed ../output_data/TRACES_viral_bed/ mv $V_TAG-coverage-$ORGAN.bed ../output_data/TRACES_viral_bed/ mv $V_TAG-zero-coverage-$ORGAN.bed ../output_data/TRACES_viral_bed/ - mv $V_TAG-$ORGAN-calls.norm.flt-indels.vcf.gz ../output_data/TRACES_viral_bed/ + mv $V_TAG-$ORGAN-calls.vcf.gz ../output_data/TRACES_viral_bed/ mkdir -p ../output_data/TRACES_viral_statistics; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; echo -e "\e[34m[TRACESPipe]\e[32m Calculating coverage ...\e[0m"; - ./TRACES_overall_virus.sh $V_TAG $ORGAN + ./TRACES_overall.sh viral $V_TAG $ORGAN C_BREADTH=`cat ../output_data/TRACES_viral_statistics/$V_TAG-total-horizontal-coverage-$ORGAN.txt`; C_DEPTH=`cat ../output_data/TRACES_viral_statistics/$V_TAG-total-depth-coverage-$ORGAN.txt`; echo -e "\e[34m[TRACESPipe]\e[1m Breadth (H) coverage: $C_BREADTH \e[0m"; @@ -422,6 +469,11 @@ while [[ $# -gt 0 ]] FORCE=1; shift ;; + -avdb|--alt-viral-db) + B_ALT_VIRAL_DB=1; + VIRAL_DATABASE_FILE="$2" + shift 2; + ;; -flog|--flush-logs) FLUSH_LOGS=1; shift @@ -463,6 +515,10 @@ while [[ $# -gt 0 ]] SHOW_HELP=0; shift ;; + -vdbm|--viral-db-metadata) + VIRAL_DATABASE_METADATA="$2"; + shift 2 + ;; -vdbr|--build-viral-r) BUILD_VDB_REF=1; SHOW_HELP=0; @@ -523,6 +579,7 @@ while [[ $# -gt 0 ]] ;; -vhs|--very-sensitive) HIGH_SENSITIVITY=1; + RUN_SPECIFIC_SENSITIVE=1; SHOW_HELP=0; shift ;; @@ -536,6 +593,11 @@ while [[ $# -gt 0 ]] SHOW_HELP=0; shift 2 ;; + -misl|--min-similarity-len) + MINIMAL_SIMILARITY_LENGTH="$2"; + SHOW_HELP=0; + shift 2 + ;; -dwms|--download-mito-species) DOWNLOAD_MITO_SPECIES=1; SHOW_HELP=0; @@ -676,6 +738,7 @@ while [[ $# -gt 0 ]] ;; -rsx|--run-extreme) RUN_ANALYSIS=1; + RUN_SPECIFIC=1; RUN_SPECIFIC_SENSITIVE=1; SPECIFIC_ID="$2"; SHOW_HELP=0; @@ -825,16 +888,16 @@ while [[ $# -gt 0 ]] SHOW_HELP=0; shift 2; ;; - -tsv|--top-size-virus) - TOP_SIZE_VIR="$2"; - SHOW_HELP=0; - shift 2; - ;; -ts|--top-size) TOP_SIZE="$2"; SHOW_HELP=0; shift 2; ;; + -tsv|--top-size-virus) + TOP_SIZE_VIR="$2"; + SHOW_HELP=0; + shift 2; + ;; -*) # unknown option with small echo "Invalid arg ($1)!"; echo "For help, try: ./TRACESPipe.sh -h" @@ -866,11 +929,17 @@ if [ "$SHOW_HELP" -eq "1" ]; echo " " echo -e "\e[93m Usage: ./TRACESPipe.sh [options] \e[0m" echo " " + echo -e "\e[34m =========== GENERAL OPTIONS ==========\e[0m" + echo " " echo " -h, --help Show this help message and exit, " echo " -v, --version Show the version and some information, " echo " -f, --force Force running and overwrite of files, " echo " -flog, --flush-logs Flush logs (delete logs), " echo " -fout, --flush-output Flush output data (delete all output_data), " + echo " -t , --threads Number of threads to use, " + echo " " + echo -e "\e[34m =========== SETUP COMMANDS ==========\e[0m" + echo " " echo " -i, --install Installation of all the tools, " echo " -up, --update Update all the tools in TRACESPipe, " echo " -spv, --show-prog-ver Show included programs versions, " @@ -878,8 +947,6 @@ if [ "$SHOW_HELP" -eq "1" ]; echo " -st, --sample Creates human ref. VDB and sample organ, " echo " " echo " -gmt, --get-max-threads Get the number of maximum machine threads," - echo " -t , --threads " - echo " Number of threads to use, " echo " " echo " -dec, --decrypt Decrypt (all files in ../encrypted_data), " echo " -enc, --encrypt Encrypt (all files in ../to_encrypt_data)," @@ -889,9 +956,9 @@ if [ "$SHOW_HELP" -eq "1" ]; echo " -udb, --build-unviral Build non viral database (control), " echo " " echo " -afs , --add-fasta " - echo " Add a FASTA sequence to the VDB.fa, " + echo " Add a FASTA sequence to the "$VIRAL_DATABASE_FILE", " echo " -aes , --add-extra-seq " - echo " Add extra sequence to the VDB.fa, " + echo " Add extra sequence to the "$VIRAL_DATABASE_FILE", " echo " -gx, --get-extra-vir Downloads/appends (VDB) extra viral seq, " echo " " echo " -gad, --gen-adapters Generate FASTA file with adapters, " @@ -921,51 +988,26 @@ if [ "$SHOW_HELP" -eq "1" ]; echo " -cbn, --create-blast-db It creates a nucleotide blast database, " echo " -ubn, --update-blast-db It updates a nucleotide blast database, " echo " " + echo -e "\e[34m =========== ANALYSIS COMMANDS ==========\e[0m" + echo " " + echo " -ra, --run-analysis Run data analysis (core), " + echo " -all, --run-all Run all the options (excluding the specific). " + echo " " echo " -sfs , --search-blast-db " echo " It blasts the nucleotide (nt) blast DB, " - echo " " echo " -sfrs , --search-blast-remote-db " echo " It blasts remotly thenucleotide (nt) blast " echo " database (it requires internet connection), " echo " " - echo " -rdup, --remove-dup Remove duplications (e.g. PCR dup), " - echo " -vhs, --very-sensitive Aligns with very high sensitivity (slower), " - echo " " echo " -gbb, --best-of-bests Identifies the best of bests references " echo " between multiple organs [similar reference], " echo " " - echo " -iss , --inter-sim-size " - echo " Inter-genome similarity top size (control), " - echo " " echo " -rpro, --run-profiles Run complexity and relative profiles (control), " echo " " echo " -rpgi , --run-gid-complexity-profile " echo " Run complexity profiles by GID, " - echo " -cpwi , --complexity-profile-window " - echo " Complexity profile window size, " - echo " -cple , --complexity-profile-level " - echo " Complexity profile compression level [1;10], " - echo " " echo " -rm, --run-meta Run viral metagenomic identification, " echo " -ro, --run-meta-nv Run NON-viral metagenomic identification," - echo " " - echo " -mis , --min-similarity " - echo " Minimum similarity value to consider the " - echo " sequence for alignment-consensus (filter), " - echo " " - echo " -top , --view-top " - echo " Display the top with the highest " - echo " similarity (by descending order), " - echo " " - echo " -c , --cache " - echo " Cache to be used by FALCON-meta, " - echo " -tsv , --top-size-virus " - echo " Top size to be used by FALCON-meta when " - echo " using TRACES_metagenomic_viral.sh, " - echo " -ts , --top-size " - echo " Top size to be used by FALCON-meta when " - echo " using TRACES_metagenomic.sh, " - echo " " echo " -rava, --run-all-v-alig Run all viral align/sort/consensus seqs " echo " from a specific list, " echo " " @@ -977,7 +1019,9 @@ if [ "$SHOW_HELP" -eq "1" ]; echo " " echo " -rsx , --run-extreme " echo " Run specific reference align/consensus " - echo " using extreme sensitivity, " + echo " using extreme sensitivity; " + echo " Retained for backwards compatibility; " + echo " Now an alias for -vhs -rsr , " echo " " echo " -rmt, --run-mito Run Mito align and consensus seq, " echo " -rmtd, --run-mito-dam Run Mito damage only, " @@ -1002,16 +1046,6 @@ if [ "$SHOW_HELP" -eq "1" ]; echo " " echo " -covp , --coverage-profile " echo " Run coverage profile for specific BED file, " - echo " -cmax , --max-coverage " - echo " Maximum depth coverage (depth normalization), " - echo " -clog , --coverage-log-scale " - echo " Coverage profile logarithmic scale VALUE=Base, " - echo " -cwis , --coverage-window-size " - echo " Coverage window size for low-pass filter, " - echo " -cdro , --coverage-drop " - echo " Coverage drop size (sampling), " - echo " -covm , --coverage-min-x " - echo " Coverage minimum value for x-axis " echo " " echo " -diff, --run-diff Run diff -> reference and hybrid (ident/SNPs), " echo " " @@ -1022,8 +1056,75 @@ if [ "$SHOW_HELP" -eq "1" ]; echo " -brec, --blast-reconstructed " echo " Run local blast over reconstructed genomes, " echo " " - echo " -ra, --run-analysis Run data analysis (core), " - echo " -all, --run-all Run all the options (excluding the specific). " + echo " " + echo -e "\e[34m =========== ANALYSIS OPTIONS ==========\e[0m" + echo " " + echo " -avdb , --alt-viral-db " + echo " Specify a path to fasta file containing " + echo " viral sequences " + echo " Sequence names must include the accession" + echo " as the first field, either whitespace or" + echo " underscore (_) delimited (NC_ handled) " + echo " -vdbm , --viral-db-metadata " + echo " Specify a path to a tab del file which " + echo " has sequence GID/ACC in the first column" + echo " and a Name representing a virus label " + echo " in the second " + echo " This changes the meta-analysis behaviour:" + echo " rather than using internal virus names " + echo " and inclusion criteria, the relationships" + echo " defined in the provided file are used. " + echo " This allows the user to define groupings " + echo " of interest in the default, or user " + echo " provided viral database. " + echo " " + echo " -rdup, --remove-dup Remove duplications (e.g. PCR dup), " + echo " -vhs, --very-sensitive Aligns with very high sensitivity (slower), " + echo " " + echo " -iss , --inter-sim-size " + echo " Inter-genome similarity top size (control), " + echo " " + echo " -cpwi , --complexity-profile-window " + echo " Complexity profile window size, " + echo " -cple , --complexity-profile-level " + echo " Complexity profile compression level [1;10], " + echo " " + echo " -mis , --min-similarity " + echo " Minimum similarity value to consider the " + echo " sequence for alignment-consensus (filter), " + echo " " + echo " -misl , --min-similarity-len " + echo " Minimum product of similarity value and " + echo " best hit sequence length for " + echo " alignment-consensus (filter), " + echo " " + echo " -top , --view-top " + echo " Display the top with the highest " + echo " similarity (by descending order), " + echo " " + echo " -c , --cache " + echo " Cache to be used by FALCON-meta, " + echo " -tsv , --top-size-virus " + echo " Top size to be used by FALCON-meta when " + echo " using TRACES_metagenomic_viral.sh; " + echo " default:0 -> seq count in viral db " + echo " -ts , --top-size " + echo " Top size to be used by FALCON-meta when " + echo " using TRACES_metagenomic.sh; " + echo " default:0 -> seq count in non-viral db " + echo " " + echo " -cmax , --max-coverage " + echo " Maximum depth coverage (depth normalization), " + echo " -clog , --coverage-log-scale " + echo " Coverage profile logarithmic scale VALUE=Base, " + echo " -cwis , --coverage-window-size " + echo " Coverage window size for low-pass filter, " + echo " -cdro , --coverage-drop " + echo " Coverage drop size (sampling), " + echo " -covm , --coverage-min-x " + echo " Coverage minimum value for x-axis " + echo " " + echo -e "\e[34m =========== EXAMPLES ==========\e[0m" echo " " echo -e "\e[93m Ex: ./TRACESPipe.sh --flush-output --flush-logs --run-mito --run-meta \e[0m" echo -e "\e[93m --remove-dup --run-de-novo --run-hybrid --min-similarity 1 --run-diff \e[0m" @@ -1042,12 +1143,14 @@ if [ "$SHOW_HELP" -eq "1" ]; # ============================================================================== # VERSION # -if [ "$SHOW_VERSION" -eq "1" ]; - then +if [ "$SHOW_VERSION" -eq "1" ]; then + version="1.1.3"; + versionFile="$SOURCE_DIR/../Version.txt" + [ -f "$versionFile" ] && version=$(cat "$versionFile") echo " "; echo " TRACESPipe "; echo " "; - echo " Version: 1.0.2 "; + echo " Version: $version "; echo " "; echo " Department of Virology and "; echo " Department of Forensic Medicine, "; @@ -1057,6 +1160,7 @@ if [ "$SHOW_VERSION" -eq "1" ]; echo " University of Aveiro, Portugal. "; echo " "; echo " diogo.pratas@helsinki.fi "; + echo " zachery.dickson@helsinki.fi "; echo " "; exit 0; fi @@ -1072,6 +1176,28 @@ mkdir -p ../output_data/ # CHECK_PROGRAMS; # +# If viral database metadata was provided, parse it for the virus groups to analyze +if [ -n "$VIRAL_DATABASE_METADATA" ]; then + if [ -s "$VIRAL_DATABASE_METADATA" ]; then + #Parse the unique virus groups in the metadata (ignore lines without at least two columns + readarray -t VIRUSES < <(awk -F '\\t' ' + (FNR > 1 && $1 && $2){VirusSet[$2]=1;} + END {n=asorti(VirusSet,vl); for(i=1;i<=n;i++){print vl[i]}} + ' "$VIRAL_DATABASE_METADATA") + #Ensure at least one virus group is present + if [ "${#VIRUSES[@]}" -lt 1 ]; then + >&2 echo -e "\e[31mERROR: Provided Viral database Metadata file did not contain any valid accession-virus pairs\e[0m" + exit 1; + fi + else + >&2 echo -e "\e[31mERROR: Provided Viral database Metadata file is non-existant or empty\e[0m" + exit 1; + fi +fi +# Save the virus groups to be analyzed +rm -f viral_names.txt; +{ printf "%s\n" "${VIRUSES[@]}"; echo ""; } > viral_names.txt +# # DELETE OLD RUN FILES # if [[ "$RUN_DIFF" -eq "1" ]]; @@ -1403,24 +1529,36 @@ if [[ "$GET_CY" -eq "1" ]]; # if [[ "$ADD_FASTA" -eq "1" ]]; then + if [ "$B_ALT_VIRAL_DB" -eq 1 ]; then + >&2 echo -e "\e[31mERROR: Cannot add fasta when using an alternative viral DB\e[0m" + exit 1 + fi CHECK_VDB; - ./TRACES_add_fasta_to_database.sh VDB.fa $NEW_FASTA + ./TRACES_add_fasta_to_database.sh "$VIRAL_DATABASE_FILE" $NEW_FASTA fi # # ============================================================================== # if [[ "$GET_EXTRA" -eq "1" ]]; then + if [ "$B_ALT_VIRAL_DB" -eq 1 ]; then + >&2 echo -e "\e[31mERROR: Cannot add extra viral seq when using an alternative viral DB\e[0m" + exit 1 + fi CHECK_ENRICH; - ./TRACES_get_enriched_sequences.sh VDB.fa + ./TRACES_get_enriched_sequences.sh "$VIRAL_DATABASE_FILE" fi # # ============================================================================== # if [[ "$ADD_EXTRA_SEQ" -eq "1" ]]; then + if [ "$B_ALT_VIRAL_DB" -eq 1 ]; then + >&2 echo -e "\e[31mERROR: Cannot add extra seq when using an alternative viral DB\e[0m" + exit 1 + fi CHECK_VDB; - ./TRACES_get_extra_seq.sh VDB.fa $NEW_SEQ_ID + ./TRACES_get_extra_seq.sh "$VIRAL_DATABASE_FILE" $NEW_SEQ_ID fi # # @@ -1494,7 +1632,7 @@ if [[ "$RUN_GID_DAMAGE_ANALYSIS" -eq "1" ]]; echo -e "\e[34m[TRACESPipe]\e[93m Running organ=$ORGAN_T\e[0m"; # rm -f FW_READS.fq.gz RV_READS.fq.gz - echo -e "\e[34m[TRACESPipe]\e[32m Copping an instance of the files ...\e[0m"; + echo -e "\e[34m[TRACESPipe]\e[32m Copying an instance of the files ...\e[0m"; cp ../input_data/$SPL_Forward FW_READS.fq.gz; cp ../input_data/$SPL_Reverse RV_READS.fq.gz; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; @@ -1534,7 +1672,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; echo -e "\e[34m[TRACESPipe]\e[93m Running organ=$ORGAN_T\e[0m"; # rm -f FW_READS.fq.gz RV_READS.fq.gz - echo -e "\e[34m[TRACESPipe]\e[32m Copping an instance of the files ...\e[0m"; + echo -e "\e[34m[TRACESPipe]\e[32m Copying an instance of the files ...\e[0m"; cp ../input_data/$SPL_Forward FW_READS.fq.gz; cp ../input_data/$SPL_Reverse RV_READS.fq.gz; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; @@ -1557,6 +1695,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; # CHECK_VDB; CHECK_PHIX; + [ $TOP_SIZE_VIR -eq 0 ] && TOP_SIZE_VIR=$(grep -c '^>' "$VIRAL_DATABASE_FILE"); # mv o_fw_pr.fq NP-o_fw_pr.fq; mv o_fw_unpr.fq NP-o_fw_unpr.fq; @@ -1565,7 +1704,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; # echo -e "\e[34m[TRACESPipe]\e[32m Running viral metagenomic analysis with FALCON-meta ...\e[0m"; mkdir -p ../output_data/TRACES_results - ./TRACES_metagenomics_viral.sh $ORGAN_T VDB.fa $TOP_SIZE_VIR $THREADS $TSIZE $CACHE 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; + ./TRACES_metagenomics_viral.sh $ORGAN_T "$VIRAL_DATABASE_FILE" $TOP_SIZE_VIR $THREADS $TSIZE $CACHE 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; # echo -e "\e[34m[TRACESPipe]\e[32m Finding the best references ...\e[0m"; @@ -1575,7 +1714,11 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; rm -f ../output_data/TRACES_results/REPORT_META_VIRAL_$ORGAN_T.txt; for VIRUS in "${VIRUSES[@]}" do - ./TRACES_get_best_$VIRUS.sh $ORGAN_T >> ../output_data/TRACES_results/REPORT_META_VIRAL_$ORGAN_T.txt + if [ -n "$VIRAL_DATABASE_METADATA" ]; then + ./TRACES_get_best_by_meta.sh "$ORGAN_T" "$VIRUS" "$VIRAL_DATABASE_METADATA" + else + ./TRACES_get_best_$VIRUS.sh $ORGAN_T + fi >> ../output_data/TRACES_results/REPORT_META_VIRAL_$ORGAN_T.txt done echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; # @@ -1628,7 +1771,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; echo -e "\e[34m[TRACESPipe]\e[93m Organ=$ORGAN_T Forward=$SPL_Forward Reverse=$SPL_Reverse\e[0m"; # rm -f FW_READS.fq.gz RV_READS.fq.gz - echo -e "\e[34m[TRACESPipe]\e[32m Copping an instance of the files ...\e[0m"; + echo -e "\e[34m[TRACESPipe]\e[32m Copying an instance of the files ...\e[0m"; cp ../input_data/$SPL_Forward FW_READS.fq.gz; cp ../input_data/$SPL_Reverse RV_READS.fq.gz; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; @@ -1653,6 +1796,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; then CHECK_VDB; CHECK_PHIX; + [ $TOP_SIZE_VIR -eq 0 ] && TOP_SIZE_VIR=$(grep -c '^>' "$VIRAL_DATABASE_FILE"); # echo -e "\e[34m[TRACESPipe]\e[32m Removing PhiX from the samples with MAGNET ...\e[0m"; #./TRACES_remove_phix.sh $THREADS @@ -1671,7 +1815,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; # echo -e "\e[34m[TRACESPipe]\e[32m Running viral metagenomic analysis with FALCON-meta ...\e[0m"; mkdir -p ../output_data/TRACES_results - ./TRACES_metagenomics_viral.sh $ORGAN_T VDB.fa $TOP_SIZE_VIR $THREADS $TSIZE $CACHE 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; + ./TRACES_metagenomics_viral.sh $ORGAN_T "$VIRAL_DATABASE_FILE" $TOP_SIZE_VIR $THREADS $TSIZE $CACHE 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; # echo -e "\e[34m[TRACESPipe]\e[32m Finding the best references ...\e[0m"; @@ -1681,7 +1825,11 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; rm -f ../output_data/TRACES_results/REPORT_META_VIRAL_$ORGAN_T.txt; for VIRUS in "${VIRUSES[@]}" do - ./TRACES_get_best_$VIRUS.sh $ORGAN_T >> ../output_data/TRACES_results/REPORT_META_VIRAL_$ORGAN_T.txt + if [ -n "$VIRAL_DATABASE_METADATA" ]; then + ./TRACES_get_best_by_meta.sh "$ORGAN_T" "$VIRUS" "$VIRAL_DATABASE_METADATA" + else + ./TRACES_get_best_$VIRUS.sh $ORGAN_T + fi >> ../output_data/TRACES_results/REPORT_META_VIRAL_$ORGAN_T.txt done echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; # @@ -1700,7 +1848,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; # echo -e "\e[34m[TRACESPipe]\e[32m Building complexity profiles with gto ...\e[0m"; cat NP-o_fw_pr.fq NP-o_fw_unpr.fq NP-o_rv_pr.fq NP-o_rv_unpr.fq > P_TRACES_sample_reads.fq - ./TRACES_profiles.sh GIS-$ORGAN_T VDB.fa P_TRACES_sample_reads.fq $ORGAN_T 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; + ./TRACES_profiles.sh GIS-$ORGAN_T "$VIRAL_DATABASE_FILE" P_TRACES_sample_reads.fq $ORGAN_T 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; fi # @@ -1711,6 +1859,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; then # CHECK_DB; + [ $TOP_SIZE -eq 0 ] && TOP_SIZE=$(grep -c '^>' DB.fa); # echo -e "\e[34m[TRACESPipe]\e[32m Running NON viral metagenomic analysis with FALCON ...\e[0m"; ./TRACES_metagenomics.sh $ORGAN_T DB.fa $TOP_SIZE $THREADS $TSIZE $CACHE 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; @@ -1747,50 +1896,10 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; # CHECK_VDB; # - echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence with pattern \"$SPECIFIC_ID\" from VDB.fa ...\e[0m"; - gto_fasta_extract_read_by_pattern -p "$SPECIFIC_ID" < VDB.fa | awk "/^>/ {n++} n>1 {exit} 1" > SPECIFIC-$SPECIFIC_ID.fa + echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence with pattern \"$SPECIFIC_ID\" from "$VIRAL_DATABASE_FILE" ...\e[0m"; + gto_fasta_extract_read_by_pattern -p "$SPECIFIC_ID" < "$VIRAL_DATABASE_FILE" | awk "/^>/ {n++} n>1 {exit} 1" > SPECIFIC-$SPECIFIC_ID.fa echo -e "\e[34m[TRACESPipe]\e[32m Aligning ... \e[0m"; - ./TRACES_viral_align_reads.sh SPECIFIC-$SPECIFIC_ID.fa $ORGAN_T $SPECIFIC_ID $THREADS $REMOVE_DUPLICATIONS 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; - echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; - # - echo -e "\e[34m[TRACESPipe]\e[32m Generate a consensus sequence with bcftools ...\e[0m"; - ./TRACES_viral_consensus.sh SPECIFIC-$SPECIFIC_ID.fa viral_aligned_sorted-$ORGAN_T-$SPECIFIC_ID.bam $ORGAN_T $SPECIFIC_ID 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; - mkdir -p ../output_data/TRACES_specific_alignments; - #rm -f ../output_data/TRACES_specific_alignments/* - cp SPECIFIC-$SPECIFIC_ID.fa ../output_data/TRACES_specific_alignments/ - cp SPECIFIC-$SPECIFIC_ID.fa.fai ../output_data/TRACES_specific_alignments/ - mv viral_aligned_sorted-$ORGAN_T-$SPECIFIC_ID.bam ../output_data/TRACES_specific_alignments/ - mv viral_aligned_sorted-$ORGAN_T-$SPECIFIC_ID.bam.bai ../output_data/TRACES_specific_alignments/ - mkdir -p ../output_data/TRACES_specific_consensus; - mv $SPECIFIC_ID-consensus-$ORGAN_T.fa ../output_data/TRACES_specific_consensus/ - mkdir -p ../output_data/TRACES_specific_bed; - mv $SPECIFIC_ID-calls-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ - mv $SPECIFIC_ID-coverage-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ - mv $SPECIFIC_ID-zero-coverage-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ - mv $SPECIFIC_ID-$ORGAN_T-calls.norm.flt-indels.vcf.gz ../output_data/TRACES_specific_bed/ - mkdir -p ../output_data/TRACES_specific_statistics; - echo -e "\e[34m[TRACESPipe]\e[32m Calculating coverage for alignment-based assembly ...\e[0m"; - ./TRACES_overall_specific.sh $SPECIFIC_ID $ORGAN_T - C_BREADTH=`cat ../output_data/TRACES_specific_statistics/$SPECIFIC_ID-total-horizontal-coverage-$ORGAN_T.txt`; - C_DEPTH=`cat ../output_data/TRACES_specific_statistics/$SPECIFIC_ID-total-depth-coverage-$ORGAN_T.txt`; - echo -e "\e[34m[TRACESPipe]\e[1m Breadth (H) coverage: $C_BREADTH \e[0m"; - echo -e "\e[34m[TRACESPipe]\e[1m Depth-x (V) coverage: $C_DEPTH \e[0m"; - echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; - fi - # - # ========================================================================== - # RUN SPECIFIC SPECIFIC ALIGN/CONSENSUS WITH EXTREME HIGH SENSITIVITY - # - if [[ "$RUN_SPECIFIC_SENSITIVE" -eq "1" ]]; - then - echo -e "\e[34m[TRACESPipe]\e[32m Aligning reads to specific viral ref(s) with pattern \"$SPECIFIC_ID\" using bowtie2 with EXTREME sensitivity ...\e[0m"; - # - CHECK_VDB; - # - echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence with pattern \"$SPECIFIC_ID\" from VDB.fa ...\e[0m"; - gto_fasta_extract_read_by_pattern -p "$SPECIFIC_ID" < VDB.fa | awk "/^>/ {n++} n>1 {exit} 1" > SPECIFIC-$SPECIFIC_ID.fa - echo -e "\e[34m[TRACESPipe]\e[32m Aligning ...\e[0m"; - ./TRACES_viral_sensitive_align_reads.sh SPECIFIC-$SPECIFIC_ID.fa $ORGAN_T $SPECIFIC_ID $THREADS $REMOVE_DUPLICATIONS 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; + ./TRACES_viral_align_reads.sh SPECIFIC-$SPECIFIC_ID.fa $ORGAN_T $SPECIFIC_ID $THREADS $REMOVE_DUPLICATIONS $RUN_SPECIFIC_SENSITIVE 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; # echo -e "\e[34m[TRACESPipe]\e[32m Generate a consensus sequence with bcftools ...\e[0m"; @@ -1802,24 +1911,22 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; mv viral_aligned_sorted-$ORGAN_T-$SPECIFIC_ID.bam ../output_data/TRACES_specific_alignments/ mv viral_aligned_sorted-$ORGAN_T-$SPECIFIC_ID.bam.bai ../output_data/TRACES_specific_alignments/ mkdir -p ../output_data/TRACES_specific_consensus; - #rm -f ../output_data/TRACES_specific_consensus/* mv $SPECIFIC_ID-consensus-$ORGAN_T.fa ../output_data/TRACES_specific_consensus/ mkdir -p ../output_data/TRACES_specific_bed; - #rm -f ../output_data/TRACES_specific_bed/* mv $SPECIFIC_ID-calls-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ mv $SPECIFIC_ID-coverage-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ mv $SPECIFIC_ID-zero-coverage-$ORGAN_T.bed ../output_data/TRACES_specific_bed/ - mv $SPECIFIC_ID-$ORGAN_T-calls.norm.flt-indels.vcf.gz ../output_data/TRACES_specific_bed/ + mv $SPECIFIC_ID-$ORGAN_T-calls.vcf.gz ../output_data/TRACES_specific_bed/ mkdir -p ../output_data/TRACES_specific_statistics; echo -e "\e[34m[TRACESPipe]\e[32m Calculating coverage for alignment-based assembly ...\e[0m"; - ./TRACES_overall_specific.sh $SPECIFIC_ID $ORGAN_T + ./TRACES_overall.sh specific $SPECIFIC_ID $ORGAN_T C_BREADTH=`cat ../output_data/TRACES_specific_statistics/$SPECIFIC_ID-total-horizontal-coverage-$ORGAN_T.txt`; C_DEPTH=`cat ../output_data/TRACES_specific_statistics/$SPECIFIC_ID-total-depth-coverage-$ORGAN_T.txt`; echo -e "\e[34m[TRACESPipe]\e[1m Breadth (H) coverage: $C_BREADTH \e[0m"; echo -e "\e[34m[TRACESPipe]\e[1m Depth-x (V) coverage: $C_DEPTH \e[0m"; echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m"; fi - # + # # ========================================================================== # DETAILED VIRAL ALIGN/CONSENSUS # @@ -1828,7 +1935,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; IDX=1; for VIRUS in "${VIRUSES[@]}" do - ALIGN_AND_CONSENSUS "$VIRUS" "$MINIMAL_SIMILARITY_VALUE" "$RUN_BEST_OF_BESTS" "$IDX" "$THREADS" "$REMOVE_DUPLICATIONS" "$HIGH_SENSITIVITY" "$ORGAN_T" + ALIGN_AND_CONSENSUS "$VIRUS" "$MINIMAL_SIMILARITY_VALUE" "$RUN_BEST_OF_BESTS" "$IDX" "$THREADS" "$REMOVE_DUPLICATIONS" "$HIGH_SENSITIVITY" "$ORGAN_T" "$MINIMAL_SIMILARITY_LENGTH" ((++IDX)); done fi @@ -1866,7 +1973,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; mv mt-zero-coverage-$ORGAN_T.bed ../output_data/TRACES_mtdna_bed/ mkdir -p ../output_data/TRACES_mtdna_statistics; echo -e "\e[34m[TRACESPipe]\e[32m Calculating coverage for alignment-based assembly ...\e[0m"; - ./TRACES_overall_mtdna.sh $ORGAN_T + ./TRACES_overall.sh mtdna mt $ORGAN_T C_BREADTH=`cat ../output_data/TRACES_mtdna_statistics/mt-total-horizontal-coverage-$ORGAN_T.txt`; C_DEPTH=`cat ../output_data/TRACES_mtdna_statistics/mt-total-depth-coverage-$ORGAN_T.txt`; echo -e "\e[34m[TRACESPipe]\e[35m Breadth (H) coverage: $C_BREADTH \e[0m"; @@ -1929,7 +2036,7 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; mv cy-coverage-$ORGAN_T.bed ../output_data/TRACES_cy_bed/ mv cy-zero-coverage-$ORGAN_T.bed ../output_data/TRACES_cy_bed/ mkdir -p ../output_data/TRACES_cy_statistics; - ./TRACES_overall_cy.sh $ORGAN_T + ./TRACES_overall.sh cy cy $ORGAN_T echo -e "\e[34m[TRACESPipe]\e[32m Done!\e[0m" fi # @@ -2147,7 +2254,11 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; SNPS=`cat out.report | grep TotalSNPs | awk '{ print $2;}'`; XBREADTH=`awk '{ print $3;}' ../output_data/TRACES_viral_statistics/$VIRUS-total-horizontal-coverage-$ORGAN_T.txt`; XDEPTH=`awk '{ print $3;}' ../output_data/TRACES_viral_statistics/$VIRUS-total-depth-coverage-$ORGAN_T.txt`; - V_INFO=`./TRACES_get_best_$VIRUS.sh $ORGAN_T`; + if [ -n "$VIRAL_DATABASE_METADATA" ]; then + V_INFO="$(./TRACES_get_best_by_meta.sh "$ORGAN" "$VIRUS" "$VIRAL_DATABASE_METADATA")"; + else + V_INFO=`./TRACES_get_best_$VIRUS.sh $ORGAN`; + fi V_GID=`echo "$V_INFO" | awk '{ print $2; }'`; V_VAL=`echo "$V_INFO" | awk '{ print $1; }'`; # @@ -2199,8 +2310,8 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; # CHECK_VDB; # - echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence with pattern \"$SPECIFIC_DENOVO_ID\" from VDB.fa ...\e[0m"; - gto_fasta_extract_read_by_pattern -p "$SPECIFIC_DENOVO_ID" < VDB.fa | awk "/^>/ {n++} n>1 {exit} 1" > SPECIFIC-$SPECIFIC_DENOVO_ID.fa + echo -e "\e[34m[TRACESPipe]\e[32m Extracting sequence with pattern \"$SPECIFIC_DENOVO_ID\" from "$VIRAL_DATABASE_FILE" ...\e[0m"; + gto_fasta_extract_read_by_pattern -p "$SPECIFIC_DENOVO_ID" < "$VIRAL_DATABASE_FILE" | awk "/^>/ {n++} n>1 {exit} 1" > SPECIFIC-$SPECIFIC_DENOVO_ID.fa echo -e "\e[34m[TRACESPipe]\e[32m Aligning and creating consensus ... \e[0m"; SCAFFOLDS_PATH="../output_data/TRACES_denovo_$ORGAN_T/scaffolds.fasta"; ./TRACES_denovo_specific.sh SPECIFIC-$SPECIFIC_DENOVO_ID.fa $SCAFFOLDS_PATH $SPECIFIC_DENOVO_ID $ORGAN_T $THREADS 1>> ../logs/Log-stdout-$ORGAN_T.txt 2>> ../logs/Log-stderr-$ORGAN_T.txt; @@ -2337,9 +2448,9 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; then IDBLAST=`head -n 1 $BLASTS_OUTPUT/SINGLE-LIST-$VIRUS-$ORGAN.txt | awk '{ print $2}' | tr '|' '\t' | awk '{ print $4;}'`; # - # Try to find genome in VDB.fa + # Try to find genome in "$VIRAL_DATABASE_FILE" CHECK_VDB; - gto_fasta_extract_read_by_pattern -p "$IDBLAST" < VDB.fa \ + gto_fasta_extract_read_by_pattern -p "$IDBLAST" < "$VIRAL_DATABASE_FILE" \ | awk "/^>/ {n++} n>1 {exit} 1" > TMP-BEST-BLAST.fa NVALSEQ=`wc -l TMP-BEST-BLAST.fa | awk '{ print $1;}'`; if [[ "$NVALSEQ" -eq "0" ]]; @@ -2360,7 +2471,11 @@ if [[ "$RUN_ANALYSIS" -eq "1" ]]; SNPS=`cat out.report | grep TotalSNPs | awk '{ print $2;}'`; XBREADTH=`awk '{ print $3;}' ../output_data/TRACES_viral_statistics/$VIRUS-total-horizontal-coverage-$ORGAN.txt`; XDEPTH=`awk '{ print $3;}' ../output_data/TRACES_viral_statistics/$VIRUS-total-depth-coverage-$ORGAN.txt`; - V_INFO=`./TRACES_get_best_$VIRUS.sh $ORGAN`; + if [ -n "$VIRAL_DATABASE_METADATA" ]; then + V_INFO="$(./TRACES_get_best_by_meta.sh "$ORGAN" "$VIRUS" "$VIRAL_DATABASE_METADATA")"; + else + V_INFO=`./TRACES_get_best_$VIRUS.sh $ORGAN`; + fi V_GID=`echo "$V_INFO" | awk '{ print $2; }'`; V_VAL=`echo "$V_INFO" | awk '{ print $1; }'`; # diff --git a/src/TRACES_coverage_table.sh b/src/TRACES_coverage_table.sh index 66be5a1..1ddc2b8 100755 --- a/src/TRACES_coverage_table.sh +++ b/src/TRACES_coverage_table.sh @@ -1,6 +1,7 @@ #!/bin/bash # -declare -a VIRUSES=('B19' 'HV1' 'HV2' 'HV3' 'HV4' 'HV5' 'HV6' 'HV6A' 'HV6B' 'HV7' 'HV8' 'POLY1' 'POLY2' 'POLY3' 'POLY4' 'POLY5' 'POLY6' 'POLY7' 'POLY8' 'POLY9' 'POLY10' 'POLY11' 'POLY12' 'POLY13' 'POLY14' 'TTV' 'HBOV1' 'HBOVNOT1' 'HBV' 'HPV' 'VARV' 'SV40' 'CUTA' 'HERV'); +readarray -t VIRUSES < viral_names.txt +#declare -a VIRUSES=('B19' 'HV1' 'HV2' 'HV3' 'HV4' 'HV5' 'HV6' 'HV6A' 'HV6B' 'HV7' 'HV8' 'POLY1' 'POLY2' 'POLY3' 'POLY4' 'POLY5' 'POLY6' 'POLY7' 'POLY8' 'POLY9' 'POLY10' 'POLY11' 'POLY12' 'POLY13' 'POLY14' 'TTV' 'HBOV1' 'HBOVNOT1' 'HBV' 'HPV' 'VARV' 'SV40' 'CUTA' 'HERV'); declare -a ORGANS=( $(cat ../meta_data/meta_info.txt | tr ':' '\t' | awk '{ print $1;}') ); # printf "; "; diff --git a/src/TRACES_cy_consensus.sh b/src/TRACES_cy_consensus.sh index ffa37eb..82646a1 100755 --- a/src/TRACES_cy_consensus.sh +++ b/src/TRACES_cy_consensus.sh @@ -17,21 +17,25 @@ awk '$4 < 1' cy-coverage-$Organ.bed > cy-zero-coverage-$Organ.bed # # CALLS W CONSENSUS samtools faidx $Reference # -P 9.9e-1 -bcftools mpileup -Ou -f $Reference $Alignments | bcftools call --ploidy 1 -P 9.9e-1 -mv -Oz -o calls-cy-$Organ.vcf.gz -bcftools index calls-cy-$Organ.vcf.gz +rawCalls="raw-cy-$Organ.vcf.gz" +bcftools mpileup -Ov -f $Reference $Alignments | bcftools call --ploidy 1 -P 9.9e-1 -mv -Oz -o "$rawCalls" +bcftools index "$rawCalls" # # normalize indels -bcftools norm -f $Reference calls-cy-$Organ.vcf.gz -Oz -o calls-cy-$Organ.norm.vcf.gz +bcftools norm -f $Reference "$rawCalls" -Oz -o calls-cy-$Organ.norm.vcf.gz # # filter adjacent indels within 5bp bcftools filter --IndelGap 5 calls-cy-$Organ.norm.vcf.gz -Oz -o calls-cy-$Organ.norm.flt-indels.vcf.gz +# filter incompatible variants +finalVCF="calls-cy-$Organ.vcf.gz" +./TRACES_filter_incompatible_variants.sh calls-cy-$Organ.norm.flt-indels.vcf.gz >| $finalVCF; # # create bed file -zcat calls-cy-$Organ.norm.flt-indels.vcf.gz |vcf2bed --snvs > cy-calls-$Organ.bed +zcat "$finalVCF" | vcf2bed --snvs > cy-calls-$Organ.bed # # CONSENSUS -tabix -f calls-cy-$Organ.norm.flt-indels.vcf.gz -bcftools consensus -m cy-zero-coverage-$Organ.bed -f $Reference calls-cy-$Organ.norm.flt-indels.vcf.gz > cy-consensus-$Organ.fa +tabix -f "$finalVCF" +bcftools consensus -m cy-zero-coverage-$Organ.bed -f $Reference $finalVCF > cy-consensus-$Organ.fa # # Give new header name for the consensus sequence tail -n +2 cy-consensus-$Organ.fa > CY_TMP_FILE_$Organ.xki @@ -40,6 +44,6 @@ cat CY_TMP_FILE_$Organ.xki >> cy-consensus-$Organ.fa rm -f CY_TMP_FILE_$Organ.xki; # # -rm -f calls-cy-$Organ.vcf.gz calls-cy-$Organ.vcf.gz.csi calls-cy-$Organ.norm.bcf calls-cy-$Organ.norm.flt-indels.bcf calls-cy-$Organ.norm.flt-indels.vcf.gz calls-cy-$Organ.norm.vcf.gz; +rm -f "$rawCalls" "$rawCalls.csi" calls-cy-$Organ.norm.bcf calls-cy-$Organ.norm.flt-indels.bcf calls-cy-$Organ.norm.flt-indels.vcf.gz calls-cy-$Organ.norm.vcf.gz "$finalVCF" ${finalVCF/.vcf.gz/.bcf}; "$finalVCF.tbi" # # diff --git a/src/TRACES_filter_incompatible_variants.sh b/src/TRACES_filter_incompatible_variants.sh new file mode 100755 index 0000000..9961439 --- /dev/null +++ b/src/TRACES_filter_incompatible_variants.sh @@ -0,0 +1,85 @@ +#!/bin/bash + +#This script is intended to pass over a vcf file and remove conflicting variants +#The choice is made first by higher quality, then by VCF order +#Confilicting alleles are snps at the same position as an insertion +# and other variants which appear in the region covered by a deletion +#If there are multiple variants overlapping a position, the best one is retained. +#Variants are processed in an order such that some conflicts may be resolved +# Example two variant sites covered by a deletion, where the deletion is lower +# quality than either variant: the deletion is filtered and the two variants are +# retained. In this way the maximum number of variants are +# +#Written by Zachery Dickson, September 2024 +# Contact: dicksoz@mcmaster.ca +# zachery.dickson@helski.fi + +if [ "$#" -lt 1 ]; then + >&2 echo -e "Usage: $(basename $0) In.vcf[.gz] > Out.vcf.gz\n" \ + "\tFinds conflicting variants, selects the maximum conflict free set\n" \ + "\tAssumes variants are sorted by position"; + exit 1; +fi + +file=$1;shift; +if file $file | grep -q gzip; then + zcat $file; +else + cat $file; +fi | + awk -F '\\t' -v call="$0 $file" ' + #j > i + function testOverlap(i,j) { + if(chrom[j] != chrom[i]){return 0;} + if(left[j] > right[i]){return 0;} + return 1; + } + BEGIN{OFS="\t"} + /^##/{print;next;} + /^#CHROM/{ + print "##incompatFilterCommand="call; + print; + next + } + { + counter++; + fullLine[counter] = $0; + qual[counter] = $6; + chrom[counter] = $1; + left[counter] = $2; + len = length($4) +# if(length($5) > len){len = length($5)} + right[counter] = $2+len-1; + sortableLength[counter] = sprintf("%d%06d",len,counter); + } + END { + #Process Variants in order of length + n=asort(sortableLength); + for(i = n; i >= 1; i--){ + #Iterate from longest to shortest + idx = substr(sortableLength[i],length(sortableLength[i])-5)+0 + if(Filtered[idx]){continue;} + jdx = idx; + bestQual=qual[idx]; + #Check all following entries as long as this variant is best, + # we have not checked all variants, and the next one overlaps + while(bestQual==qual[idx] && jdx < counter && + testOverlap(idx,jdx+1)){ + jdx++; + if(qual[jdx] > qual[idx]){ + Filtered[idx] = 1; + bestQual = qual[jdx]; + } else { + Filtered[jdx] = 1; + } + } + } + for(idx=1;idx<=counter; idx++){ + if(!Filtered[idx]){ + print fullLine[idx]; + } + } + } + ' | + bgzip; + diff --git a/src/TRACES_get_best_by_meta.sh b/src/TRACES_get_best_by_meta.sh new file mode 100755 index 0000000..c5f7fd6 --- /dev/null +++ b/src/TRACES_get_best_by_meta.sh @@ -0,0 +1,19 @@ +#!/bin/bash +ORGAN=$1; shift +Virus=$1; shift +MetaFile=$1; shift +# +awk -v Virus="$Virus" -F '\\t' ' + BEGIN{OFS="\t";bestSim="-";bestAcc="-";bestLen="-"} + (ARGIND == 1){if($2==Virus){InSet[$1]=1};next} + { + sub("NC_","NC-",$4); + split($4,a,"_"); + acc = a[1]; + sub("NC-","NC_",acc); + sim=$3; + len=$2; + } + (InSet[acc] && sim > bestSim+0){ bestSim=sim; bestAcc=acc; bestLen=len;} + END {print bestSim,bestAcc,bestLen} +' "$MetaFile" "top-$ORGAN.csv" diff --git a/src/TRACES_hybrid_consensus.sh b/src/TRACES_hybrid_consensus.sh index 4562210..f0fc593 100755 --- a/src/TRACES_hybrid_consensus.sh +++ b/src/TRACES_hybrid_consensus.sh @@ -20,22 +20,26 @@ awk '$4 < 1' $Label-coverage-$Organ.bed > $Label-zero-coverage-$Organ.bed # # CALLS samtools faidx $Reference # -P 9.9e-1 # uses the new variations for consensus -bcftools mpileup -Ou -f $Reference $Alignments | bcftools call --ploidy 1 -M -mv -Oz -o $Label-$Organ-calls.vcf.gz -bcftools index $Label-$Organ-calls.vcf.gz +rawCalls="$Label-$Organ-raw.vcf.gz" +bcftools mpileup -Ov -f $Reference $Alignments | bcftools call --ploidy 1 -M -mv -Oz -o "$rawCalls" +bcftools index "$rawCalls" # # normalize indels -bcftools norm -f $Reference $Label-$Organ-calls.vcf.gz -Oz -o $Label-$Organ-calls.norm.vcf.gz +bcftools norm -f $Reference "$rawCalls" -Oz -o $Label-$Organ-calls.norm.vcf.gz # # filter adjacent indels within 5bp #bcftools filter --IndelGap 5 $Label-$Organ-calls.norm.vcf.gz -Oz -o $Label-$Organ-calls.norm.flt-indels.vcf.gz +# filter incompatible variants +finalVCF="$Label-$Organ-calls.vcf.gz" +./TRACES_filter_incompatible_variants.sh $Label-$Organ-calls.norm.vcf.gz >| "$finalVCF"; # # create bed file #zcat $Label-$Organ-calls.norm.flt-indels.vcf.gz |vcf2bed --snvs > $Label-calls-$Organ.bed -zcat $Label-$Organ-calls.norm.vcf.gz |vcf2bed --snvs > $Label-calls-$Organ.bed +zcat "$finalVCF" | vcf2bed --snvs > $Label-calls-$Organ.bed # # CONSENSUS -tabix -f $Label-$Organ-calls.norm.vcf.gz -bcftools consensus -m $Label-zero-coverage-$Organ.bed -f $Reference $Label-$Organ-calls.norm.vcf.gz > $Label-consensus-$Organ.fa +tabix -f "$finalVCF" +bcftools consensus -m $Label-zero-coverage-$Organ.bed -f $Reference "$finalVCF" > $Label-consensus-$Organ.fa # # Give new header name for the consensus sequence tail -n +2 $Label-consensus-$Organ.fa > $Label-$Organ-TMP_FILE.xki @@ -44,6 +48,6 @@ cat $Label-$Organ-TMP_FILE.xki >> $Label-consensus-$Organ.fa rm -f $Label-$Organ-TMP_FILE.xki; # # -rm -f $Label-$Organ-calls.vcf.gz $Label-$Organ-calls.vcf.gz.csi $Label-$Organ-calls.norm.bcf $Label-$Organ-calls.norm.vcf.gz.csi $Label-$Organ-calls.norm.vcf.gz; +rm -f "$rawCalls" "$rawCalls.csi" $Label-$Organ-calls.norm.bcf $Label-$Organ-calls.norm.vcf.gz.csi $Label-$Organ-calls.norm.vcf.gz "$finalVCF" "$finalVCF.tbi"; # # diff --git a/src/TRACES_install.sh b/src/TRACES_install.sh index 696182c..b014723 100755 --- a/src/TRACES_install.sh +++ b/src/TRACES_install.sh @@ -45,10 +45,10 @@ if [[ "$INSTALL_PIPELINE" -eq "1" ]]; conda install -c bioconda spades --yes conda install -c bioconda igv --yes conda install -c bioconda bowtie2 --yes - conda install -c bioconda samtools=1.9 --yes - conda install -c bioconda bcftools=1.9 --yes + conda install -c bioconda samtools=1.21 --yes + conda install -c bioconda bcftools=1.21 --yes conda install -c bioconda bedops --yes - conda install -c bioconda bedtools --yes + conda install -c bioconda bedtools=2.31.1 --yes conda install -c bioconda fastq-pair --yes conda install -c bioconda entrez-direct --yes conda install -c bioconda/label/cf201901 entrez-direct --yes diff --git a/src/TRACES_mt_consensus.sh b/src/TRACES_mt_consensus.sh index 05012b7..978ade2 100755 --- a/src/TRACES_mt_consensus.sh +++ b/src/TRACES_mt_consensus.sh @@ -22,21 +22,25 @@ awk '$4 < 1' $Label-coverage-$Organ.bed > $Label-zero-coverage-$Organ.bed # CHAN # # CALLS samtools faidx $Reference # -P 9.9e-1 -bcftools mpileup -Ou -f $Reference $Alignments | bcftools call --ploidy 1 -P 9.9e-1 -mv -Oz -o $Label-$Organ-calls.vcf.gz -bcftools index $Label-$Organ-calls.vcf.gz +rawCalls="$Label-$Organ-raw.vcf.gz" +bcftools mpileup -Ov -f $Reference $Alignments | bcftools call --ploidy 1 -P 9.9e-1 -mv -Oz -o "$rawCalls" +bcftools index "$rawCalls" # # normalize indels -bcftools norm -f $Reference $Label-$Organ-calls.vcf.gz -Oz -o $Label-$Organ-calls.norm.vcf.gz +bcftools norm -f $Reference "$rawCalls" -Oz -o $Label-$Organ-calls.norm.vcf.gz # # filter adjacent indels within 5bp bcftools filter --IndelGap 5 $Label-$Organ-calls.norm.vcf.gz -Oz -o $Label-$Organ-calls.norm.flt-indels.vcf.gz +# filter incompatible variants +finalVCF="$Label-$Organ-calls.vcf.gz" +./TRACES_filter_incompatible_variants.sh $Label-$Organ-calls.norm.flt-indels.vcf.gz >| "$finalVCF"; # # create bed file -zcat $Label-$Organ-calls.norm.flt-indels.vcf.gz |vcf2bed --snvs > $Label-calls-$Organ.bed +zcat "$finalVCF" |vcf2bed --snvs > $Label-calls-$Organ.bed # # CONSENSUS -tabix -f $Label-$Organ-calls.norm.flt-indels.vcf.gz -bcftools consensus -m $Label-zero-coverage-$Organ.bed -f $Reference $Label-$Organ-calls.norm.flt-indels.vcf.gz > $Label-consensus-$Organ.fa +tabix -f "$finalVCF" +bcftools consensus -m $Label-zero-coverage-$Organ.bed -f $Reference "$finalVCF" > $Label-consensus-$Organ.fa # # Give new header name for the consensus sequence tail -n +2 $Label-consensus-$Organ.fa > $Label-TMP_FILE_$Organ.xki @@ -45,6 +49,6 @@ cat $Label-TMP_FILE_$Organ.xki >> $Label-consensus-$Organ.fa rm -f $Label-TMP_FILE_$Organ.xki; # # -rm -f $Label-$Organ-calls.vcf.gz $Label-$Organ-calls.vcf.gz.csi $Label-$Organ-calls.norm.bcf $Label-$Organ-calls.norm.flt-indels.bcf $Label-$Organ-calls.norm.flt-indels.vcf.gz $Label-$Organ-calls.norm.flt-indels.vcf.gz.csi $Label-$Organ-calls.norm.vcf.gz; +rm -f "$rawCalls" "$rawCalls.csi" $Label-$Organ-calls.norm.bcf $Label-$Organ-calls.norm.vcf.gz $Label-$Organ-calls.norm.flt-indels.bcf $Label-$Organ-calls.norm.flt-indels.vcf.gz $Label-$Organ-calls.norm.flt-indels.vcf.gz.csi "$finalVCF" "$finalVCF.tbi" "${finalVCF/.vcf.gz/.bcf}"; # # diff --git a/src/TRACES_multiorgan_consensus.sh b/src/TRACES_multiorgan_consensus.sh index d64ac84..01a62ca 100755 --- a/src/TRACES_multiorgan_consensus.sh +++ b/src/TRACES_multiorgan_consensus.sh @@ -35,13 +35,17 @@ samtools index $FIELD-data_aligned_sorted.bam $FIELD-data_aligned_sorted.bam.bai bedtools genomecov -ibam $FIELD-data_aligned_sorted.bam -bga > $FIELD-coverage.bed awk '$4 < 1' $FIELD-coverage.bed > $FIELD-zero-coverage.bed samtools faidx $REFERENCE -bcftools mpileup -Ou -f $REFERENCE $FIELD-data_aligned_sorted.bam \ -| bcftools call --ploidy 1 -mv -Oz -o $FIELD-calls.vcf.gz -bcftools index $FIELD-calls.vcf.gz -bcftools norm -f $REFERENCE $FIELD-calls.vcf.gz -Oz -o $FIELD-calls.norm.vcf.gz -zcat $FIELD-calls.norm.vcf.gz |vcf2bed --snvs > $FIELD-calls.bed -tabix -f $FIELD-calls.norm.vcf.gz -bcftools consensus -f $REFERENCE $FIELD-calls.norm.vcf.gz > $FIELD-consensus.fa +rawCalls="$FIELD-raw.vcf.gz" +bcftools mpileup -Ov -f $REFERENCE $FIELD-data_aligned_sorted.bam \ +| bcftools call --ploidy 1 -mv -Oz -o "$rawCalls" +bcftools index "$rawCalls" +bcftools norm -f $REFERENCE "$rawCalls" -Oz -o $FIELD-calls.norm.vcf.gz +# filter incompatible variants +finalVCF="$FIELD-calls.vcf.gz" +./TRACES_filter_incompatible_variants.sh $FIELD-calls.norm.vcf.gz >| "$finalVCF"; +zcat "$finalVCF" | vcf2bed --snvs > $FIELD-calls.bed +tabix -f "$finalVCF" +bcftools consensus -f $REFERENCE "$finalVCF" > $FIELD-consensus.fa # # Give new header name for the consensus sequence tail -n +2 $FIELD-consensus.fa > $FIELD-TMP2_FILE.xki diff --git a/src/TRACES_overall.sh b/src/TRACES_overall.sh new file mode 100755 index 0000000..03919f7 --- /dev/null +++ b/src/TRACES_overall.sh @@ -0,0 +1,90 @@ +#!/bin/bash +# +#USAGE: TRACES_overall.sh reftype pattern organ [maxdepth=1000] +# RefType is one of viral, specific, cy, mtdna and defines the type of reference we are calculating stats for +# Pattern defines the specific reference used +# for cy and mtdna this should be cy and mt respectively +# for specific it is the pattern used to find the reference in the DB +# for virus it is the virus Name +# Organ is the identifier library being processed +# MaxDepth is the maximum depth to consider at any given position +# +# +#Define Global Variables +declare -A ValidRefTypes; +for type in cy mtdna specific viral; do + ValidRefTypes[$type]=1 +done +declare -A RequiredPattern; +RequiredPattern["cy"]="cy" +RequiredPattern["mtdna"]="mt" +DEFAULT_MAX_DEPTH=1000; +#Pull the Ref Type, validate it, and set the directories containing relevant bed and bam, and output files +REFTYPE="$1";shift; +if [[ -z "${ValidRefTypes[$REFTYPE]}" ]]; then + >&2 echo -e "[BUG] Attempt to call TRACES_overall.sh on an unrecognized type ($REFTYPE)\n" \ + "\t Valid types are: " "${!ValidRefTypes[@]}" \ + ; + exit 1; +fi +BedDir="../output_data/TRACES_${REFTYPE}_bed" +BamDir="../output_data/TRACES_${REFTYPE}_alignments" +StatsDir="../output_data/TRACES_${REFTYPE}_statistics" +#Pull the pattern and validate it against the reftype +PATTERN="$1";shift; +if [[ -n "${RequiredPattern[$REFTYPE]}" && "$PATTERN" != "${RequiredPattern[$REFTYPE]}" ]]; then + rq="${RequiredPattern[$REFTYPE]}" + >&2 echo "[BUG WARNING] Call to TRACES_overall.sh $REFTYPE with pattern!='$rq'"; + PATTERN=$rq +fi +#Set relevant pref- and suffixes +BedPrefix="$PATTERN" #post-delimiter is constant across all ref types +AlnBamPrefix="$PATTERN" #post-delimiter is constant across all ref types +AlnBamSuffix="-$PATTERN" #pre-delimiter included as the suffix isn't present for cy and mtdna +StatsPrefix="$PATTERN" #post-delimiter is constant across all ref types +#Handle cases where the -fixes aren't pattern +case $REFTYPE in + viral) + AlnBamPrefix="viral" + ;; + specific) + AlnBamPrefix="viral" + ;; + cy) + AlnBamSuffix="" + ;; + mtdna) + AlnBamSuffix="" + ;; +esac +#Pull the organ and then set the relevant bed, bam, and output files +ORGAN="$1"; shift +CovBedFile="$BedDir/$BedPrefix-coverage-$ORGAN.bed" +AlnBamFile="$BamDir/${AlnBamPrefix}_aligned_sorted-${ORGAN}${AlnBamSuffix}.bam" +DepthFile="$StatsDir/${StatsPrefix}-total-depth-coverage-${ORGAN}.txt" +ZeroBedFile="$BedDir/${BedPrefix}-zero-coverage-$ORGAN.bed" +BreadthFile="$StatsDir/${StatsPrefix}-total-horizontal-coverage-${ORGAN}.txt" +#Pull the Max Depth from the comand line options +MAX_DEPTH=$1; shift +if [ -z "$MAX_DEPTH" ]; then + MAX_DEPTH=$DEFAULT_MAX_DEPTH; +fi +# +#Get the total size of the reference sequence, and the sequence ID for the first contig +# Note: we assume all references are single contigs at this point +TOTAL_SIZE=$(tail -n 1 "$CovBedFile" | awk '{ print $3}'); +acc=$(head -1 "$CovBedFile" | cut -f1); +bedEntry="$acc\t0\t$TOTAL_SIZE" +#Updated Depth calculation which ignores peaks, but more importantly ignores secondary/supplementary alignments +#Column 3 is the length of the genome (constructed in the bedEntry above +#Column 4 is the total bases covered +NORMALIZED_COVERAGE=$(samtools bedcov --max-depth "$MAX_DEPTH" <(echo -e "$bedEntry") "$AlnBamFile" | awk '{print $4/$3}') +printf "$PATTERN\t$ORGAN\t%0.4f\n" "$NORMALIZED_COVERAGE" > "$DepthFile" +# +ZERO_COVERAGE=$(awk '{sum += ($3-$2)} END {print sum}' "$ZeroBedFile"); +if [ -z "$ZERO_COVERAGE" ]; then + ZERO_COVERAGE=0; +fi +NORMALIZED_ZERO_COVERAGE=$(echo "scale=4; (100-(($ZERO_COVERAGE / $TOTAL_SIZE)*100))" | bc -l); +printf "$PATTERN\t$ORGAN\t%0.4f\n" "$NORMALIZED_ZERO_COVERAGE" > "$BreadthFile" +# diff --git a/src/TRACES_overall_virus.sh b/src/TRACES_overall_virus.sh index aee418a..85e904a 100755 --- a/src/TRACES_overall_virus.sh +++ b/src/TRACES_overall_virus.sh @@ -2,11 +2,15 @@ # VIRUS="$1"; ORGAN="$2"; +MAX_DEPTH=1000; # -TOTAL_COVERAGE=`awk '{sum += (($3-$2)*$4)} END {print sum}' ../output_data/TRACES_viral_bed/$VIRUS-coverage-$ORGAN.bed `; +#TOTAL_COVERAGE=`awk '{sum += (($3-$2)*$4)} END {print sum}' ../output_data/TRACES_viral_bed/$VIRUS-coverage-$ORGAN.bed `; TOTAL_SIZE=`tail -n 1 ../output_data/TRACES_viral_bed/$VIRUS-coverage-$ORGAN.bed | awk '{ print $3}'`; +acc=$(head -1 ../output_data/TRACES_viral_bed/$VIRUS-coverage-$ORGAN.bed | cut -f1); +#Updated Depth calculation which ignores peaks, but more importantly ignores secondary/supplementary alignments +TOTAL_COVERAGE=$(samtools bedcov --max-depth $MAX_DEPTH <(echo -e "$acc\t0\t$TOTAL_SIZE") ../output_data/TRACES_viral_alignments/viral_aligned_sorted-$ORGAN-$VIRUS.bam | awk '{print $NF}') NORMALIZED_COVERAGE=`echo "scale=4; $TOTAL_COVERAGE / $TOTAL_SIZE" | bc -l`; -printf "$VIRUS\t$ORGAN\t$NORMALIZED_COVERAGE\n" > ../output_data/TRACES_viral_statistics/$VIRUS-total-depth-coverage-$ORGAN.txt; +printf "$VIRUS\t$ORGAN\t%0.4f\n" "$NORMALIZED_COVERAGE" > ../output_data/TRACES_viral_statistics/$VIRUS-total-depth-coverage-$ORGAN.txt; # ZERO_COVERAGE=`awk '{sum += ($3-$2)} END {print sum}' ../output_data/TRACES_viral_bed/$VIRUS-zero-coverage-$ORGAN.bed `; if [ -z $ZERO_COVERAGE ] diff --git a/src/TRACES_viral_align_reads.sh b/src/TRACES_viral_align_reads.sh index e1546f4..c6368f2 100755 --- a/src/TRACES_viral_align_reads.sh +++ b/src/TRACES_viral_align_reads.sh @@ -15,14 +15,14 @@ # BUILD THE INDEX rm -f index-$2-$3-file* viral_aligned_sorted-$2-$3.bam.bai bowtie2-build $1 index-$2-$3-file -# +# HANDLE OPTIONAL SENSITIVITY +sensitivityArg="" +if [[ "$6" == "1" ]]; then + sensitivityArg="--very-sensitive" +fi # ALIGN -if [[ "$6" == "1" ]]; - then - bowtie2 -a --threads $4 --very-sensitive -x index-$2-$3-file -1 o_fw_pr.fq -2 o_rv_pr.fq -U o_fw_unpr.fq,o_rv_unpr.fq > aligned-$2-$3.sam - else - bowtie2 -a --threads $4 -x index-$2-$3-file -1 o_fw_pr.fq -2 o_rv_pr.fq -U o_fw_unpr.fq,o_rv_unpr.fq > aligned-$2-$3.sam - fi +bowtie2 -a --threads $4 "$sensitivityArg" -x index-$2-$3-file \ + -1 o_fw_pr.fq -2 o_rv_pr.fq -U o_fw_unpr.fq,o_rv_unpr.fq > aligned-$2-$3.sam # # SORT & BIN samtools sort --threads $4 aligned-$2-$3.sam > viral_aligned_sorted-$2-$3.bam @@ -32,18 +32,18 @@ if [[ "$5" -eq "1" ]]; then echo "Removing Duplications ..."; # SORT BY NAME - samtools sort -n viral_aligned_sorted-$2-$3.bam > viral_aligned_sorted_sorted-$2-$3.bam + samtools sort --thread $4 -n viral_aligned_sorted-$2-$3.bam > viral_aligned_sorted_sorted-$2-$3.bam # # ADD ms AND MC FOR MARKDUP - samtools fixmate -m viral_aligned_sorted_sorted-$2-$3.bam viral_aligned_sorted_sorted-$2-$3-fixmate.bam + samtools fixmate --threads $4 -m viral_aligned_sorted_sorted-$2-$3.bam viral_aligned_sorted_sorted-$2-$3-fixmate.bam rm -f viral_aligned_sorted_sorted-$2-$3.bam # # SORTING POSITION ORDER - samtools sort -o viral_aligned_sorted_sorted-$2-$3-fixmate-sort.bam viral_aligned_sorted_sorted-$2-$3-fixmate.bam + samtools sort --threads $4 -o viral_aligned_sorted_sorted-$2-$3-fixmate-sort.bam viral_aligned_sorted_sorted-$2-$3-fixmate.bam rm -f viral_aligned_sorted_sorted-$2-$3-fixmate.bam # # REMOVE DUPLICATES - samtools markdup -r viral_aligned_sorted_sorted-$2-$3-fixmate-sort.bam viral_aligned_sorted-$2-$3.bam + samtools markdup --threads $4 -r viral_aligned_sorted_sorted-$2-$3-fixmate-sort.bam viral_aligned_sorted-$2-$3.bam rm -f viral_aligned_sorted_sorted-$2-$3-fixmate-sort.bam # fi diff --git a/src/TRACES_viral_consensus.sh b/src/TRACES_viral_consensus.sh index 5d725e9..8bc0432 100755 --- a/src/TRACES_viral_consensus.sh +++ b/src/TRACES_viral_consensus.sh @@ -24,21 +24,26 @@ awk '$4 < 1' $Label-coverage-$Organ.bed > $Label-zero-coverage-$Organ.bed # # CALLS samtools faidx $Reference # -P 9.9e-1 #here! -samtools mpileup -Ou -f $Reference $Alignments | bcftools call --ploidy 1 -P 9.9e-1 -mv -Oz -o $Label-$Organ-calls.vcf.gz -bcftools index $Label-$Organ-calls.vcf.gz +rawCalls="$Label-$Organ-raw.vcf.gz" +bcftools mpileup -Ov -f $Reference $Alignments | bcftools call --ploidy 1 -P 9.9e-1 -mv -Oz -o "$rawCalls" +bcftools index "$rawCalls" # # normalize indels -bcftools norm -f $Reference $Label-$Organ-calls.vcf.gz -Oz -o $Label-$Organ-calls.norm.vcf.gz +bcftools norm -f $Reference "$rawCalls" -Oz -o $Label-$Organ-calls.norm.vcf.gz # # filter adjacent indels within 5bp bcftools filter --IndelGap 5 $Label-$Organ-calls.norm.vcf.gz -Oz -o $Label-$Organ-calls.norm.flt-indels.vcf.gz +# filter incompatible variants +finalVCF="$Label-$Organ-calls.vcf.gz" +./TRACES_filter_incompatible_variants.sh $Label-$Organ-calls.norm.flt-indels.vcf.gz >| "$finalVCF"; + # # create bed file -zcat $Label-$Organ-calls.norm.flt-indels.vcf.gz |vcf2bed --snvs > $Label-calls-$Organ.bed +zcat "$finalVCF" | vcf2bed --snvs > $Label-calls-$Organ.bed # # CONSENSUS -tabix -f $Label-$Organ-calls.norm.flt-indels.vcf.gz -bcftools consensus -m $Label-zero-coverage-$Organ.bed -f $Reference $Label-$Organ-calls.norm.flt-indels.vcf.gz > $Label-consensus-$Organ.fa +tabix -f "$finalVCF" +bcftools consensus -m $Label-zero-coverage-$Organ.bed -f $Reference "$finalVCF" > $Label-consensus-$Organ.fa # # Give new header name for the consensus sequence tail -n +2 $Label-consensus-$Organ.fa > $Label-$Organ-TMP_FILE.xki @@ -47,6 +52,6 @@ cat $Label-$Organ-TMP_FILE.xki >> $Label-consensus-$Organ.fa rm -f $Label-$Organ-TMP_FILE.xki; # # -rm -f $Label-$Organ-calls.vcf.gz $Label-$Organ-calls.vcf.gz.csi $Label-$Organ-calls.norm.bcf $Label-$Organ-calls.norm.flt-indels.bcf $Label-$Organ-calls.norm.flt-indels.vcf.gz.csi; +rm -f "$rawCalls" "$rawCalls.csi" $Label-$Organ-calls.norm.bcf $Label-$Organ-calls.norm.vcf.gz $Label-$Organ-calls.norm.flt-indels.bcf $Label-$Organ-calls.norm.flt-indels.vcf.gz "$finalVCL.csi"; # # diff --git a/src/viral_names.txt b/system_files/internal_viral_names.txt old mode 100755 new mode 100644 similarity index 99% rename from src/viral_names.txt rename to system_files/internal_viral_names.txt index 8b26263..bf2f98d --- a/src/viral_names.txt +++ b/system_files/internal_viral_names.txt @@ -32,4 +32,3 @@ VARV SV40 CUTA HERV -