@@ -70,41 +70,55 @@ sub phast{
70
70
my ($start ,$stop )=split (/ \- / ,$coordinates );
71
71
$stop ||=$start ;
72
72
73
- my $file =" $tempdir /$i .fna" ;
73
+ # The file name will have the start coordinate as its name
74
+ my $file =" $tempdir /$start .fna" ;
74
75
open (SEQOUT," >" ,$file ) or die " ERROR: could not write seq to temp file $file : $! " ;
75
76
print SEQOUT " >" .$contig ." \n " .$seq {$contig }-> subseq($start ,$stop )." \n " ;
76
77
close SEQOUT;
77
78
$i ++;
78
79
}
79
80
my $threadsPerBlast =int ($$settings {numcpus }/$i );
80
81
$threadsPerBlast =1 if ($threadsPerBlast <1);
81
-
82
+
82
83
# Better parallelization: one fasta entry per cpu.
83
84
# Split the query into multiple files and then figure out
84
85
# how many cpus per blast job we need.
85
86
86
87
# Perform blast on these split files.
87
88
logmsg " Created blast input query files under $tempdir /*.fna" ;
88
- system (" ls $tempdir /*.fna | xargs -I {} -P $$settings {numcpus} -n 1 blastx -query {} -db $db -evalue 0.05 -outfmt 6 -num_threads $threadsPerBlast -out {}.bls" );
89
+ my $xargsCommand =qq( ls $tempdir /*.fna | xargs -P $$settings {numcpus} -n 1 sh -c '
90
+ offset=\$ (basename \$ 0 .fna); # Find the coordinate offset from the filename
91
+ blastx -query \$ 0 -db $db -evalue 0.05 -outfmt 6 -num_threads $threadsPerBlast |\\
92
+ perl -lane \"
93
+ \\\$ F[6]+=\$ offset; # query coordinate offset
94
+ \\\$ F[7]+=\$ offset; # query coordinate offset
95
+ print join(\\\"\\\t\\\" ,\@ F); # lots of backslashes because of language-inception
96
+ \" > \$ 0.bls
97
+ ') ;
98
+ # die $xargsCommand;
99
+ system ($xargsCommand );
89
100
die " ERROR with blastx: $! " if $? ;
90
101
# my $allResults=`blastx -query '$fasta' -db $db -evalue 0.05 -outfmt 6 -num_threads $$settings{numcpus}`;
91
- my $ allResults =` cat $tempdir /*.fna.bls` ;
102
+ my @ allResults =` cat $tempdir /*.fna.bls` ;
92
103
die " ERROR with cat on $tempdir /*.fna.bls" if ($? );
93
- die " No results were returned by blastx" if (!$ allResults );
104
+ warn " No results were returned by blastx" if (!@ allResults );
94
105
95
106
my $flanking =$$settings {flanking }; # bp
96
107
logmsg " Parsing results with a soft flanking distance of $flanking " ;
97
- my (%range );
98
- for my $result (split ( / \n / , $ allResults) ){
108
+ my (%range , %seenRange );
109
+ for my $result (@ allResults ){
99
110
$result =~s / ^\s +|\s +$// g ; # trim
100
- my ($contig ,$hit ,$identity ,$length ,$gaps ,$mismatches ,$sstart , $send , $qstart , $qend ,$e ,$score )=split /\t/, $result ;
111
+ my ($contig ,$hit ,$identity ,$length ,$gaps ,$mismatches ,$qstart , $qend , $sstart , $send ,$e ,$score )=split /\t/, $result ;
101
112
next if ($score < 50 || $length < 20);
102
-
113
+ # Don't bother the Range object if this is a range we've already seen.
114
+ # This will speed things up since Number::Range is slow, and
115
+ # it will happen because each genome region can hit multiple phages.
116
+ next if ($seenRange {$contig }{$qstart }{$qend }++);
117
+
103
118
# Make sure there is a range object for this contig.
104
- # Come up with
105
119
$range {$contig }||=Number::Range-> new;
106
- my $lo =min($sstart , $send );
107
- my $hi =max($sstart , $send );
120
+ my $lo =min($qstart , $qend );
121
+ my $hi =max($qstart , $qend );
108
122
109
123
# Add some coordinates between close hits based on
110
124
# the flanking distance. Start from high to low
@@ -133,13 +147,15 @@ sub phast{
133
147
$range {$contig }-> addrange($loSoftFlank ..$hiSoftFlank );
134
148
}
135
149
150
+ logmsg " Finished adding coordinate ranges. Now creating a bed format" ;
151
+
136
152
# Translate the ranges found in the Range objects into
137
153
# an array of [contig,start,stop]
138
154
my @range ;
139
155
while (my ($contig ,$rangeObj )=each (%range )){
140
- my $rangeStr =$rangeObj -> range ;
141
- while ( $rangeStr =~ / ( \d +) \.\. ( \d +),? / g ){
142
- push (@range ,[$contig ,$1 , $2 ]);
156
+ my @rangeList =$rangeObj -> rangeList ;
157
+ for ( @rangeList ){
158
+ push (@range ,[$contig ,@$_ ]);
143
159
}
144
160
}
145
161
@@ -150,7 +166,7 @@ sub usage{
150
166
" Finds phages in a fasta file using phast
151
167
Usage: $0 file.fasta
152
168
--numcpus 1
153
- --tempdir tmp/
169
+ --tempdir /tmp
154
170
--flanking 0 Give 'soft' edges to ranges. If blast hits are this many
155
171
nt away from another blast hit, then join the ranges and
156
172
include any intermediate positions. If ranges cannot be
0 commit comments