Skip to content

Commit f771cbd

Browse files
committed
fixed coordinates problem when multithreading
1 parent c5509d0 commit f771cbd

File tree

1 file changed

+32
-16
lines changed

1 file changed

+32
-16
lines changed

scripts/set_findPhages.pl

Lines changed: 32 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -70,41 +70,55 @@ sub phast{
7070
my($start,$stop)=split(/\-/,$coordinates);
7171
$stop||=$start;
7272

73-
my $file="$tempdir/$i.fna";
73+
# The file name will have the start coordinate as its name
74+
my $file="$tempdir/$start.fna";
7475
open(SEQOUT,">",$file) or die "ERROR: could not write seq to temp file $file: $!";
7576
print SEQOUT ">".$contig."\n".$seq{$contig}->subseq($start,$stop)."\n";
7677
close SEQOUT;
7778
$i++;
7879
}
7980
my $threadsPerBlast=int($$settings{numcpus}/$i);
8081
$threadsPerBlast=1 if($threadsPerBlast<1);
81-
82+
8283
# Better parallelization: one fasta entry per cpu.
8384
# Split the query into multiple files and then figure out
8485
# how many cpus per blast job we need.
8586

8687
# Perform blast on these split files.
8788
logmsg "Created blast input query files under $tempdir/*.fna";
88-
system("ls $tempdir/*.fna | xargs -I {} -P $$settings{numcpus} -n 1 blastx -query {} -db $db -evalue 0.05 -outfmt 6 -num_threads $threadsPerBlast -out {}.bls");
89+
my $xargsCommand=qq(ls $tempdir/*.fna | xargs -P $$settings{numcpus} -n 1 sh -c '
90+
offset=\$(basename \$0 .fna); # Find the coordinate offset from the filename
91+
blastx -query \$0 -db $db -evalue 0.05 -outfmt 6 -num_threads $threadsPerBlast |\\
92+
perl -lane \"
93+
\\\$F[6]+=\$offset; # query coordinate offset
94+
\\\$F[7]+=\$offset; # query coordinate offset
95+
print join(\\\"\\\t\\\",\@F); # lots of backslashes because of language-inception
96+
\" > \$0.bls
97+
');
98+
#die $xargsCommand;
99+
system($xargsCommand);
89100
die "ERROR with blastx: $!" if $?;
90101
#my $allResults=`blastx -query '$fasta' -db $db -evalue 0.05 -outfmt 6 -num_threads $$settings{numcpus}`;
91-
my $allResults=`cat $tempdir/*.fna.bls`;
102+
my @allResults=`cat $tempdir/*.fna.bls`;
92103
die "ERROR with cat on $tempdir/*.fna.bls" if($?);
93-
die "No results were returned by blastx" if(!$allResults);
104+
warn "No results were returned by blastx" if(!@allResults);
94105

95106
my $flanking=$$settings{flanking}; #bp
96107
logmsg "Parsing results with a soft flanking distance of $flanking";
97-
my(%range);
98-
for my $result(split(/\n/,$allResults)){
108+
my(%range, %seenRange);
109+
for my $result(@allResults){
99110
$result=~s/^\s+|\s+$//g; # trim
100-
my ($contig,$hit,$identity,$length,$gaps,$mismatches,$sstart,$send,$qstart,$qend,$e,$score)=split /\t/, $result;
111+
my ($contig,$hit,$identity,$length,$gaps,$mismatches,$qstart,$qend,$sstart,$send,$e,$score)=split /\t/, $result;
101112
next if($score < 50 || $length < 20);
102-
113+
# Don't bother the Range object if this is a range we've already seen.
114+
# This will speed things up since Number::Range is slow, and
115+
# it will happen because each genome region can hit multiple phages.
116+
next if($seenRange{$contig}{$qstart}{$qend}++);
117+
103118
# Make sure there is a range object for this contig.
104-
# Come up with
105119
$range{$contig}||=Number::Range->new;
106-
my $lo=min($sstart,$send);
107-
my $hi=max($sstart,$send);
120+
my $lo=min($qstart,$qend);
121+
my $hi=max($qstart,$qend);
108122

109123
# Add some coordinates between close hits based on
110124
# the flanking distance. Start from high to low
@@ -133,13 +147,15 @@ sub phast{
133147
$range{$contig}->addrange($loSoftFlank..$hiSoftFlank);
134148
}
135149

150+
logmsg "Finished adding coordinate ranges. Now creating a bed format";
151+
136152
# Translate the ranges found in the Range objects into
137153
# an array of [contig,start,stop]
138154
my @range;
139155
while(my($contig,$rangeObj)=each(%range)){
140-
my $rangeStr=$rangeObj->range;
141-
while($rangeStr=~/(\d+)\.\.(\d+),?/g){
142-
push(@range,[$contig,$1,$2]);
156+
my @rangeList=$rangeObj->rangeList;
157+
for(@rangeList){
158+
push(@range,[$contig,@$_]);
143159
}
144160
}
145161

@@ -150,7 +166,7 @@ sub usage{
150166
"Finds phages in a fasta file using phast
151167
Usage: $0 file.fasta
152168
--numcpus 1
153-
--tempdir tmp/
169+
--tempdir /tmp
154170
--flanking 0 Give 'soft' edges to ranges. If blast hits are this many
155171
nt away from another blast hit, then join the ranges and
156172
include any intermediate positions. If ranges cannot be

0 commit comments

Comments
 (0)