3 # Copyright (C) 2007-2009 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines to run BLAST and parse results.
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
33 use Storable qw( dclone );
35 use vars qw( @ISA @EXPORT );
37 @ISA = qw( Exporter );
40 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
43 # The blast report structured like this:
45 # for the first entry:
47 # 1 - blast program name
49 # 3 - query sequence name and length
50 # 4 - subject database
51 # 5 - sequences producing significant alignments
52 # 6 - one or more HSP for each subject sequence
53 # 7 - blast statistics
55 # for subsequent entries:
57 # 3 - query sequence name and length
58 # 5 - sequences producing significant alignments
59 # 6 - one or more HSP for each subject sequence
60 # 7 - blast statistics
62 # ________________________
78 # ________________________
81 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
86 # Martin A. Hansen, March 2007.
88 # determines if the results is from ncbi blast or blastcl3
89 # and parses the results in accordance.
96 my ( $results, $line,$doctype );
98 while ( $line = <$fh> )
102 if ( $line =~ /^<!DOCTYPE/ )
109 if ( $doctype eq '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">' )
111 print STDERR qq(Parsing blastcl3 results ...\n);
112 $results = xml_parse_blast_blastcl3( $fh );
114 elsif ( $doctype eq '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">' )
116 print STDERR qq(Parsing NCBI blast results ...\n);
117 $results = xml_parse_blast_ncbi( $fh );
121 die qq(ERROR: Could not determine doctype\n);
124 return wantarray ? @{ $results } : $results;
128 sub xml_parse_blast_ncbi
130 # Martin A. Hansen, February 2007.
135 my ( $blast_record, $line, @blast_query, @blast_subject, $query, $subject, @results );
137 while ( $blast_record = xml_get_blast_record( $fh ) and scalar @{ $blast_record } > 0 )
139 foreach $line ( @{ $blast_record } )
141 if ( $line =~ /<Iteration_query-ID>|<Iteration_query-def>|<Iteration_query-len>/ )
143 push @blast_query, $line;
145 elsif ( @blast_query )
147 push @blast_subject, $line;
149 if ( $line =~ /<\/Iteration_hits>/ )
151 $query = xml_parse_blast_query( \@blast_query );
152 $subject = xml_parse_blast_subject( \@blast_subject );
156 "SUBJECT" => $subject,
160 undef @blast_subject;
166 return wantarray ? @results : \@results;
170 sub xml_parse_blast_blastcl3
172 # Martin A. Hansen, February 2007.
177 my ( $blast_record, $line, @blast_query, @blast_subject, $query, $subject, @results );
179 while ( $blast_record = xml_get_blast_record( $fh ) and scalar @{ $blast_record } > 0 )
181 foreach $line ( @{ $blast_record } )
183 if ( $line =~ /<BlastOutput_query-ID>|<BlastOutput_query-def>|<BlastOutput_query-len>/ )
185 push @blast_query, $line;
187 elsif ( @blast_query )
189 push @blast_subject, $line;
191 if ( $line =~ /<\/Iteration_hits>/ )
193 $query = xml_parse_blast_query( \@blast_query );
194 $subject = xml_parse_blast_subject( \@blast_subject );
198 "SUBJECT" => $subject,
202 undef @blast_subject;
208 return wantarray ? @results : \@results;
212 sub xml_get_blast_record
214 # Martin A. Hansen, March 2007.
216 my ( $fh, # file handle to BLAST file in XML format
219 # returns list of lines
221 my ( $line, @blast_record );
223 while ( $line = <$fh> )
227 push @blast_record, $line;
229 last if $line =~ /<\/BlastOutput>/;
232 return wantarray ? @blast_record : \@blast_record;
236 sub xml_parse_blast_query
243 foreach $line ( @{ $lines } )
245 if ( $line =~ /<Iteration_query-ID>([^<]+)/ ) {
246 $hash{ "Q_ID" } = $1;
247 } elsif ( $line =~ /<Iteration_query-def>([^<]+)/ ) {
248 $hash{ "Q_DEF" } = $1;
249 } elsif ( $line =~ /<Iteration_query-len>([^<]+)/ ) {
250 $hash{ "Q_LEN" } = $1;
254 return wantarray ? %hash : \%hash;
258 sub xml_parse_blast_subject
260 # Martin A. Hansen, March 2007.
267 my ( $line, @blast_hit, @blast_hsps, $hit, $hsps, @hits );
269 foreach $line ( @{ $lines } )
271 if ( $line =~ /<Hit_id>|<Hit_def>|<Hit_accession>|<Hit_len>|<Hit_num>/ )
273 push @blast_hit, $line;
277 push @blast_hsps, $line;
279 if ( $line =~ /<\/Hit_hsps>/ )
281 $hit = xml_parse_blast_hit( \@blast_hit );
282 $hsps = xml_parse_blast_hsps( \@blast_hsps );
284 $hit->{ "HSPS" } = $hsps;
294 return wantarray ? @hits : \@hits;
298 sub xml_parse_blast_hit
305 foreach $line ( @{ $lines } )
307 if ( $line =~ /<Hit_num>([^<]+)/ ) {
308 $hash{ "S_NUM" } = $1;
309 } elsif ( $line =~ /<Hit_id>([^<]+)/ ) {
310 $hash{ "S_ID" } = $1;
311 } elsif ( $line =~ /<Hit_def>([^<]+)/ ) {
312 $hash{ "S_DEF" } = $1;
313 } elsif ( $line =~ /<Hit_accession>([^<]+)/ ) {
314 $hash{ "S_ACC" } = $1;
315 } elsif ( $line =~ /<Hit_len>([^<]+)/ ) {
316 $hash{ "S_LEN" } = $1;
320 return wantarray ? %hash : \%hash;
324 sub xml_parse_blast_hsps
326 # Martin A. Hansen, March 2007.
333 my ( $line, %hash, @hsps );
335 foreach $line ( @{ $blast_hits } )
337 if ( $line =~ /<Hsp_num>([^<]+)/ ) {
339 } elsif ( $line =~ /<Hsp_evalue>([^<]+)/ ) {
340 $hash{ "E_VAL" } = $1;
341 } elsif ( $line =~ /<Hsp_query-from>([^<]+)/ ) {
342 $hash{ "Q_BEG" } = $1 - 1;
343 } elsif ( $line =~ /<Hsp_query-to>([^<]+)/ ) {
344 $hash{ "Q_END" } = $1 - 1;
345 } elsif ( $line =~ /<Hsp_hit-from>([^<]+)/ ) {
346 $hash{ "S_BEG" } = $1 - 1;
347 } elsif ( $line =~ /<Hsp_hit-to>([^<]+)/ ) {
348 $hash{ "S_END" } = $1 - 1;
349 } elsif ( $line =~ /<Hsp_query-frame>([^<]+)/ ) {
350 $hash{ "Q_FRAME" } = $1;
351 } elsif ( $line =~ /<Hsp_hit-frame>([^<]+)/ ) {
352 $hash{ "S_FRAME" } = $1;
353 } elsif ( $line =~ /<Hsp_qseq>([^<]+)/ ) {
354 $hash{ "Q_ALIGN" } = $1;
355 } elsif ( $line =~ /<Hsp_hseq>([^<]+)/ ) {
356 $hash{ "S_ALIGN" } = $1;
357 } elsif ( $line =~ /<Hsp_midline>([^<]+)/ ) {
358 $hash{ "MIDLINE" } = $1;
359 } elsif ( $line =~ /<\/Hsp>/ ) {
360 push @hsps, dclone \%hash;
364 return wantarray ? @hsps : \@hsps;
368 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<