3 # Copyright (C) 2007 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Routines to run BLAST and parse results.
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
32 use Storable qw( dclone );
34 use vars qw( @ISA @EXPORT );
36 @ISA = qw( Exporter );
39 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
42 # The blast report structured like this:
44 # for the first entry:
46 # 1 - blast program name
48 # 3 - query sequence name and length
49 # 4 - subject database
50 # 5 - sequences producing significant alignments
51 # 6 - one or more HSP for each subject sequence
52 # 7 - blast statistics
54 # for subsequent entries:
56 # 3 - query sequence name and length
57 # 5 - sequences producing significant alignments
58 # 6 - one or more HSP for each subject sequence
59 # 7 - blast statistics
61 # ________________________
77 # ________________________
80 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
85 # Martin A. Hansen, March 2007.
87 # determines if the results is from ncbi blast or blastcl3
88 # and parses the results in accordance.
95 my ( $results, $line,$doctype );
97 while ( $line = <$fh> )
101 if ( $line =~ /^<!DOCTYPE/ )
108 if ( $doctype eq '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "NCBI_BlastOutput.dtd">' )
110 print STDERR qq(Parsing blastcl3 results ...\n);
111 $results = &xml_parse_blast_blastcl3( $fh );
113 elsif ( $doctype eq '<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">' )
115 print STDERR qq(Parsing NCBI blast results ...\n);
116 $results = &xml_parse_blast_ncbi( $fh );
120 die qq(ERROR: Could not determine doctype\n);
123 return wantarray ? @{ $results } : $results;
127 sub xml_parse_blast_ncbi
129 # Martin A. Hansen, February 2007.
134 my ( $blast_record, $line, @blast_query, @blast_subject, $query, $subject, @results );
136 while ( $blast_record = &xml_get_blast_record( $fh ) and scalar @{ $blast_record } > 0 )
138 foreach $line ( @{ $blast_record } )
140 if ( $line =~ /<Iteration_query-ID>|<Iteration_query-def>|<Iteration_query-len>/ )
142 push @blast_query, $line;
144 elsif ( @blast_query )
146 push @blast_subject, $line;
148 if ( $line =~ /<\/Iteration_hits>/ )
150 $query = &xml_parse_blast_query( \@blast_query );
151 $subject = &xml_parse_blast_subject( \@blast_subject );
155 "SUBJECT" => $subject,
159 undef @blast_subject;
165 return wantarray ? @results : \@results;
169 sub xml_parse_blast_blastcl3
171 # Martin A. Hansen, February 2007.
176 my ( $blast_record, $line, @blast_query, @blast_subject, $query, $subject, @results );
178 while ( $blast_record = &xml_get_blast_record( $fh ) and scalar @{ $blast_record } > 0 )
180 foreach $line ( @{ $blast_record } )
182 if ( $line =~ /<BlastOutput_query-ID>|<BlastOutput_query-def>|<BlastOutput_query-len>/ )
184 push @blast_query, $line;
186 elsif ( @blast_query )
188 push @blast_subject, $line;
190 if ( $line =~ /<\/Iteration_hits>/ )
192 $query = &xml_parse_blast_query( \@blast_query );
193 $subject = &xml_parse_blast_subject( \@blast_subject );
197 "SUBJECT" => $subject,
201 undef @blast_subject;
207 return wantarray ? @results : \@results;
211 sub xml_get_blast_record
213 # Martin A. Hansen, March 2007.
215 my ( $fh, # file handle to BLAST file in XML format
218 # returns list of lines
220 my ( $line, @blast_record );
222 while ( $line = <$fh> )
226 push @blast_record, $line;
228 last if $line =~ /<\/BlastOutput>/;
231 return wantarray ? @blast_record : \@blast_record;
235 sub xml_parse_blast_query
242 foreach $line ( @{ $lines } )
244 if ( $line =~ /<Iteration_query-ID>([^<]+)/ ) {
245 $hash{ "Q_ID" } = $1;
246 } elsif ( $line =~ /<Iteration_query-def>([^<]+)/ ) {
247 $hash{ "Q_DEF" } = $1;
248 } elsif ( $line =~ /<Iteration_query-len>([^<]+)/ ) {
249 $hash{ "Q_LEN" } = $1;
253 return wantarray ? %hash : \%hash;
257 sub xml_parse_blast_subject
259 # Martin A. Hansen, March 2007.
266 my ( $line, @blast_hit, @blast_hsps, $hit, $hsps, @hits );
268 foreach $line ( @{ $lines } )
270 if ( $line =~ /<Hit_id>|<Hit_def>|<Hit_accession>|<Hit_len>|<Hit_num>/ )
272 push @blast_hit, $line;
276 push @blast_hsps, $line;
278 if ( $line =~ /<\/Hit_hsps>/ )
280 $hit = &xml_parse_blast_hit( \@blast_hit );
281 $hsps = &xml_parse_blast_hsps( \@blast_hsps );
283 $hit->{ "HSPS" } = $hsps;
293 return wantarray ? @hits : \@hits;
297 sub xml_parse_blast_hit
304 foreach $line ( @{ $lines } )
306 if ( $line =~ /<Hit_num>([^<]+)/ ) {
307 $hash{ "S_NUM" } = $1;
308 } elsif ( $line =~ /<Hit_id>([^<]+)/ ) {
309 $hash{ "S_ID" } = $1;
310 } elsif ( $line =~ /<Hit_def>([^<]+)/ ) {
311 $hash{ "S_DEF" } = $1;
312 } elsif ( $line =~ /<Hit_accession>([^<]+)/ ) {
313 $hash{ "S_ACC" } = $1;
314 } elsif ( $line =~ /<Hit_len>([^<]+)/ ) {
315 $hash{ "S_LEN" } = $1;
319 return wantarray ? %hash : \%hash;
323 sub xml_parse_blast_hsps
325 # Martin A. Hansen, March 2007.
332 my ( $line, %hash, @hsps );
334 foreach $line ( @{ $blast_hits } )
336 if ( $line =~ /<Hsp_num>([^<]+)/ ) {
338 } elsif ( $line =~ /<Hsp_evalue>([^<]+)/ ) {
339 $hash{ "E_VAL" } = $1;
340 } elsif ( $line =~ /<Hsp_query-from>([^<]+)/ ) {
341 $hash{ "Q_BEG" } = $1 - 1;
342 } elsif ( $line =~ /<Hsp_query-to>([^<]+)/ ) {
343 $hash{ "Q_END" } = $1 - 1;
344 } elsif ( $line =~ /<Hsp_hit-from>([^<]+)/ ) {
345 $hash{ "S_BEG" } = $1 - 1;
346 } elsif ( $line =~ /<Hsp_hit-to>([^<]+)/ ) {
347 $hash{ "S_END" } = $1 - 1;
348 } elsif ( $line =~ /<Hsp_query-frame>([^<]+)/ ) {
349 $hash{ "Q_FRAME" } = $1;
350 } elsif ( $line =~ /<Hsp_hit-frame>([^<]+)/ ) {
351 $hash{ "S_FRAME" } = $1;
352 } elsif ( $line =~ /<Hsp_qseq>([^<]+)/ ) {
353 $hash{ "Q_ALIGN" } = $1;
354 } elsif ( $line =~ /<Hsp_hseq>([^<]+)/ ) {
355 $hash{ "S_ALIGN" } = $1;
356 } elsif ( $line =~ /<Hsp_midline>([^<]+)/ ) {
357 $hash{ "MIDLINE" } = $1;
358 } elsif ( $line =~ /<\/Hsp>/ ) {
359 push @hsps, dclone \%hash;
363 return wantarray ? @hsps : \@hsps;
367 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<