3 # Copyright (C) 2007 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Stuff for interacting with NCBI Entrez
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
38 use vars qw( @ISA @EXPORT );
40 @ISA = qw( Exporter );
43 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
48 # Martin A. Hansen, March 2007.
50 # connects to the ncbi website and retrieves a genbank record,
53 my ( $db, # database <nucleotide|protein>
55 $type, # retrieval type <gb|gp>
60 my ( $content, @lines, $i, $seq );
62 $content = get "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=$db&id=$id&rettype=$type";
70 # Martin A. Hansen, March 2007.
72 # connects to the ncbi website and retrieves a genbank record,
73 # from which the sequence is parsed and returned.
75 my ( $db, # database <nucleotide|protein>
77 $type, # retrieval type <gb|gp>
82 my ( $content, @lines, $i, $seq );
84 $content = get "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=$db&id=$id&rettype=$type";
86 @lines = split "\n", $content;
90 while ( $lines[ $i ] !~ /^ORIGIN/ ) {
96 while ( $lines[ $i ] !~ /^\/\// )
98 $lines[ $i ] =~ s/^\s*\d+//;
100 $seq .= $lines[ $i ];
113 # Martin A. Hansen, February 2008.
115 # !!! NOT USED ANYMORE !!! #
117 # Reads in and parses a file in SOFT format.
119 my ( $path, # full path to SOFT file
124 my ( $fh, @lines, $i, $c, $num, %key_hash, @fields, %id_hash, $id, $seq, $count, $record, @records, $platform_id, $sample_id, $sample_title );
126 $fh = Maasha::Common::read_open( $path );
138 while ( $i < @lines )
140 if ( $lines[ $i ] =~ /^\^PLATFORM = (.+)/ )
144 elsif ( $lines[ $i ] =~ /^!platform_table_begin$/ )
146 @fields = split "\t", $lines[ $i + 1 ];
148 for ( $c = 0; $c < @fields; $c++ ) {
149 $key_hash{ $fields[ $c ] } = $c;
154 while ( $lines[ $c ] !~ /^!platform_table_end$/ )
156 @fields = split "\t", $lines[ $c ];
158 $id_hash{ $fields[ $key_hash{ "ID" } ] } = $fields[ $key_hash{ "SEQUENCE" } ];
165 elsif ( $lines[ $i ] =~ /^\^SAMPLE = (.+)/ )
169 elsif ( $lines[ $i ] =~ /^!Sample_title = (.+)/ )
173 elsif ( $lines[ $i ] =~ /^!sample_table_begin/ )
177 @fields = split "\t", $lines[ $i + 1 ];
179 for ( $c = 0; $c < @fields; $c++ ) {
180 $key_hash{ $fields[ $c ] } = $c;
185 while ( $lines[ $c ] !~ /^!sample_table_end$/ )
189 @fields = split "\t", $lines[ $c ];
191 $id = $fields[ $key_hash{ "ID_REF" } ];
192 $seq = $id_hash{ $id };
193 $count = $fields[ $key_hash{ "VALUE" } ];
197 $record->{ "SAMPLE_TITLE" } = $sample_title;
198 $record->{ "SEQ" } = $seq;
199 $record->{ "SEQ_NAME" } = join( "_", $platform_id, $sample_id, $num, $count );
201 push @records, $record;
215 return wantarray ? @records : \@records;
221 # Martin A. Hansen, June 2008.
223 # Create a index with linenumbers of the different tables
224 # in a soft file. The index is returned.
226 my ( $file, # file to index
231 my ( $fh, $line, $i, $c, @index, $first );
233 $fh = Maasha::Common::read_open( $file );
240 while ( $line = <$fh> )
244 if ( $line =~ /^\^/ )
253 $index[ $c - 1 ]{ "LINE_END" } = $i - 1;
266 $index[ $c - 1 ]{ "LINE_END" } = $i - 1;
270 return wantarray ? @index : \@index;
274 sub soft_get_platform
276 # Martin A. Hansen, June 2008.
278 # Given a filehandle to a SOFT file parses all the platform tables
281 my ( $fh, # filehandle
282 $beg, # line number where platform tables begin
283 $end, # line number where platform tables end
288 my ( $line, @lines, $i, $c, @fields, %key_hash, %id_hash );
292 while ( $line = <$fh> )
296 push @lines, $line if $i >= $beg;
305 while ( $i < @lines )
307 if ( $lines[ $i ] =~ /^!platform_table_begin$/ )
309 @fields = split "\t", $lines[ $i + 1 ];
311 for ( $c = 0; $c < @fields; $c++ ) {
312 $key_hash{ $fields[ $c ] } = $c;
317 while ( $lines[ $c ] !~ /^!platform_table_end$/ )
319 @fields = split "\t", $lines[ $c ];
321 $id_hash{ $fields[ $key_hash{ "ID" } ] } = $fields[ $key_hash{ "SEQUENCE" } ] || $fields[ $key_hash{ "ID" } ];
332 return wantarray ? %id_hash : \%id_hash;
338 # Martin A. Hansen, June 2008.
340 # Given a filehandle to a SOFT file and a platform table
341 # parses sample records which are returned.
343 my ( $fh, # filehandle
344 $plat_table, # hashref with platform table
345 $beg, # line number where sample table begin
346 $end, # line number where sample table end
347 $skip, # flag indicating that this sample table should not be parsed.
350 # Returns list of hashref
352 my ( $line, @lines, $i, $c, $platform_id, @fields, %key_hash, $num, $sample_id, $sample_title, $id, $seq, $count, @records, $record );
356 while ( $line = <$fh> )
362 push @lines, $line if $i >= $beg;
371 return wantarray ? () : [];
378 while ( $i < @lines )
380 if ( $lines[ $i ] =~ /^\^SAMPLE = (.+)/ )
384 elsif ( $lines[ $i ] =~ /!Sample_platform_id = (.+)/ )
388 elsif ( $lines[ $i ] =~ /^!Sample_title = (.+)/ )
392 elsif ( $lines[ $i ] =~ /^!sample_table_begin/ )
396 @fields = split "\t", $lines[ $i + 1 ];
398 for ( $c = 0; $c < @fields; $c++ ) {
399 $key_hash{ $fields[ $c ] } = $c;
404 while ( $lines[ $c ] !~ /^!sample_table_end$/ )
408 @fields = split "\t", $lines[ $c ];
410 $id = $fields[ $key_hash{ "ID_REF" } ];
411 $seq = $plat_table->{ $id };
412 $count = $fields[ $key_hash{ "VALUE" } ];
416 $record->{ "SAMPLE_TITLE" } = $sample_title;
417 $record->{ "SEQ" } = $seq;
418 $record->{ "SEQ_NAME" } = join( "_", $platform_id, $sample_id, $num, $count );
420 push @records, $record;
434 return wantarray ? @records : \@records;
440 # Martin A. Hansen, Juli 2008.
442 # Create a BLAST index of a given FASTA file
443 # in a specified directory using a given name.
445 my ( $file, # path to FASTA file
446 $src_dir, # source directory with FASTA file
447 $dst_dir, # destination directory for BLAST index
448 $index_type, # protein or nucleotide
449 $index_name, # name of index
456 if ( $index_type =~ /protein/i ) {
462 Maasha::Filesys::dir_create_if_not_exists( $dst_dir );
463 Maasha::Common::run( "formatdb", "-p $type -i $src_dir/$file -t $index_name -l /dev/null" );
464 Maasha::Common::run( "mv", "$src_dir/*hr $dst_dir" );
465 Maasha::Common::run( "mv", "$src_dir/*in $dst_dir" );
466 Maasha::Common::run( "mv", "$src_dir/*sq $dst_dir" );
470 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<