3 # Copyright (C) 2007 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Stuff for interacting with NCBI Entrez
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
39 use vars qw( @ISA @EXPORT );
41 @ISA = qw( Exporter );
44 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
49 # Martin A. Hansen, March 2007.
51 # connects to the ncbi website and retrieves a genbank record,
54 my ( $db, # database <nucleotide|protein>
56 $type, # retrieval type <gb|gp>
61 my ( $content, @lines, $i, $seq );
63 $content = get "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=$db&id=$id&rettype=$type";
71 # Martin A. Hansen, March 2007.
73 # connects to the ncbi website and retrieves a genbank record,
74 # from which the sequence is parsed and returned.
76 my ( $db, # database <nucleotide|protein>
78 $type, # retrieval type <gb|gp>
83 my ( $content, @lines, $i, $seq );
85 $content = get "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=$db&id=$id&rettype=$type";
87 @lines = split "\n", $content;
91 while ( $lines[ $i ] !~ /^ORIGIN/ ) {
97 while ( $lines[ $i ] !~ /^\/\// )
99 $lines[ $i ] =~ s/^\s*\d+//;
101 $seq .= $lines[ $i ];
114 # Martin A. Hansen, February 2008.
116 # !!! NOT USED ANYMORE !!! #
118 # Reads in and parses a file in SOFT format.
120 my ( $path, # full path to SOFT file
125 my ( $fh, @lines, $i, $c, $num, %key_hash, @fields, %id_hash, $id, $seq, $count, $record, @records, $platform_id, $sample_id, $sample_title );
127 $fh = Maasha::Common::read_open( $path );
139 while ( $i < @lines )
141 if ( $lines[ $i ] =~ /^\^PLATFORM = (.+)/ )
145 elsif ( $lines[ $i ] =~ /^!platform_table_begin$/ )
147 @fields = split "\t", $lines[ $i + 1 ];
149 for ( $c = 0; $c < @fields; $c++ ) {
150 $key_hash{ $fields[ $c ] } = $c;
155 while ( $lines[ $c ] !~ /^!platform_table_end$/ )
157 @fields = split "\t", $lines[ $c ];
159 $id_hash{ $fields[ $key_hash{ "ID" } ] } = $fields[ $key_hash{ "SEQUENCE" } ];
166 elsif ( $lines[ $i ] =~ /^\^SAMPLE = (.+)/ )
170 elsif ( $lines[ $i ] =~ /^!Sample_title = (.+)/ )
174 elsif ( $lines[ $i ] =~ /^!sample_table_begin/ )
178 @fields = split "\t", $lines[ $i + 1 ];
180 for ( $c = 0; $c < @fields; $c++ ) {
181 $key_hash{ $fields[ $c ] } = $c;
186 while ( $lines[ $c ] !~ /^!sample_table_end$/ )
190 @fields = split "\t", $lines[ $c ];
192 $id = $fields[ $key_hash{ "ID_REF" } ];
193 $seq = $id_hash{ $id };
194 $count = $fields[ $key_hash{ "VALUE" } ];
198 $record->{ "SAMPLE_TITLE" } = $sample_title;
199 $record->{ "SEQ" } = $seq;
200 $record->{ "SEQ_NAME" } = join( "_", $platform_id, $sample_id, $num, $count );
202 push @records, $record;
216 return wantarray ? @records : \@records;
222 # Martin A. Hansen, June 2008.
224 # Create a index with linenumbers of the different tables
225 # in a soft file. The index is returned.
227 my ( $file, # file to index
232 my ( $fh, $line, $i, $c, @index, $first );
234 $fh = Maasha::Common::read_open( $file );
241 while ( $line = <$fh> )
245 if ( $line =~ /^\^/ )
254 $index[ $c - 1 ]{ "LINE_END" } = $i - 1;
267 $index[ $c - 1 ]{ "LINE_END" } = $i - 1;
271 return wantarray ? @index : \@index;
275 sub soft_get_platform
277 # Martin A. Hansen, June 2008.
279 # Given a filehandle to a SOFT file parses all the platform tables
282 my ( $fh, # filehandle
283 $beg, # line number where platform tables begin
284 $end, # line number where platform tables end
289 my ( $line, @lines, $i, $c, @fields, %key_hash, %id_hash );
293 while ( $line = <$fh> )
297 push @lines, $line if $i >= $beg;
306 while ( $i < @lines )
308 if ( $lines[ $i ] =~ /^!platform_table_begin$/ )
310 @fields = split "\t", $lines[ $i + 1 ];
312 for ( $c = 0; $c < @fields; $c++ ) {
313 $key_hash{ $fields[ $c ] } = $c;
318 while ( $lines[ $c ] !~ /^!platform_table_end$/ )
320 @fields = split "\t", $lines[ $c ];
322 if ( defined $key_hash{ "SEQUENCE" } ) {
323 $id_hash{ $fields[ $key_hash{ "ID" } ] } = $fields[ $key_hash{ "SEQUENCE" } ];
325 $id_hash{ $fields[ $key_hash{ "ID" } ] } = $fields[ $key_hash{ "ID" } ];
337 return wantarray ? %id_hash : \%id_hash;
343 # Martin A. Hansen, June 2008.
345 # Given a filehandle to a SOFT file and a platform table
346 # parses sample records which are returned.
348 my ( $fh, # filehandle
349 $plat_table, # hashref with platform table
350 $beg, # line number where sample table begin
351 $end, # line number where sample table end
352 $skip, # flag indicating that this sample table should not be parsed.
355 # Returns list of hashref
357 my ( $line, @lines, $i, $c, $platform_id, @fields, %key_hash, $num, $sample_id, $sample_title, $id, $seq, $count, @records, $record );
361 while ( $line = <$fh> )
367 push @lines, $line if $i >= $beg;
376 return wantarray ? () : [];
383 while ( $i < @lines )
385 if ( $lines[ $i ] =~ /^\^SAMPLE = (.+)/ )
389 elsif ( $lines[ $i ] =~ /!Sample_platform_id = (.+)/ )
393 elsif ( $lines[ $i ] =~ /^!Sample_title = (.+)/ )
397 elsif ( $lines[ $i ] =~ /^!sample_table_begin/ )
401 @fields = split "\t", $lines[ $i + 1 ];
403 for ( $c = 0; $c < @fields; $c++ ) {
404 $key_hash{ $fields[ $c ] } = $c;
409 while ( $lines[ $c ] !~ /^!sample_table_end$/ )
413 @fields = split "\t", $lines[ $c ];
415 $id = $fields[ $key_hash{ "ID_REF" } ];
416 $seq = $plat_table->{ $id };
417 $count = $fields[ $key_hash{ "VALUE" } ];
421 $record->{ "SAMPLE_TITLE" } = $sample_title;
422 $record->{ "SEQ" } = $seq;
423 $record->{ "SEQ_NAME" } = join( "_", $platform_id, $sample_id, $num, $count );
425 push @records, $record;
439 return wantarray ? @records : \@records;
445 # Martin A. Hansen, Juli 2008.
447 # Create a BLAST index of a given FASTA file
448 # in a specified directory using a given name.
450 my ( $file, # path to FASTA file
451 $src_dir, # source directory with FASTA file
452 $dst_dir, # destination directory for BLAST index
453 $index_type, # protein or nucleotide
454 $index_name, # name of index
461 if ( $index_type =~ /protein/i ) {
467 Maasha::Filesys::dir_create_if_not_exists( $dst_dir );
468 Maasha::Common::run( "formatdb", "-p $type -i $src_dir/$file -t $index_name -l /dev/null" );
469 Maasha::Common::run( "mv", "$src_dir/*hr $dst_dir" );
470 Maasha::Common::run( "mv", "$src_dir/*in $dst_dir" );
471 Maasha::Common::run( "mv", "$src_dir/*sq $dst_dir" );
475 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<