3 # Copyright (C) 2007 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
25 # Stuff for interacting with NCBI Entrez
28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
39 use vars qw( @ISA @EXPORT );
41 @ISA = qw( Exporter );
44 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
49 # Martin A. Hansen, March 2007.
51 # connects to the ncbi website and retrieves a genbank record,
54 my ( $db, # database <nucleotide|protein>
56 $type, # retrieval type <gb|gp>
61 my ( $content, @lines, $i, $seq );
63 $content = get "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=$db&id=$id&rettype=$type";
71 # Martin A. Hansen, March 2007.
73 # connects to the ncbi website and retrieves a genbank record,
74 # from which the sequence is parsed and returned.
76 my ( $db, # database <nucleotide|protein>
78 $type, # retrieval type <gb|gp>
83 my ( $content, @lines, $i, $seq );
85 $content = get "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=$db&id=$id&rettype=$type";
87 @lines = split "\n", $content;
91 while ( $lines[ $i ] !~ /^ORIGIN/ ) {
97 while ( $lines[ $i ] !~ /^\/\// )
99 $lines[ $i ] =~ s/^\s*\d+//;
101 $seq .= $lines[ $i ];
114 # Martin A. Hansen, June 2008.
116 # Create a index with line numbers of the different tables
117 # in a soft file. The index is returned.
119 my ( $file, # file to index
124 my ( $fh, $line, $i, $c, @index, $first );
126 $fh = Maasha::Filesys::file_read_open( $file );
133 while ( $line = <$fh> )
137 if ( $line =~ /^\^/ )
146 $index[ $c - 1 ]{ "LINE_END" } = $i - 1;
159 $index[ $c - 1 ]{ "LINE_END" } = $i - 1;
163 return wantarray ? @index : \@index;
167 sub soft_get_platform
169 # Martin A. Hansen, June 2008.
171 # Given a filehandle to a SOFT file parses all the platform tables
174 my ( $fh, # filehandle
175 $beg, # line number where platform tables begin
176 $end, # line number where platform tables end
181 my ( $line, @lines, $i, $c, @fields, %key_hash, %id_hash );
185 while ( $line = <$fh> )
189 push @lines, $line if $i >= $beg;
198 while ( $i < @lines )
200 if ( $lines[ $i ] =~ /^!platform_table_begin$/ )
202 @fields = split "\t", $lines[ $i + 1 ];
204 for ( $c = 0; $c < @fields; $c++ ) {
205 $key_hash{ $fields[ $c ] } = $c;
210 while ( $lines[ $c ] !~ /^!platform_table_end$/ )
212 @fields = split "\t", $lines[ $c ];
214 if ( defined $key_hash{ "SEQUENCE" } ) {
215 $id_hash{ $fields[ $key_hash{ "ID" } ] } = $fields[ $key_hash{ "SEQUENCE" } ];
217 $id_hash{ $fields[ $key_hash{ "ID" } ] } = $fields[ $key_hash{ "ID" } ];
229 return wantarray ? %id_hash : \%id_hash;
235 # Martin A. Hansen, June 2008.
237 # Given a filehandle to a SOFT file and a platform table
238 # parses sample records which are returned.
240 my ( $fh, # filehandle
241 $plat_table, # hashref with platform table
242 $beg, # line number where sample table begin
243 $end, # line number where sample table end
244 $skip, # flag indicating that this sample table should not be parsed.
247 # Returns list of hashref
249 my ( $line, @lines, $i, $c, $platform_id, @fields, %key_hash, $num, $sample_id, $sample_title, $id, $seq, $count, @records, $record );
253 while ( $line = <$fh> )
259 push @lines, $line if $i >= $beg;
268 return wantarray ? () : [];
275 while ( $i < @lines )
277 if ( $lines[ $i ] =~ /^\^SAMPLE = (.+)/ )
281 elsif ( $lines[ $i ] =~ /!Sample_platform_id = (.+)/ )
285 elsif ( $lines[ $i ] =~ /^!Sample_title = (.+)/ )
289 elsif ( $lines[ $i ] =~ /^!sample_table_begin/ )
293 @fields = split "\t", $lines[ $i + 1 ];
295 for ( $c = 0; $c < @fields; $c++ )
297 $fields[ $c ] = "ID_REF" if $fields[ $c ] eq "SEQUENCE";
298 $fields[ $c ] = "VALUE" if $fields[ $c ] eq "COUNT";
300 $key_hash{ $fields[ $c ] } = $c;
305 while ( $lines[ $c ] !~ /^!sample_table_end$/ )
309 @fields = split "\t", $lines[ $c ];
310 $id = $fields[ $key_hash{ "ID_REF" } ];
311 $seq = $plat_table->{ $id } || $id;
312 $count = $fields[ $key_hash{ "VALUE" } ];
316 $record->{ "SAMPLE_TITLE" } = $sample_title;
317 $record->{ "SEQ" } = $seq;
318 $record->{ "SEQ_NAME" } = join( "_", $platform_id, $sample_id, $num, $count );
320 push @records, $record;
334 return wantarray ? @records : \@records;
340 # Martin A. Hansen, Juli 2008.
342 # Create a BLAST index of a given FASTA file
343 # in a specified directory using a given name.
345 my ( $file, # path to FASTA file
346 $src_dir, # source directory with FASTA file
347 $dst_dir, # destination directory for BLAST index
348 $index_type, # protein or nucleotide
349 $index_name, # name of index
356 if ( $index_type =~ /protein/i ) {
362 Maasha::Filesys::dir_create_if_not_exists( $dst_dir );
363 Maasha::Common::run( "formatdb", "-p $type -i $src_dir/$file -t $index_name -l /dev/null" );
364 Maasha::Common::run( "mv", "$src_dir/*hr $dst_dir" );
365 Maasha::Common::run( "mv", "$src_dir/*in $dst_dir" );
366 Maasha::Common::run( "mv", "$src_dir/*sq $dst_dir" );
370 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<