code_perl/Maasha/Match.pm

   1 package Maasha::Match;
   2
   3 # Copyright (C) 2007 Martin A. Hansen.
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21
  22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  23
  24
  25 # Routines to match sequences
  26
  27
  28 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  29
  30
  31 use strict;
  32 use Data::Dumper;
  33 use Storable qw( dclone );
  34 use Maasha::Common;
  35 use Maasha::Fasta;
  36 use Maasha::Seq;
  37 use Maasha::Berkeley_DB;
  38 use vars qw ( @ISA @EXPORT );
  39
  40 @ISA = qw( Exporter );
  41
  42
  43 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
  44
  45
  46 sub match_mummer
  47 {
  48     # Martin A. Hansen, June 2007.
  49
  50     # Match sequences using MUMmer.
  51
  52     my ( $entries1,   # FASTA entries
  53          $entries2,   # FASTA entries
  54          $options,    # additional MUMmer options - OPTIONAL
  55          $tmp_dir,    # temporary directory
  56        ) = @_;
  57
  58     # Returns a list.
  59
  60     my ( @args, $arg, $file_in1, $file_in2, $cmd, $file_out, $fh, $line, $result, @results );
  61
  62     $tmp_dir ||= $ENV{ "BP_TMP" };
  63
  64     $options->{ "word_size" } ||= 20;
  65     $options->{ "direction" } ||= "both";
  66
  67     push @args, "-c";
  68     push @args, "-L";
  69     push @args, "-F";
  70     push @args, "-l $options->{ 'word_size' }";
  71     push @args, "-maxmatch";
  72     push @args, "-n" if not Maasha::Seq::seq_guess_type( $entries1->[ 0 ]->[ 1 ] ) eq "protein";
  73     push @args, "-b" if $options->{ "direction" } =~ /^b/;
  74     push @args, "-r" if $options->{ "direction" } =~ /^r/;
  75
  76     $arg = join " ", @args;
  77
  78     $file_in1 = "$tmp_dir/muscle1.tmp";
  79     $file_in2 = "$tmp_dir/muscle2.tmp";
  80     $file_out = "$tmp_dir/muscle3.tmp";
  81
  82     map { $_->[ 0 ] =~ tr/ /_/ } @{ $entries1 };
  83     map { $_->[ 0 ] =~ tr/ /_/ } @{ $entries2 };
  84
  85     Maasha::Fasta::put_entries( $entries1, $file_in1 );
  86     Maasha::Fasta::put_entries( $entries2, $file_in2 );
  87
  88     Maasha::Common::run( "mummer", "$arg $file_in1 $file_in2 > $file_out 2>/dev/null" );
  89
  90     $fh = Maasha::Common::read_open( $file_out );
  91
  92     while ( $line = <$fh> )
  93     {
  94         chomp $line;
  95
  96         if ( $line =~ /^> (.+)Reverse\s+Len = (\d+)$/ )
  97         {
  98             $result->{ "Q_ID" }  = $1;
  99             $result->{ "Q_LEN" } = $2;
 100             $result->{ "DIR" }   = "reverse";
 101         }
 102         elsif ( $line =~ /^> (.+)Len = (\d+)$/ )
 103         {
 104             $result->{ "Q_ID" }  = $1;
 105             $result->{ "Q_LEN" } = $2;
 106             $result->{ "DIR" }   = "forward";
 107         }
 108         elsif ( $line =~ /^\s*(.\S+)\s+(\d+)\s+(\d+)\s+(\d+)$/ )
 109         {
 110             $result->{ "S_ID" }    = $1;
 111             $result->{ "S_BEG" }   = $2 - 1;
 112             $result->{ "Q_BEG" }   = $3 - 1;
 113             $result->{ "HIT_LEN" } = $4;
 114             $result->{ "S_END" }   = $result->{ "S_BEG" } + $result->{ "HIT_LEN" } - 1;
 115             $result->{ "Q_END" }   = $result->{ "Q_BEG" } + $result->{ "HIT_LEN" } - 1;
 116
 117             push @results, dclone $result;
 118         }
 119
 120     }
 121
 122     unlink $file_in1;
 123     unlink $file_in2;
 124     unlink $file_out;
 125
 126     return wantarray ? @results : \@results;
 127 }
 128
 129
 130 sub match_vmatch
 131 {
 132     # Martin A. Hansen, April 2008.
 133
 134     # Vmatches a list of records against a list of index files and the full
 135     # path to the result file is returned.
 136
 137     my ( $tmp_dir,       # directory in where to save temp files
 138          $records,       # list of records
 139          $index_files,   # list of index files
 140          $options,       # argument hash
 141        ) = @_;
 142
 143     # Returns a string.
 144
 145     my ( $query_file, $result_file, @result_files, $fh_in, $fh_out, $line, @fields, $i, $record, $vmatch_args, @index_names, @seq_names, $count_list );
 146
 147     $query_file  = "$tmp_dir/query.seq";
 148     $result_file = "$tmp_dir/vmatch.out";
 149
 150     $fh_out = Maasha::Common::write_open( $query_file );
 151
 152     foreach $record ( @{ $records } )
 153     {
 154         if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } )
 155         {
 156             next if length $record->{ "SEQ" } < 12; # assuming that the index is created for 12 as minimum length
 157
 158             push @seq_names, $record->{ "SEQ_NAME" };
 159
 160             Maasha::Fasta::put_entry( [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ], $fh_out, 80 );
 161         }
 162     }
 163
 164     close $fh_out;
 165
 166     if ( $options->{ 'genome' } ) {
 167         $vmatch_args  = "-complete -d -p -q $query_file";
 168     } else {
 169         $vmatch_args  = "-complete -d -p -showdesc 100 -q $query_file";
 170     }
 171
 172     $vmatch_args .= " -h " . $options->{ "hamming_dist" } if $options->{ "hamming_dist" };
 173     $vmatch_args .= " -e " . $options->{ "edit_dist" }    if $options->{ "edit_dist" };
 174
 175     for ( $i = 0; $i < @{ $index_files }; $i++ )
 176     {
 177         Maasha::Common::run( "vmatch", "$vmatch_args $index_files->[ $i ] > $result_file.$i" );
 178
 179         push @result_files, "$result_file.$i";
 180     }
 181
 182     unlink $query_file;
 183
 184     $count_list = vmatch_count_hits( \@result_files ) if ( $options->{ "count" } );
 185
 186     $fh_out = Maasha::Common::write_open( $result_file );
 187
 188     for ( $i = 0; $i < @{ $index_files }; $i++ )
 189     {
 190         $index_files->[ $i ] =~ s/.+\/(.+)\.fna$/$1/ if $options->{ 'genome' };
 191
 192         $fh_in = Maasha::Common::read_open( "$result_file.$i" );
 193
 194         while ( $line = <$fh_in> )
 195         {
 196             chomp $line;
 197
 198             next if $line =~ /^#/;
 199
 200             @fields = split " ", $line;
 201
 202             next if $options->{ "max_hits" } and $count_list->[ $fields[ 5 ] ] > $options->{ 'max_hits' };
 203
 204             $fields[ 1 ] = $index_files->[ $i ];                                     # S_ID
 205             $fields[ 9 ] = $count_list->[ $fields[ 5 ] ] if $options->{ "count" };   # SCORE
 206             $fields[ 5 ] = $seq_names[ $fields[ 5 ] ];                               # Q_ID
 207
 208             print $fh_out join( "\t", @fields ), "\n";
 209         }
 210
 211         close $fh_in;
 212
 213         unlink "$result_file.$i";
 214     }
 215
 216     close $fh_out;
 217
 218     return $result_file;
 219 }
 220
 221
 222 sub vmatch_count_hits
 223 {
 224     # Martin A. Hansen, April 2008.
 225
 226     # Given a list of Vmatch result file, count duplications based
 227     # on q_id. The counts are returned in a list where the list index
 228     # corresponds to the q_id index in the query file.
 229
 230     my ( $files,   # vmatch result files
 231        ) = @_;
 232
 233     # Returns a list.
 234
 235     my ( $file, $fh_in, $line, @fields, @count_list );
 236
 237     foreach $file ( @{ $files } )
 238     {
 239         $fh_in = Maasha::Common::read_open( $file );
 240
 241         while ( $line = <$fh_in> )
 242         {
 243             chomp $line;
 244
 245             next if $line =~ /^#/;
 246
 247             @fields = split " ", $line;
 248
 249             $count_list[ $fields[ 5 ] ]++;
 250         }
 251
 252         close $fh_in;
 253     }
 254
 255     return wantarray ? @count_list : \@count_list;
 256 }
 257
 258
 259 sub vmatch_count_hits_old
 260 {
 261     # Martin A. Hansen, April 2008.
 262
 263     # Given a Vmatch result file, substitute the
 264     # score field with the times the query sequence
 265     # was found.
 266
 267     my ( $tmp_dir,     # directory in where to save temp files
 268          $path,        # full path to vmatch file
 269          $max_count,   # filter too abundant seqs - OPTIONAL
 270        ) = @_;
 271
 272     # Returns nothing.
 273
 274     my ( $fh_in, $fh_out, $line, @fields, @count_list );
 275
 276     $fh_in = Maasha::Common::read_open( $path );
 277
 278     while ( $line = <$fh_in> )
 279     {
 280         chomp $line;
 281
 282         next if $line =~ /^#/;
 283
 284         @fields = split " ", $line;
 285
 286         $count_list[ $fields[ 5 ] ]++;
 287     }
 288
 289     close $fh_in;
 290
 291     $fh_in  = Maasha::Common::read_open( $path );
 292     $fh_out = Maasha::Common::write_open( "$tmp_dir/vmatch.count" );
 293
 294     while ( $line = <$fh_in> )
 295     {
 296         chomp $line;
 297
 298         next if $line =~ /^#/;
 299
 300         @fields = split " ", $line;
 301
 302         $fields[ 9 ] = $count_list[ $fields[ 5 ] ];
 303
 304         if ( $max_count ) {
 305             print $fh_out join( "\t", @fields ), "\n" if $fields[ 9 ] <= $max_count;
 306         } else {
 307             print $fh_out join( "\t", @fields ), "\n";
 308         }
 309     }
 310
 311     close $fh_in;
 312     close $fh_out;
 313
 314     rename "$tmp_dir/vmatch.count", $path;
 315 }
 316
 317
 318 sub vmatch_count_hits_old
 319 {
 320     # Martin A. Hansen, April 2008.
 321
 322     # Given a Vmatch result file, substitute the
 323     # score field with the times the query sequence
 324     # was found.
 325
 326     my ( $tmp_dir,     # directory in where to save temp files
 327          $path,        # full path to vmatch file
 328          $max_count,   # filter too abundant seqs - OPTIONAL
 329        ) = @_;
 330
 331     # Returns nothing.
 332
 333     my ( $fh_in, $fh_out, $line, @fields, %count_hash );
 334
 335     if ( $max_count ) {
 336         %count_hash = ();
 337     } else {
 338         %count_hash = Maasha::Berkeley_DB::db_init( "$tmp_dir/hash.bdb" );
 339     }
 340
 341     $fh_in = Maasha::Common::read_open( $path );
 342
 343     while ( $line = <$fh_in> )
 344     {
 345         chomp $line;
 346
 347         next if $line =~ /^#/;
 348
 349         @fields = split " ", $line;
 350
 351         $count_hash{ $fields[ 5 ] }++;
 352     }
 353
 354     close $fh_in;
 355
 356     $fh_in  = Maasha::Common::read_open( $path );
 357     $fh_out = Maasha::Common::write_open( "$tmp_dir/vmatch.count" );
 358
 359     while ( $line = <$fh_in> )
 360     {
 361         chomp $line;
 362
 363         next if $line =~ /^#/;
 364
 365         @fields = split " ", $line;
 366
 367         $fields[ 9 ] = $count_hash{ $fields[ 5 ] };
 368
 369         if ( $max_count ) {
 370             print $fh_out join( "\t", @fields ), "\n" if $fields[ 9 ] <= $max_count;
 371         } else {
 372             print $fh_out join( "\t", @fields ), "\n";
 373         }
 374     }
 375
 376     close $fh_in;
 377     close $fh_out;
 378
 379     if ( not $max_count )
 380     {
 381         untie %count_hash;
 382         unlink "$tmp_dir/hash.bdb";
 383     }
 384
 385     rename "$tmp_dir/vmatch.count", $path;
 386 }
 387
 388
 389 sub vmatch_get_entry
 390 {
 391     # Martin A. Hansen, January 2008.
 392
 393     # Parses vmatch output records.
 394
 395     my ( $fh,   # file handle to vmatch result file.
 396        ) = @_;
 397
 398     # Returns a hash.
 399
 400     my ( $line, @fields, %record );
 401
 402     while ( $line = <$fh> )
 403     {
 404         chomp $line;
 405
 406         next if $line =~ /^#/;
 407
 408         @fields = split "\t", $line;
 409
 410         $record{ "REC_TYPE" } = "VMATCH";
 411
 412         $record{ "S_LEN" }      = $fields[ 0 ];
 413         $record{ "S_ID" }       = $fields[ 1 ];
 414         $record{ "S_BEG" }      = $fields[ 2 ];
 415
 416         if ( $fields[ 3 ] eq "D" ) {
 417             $record{ "STRAND" } = "+";
 418         } else {
 419             $record{ "STRAND" } = "-";
 420         }
 421
 422         $record{ "Q_LEN" }      = $fields[ 4 ];
 423         $record{ "Q_ID" }       = $fields[ 5 ];
 424         $record{ "Q_BEG" }      = $fields[ 6 ];
 425         $record{ "MATCH_DIST" } = $fields[ 7 ];
 426         $record{ "E_VAL" }      = $fields[ 8 ];
 427         $record{ "SCORE" }      = $fields[ 9 ];
 428         $record{ "IDENT" }      = $fields[ 10 ];
 429
 430         $record{ "Q_END" }      = $record{ "Q_BEG" } + $record{ "Q_LEN" } - 1;
 431         $record{ "S_END" }      = $record{ "S_BEG" } + $record{ "S_LEN" } - 1;
 432
 433         return wantarray ? %record : \%record;
 434     }
 435 }
 436
 437
 438 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 439
 440
 441 __END__