X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=code_perl%2FMaasha%2FMatch.pm;h=6ddd4132a05f6e8a3f044a17b558e81011ea09b2;hb=58b9ddc3ddfcbe0c4476f36b12cc6cb0dbe88b41;hp=8f52d8d99d6dc5d29a91b7c6467beab06aac5fe3;hpb=6a4043cd15c4d8f6c251e4a9c50b006f95cda2db;p=biopieces.git diff --git a/code_perl/Maasha/Match.pm b/code_perl/Maasha/Match.pm index 8f52d8d..6ddd413 100644 --- a/code_perl/Maasha/Match.pm +++ b/code_perl/Maasha/Match.pm @@ -28,15 +28,22 @@ package Maasha::Match; # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< +use warnings; use strict; use Data::Dumper; use Storable qw( dclone ); use Maasha::Common; +use Maasha::Filesys; use Maasha::Fasta; use Maasha::Seq; -use Maasha::Berkeley_DB; +use Maasha::Biopieces; use vars qw ( @ISA @EXPORT ); +use constant { + SEQ_NAME => 0, + SEQ => 1, +}; + @ISA = qw( Exporter ); @@ -69,7 +76,7 @@ sub match_mummer push @args, "-F"; push @args, "-l $options->{ 'word_size' }"; push @args, "-maxmatch"; - push @args, "-n" if not &Maasha::Seq::seq_guess_type( $entries1->[ 0 ]->[ 1 ] ) eq "protein"; + push @args, "-n" if not Maasha::Seq::seq_guess_type( $entries1->[ 0 ]->[ 1 ] ) eq "PROTEIN"; push @args, "-b" if $options->{ "direction" } =~ /^b/; push @args, "-r" if $options->{ "direction" } =~ /^r/; @@ -82,12 +89,12 @@ sub match_mummer map { $_->[ 0 ] =~ tr/ /_/ } @{ $entries1 }; map { $_->[ 0 ] =~ tr/ /_/ } @{ $entries2 }; - &Maasha::Fasta::put_entries( $entries1, $file_in1 ); - &Maasha::Fasta::put_entries( $entries2, $file_in2 ); + Maasha::Fasta::put_entries( $entries1, $file_in1 ); + Maasha::Fasta::put_entries( $entries2, $file_in2 ); - &Maasha::Common::run( "mummer", "$arg $file_in1 $file_in2 > $file_out 2>/dev/null" ); + Maasha::Common::run( "mummer", "$arg $file_in1 $file_in2 > $file_out 2>/dev/null" ); - $fh = &Maasha::Common::read_open( $file_out ); + $fh = Maasha::Filesys::file_read_open( $file_out ); while ( $line = <$fh> ) { @@ -116,7 +123,6 @@ sub match_mummer push @results, dclone $result; } - } unlink $file_in1; @@ -127,311 +133,44 @@ sub match_mummer } -sub match_vmatch -{ - # Martin A. Hansen, April 2008. - - # Vmatches a list of records against a list of index files and the full - # path to the result file is returned. - - my ( $tmp_dir, # directory in where to save temp files - $records, # list of records - $index_files, # list of index files - $options, # argument hash - ) = @_; - - # Returns a string. - - my ( $query_file, $result_file, @result_files, $fh_in, $fh_out, $line, @fields, $i, $record, $vmatch_args, @index_names, @seq_names, $count_list ); - - $query_file = "$tmp_dir/query.seq"; - $result_file = "$tmp_dir/vmatch.out"; - - $fh_out = &Maasha::Common::write_open( $query_file ); - - foreach $record ( @{ $records } ) - { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) - { - next if length $record->{ "SEQ" } < 12; # assuming that the index is created for 12 as minimum length - - push @seq_names, $record->{ "SEQ_NAME" }; - - &Maasha::Fasta::put_entry( [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ], $fh_out, 80 ); - } - } - - close $fh_out; - - if ( $options->{ 'genome' } ) { - $vmatch_args = "-complete -d -p -q $query_file"; - } else { - $vmatch_args = "-complete -d -p -showdesc 100 -q $query_file"; - } - - $vmatch_args .= " -h " . $options->{ "hamming_dist" } if $options->{ "hamming_dist" }; - $vmatch_args .= " -e " . $options->{ "edit_dist" } if $options->{ "edit_dist" }; - - for ( $i = 0; $i < @{ $index_files }; $i++ ) - { - &Maasha::Common::run( "vmatch", "$vmatch_args $index_files->[ $i ] > $result_file.$i" ); - - push @result_files, "$result_file.$i"; - } - - unlink $query_file; - - $count_list = &vmatch_count_hits( \@result_files ) if ( $options->{ "count" } ); - - $fh_out = &Maasha::Common::write_open( $result_file ); - - for ( $i = 0; $i < @{ $index_files }; $i++ ) - { - $index_files->[ $i ] =~ s/.+\/(.+)\.fna$/$1/ if $options->{ 'genome' }; - - $fh_in = &Maasha::Common::read_open( "$result_file.$i" ); - - while ( $line = <$fh_in> ) - { - chomp $line; - - next if $line =~ /^#/; - - @fields = split " ", $line; - - next if $options->{ "max_hits" } and $count_list->[ $fields[ 5 ] ] > $options->{ 'max_hits' }; - - $fields[ 1 ] = $index_files->[ $i ]; # S_ID - $fields[ 9 ] = $count_list->[ $fields[ 5 ] ] if $options->{ "count" }; # SCORE - $fields[ 5 ] = $seq_names[ $fields[ 5 ] ]; # Q_ID - - print $fh_out join( "\t", @fields ), "\n"; - } - - close $fh_in; - - unlink "$result_file.$i"; - } - - close $fh_out; - - return $result_file; -} - - -sub vmatch_count_hits +sub vmatch_index { - # Martin A. Hansen, April 2008. + # Martin A. Hansen, July 2008. - # Given a list of Vmatch result file, count duplications based - # on q_id. The counts are returned in a list where the list index - # corresponds to the q_id index in the query file. + # Use mkvtree to create a vmatch index of a given file - my ( $files, # vmatch result files - ) = @_; - - # Returns a list. - - my ( $file, $fh_in, $line, @fields, @count_list ); - - foreach $file ( @{ $files } ) - { - $fh_in = &Maasha::Common::read_open( $file ); - - while ( $line = <$fh_in> ) - { - chomp $line; - - next if $line =~ /^#/; - - @fields = split " ", $line; - - $count_list[ $fields[ 5 ] ]++; - } - - close $fh_in; - } - - return wantarray ? @count_list : \@count_list; -} - - -sub vmatch_count_hits_old -{ - # Martin A. Hansen, April 2008. - - # Given a Vmatch result file, substitute the - # score field with the times the query sequence - # was found. - - my ( $tmp_dir, # directory in where to save temp files - $path, # full path to vmatch file - $max_count, # filter too abundant seqs - OPTIONAL + my ( $file, # FASTA file to index + $src_dir, # source directory with FASTA file + $dst_dir, # distination directory for Vmatch index + $tmp_dir, # temp directory - OPTIONAL ) = @_; # Returns nothing. - my ( $fh_in, $fh_out, $line, @fields, @count_list ); - - $fh_in = &Maasha::Common::read_open( $path ); - - while ( $line = <$fh_in> ) - { - chomp $line; + my ( $fh, $entry, $tmp_file ); - next if $line =~ /^#/; + Maasha::Filesys::dir_create_if_not_exists( $dst_dir ); - @fields = split " ", $line; +# if ( Maasha::Common::file_size( $file ) < 200_000_000 ) +# { +# &Maasha::Common::run( "mkvtree", "-db $src_dir/$file -dna -pl -allout -indexname $dst_dir/$file > /dev/null 3>&1" ); +# } +# else +# { + $fh = Maasha::Filesys::file_read_open( "$src_dir/$file" ); - $count_list[ $fields[ 5 ] ]++; - } - - close $fh_in; - - $fh_in = &Maasha::Common::read_open( $path ); - $fh_out = &Maasha::Common::write_open( "$tmp_dir/vmatch.count" ); - - while ( $line = <$fh_in> ) - { - chomp $line; - - next if $line =~ /^#/; - - @fields = split " ", $line; - - $fields[ 9 ] = $count_list[ $fields[ 5 ] ]; - - if ( $max_count ) { - print $fh_out join( "\t", @fields ), "\n" if $fields[ 9 ] <= $max_count; - } else { - print $fh_out join( "\t", @fields ), "\n"; - } - } - - close $fh_in; - close $fh_out; - - rename "$tmp_dir/vmatch.count", $path; -} - - -sub vmatch_count_hits_old -{ - # Martin A. Hansen, April 2008. - - # Given a Vmatch result file, substitute the - # score field with the times the query sequence - # was found. - - my ( $tmp_dir, # directory in where to save temp files - $path, # full path to vmatch file - $max_count, # filter too abundant seqs - OPTIONAL - ) = @_; - - # Returns nothing. - - my ( $fh_in, $fh_out, $line, @fields, %count_hash ); - - if ( $max_count ) { - %count_hash = (); - } else { - %count_hash = &Maasha::Berkeley_DB::db_init( "$tmp_dir/hash.bdb" ); - } - - $fh_in = &Maasha::Common::read_open( $path ); - - while ( $line = <$fh_in> ) - { - chomp $line; - - next if $line =~ /^#/; - - @fields = split " ", $line; - - $count_hash{ $fields[ 5 ] }++; - } - - close $fh_in; - - $fh_in = &Maasha::Common::read_open( $path ); - $fh_out = &Maasha::Common::write_open( "$tmp_dir/vmatch.count" ); - - while ( $line = <$fh_in> ) - { - chomp $line; - - next if $line =~ /^#/; - - @fields = split " ", $line; - - $fields[ 9 ] = $count_hash{ $fields[ 5 ] }; - - if ( $max_count ) { - print $fh_out join( "\t", @fields ), "\n" if $fields[ 9 ] <= $max_count; - } else { - print $fh_out join( "\t", @fields ), "\n"; - } - } - - close $fh_in; - close $fh_out; - - if ( not $max_count ) - { - untie %count_hash; - unlink "$tmp_dir/hash.bdb"; - } - - rename "$tmp_dir/vmatch.count", $path; -} - - -sub vmatch_get_entry -{ - # Martin A. Hansen, January 2008. - - # Parses vmatch output records. - - my ( $fh, # file handle to vmatch result file. - ) = @_; - - # Returns a hash. - - my ( $line, @fields, %record ); - - while ( $line = <$fh> ) - { - chomp $line; - - next if $line =~ /^#/; - - @fields = split "\t", $line; + while ( $entry = Maasha::Fasta::get_entry( $fh ) ) + { + $tmp_file = $entry->[ SEQ_NAME ] . ".fna"; - $record{ "REC_TYPE" } = "VMATCH"; + Maasha::Fasta::put_entries( [ $entry ], "$tmp_dir/$tmp_file" ); - $record{ "S_LEN" } = $fields[ 0 ]; - $record{ "S_ID" } = $fields[ 1 ]; - $record{ "S_BEG" } = $fields[ 2 ]; + Maasha::Common::run( "mkvtree", "-db $tmp_dir/$tmp_file -dna -pl -allout -indexname $dst_dir/$tmp_file > /dev/null 3>&1" ); - if ( $fields[ 3 ] eq "D" ) { - $record{ "STRAND" } = "+"; - } else { - $record{ "STRAND" } = "-"; + unlink "$tmp_dir/$tmp_file"; } - $record{ "Q_LEN" } = $fields[ 4 ]; - $record{ "Q_ID" } = $fields[ 5 ]; - $record{ "Q_BEG" } = $fields[ 6 ]; - $record{ "MATCH_DIST" } = $fields[ 7 ]; - $record{ "E_VAL" } = $fields[ 8 ]; - $record{ "SCORE" } = $fields[ 9 ]; - $record{ "IDENT" } = $fields[ 10 ]; - - $record{ "Q_END" } = $record{ "Q_BEG" } + $record{ "Q_LEN" } - 1; - $record{ "S_END" } = $record{ "S_BEG" } + $record{ "S_LEN" } - 1; - - return wantarray ? %record : \%record; - } + close $fh; }