From: martinahansen Date: Fri, 29 Aug 2008 06:49:49 +0000 (+0000) Subject: added remove_adaptor biopiece X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;ds=sidebyside;h=a8afbe8bc7930eb209cd9731d2f4666d2d587bd1;p=biopieces.git added remove_adaptor biopiece git-svn-id: http://biopieces.googlecode.com/svn/trunk@230 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/code_c/Maasha/src/inc/list.h b/code_c/Maasha/src/inc/list.h index 810aff0..33cc06b 100644 --- a/code_c/Maasha/src/inc/list.h +++ b/code_c/Maasha/src/inc/list.h @@ -89,6 +89,9 @@ void list_dl_add_after( list_dl **list_ppt, node_dl **node_ppt, node_dl **new_no /* Remove a node from a doubly linked list. */ void list_dl_remove( list_dl **list_ppt, node_dl **node_ppt ); +/* Debug function to print all elements from a doubly linked list. */ +void list_dl_print( list_dl *list_pt ); + /* >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<*/ diff --git a/code_c/Maasha/src/lib/list.c b/code_c/Maasha/src/lib/list.c index 4525dd8..edba268 100644 --- a/code_c/Maasha/src/lib/list.c +++ b/code_c/Maasha/src/lib/list.c @@ -279,3 +279,21 @@ void list_dl_remove( list_dl **list_ppt, node_dl **node_ppt ) } +void list_dl_print( list_dl *list_pt ) +{ + /* Martin A. Hansen, August 2008 */ + + /* Debug function to print all elements from a doubly linked list. */ + + node_dl *node = list_pt->first; + int i = 0; + + while ( node != NULL ) + { + printf( "Node: %d val: %s\n", i, ( char * ) node->val ); + + node = node->next; + + i++; + } +} diff --git a/code_perl/Maasha/Biopieces.pm b/code_perl/Maasha/Biopieces.pm index 5280748..9ca6a32 100644 --- a/code_perl/Maasha/Biopieces.pm +++ b/code_perl/Maasha/Biopieces.pm @@ -235,6 +235,7 @@ sub run_script elsif ( $script eq "write_solid" ) { script_write_solid( $in, $out, $options ) } elsif ( $script eq "head_records" ) { script_head_records( $in, $out, $options ) } elsif ( $script eq "remove_keys" ) { script_remove_keys( $in, $out, $options ) } + elsif ( $script eq "remove_adaptor" ) { script_remove_adaptor( $in, $out, $options ) } elsif ( $script eq "rename_keys" ) { script_rename_keys( $in, $out, $options ) } elsif ( $script eq "uniq_vals" ) { script_uniq_vals( $in, $out, $options ) } elsif ( $script eq "merge_vals" ) { script_merge_vals( $in, $out, $options ) } @@ -280,7 +281,7 @@ sub get_options # Returns hash - my ( %options, @options, $opt, @genomes ); + my ( %options, @options, $opt, @genomes, $real ); if ( $script eq "print_usage" ) { @@ -733,6 +734,15 @@ sub get_options save_keys|K=s ); } + elsif ( $script eq "remove_adaptor" ) + { + @options = qw( + adaptor|a=s + mismatches|m=s + no_remove|n + offset|o=s + ); + } elsif ( $script eq "rename_keys" ) { @options = qw( @@ -980,13 +990,15 @@ sub get_options # print STDERR Dumper( \%options ); + $real = "beg|end|word_size|wrap|chunk_size|tile_size|len|prefix_length|mismatches|offset|num|skip|cpus|window_size|step_size"; + foreach $opt ( keys %options ) { if ( $opt =~ /stream_in|pattern_in|exact_in/ and not -f $options{ $opt } ) { Maasha::Common::error( qq(Argument to --$opt must be a valid file or fileglob expression - not "$options{ $opt }") ); } - elsif ( $opt =~ /beg|end|word_size|wrap|chunk_size|tile_size|len|prefix_length|num|skip|cpus|window_size|step_size/ and $options{ $opt } !~ /^\d+$/ ) + elsif ( $opt =~ /$real/ and $options{ $opt } !~ /^\d+$/ ) { Maasha::Common::error( qq(Argument to --$opt must be a whole number - not "$options{ $opt }") ); } @@ -4543,6 +4555,58 @@ sub script_remove_keys } +sub script_remove_adaptor +{ + # Martin A. Hansen, August 2008. + + # Find and remove adaptor from sequences in the stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $adaptor, $seq, $adaptor_len, $seq_len, $offset, $max_match, $max_mismatch, $pos ); + + $max_mismatch = $options->{ "mismatches" } || 0; + $offset = $options->{ "offset" } || 15; + $adaptor = $options->{ "adaptor" }; + $adaptor_len = length $adaptor; + $adaptor = [ split //, uc $adaptor ]; + + $max_match = $adaptor_len - $max_mismatch; + + while ( $record = get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + $seq = $record->{ "SEQ" }; + $seq_len = length $seq; + $seq = [ split //, uc $seq ]; + + $pos = Maasha::Seq::find_adaptor( $adaptor, $seq, $adaptor_len, $seq_len, $offset, $max_match, $max_mismatch ); + + $record->{ "ADAPTOR_POS" } = $pos; + + if ( $pos >= 0 ) + { + if ( not $options->{ "no_remove" } ) { + $record->{ "SEQ" } = substr $record->{ "SEQ" }, 0, $pos; + } + + put_record( $record, $out ); + } + } + else + { + put_record( $record, $out ); + } + } +} + + sub script_rename_keys { # Martin A. Hansen, August 2007. diff --git a/code_perl/Maasha/Seq.pm b/code_perl/Maasha/Seq.pm index 9170518..9b3ae08 100644 --- a/code_perl/Maasha/Seq.pm +++ b/code_perl/Maasha/Seq.pm @@ -1385,6 +1385,99 @@ sub seq_word_unpack return $word; } + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ADAPTOR LOCATING <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub find_adaptor +{ + # Martin A. Hansen & Selene Fernandez, August 2008 + + # Locates an adaptor sequence in a given sequence + # starting from a given offset position allowing a given + # number of mismatches (no indels). + + my ( $adaptor, # list of chars + $tag, # list of chars + $adaptor_len, # length of adaptor sequence + $tag_len, # length of tag sequence + $tag_offset, # offset position + $max_match, # number of matches indicating success + $max_mismatch, # number of mismatches + ) = @_; + + # Returns an integer. + + my ( $i, $j, $match, $mismatch ); + + $i = $tag_offset; + + while ( $i < $tag_len - ( $max_match + $max_mismatch ) + 1 ) + { + $j = 0; + + while ( $j < $adaptor_len - ( $max_match + $max_mismatch ) + 1 ) + { + return $i if check_diag( $adaptor, $tag, $adaptor_len, $tag_len, $j, $i, $max_match, $max_mismatch ); + + $j++ + } + + $i++; + } + + return -1; +} + + +sub check_diag +{ + # Martin A. Hansen & Selene Fernandez, August 2008 + + # Checks the diagonal starting at a given coordinate + # of a search space constituted by an adaptor and a tag sequence. + # Residues in the diagonal are compared between the sequences allowing + # for a given number of mismatches. We terminate when search spaca is + # exhausted or if max matches or mismatches is reached. + + my ( $adaptor, + $tag, + $adaptor_len, + $tag_len, + $adaptor_beg, + $tag_beg, + $max_match, + $max_mismatch, + ) = @_; + + # Returns boolean. + + my ( $match, $mismatch ); + + $match = 0; + $mismatch = 0; + + while ( $adaptor_beg <= $adaptor_len and $tag_beg <= $tag_len ) + { + if ( $adaptor->[ $adaptor_beg ] eq $tag->[ $tag_beg ] ) + { + $match++; + + return 1 if $match >= $max_match; + } + else + { + $mismatch++; + + return 0 if $mismatch > $max_mismatch; + } + + $adaptor_beg++; + $tag_beg++; + } + + return 0; +} # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<