elsif ( $script eq "write_solid" ) { script_write_solid( $in, $out, $options ) }
elsif ( $script eq "head_records" ) { script_head_records( $in, $out, $options ) }
elsif ( $script eq "remove_keys" ) { script_remove_keys( $in, $out, $options ) }
+ elsif ( $script eq "remove_adaptor" ) { script_remove_adaptor( $in, $out, $options ) }
elsif ( $script eq "rename_keys" ) { script_rename_keys( $in, $out, $options ) }
elsif ( $script eq "uniq_vals" ) { script_uniq_vals( $in, $out, $options ) }
elsif ( $script eq "merge_vals" ) { script_merge_vals( $in, $out, $options ) }
# Returns hash
- my ( %options, @options, $opt, @genomes );
+ my ( %options, @options, $opt, @genomes, $real );
if ( $script eq "print_usage" )
{
save_keys|K=s
);
}
+ elsif ( $script eq "remove_adaptor" )
+ {
+ @options = qw(
+ adaptor|a=s
+ mismatches|m=s
+ no_remove|n
+ offset|o=s
+ );
+ }
elsif ( $script eq "rename_keys" )
{
@options = qw(
# print STDERR Dumper( \%options );
+ $real = "beg|end|word_size|wrap|chunk_size|tile_size|len|prefix_length|mismatches|offset|num|skip|cpus|window_size|step_size";
+
foreach $opt ( keys %options )
{
if ( $opt =~ /stream_in|pattern_in|exact_in/ and not -f $options{ $opt } )
{
Maasha::Common::error( qq(Argument to --$opt must be a valid file or fileglob expression - not "$options{ $opt }") );
}
- elsif ( $opt =~ /beg|end|word_size|wrap|chunk_size|tile_size|len|prefix_length|num|skip|cpus|window_size|step_size/ and $options{ $opt } !~ /^\d+$/ )
+ elsif ( $opt =~ /$real/ and $options{ $opt } !~ /^\d+$/ )
{
Maasha::Common::error( qq(Argument to --$opt must be a whole number - not "$options{ $opt }") );
}
}
+sub script_remove_adaptor
+{
+ # Martin A. Hansen, August 2008.
+
+ # Find and remove adaptor from sequences in the stream.
+
+ my ( $in, # handle to in stream
+ $out, # handle to out stream
+ $options, # options hash
+ ) = @_;
+
+ # Returns nothing.
+
+ my ( $record, $adaptor, $seq, $adaptor_len, $seq_len, $offset, $max_match, $max_mismatch, $pos );
+
+ $max_mismatch = $options->{ "mismatches" } || 0;
+ $offset = $options->{ "offset" } || 15;
+ $adaptor = $options->{ "adaptor" };
+ $adaptor_len = length $adaptor;
+ $adaptor = [ split //, uc $adaptor ];
+
+ $max_match = $adaptor_len - $max_mismatch;
+
+ while ( $record = get_record( $in ) )
+ {
+ if ( $record->{ "SEQ" } )
+ {
+ $seq = $record->{ "SEQ" };
+ $seq_len = length $seq;
+ $seq = [ split //, uc $seq ];
+
+ $pos = Maasha::Seq::find_adaptor( $adaptor, $seq, $adaptor_len, $seq_len, $offset, $max_match, $max_mismatch );
+
+ $record->{ "ADAPTOR_POS" } = $pos;
+
+ if ( $pos >= 0 )
+ {
+ if ( not $options->{ "no_remove" } ) {
+ $record->{ "SEQ" } = substr $record->{ "SEQ" }, 0, $pos;
+ }
+
+ put_record( $record, $out );
+ }
+ }
+ else
+ {
+ put_record( $record, $out );
+ }
+ }
+}
+
+
sub script_rename_keys
{
# Martin A. Hansen, August 2007.
return $word;
}
+
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> ADAPTOR LOCATING <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+
+sub find_adaptor
+{
+ # Martin A. Hansen & Selene Fernandez, August 2008
+
+ # Locates an adaptor sequence in a given sequence
+ # starting from a given offset position allowing a given
+ # number of mismatches (no indels).
+
+ my ( $adaptor, # list of chars
+ $tag, # list of chars
+ $adaptor_len, # length of adaptor sequence
+ $tag_len, # length of tag sequence
+ $tag_offset, # offset position
+ $max_match, # number of matches indicating success
+ $max_mismatch, # number of mismatches
+ ) = @_;
+
+ # Returns an integer.
+
+ my ( $i, $j, $match, $mismatch );
+
+ $i = $tag_offset;
+
+ while ( $i < $tag_len - ( $max_match + $max_mismatch ) + 1 )
+ {
+ $j = 0;
+
+ while ( $j < $adaptor_len - ( $max_match + $max_mismatch ) + 1 )
+ {
+ return $i if check_diag( $adaptor, $tag, $adaptor_len, $tag_len, $j, $i, $max_match, $max_mismatch );
+
+ $j++
+ }
+
+ $i++;
+ }
+
+ return -1;
+}
+
+
+sub check_diag
+{
+ # Martin A. Hansen & Selene Fernandez, August 2008
+
+ # Checks the diagonal starting at a given coordinate
+ # of a search space constituted by an adaptor and a tag sequence.
+ # Residues in the diagonal are compared between the sequences allowing
+ # for a given number of mismatches. We terminate when search spaca is
+ # exhausted or if max matches or mismatches is reached.
+
+ my ( $adaptor,
+ $tag,
+ $adaptor_len,
+ $tag_len,
+ $adaptor_beg,
+ $tag_beg,
+ $max_match,
+ $max_mismatch,
+ ) = @_;
+
+ # Returns boolean.
+
+ my ( $match, $mismatch );
+
+ $match = 0;
+ $mismatch = 0;
+
+ while ( $adaptor_beg <= $adaptor_len and $tag_beg <= $tag_len )
+ {
+ if ( $adaptor->[ $adaptor_beg ] eq $tag->[ $tag_beg ] )
+ {
+ $match++;
+
+ return 1 if $match >= $max_match;
+ }
+ else
+ {
+ $mismatch++;
+
+ return 0 if $mismatch > $max_mismatch;
+ }
+
+ $adaptor_beg++;
+ $tag_beg++;
+ }
+
+ return 0;
+}
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<