From cd6541f8b0c481181ebcf55be5aecf82001744f1 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Wed, 3 Mar 2010 07:57:08 +0000 Subject: [PATCH] added find_gaps git-svn-id: http://biopieces.googlecode.com/svn/trunk@887 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/find_gaps | 136 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100755 bp_bin/find_gaps diff --git a/bp_bin/find_gaps b/bp_bin/find_gaps new file mode 100755 index 0000000..b39b22b --- /dev/null +++ b/bp_bin/find_gaps @@ -0,0 +1,136 @@ +#!/usr/bin/env perl + +# Copyright (C) 2007-2010 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Find stretches of N's in sequences from the stream. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use warnings; +use strict; +use Maasha::Biopieces; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $options, $in, $out, $record, $gaps, $gap ); + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'min_len', short => 'm', type => 'uint', mandatory => 'no', default => 5, allowed => undef, disallowed => 0 }, + ] +); + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + if ( exists $record->{ 'SEQ' } ) + { + $gaps = find_gaps( $record->{ 'SEQ' }, $options->{ 'min_len' } ); + + foreach $gap ( @{ $gaps } ) + { + $gap->{ 'S_ID' } = $record->{ 'S_ID' } || $record->{ 'SEQ_NAME' }; + + Maasha::Biopieces::put_record( $gap, $out ); + } + } + + Maasha::Biopieces::put_record( $record, $out ); +} + +Maasha::Biopieces::close_stream( $in ); +Maasha::Biopieces::close_stream( $out ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub find_gaps +{ + # Martin A. Hansen, March 2010. + + # Find stretches, larger than or equal to a given minimum + # of N's in a sequence, and returns a list of intervals + # with these. + + my ( $seq, # Sequence + $min, # Minimum stretch length + ) = @_; + + # Returns a list. + + my ( @gaps, $block, $beg, $end ); + + $seq = uc $seq; + + $block = 'N' x $min; + + $beg = 0; + + while ( 1 ) + { + $beg = index $seq, $block, $beg; + + last if $beg < 0; + + $end = $beg; + + while ( substr( $seq, $end, 1 ) eq 'N' ) { + $end++; + } + + push @gaps, { + S_BEG => $beg, + S_END => $end - 1, + }; + + $beg = $end + 1; + } + + return wantarray ? @gaps : \@gaps; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + Maasha::Biopieces::status_set(); +} + + +END +{ + Maasha::Biopieces::status_log(); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ -- 2.39.5