From d2b48a034defcacac6e9134894560818b2790426 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Wed, 30 Jun 2010 14:29:50 +0000 Subject: [PATCH] added trim_seq git-svn-id: http://biopieces.googlecode.com/svn/trunk@1006 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/trim_seq | 105 ++++++++++++++++++++++++++++++ code_perl/Maasha/Fastq.pm | 40 ++++++++++++ code_ruby/Maasha/lib/biopieces.rb | 6 +- 3 files changed, 148 insertions(+), 3 deletions(-) create mode 100755 bp_bin/trim_seq diff --git a/bp_bin/trim_seq b/bp_bin/trim_seq new file mode 100755 index 0000000..4cda524 --- /dev/null +++ b/bp_bin/trim_seq @@ -0,0 +1,105 @@ +#!/usr/bin/env perl + +# Copyright (C) 2007-2010 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Trim sequence ends for residues with a low quality score. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use warnings; +use strict; +use Maasha::Biopieces; +use Maasha::Fastq; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $options, $in, $out, $record, $left, $right, $pos_l, $pos_r ); + +$options = Maasha::Biopieces::parse_options( + [ + { long => 'min', short => 'm', type => 'uint', mandatory => 'no', default => 15, allowed => undef, disallowed => 0 }, + { long => 'trim', short => 't', type => 'string', mandatory => 'no', default => 'both', allowed => 'left,right,both', disallowed => undef }, + ] +); + +$right = 1 if $options->{ 'trim' } eq 'right'; +$left = 1 if $options->{ 'trim' } eq 'left'; +$right = 1 if $options->{ 'trim' } eq 'both'; +$left = 1 if $options->{ 'trim' } eq 'both'; + +$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); +$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + +while ( $record = Maasha::Biopieces::get_record( $in ) ) +{ + if ( exists $record->{ 'SEQ' } and exists $record->{ 'SCORES' } ) + { + if ( $right ) + { + $pos_r = Maasha::Fastq::trim_right( $record->{ 'SCORES' }, $options->{ 'min' } ); + + $record->{ 'SEQ' } = substr $record->{ 'SEQ' }, 0, $pos_r + 1; + $record->{ 'SCORES' } = substr $record->{ 'SCORES' }, 0, $pos_r + 1; + $record->{ 'TRIM_RIGHT' } = $pos_r; + } + + if ( $left ) + { + $pos_l = Maasha::Fastq::trim_left( $record->{ 'SCORES' }, $options->{ 'min' } ); + + $record->{ 'SEQ' } = substr $record->{ 'SEQ' }, $pos_l; + $record->{ 'SCORES' } = substr $record->{ 'SCORES' }, $pos_l; + $record->{ 'TRIM_LEFT' } = $pos_l; + } + + $record->{ 'SEQ_LEN' } = length $record->{ 'SEQ' }; + } + + Maasha::Biopieces::put_record( $record, $out ); +} + +Maasha::Biopieces::close_stream( $in ); +Maasha::Biopieces::close_stream( $out ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +BEGIN +{ + Maasha::Biopieces::status_set(); +} + + +END +{ + Maasha::Biopieces::status_log(); +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ diff --git a/code_perl/Maasha/Fastq.pm b/code_perl/Maasha/Fastq.pm index b27017b..5015ef3 100644 --- a/code_perl/Maasha/Fastq.pm +++ b/code_perl/Maasha/Fastq.pm @@ -333,6 +333,46 @@ void softmask_phred_str( char *seq, char *scores, int threshold ) } +int trim_left( char *scores, int min ) +{ + /* Martin A. Hansen, June 2010 */ + + /* Starting from the left in a score string, */ + /* locate the position when the score is above */ + /* a given min.*/ + + int pos = 0; + + while ( pos < strlen( scores ) && solexa2dec( scores[ pos ] ) <= min ) { + pos++; + } + + return pos; +} + + +int trim_right( char *scores, int min ) +{ + /* Martin A. Hansen, June 2010 */ + + /* Starting from the right in a score string, */ + /* locate the position when the score is above */ + /* a given min.*/ + + int len = strlen( scores ); + int pos = len; + + while ( pos > 0 && solexa2dec( scores[ pos ] ) <= min ) { + pos--; + } + + if ( pos == 0 ) { + pos = len; + } + + return pos; +} + END_C diff --git a/code_ruby/Maasha/lib/biopieces.rb b/code_ruby/Maasha/lib/biopieces.rb index 1dfb9fb..47b411d 100644 --- a/code_ruby/Maasha/lib/biopieces.rb +++ b/code_ruby/Maasha/lib/biopieces.rb @@ -315,7 +315,7 @@ class OptionHandler # Given the script name determine the path of the wiki file with the usage info. def wiki_path - path = ENV["BP_DIR"] + "/bp_usage/" + File.basename(@script_path, ".rb") + ".wiki" + path = ENV["BP_DIR"] + "/bp_usage/" + File.basename(@script_path) + ".wiki" raise "No such wiki file: #{path}" unless File.file? path path end @@ -507,7 +507,7 @@ class Status def log(exit_status) time1 = Time.new.strftime("%Y-%m-%d %X") user = ENV["USER"] - script = File.basename($0, ".rb") + script = File.basename($0) stream = File.open(path) time0, args, tmp_dir = stream.first.split(";") @@ -530,7 +530,7 @@ class Status # Path to status file def path user = ENV["USER"] - script = File.basename($0, ".rb") + script = File.basename($0) pid = $$ path = ENV["BP_TMP"] + "/" + [user, script, pid, "status"].join(".") end -- 2.39.2