From 0be23a86e418adce44df86123d17ea5a9552e369 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Tue, 1 Feb 2011 13:55:08 +0000 Subject: [PATCH] added analyze_assembly biopiece git-svn-id: http://biopieces.googlecode.com/svn/trunk@1247 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/analyze_assembly | 82 ++++++++++++++++++++++++++++++ bp_test/in/analyze_assembly.in | 20 ++++++++ bp_test/out/analyze_assembly.out.1 | 27 ++++++++++ bp_test/out/analyze_assembly.out.2 | 7 +++ bp_test/test/test_analyze_assembly | 11 ++++ 5 files changed, 147 insertions(+) create mode 100755 bp_bin/analyze_assembly create mode 100644 bp_test/in/analyze_assembly.in create mode 100644 bp_test/out/analyze_assembly.out.1 create mode 100644 bp_test/out/analyze_assembly.out.2 create mode 100755 bp_test/test/test_analyze_assembly diff --git a/bp_bin/analyze_assembly b/bp_bin/analyze_assembly new file mode 100755 index 0000000..610d7e1 --- /dev/null +++ b/bp_bin/analyze_assembly @@ -0,0 +1,82 @@ +#!/usr/bin/env ruby + +# Copyright (C) 2007-2011 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# This program is part of the Biopieces framework (www.biopieces.org). + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +# Analyze assembled sequences in the stream and output stats. + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +require 'biopieces' +require 'pp' + +casts = [] +casts << {:long=>'no_stream', :short=>'x', :type=>'flag', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} +casts << {:long=>'data_out', :short=>'o', :type=>'file', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil} + +bp = Biopieces.new + +options = bp.parse(ARGV, casts) + +total = 0 +lengths = [] + +bp.each_record do |record| + bp.puts record unless options[:no_stream] + + if record.has_key? :SEQ + total += record[:SEQ].length + lengths << record[:SEQ].length + end +end + +count = 0 +n50 = 0 + +lengths.sort.reverse.each do |length| + count += length + + if count >= total * 0.50 + n50 = length + break + end +end + +bp.out = Stream.write(options[:data_out]) if options[:data_out] + +new_record = {} +new_record[:N50] = n50 +new_record[:MAX] = lengths.max +new_record[:MIN] = lengths.min +new_record[:MEAN] = (total.to_f / lengths.size.to_f).to_i +new_record[:TOTAL] = total +new_record[:COUNT] = lengths.size + +bp.puts new_record + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +__END__ diff --git a/bp_test/in/analyze_assembly.in b/bp_test/in/analyze_assembly.in new file mode 100644 index 0000000..b973713 --- /dev/null +++ b/bp_test/in/analyze_assembly.in @@ -0,0 +1,20 @@ +SEQ_NAME: test1 +SEQ: ATGCACATTG +SEQ_LEN: 10 +--- +SEQ_NAME: test2 +SEQ: ATGCACATTGATGCACATTG +SEQ_LEN: 20 +--- +SEQ_NAME: test3 +SEQ: ATGCACATTGATGCACATTGATGCACATTG +SEQ_LEN: 30 +--- +SEQ_NAME: test4 +SEQ: ATGCACATTGATGCACATTGATGCACATTGATGCACATTG +SEQ_LEN: 40 +--- +SEQ_NAME: test5 +SEQ: ATGCACATTGATGCACATTGATGCACATTGATGCACATTGATGCACATTG +SEQ_LEN: 50 +--- diff --git a/bp_test/out/analyze_assembly.out.1 b/bp_test/out/analyze_assembly.out.1 new file mode 100644 index 0000000..c47d842 --- /dev/null +++ b/bp_test/out/analyze_assembly.out.1 @@ -0,0 +1,27 @@ +SEQ_NAME: test1 +SEQ: ATGCACATTG +SEQ_LEN: 10 +--- +SEQ_NAME: test2 +SEQ: ATGCACATTGATGCACATTG +SEQ_LEN: 20 +--- +SEQ_NAME: test3 +SEQ: ATGCACATTGATGCACATTGATGCACATTG +SEQ_LEN: 30 +--- +SEQ_NAME: test4 +SEQ: ATGCACATTGATGCACATTGATGCACATTGATGCACATTG +SEQ_LEN: 40 +--- +SEQ_NAME: test5 +SEQ: ATGCACATTGATGCACATTGATGCACATTGATGCACATTGATGCACATTG +SEQ_LEN: 50 +--- +N50: 40 +MAX: 50 +MIN: 10 +MEAN: 30 +TOTAL: 150 +COUNT: 5 +--- diff --git a/bp_test/out/analyze_assembly.out.2 b/bp_test/out/analyze_assembly.out.2 new file mode 100644 index 0000000..d10623b --- /dev/null +++ b/bp_test/out/analyze_assembly.out.2 @@ -0,0 +1,7 @@ +N50: 40 +MAX: 50 +MIN: 10 +MEAN: 30 +TOTAL: 150 +COUNT: 5 +--- diff --git a/bp_test/test/test_analyze_assembly b/bp_test/test/test_analyze_assembly new file mode 100755 index 0000000..20dfa3d --- /dev/null +++ b/bp_test/test/test_analyze_assembly @@ -0,0 +1,11 @@ +#!/bin/bash + +source "$BP_DIR/bp_test/lib/test.sh" + +run "$bp -I $in -O $tmp" +assert_no_diff $tmp $out.1 +clean + +run "$bp -I $in -o $tmp -x" +assert_no_diff $tmp $out.2 +clean -- 2.39.2