]> git.donarmstrong.com Git - biopieces.git/commitdiff
upgraded read_fastq and added tests
authormartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Tue, 12 Oct 2010 09:16:31 +0000 (09:16 +0000)
committermartinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Tue, 12 Oct 2010 09:16:31 +0000 (09:16 +0000)
git-svn-id: http://biopieces.googlecode.com/svn/trunk@1107 74ccb610-7750-0410-82ae-013aeee3265d

bp_bin/read_fastq
bp_bin/trim_seq
bp_test/in/read_fastq.in [new file with mode: 0644]
bp_test/in/read_fastq.in.gz [new file with mode: 0644]
bp_test/out/read_fastq.out.1 [new file with mode: 0644]
bp_test/out/read_fastq.out.2 [new file with mode: 0644]
bp_test/test/test_read_fastq [new file with mode: 0755]
code_ruby/Maasha/lib/seq.rb

index eb926618efdeae2dff7bd5ddc50fab0fd1771b43..cb114ff0a707dd8d6873feb79f07dcf415110e1f 100755 (executable)
@@ -1,6 +1,6 @@
-#!/usr/bin/env perl
+#!/usr/bin/env ruby
 
-# Copyright (C) 2007-2009 Martin A. Hansen.
+# Copyright (C) 2007-2010 Martin A. Hansen.
 
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
 
 # http://www.gnu.org/copyleft/gpl.html
 
-
-# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-# Read FASTQ entries from one or more files.
-
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
+# This program is part of the Biopieces framework (www.biopieces.org).
 
-use warnings;
-use strict;
-use Maasha::Biopieces;
-use Maasha::Filesys;
-use Maasha::Fastq;
+# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
+# Read FASTQ entries from one or more files.
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
 
+require 'biopieces'
+require 'fastq'
 
-my ( $options, $in, $out, $record, $data_in, $num, $entry );
-
-$options = Maasha::Biopieces::parse_options(
-    [
-        { long => 'data_in',     short => 'i', type => 'files!', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
-        { long => 'num',         short => 'n', type => 'uint',   mandatory => 'no', default => undef, allowed => undef, disallowed => 0     },
-        { long => 'convert2dec', short => 'c', type => 'flag',   mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
-        { long => 'cutoff',      short => 'C', type => 'int',    mandatory => 'no', default => 20,    allowed => undef, disallowed => undef },
-        { long => 'soft_mask',   short => 's', type => 'flag',   mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
-    ]   
-);
-
-$in  = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
-$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
+casts = []
+casts << {:long=>'data_in', :short=>'i', :type=>'files!', :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>nil}
+casts << {:long=>'num',     :short=>'n', :type=>'uint',   :mandatory=>false, :default=>nil, :allowed=>nil, :disallowed=>'0'}
 
-while ( $record = Maasha::Biopieces::get_record( $in ) ) {
-    Maasha::Biopieces::put_record( $record, $out );
-}
+PHRED_SCORES = Regexp.new('[!"#$%&\'()*+,-./0123456789:]')
 
-if ( $options->{ 'data_in' } )
-{
-    $data_in = Maasha::Filesys::files_read_open( $options->{ 'data_in' } );
+bp = Biopieces.new
 
-    $num = 1;
+options = bp.parse(ARGV, casts)
 
-    while ( $entry = Maasha::Fastq::get_entry( $data_in ) ) 
-    {
-        if ( $record = Maasha::Fastq::fastq2biopiece( $entry ) )
-        {
-            Maasha::Fastq::softmask_phred_str( $record->{ 'SEQ' }, $record->{ 'SCORES' }, $options->{ 'cutoff' } ) if $options->{ 'soft_mask' };
-            $record->{ 'SCORES' } = Maasha::Fastq::phred_str2dec_str( $record->{ 'SCORES' } ) if $options->{ 'convert2dec' };
-
-            Maasha::Biopieces::put_record( $record, $out );
-        }
-
-        last if $options->{ "num" } and $num == $options->{ "num" };
-
-        $num++;
-    }
-
-    close $data_in;
-}
-
-Maasha::Biopieces::close_stream( $in );
-Maasha::Biopieces::close_stream( $out );
-
-
-# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+bp.each_record do |record|
+  bp.puts record
+end
 
+num  = 0
+last = false
 
-BEGIN
-{
-    Maasha::Biopieces::status_set();
-}
+if options.has_key? :data_in
+  options[:data_in].each do |file|
+    Fastq.open(file, mode='r') do |fastq|
+      fastq.each do |entry|
+        entry.convert! if entry.qual.match PHRED_SCORES
+        bp.puts entry.to_bp
+        num += 1
 
+        if options.has_key? :num and options[:num] == num
+          last = true
+          break
+        end
+      end
+    end
 
-END
-{
-    Maasha::Biopieces::status_log();
-}
+    break if last
+  end
+end
 
 
 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
index 4cda52483cd036d6de09be3146e22aa3df502d42..2abaa864a3b3b0aeefbb3a62e44e7e4a9cdf50d7 100755 (executable)
@@ -60,8 +60,8 @@ while ( $record = Maasha::Biopieces::get_record( $in ) )
         {
             $pos_r = Maasha::Fastq::trim_right( $record->{ 'SCORES' }, $options->{ 'min' } );
         
-            $record->{ 'SEQ' }       = substr $record->{ 'SEQ' }, 0, $pos_r + 1;
-            $record->{ 'SCORES' }    = substr $record->{ 'SCORES' }, 0, $pos_r + 1; 
+            $record->{ 'SEQ' }        = substr $record->{ 'SEQ' }, 0, $pos_r + 1;
+            $record->{ 'SCORES' }     = substr $record->{ 'SCORES' }, 0, $pos_r + 1; 
             $record->{ 'TRIM_RIGHT' } = $pos_r;
         }
 
diff --git a/bp_test/in/read_fastq.in b/bp_test/in/read_fastq.in
new file mode 100644 (file)
index 0000000..77726be
--- /dev/null
@@ -0,0 +1,12 @@
+@sanger
+aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
++
+!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHI
+@solexa
+bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
++
+;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh
+@illumina1.3
+ccccccccccccccccccccccccccccccccccccccccc
++
+@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh
diff --git a/bp_test/in/read_fastq.in.gz b/bp_test/in/read_fastq.in.gz
new file mode 100644 (file)
index 0000000..8fd4122
Binary files /dev/null and b/bp_test/in/read_fastq.in.gz differ
diff --git a/bp_test/out/read_fastq.out.1 b/bp_test/out/read_fastq.out.1
new file mode 100644 (file)
index 0000000..4944eb4
--- /dev/null
@@ -0,0 +1,15 @@
+SEQ_NAME: sanger
+SEQ: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+SEQ_LEN: 41
+SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh
+---
+SEQ_NAME: solexa
+SEQ: bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
+SEQ_LEN: 46
+SCORES: ;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh
+---
+SEQ_NAME: illumina1.3
+SEQ: ccccccccccccccccccccccccccccccccccccccccc
+SEQ_LEN: 41
+SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh
+---
diff --git a/bp_test/out/read_fastq.out.2 b/bp_test/out/read_fastq.out.2
new file mode 100644 (file)
index 0000000..11fa8ee
--- /dev/null
@@ -0,0 +1,5 @@
+SEQ_NAME: sanger
+SEQ: aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
+SEQ_LEN: 41
+SCORES: @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh
+---
diff --git a/bp_test/test/test_read_fastq b/bp_test/test/test_read_fastq
new file mode 100755 (executable)
index 0000000..82b220c
--- /dev/null
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+source "$BP_DIR/bp_test/lib/test.sh"
+
+run "$bp -i $in -O $tmp"
+assert_no_diff $tmp $out.1
+clean
+
+run "$bp -i $in.gz -O $tmp"
+assert_no_diff $tmp $out.1
+clean
+
+run "$bp -i $in -n 1 -O $tmp"
+assert_no_diff $tmp $out.2
+clean
index 2626ddcb12cf100eee97684cebdb48168a815e58..b7cc44bbdf5d2e2d0a634d6fb18debc1b9f36727 100644 (file)
@@ -3,6 +3,10 @@ DNA     = %w[a t c g]
 RNA     = %w[a u c g]
 PROTEIN = %w[f l s y c w p h q r i m t n k v a d e g]
 
+# Quality scores bases
+SCORE_PHRED  = 33
+SCORE_SOLEXA = 64
+
 # Error class for all exceptions to do with Seq.
 class SeqError < StandardError; end
 
@@ -67,6 +71,7 @@ class Seq
     record[:SEQ_NAME] = self.seq_name
     record[:SEQ]      = self.seq
     record[:SEQ_LEN]  = self.length
+    record[:SCORES]   = self.qual if self.qual
     record
   end
 
@@ -135,6 +140,17 @@ class Seq
     self.type = type.downcase
     seq_new
   end
+
+  # Method to convert the quality scores from a specified base
+  # to another base.
+  def convert!
+    self.qual.gsub! /./ do |score|
+      score_phred  = score.ord - SCORE_PHRED
+      raise SeqError, "Bad Phred score: #{score} (#{score_phred})" unless (0 .. 40).include? score_phred
+      score_solexa = score_phred + SCORE_SOLEXA
+      score        = score_solexa.chr
+    end
+  end
 end
 
 # Error class for all exceptions to do with Digest.