From daa66aef9f3627fe925b490dc460fa99bc94445a Mon Sep 17 00:00:00 2001 From: martinahansen Date: Wed, 13 Jan 2010 15:03:26 +0000 Subject: [PATCH] changed engine in cluster_seq to uclust git-svn-id: http://biopieces.googlecode.com/svn/trunk@834 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/cluster_seq | 60 ++++++++++++++++------------------------------ 1 file changed, 21 insertions(+), 39 deletions(-) diff --git a/bp_bin/cluster_seq b/bp_bin/cluster_seq index 12301c2..26037e1 100755 --- a/bp_bin/cluster_seq +++ b/bp_bin/cluster_seq @@ -39,20 +39,23 @@ use Maasha::Filesys; # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -my ( $options, $in, $out, $tmp_dir, $tmp_fh1, $tmp_fh2, $fh, $record, $entry, $type, @args, $arg_str, $clusters ); +my ( $options, $in, $out, $tmp_dir, $tmp_fh1, $tmp_fh2, $fh, $record, $entry, @args, $arg_str, $clusters ); + +$tmp_dir = Maasha::Biopieces::get_tmpdir(); $options = Maasha::Biopieces::parse_options( [ - { long => 'identity', short => 'i', type => 'float', mandatory => 'no', default => "0.9", allowed => undef, disallowed => undef }, - { long => 'word_size', short => 'w', type => 'uint', mandatory => 'no', default => 7, allowed => undef, disallowed => 0 }, - { long => 'fast_clust', short => 'f', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + { long => 'identity', short => 'i', type => 'float', mandatory => 'no', default => "0.9", allowed => undef, disallowed => undef }, + { long => 'word_size', short => 'w', type => 'uint', mandatory => 'no', default => 7, allowed => undef, disallowed => 0 }, + { long => 'fast_clust', short => 'f', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef }, + + { long => 'tmp_dir', short => 't', type => 'dir!', mandatory => 'no', default => $tmp_dir, allowed => undef, disallowed => undef }, ] ); $in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); $out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); -$tmp_dir = Maasha::Biopieces::get_tmpdir(); $tmp_fh1 = Maasha::Filesys::file_write_open( "$tmp_dir/cluster.fasta" ); $tmp_fh2 = Maasha::Filesys::file_write_open( "$tmp_dir/cluster.stream" ); @@ -60,8 +63,6 @@ while ( $record = Maasha::Biopieces::get_record( $in ) ) { if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { - $type = Maasha::Seq::seq_guess_type( $record->{ 'SEQ' } ) if not $type; - Maasha::Fasta::put_entry( $entry, $tmp_fh1 ); } @@ -71,23 +72,18 @@ while ( $record = Maasha::Biopieces::get_record( $in ) ) close $tmp_fh1; close $tmp_fh2; -push @args, "-d 120"; -push @args, "-i $tmp_dir/cluster.fasta"; -push @args, "-o $tmp_dir/cluster.out"; -push @args, "-n $options->{ 'word_size' }"; -push @args, "-c $options->{ 'identity' }"; -push @args, "-g 1" if not $options->{ 'fast_clust' }; +push @args, "--ucluster $tmp_dir/cluster.fasta"; +push @args, "--id $options->{ 'identity' }"; +push @args, "--tmpdir $options->{ 'tmp_dir' }"; +push @args, "--output $tmp_dir/cluster.uc"; +push @args, "--quiet" if not $options->{ 'verbose' }; push @args, "> /dev/null 2>&1" if not $options->{ 'verbose' }; $arg_str = join " ", @args; -if ( $type =~ /protein/i ) { - Maasha::Common::run( "cdhit", $arg_str ); -} else { - Maasha::Common::run( "cdhit-est", $arg_str ); -} +Maasha::Common::run( "uclust", $arg_str ); -$clusters = parse_clusters( "$tmp_dir/cluster.out.clstr" ); +$clusters = parse_clusters( "$tmp_dir/cluster.uc" ); $tmp_fh2 = Maasha::Filesys::file_read_open( "$tmp_dir/cluster.stream" ); @@ -111,7 +107,7 @@ sub parse_clusters { # Martin A. Hansen, January 2010. - # Parses a CD-hit cluster file and returns a hash with + # Parses a uclust uc cluster file and returns a hash with # sequence name as key and cluster number as value. my ( $file, # cluster file @@ -119,31 +115,17 @@ sub parse_clusters # Returns a hash. - my ( $block, $fh, @lines, $line, %clusters, $seq_name, $cluster ); - - local $/ = "\n>"; + my ( $fh, $line, %clusters, $seq_name, $cluster ); $fh = Maasha::Filesys::file_read_open( $file ); - while ( $block = <$fh> ) + while ( $line = <$fh> ) { - chomp $block; - - @lines = split "\n", $block; - - $cluster = shift @lines; - - $cluster =~ s/>?Cluster (\d+)/$1/; + chomp $line; - foreach $line ( @lines ) - { - if ( $line =~ />(.*)/ ) - { - $seq_name = $1; + ( $cluster, undef, undef, undef, $seq_name ) = split "\t", $line; - $clusters{ $seq_name } = $cluster; - } - } + $clusters{ $seq_name } = $cluster; } close $fh; -- 2.39.2