3 # Copyright (C) 2007-2009 Martin A. Hansen.
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
22 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
24 # Cluster sequences in the stream.
26 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
33 use Maasha::Biopieces;
39 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
42 my ( $options, $in, $out, $tmp_dir, $tmp_fh1, $tmp_fh2, $fh, $record, $entry, $type, @args, $arg_str, $clusters );
44 $options = Maasha::Biopieces::parse_options(
46 { long => 'identity', short => 'i', type => 'float', mandatory => 'no', default => "0.9", allowed => undef, disallowed => undef },
47 { long => 'word_size', short => 'w', type => 'uint', mandatory => 'no', default => 7, allowed => undef, disallowed => 0 },
48 { long => 'fast_clust', short => 'f', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
52 $in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
53 $out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
55 $tmp_dir = Maasha::Biopieces::get_tmpdir();
56 $tmp_fh1 = Maasha::Filesys::file_write_open( "$tmp_dir/cluster.fasta" );
57 $tmp_fh2 = Maasha::Filesys::file_write_open( "$tmp_dir/cluster.stream" );
59 while ( $record = Maasha::Biopieces::get_record( $in ) )
61 if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
63 $type = Maasha::Seq::seq_guess_type( $record->{ 'SEQ' } ) if not $type;
65 Maasha::Fasta::put_entry( $entry, $tmp_fh1 );
68 Maasha::Biopieces::put_record( $record, $tmp_fh2 );
75 push @args, "-i $tmp_dir/cluster.fasta";
76 push @args, "-o $tmp_dir/cluster.out";
77 push @args, "-n $options->{ 'word_size' }";
78 push @args, "-c $options->{ 'identity' }";
79 push @args, "-g 1" if not $options->{ 'fast_clust' };
80 push @args, "> /dev/null 2>&1" if not $options->{ 'verbose' };
82 $arg_str = join " ", @args;
84 if ( $type =~ /protein/i ) {
85 Maasha::Common::run( "cdhit", $arg_str );
87 Maasha::Common::run( "cdhit-est", $arg_str );
90 $clusters = parse_clusters( "$tmp_dir/cluster.out.clstr" );
92 $tmp_fh2 = Maasha::Filesys::file_read_open( "$tmp_dir/cluster.stream" );
94 while ( $record = Maasha::Biopieces::get_record( $tmp_fh2 ) )
96 if ( exists $clusters->{ $record->{ 'SEQ_NAME' } } ) {
97 $record->{ 'CLUSTER' } = $clusters->{ $record->{ 'SEQ_NAME' } };
100 Maasha::Biopieces::put_record( $record, $out );
103 Maasha::Biopieces::close_stream( $in );
104 Maasha::Biopieces::close_stream( $out );
107 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
112 # Martin A. Hansen, January 2010.
114 # Parses a CD-hit cluster file and returns a hash with
115 # sequence name as key and cluster number as value.
117 my ( $file, # cluster file
122 my ( $block, $fh, @lines, $line, %clusters, $seq_name, $cluster );
126 $fh = Maasha::Filesys::file_read_open( $file );
128 while ( $block = <$fh> )
132 @lines = split "\n", $block;
134 $cluster = shift @lines;
136 $cluster =~ s/>?Cluster (\d+)/$1/;
138 foreach $line ( @lines )
140 if ( $line =~ />(.*)/ )
144 $clusters{ $seq_name } = $cluster;
151 return wantarray ? %clusters : \%clusters;
155 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
160 Maasha::Biopieces::status_set();
166 Maasha::Biopieces::status_log();
170 # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<