# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-my ( $options, $in, $out, $tmp_dir, $tmp_fh1, $tmp_fh2, $fh, $record, $entry, $type, @args, $arg_str, $clusters );
+my ( $options, $in, $out, $tmp_dir, $tmp_fh1, $tmp_fh2, $fh, $record, $entry, @args, $arg_str, $clusters );
+
+$tmp_dir = Maasha::Biopieces::get_tmpdir();
$options = Maasha::Biopieces::parse_options(
[
- { long => 'identity', short => 'i', type => 'float', mandatory => 'no', default => "0.9", allowed => undef, disallowed => undef },
- { long => 'word_size', short => 'w', type => 'uint', mandatory => 'no', default => 7, allowed => undef, disallowed => 0 },
- { long => 'fast_clust', short => 'f', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+ { long => 'identity', short => 'i', type => 'float', mandatory => 'no', default => "0.9", allowed => undef, disallowed => undef },
+ { long => 'word_size', short => 'w', type => 'uint', mandatory => 'no', default => 7, allowed => undef, disallowed => 0 },
+ { long => 'fast_clust', short => 'f', type => 'flag', mandatory => 'no', default => undef, allowed => undef, disallowed => undef },
+
+ { long => 'tmp_dir', short => 't', type => 'dir!', mandatory => 'no', default => $tmp_dir, allowed => undef, disallowed => undef },
]
);
$in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } );
$out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } );
-$tmp_dir = Maasha::Biopieces::get_tmpdir();
$tmp_fh1 = Maasha::Filesys::file_write_open( "$tmp_dir/cluster.fasta" );
$tmp_fh2 = Maasha::Filesys::file_write_open( "$tmp_dir/cluster.stream" );
{
if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) )
{
- $type = Maasha::Seq::seq_guess_type( $record->{ 'SEQ' } ) if not $type;
-
Maasha::Fasta::put_entry( $entry, $tmp_fh1 );
}
close $tmp_fh1;
close $tmp_fh2;
-push @args, "-d 120";
-push @args, "-i $tmp_dir/cluster.fasta";
-push @args, "-o $tmp_dir/cluster.out";
-push @args, "-n $options->{ 'word_size' }";
-push @args, "-c $options->{ 'identity' }";
-push @args, "-g 1" if not $options->{ 'fast_clust' };
+push @args, "--ucluster $tmp_dir/cluster.fasta";
+push @args, "--id $options->{ 'identity' }";
+push @args, "--tmpdir $options->{ 'tmp_dir' }";
+push @args, "--output $tmp_dir/cluster.uc";
+push @args, "--quiet" if not $options->{ 'verbose' };
push @args, "> /dev/null 2>&1" if not $options->{ 'verbose' };
$arg_str = join " ", @args;
-if ( $type =~ /protein/i ) {
- Maasha::Common::run( "cdhit", $arg_str );
-} else {
- Maasha::Common::run( "cdhit-est", $arg_str );
-}
+Maasha::Common::run( "uclust", $arg_str );
-$clusters = parse_clusters( "$tmp_dir/cluster.out.clstr" );
+$clusters = parse_clusters( "$tmp_dir/cluster.uc" );
$tmp_fh2 = Maasha::Filesys::file_read_open( "$tmp_dir/cluster.stream" );
{
# Martin A. Hansen, January 2010.
- # Parses a CD-hit cluster file and returns a hash with
+ # Parses a uclust uc cluster file and returns a hash with
# sequence name as key and cluster number as value.
my ( $file, # cluster file
# Returns a hash.
- my ( $block, $fh, @lines, $line, %clusters, $seq_name, $cluster );
-
- local $/ = "\n>";
+ my ( $fh, $line, %clusters, $seq_name, $cluster );
$fh = Maasha::Filesys::file_read_open( $file );
- while ( $block = <$fh> )
+ while ( $line = <$fh> )
{
- chomp $block;
-
- @lines = split "\n", $block;
-
- $cluster = shift @lines;
-
- $cluster =~ s/>?Cluster (\d+)/$1/;
+ chomp $line;
- foreach $line ( @lines )
- {
- if ( $line =~ />(.*)/ )
- {
- $seq_name = $1;
+ ( $cluster, undef, undef, undef, $seq_name ) = split "\t", $line;
- $clusters{ $seq_name } = $cluster;
- }
- }
+ $clusters{ $seq_name } = $cluster;
}
close $fh;