$options = get_options( $script );
- $script = "print_usage" if ( -t STDIN and not @ARGV and $script ne "list_biopieces" );
+ if ( $script ne "list_biopieces" and $script ne "list_genomes" ) {
+ $script = "print_usage" if ( -t STDIN and keys %{ $options } <= 1 or $options->{ 'help' } );
+ }
- $in = read_stream( $options->{ "stream_in" } );
- $out = write_stream( $options->{ "stream_out" } );
+ $in = read_stream( $options->{ "stream_in" } );
+ $out = write_stream( $options->{ "stream_out" } );
if ( $script eq "print_usage" ) { script_print_usage( $in, $out, $options ) }
elsif ( $script eq "list_biopieces" ) { script_list_biopieces( $in, $out, $options ) }
+ elsif ( $script eq "list_genomes" ) { script_list_genomes( $in, $out, $options ) }
elsif ( $script eq "read_fasta" ) { script_read_fasta( $in, $out, $options ) }
elsif ( $script eq "read_tab" ) { script_read_tab( $in, $out, $options ) }
elsif ( $script eq "read_psl" ) { script_read_psl( $in, $out, $options ) }
elsif ( $script eq "read_solexa" ) { script_read_solexa( $in, $out, $options ) }
elsif ( $script eq "read_solid" ) { script_read_solid( $in, $out, $options ) }
elsif ( $script eq "read_mysql" ) { script_read_mysql( $in, $out, $options ) }
+ elsif ( $script eq "format_genome" ) { script_format_genome( $in, $out, $options ) }
elsif ( $script eq "length_seq" ) { script_length_seq( $in, $out, $options ) }
elsif ( $script eq "uppercase_seq" ) { script_uppercase_seq( $in, $out, $options ) }
elsif ( $script eq "shuffle_seq" ) { script_shuffle_seq( $in, $out, $options ) }
password|p=s
);
}
+ elsif ( $script eq "format_genome" )
+ {
+ @options = qw(
+ no_stream|x
+ dir|d=s
+ genome|g=s
+ formats|f=s
+ );
+ }
elsif ( $script eq "length_seq" )
{
@options = qw(
$options{ "quals" } = [ split ",", $options{ "quals" } ] if defined $options{ "quals" };
$options{ "feats" } = [ split ",", $options{ "feats" } ] if defined $options{ "feats" };
$options{ "frames" } = [ split ",", $options{ "frames" } ] if defined $options{ "frames" };
+ $options{ "formats" } = [ split ",", $options{ "formats" } ] if defined $options{ "formats" };
# ---- check arguments ----
{
Maasha::Common::error( qq(Argument to --$opt must be "+" or "-" - not "$options{ $opt }") );
}
- elsif ( $opt eq "genome" )
+ elsif ( $opt eq "genome" and $script ne "format_genome" )
{
- @genomes = Maasha::Config::genomes();
-
- if ( not grep $options{ $opt }, @genomes ) {
- Maasha::Common::error( qq(Genome $options{ $opt } not found in "$ENV{ 'INST_DIR' }/conf/genomes.conf") );
+ @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" );
+ map { $_ =~ s/.*\/(.+)$/$1/ } @genomes;
+
+ if ( not grep { $_ =~ /^$options{ $opt }$/ } @genomes ) {
+ Maasha::Common::error( qq(Genome $options{ $opt } not found in "$ENV{ 'BP_DATA' }/genomes/") );
}
}
elsif ( $opt eq "terminal" and not $options{ $opt } =~ /^(svg|post|dumb)/ )
Maasha::Common::error( qq(both --database and --genome specified) ) if $script eq "blast_seq" and $options{ "genome" } and $options{ "database" };
Maasha::Common::error( qq(no --index_name or --genome specified) ) if $script eq "vmatch_seq" and not $options{ "genome" } and not $options{ "index_name" };
Maasha::Common::error( qq(both --index and --genome specified) ) if $script eq "vmatch_seq" and $options{ "genome" } and $options{ "index" };
- Maasha::Common::error( qq(no --genome specified) ) if $script =~ /get_genome_seq|get_genome_align|get_genome_phastcons|blat_seq|plot_phastcons_profiles|plot_karyogram/ and not $options{ "genome" };
+ Maasha::Common::error( qq(no --genome specified) ) if $script =~ /format_genome|get_genome_seq|get_genome_align|get_genome_phastcons|blat_seq|plot_phastcons_profiles|plot_karyogram/ and not $options{ "genome" };
Maasha::Common::error( qq(no --key specified) ) if $script =~ /plot_lendist|plot_histogram/ and not $options{ "key" };
Maasha::Common::error( qq(no --keys speficied) ) if $script =~ /sort_records|count_vals|sum_vals|mean_vals|median_vals|length_vals/ and not $options{ "keys" };
}
+sub script_list_genomes
+{
+ # Martin A. Hansen, January 2008.
+
+ # Prints the synopsis from the usage for each of the biopieces.
+
+ my ( $in, # handle to in stream
+ $out, # handle to out stream
+ $options, # options hash
+ ) = @_;
+
+ # Returns nothing.
+
+ my ( @genomes, $genome, @formats, $format, %hash, %found, @row );
+
+ @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" );
+
+ foreach $genome ( @genomes )
+ {
+ next if $genome =~ /\.$/;
+
+ @formats = Maasha::Common::ls_dirs( $genome );
+
+ foreach $format ( @formats )
+ {
+ if ( $format =~ /\/([^\/]+)\/(\w+)$/ )
+ {
+ $hash{ $1 }{ $2 } = 1;
+
+ $found{ $2 } = 1;
+ }
+ }
+ }
+
+ @row = "Genome";
+
+ map { push @row, $_ } sort keys %found;
+
+ print join( "\t", @row ), "\n";
+
+ foreach $genome ( sort keys %hash )
+ {
+ @row = $genome;
+
+ foreach $format ( sort keys %found )
+ {
+ if ( exists $hash{ $genome }{ $format } ) {
+ push @row, "yes";
+ } else {
+ push @row, "no";
+ }
+ }
+
+ print join( "\t", @row ), "\n";
+ }
+}
+
+
sub script_read_fasta
{
# Martin A. Hansen, August 2007.
}
+sub script_format_genome
+{
+ # Martin A. Hansen, Juli 2008.
+
+ # Format a genome to speficed formats.
+
+ my ( $in, # handle to in stream
+ $out, # handle to out stream
+ $options, # options hash
+ ) = @_;
+
+ # Returns nothing.
+
+ my ( $dir, $genome, $fasta_dir, $fh_out, $record, $format, $index );
+
+ $dir = $options->{ 'dir' } || $ENV{ 'BP_DATA' };
+ $genome = $options->{ 'genome' };
+
+ Maasha::Common::error( "Directory: $dir does not exist" ) if not -d $dir;
+ Maasha::Common::dir_create_if_not_exists( "$dir/genomes" );
+ Maasha::Common::dir_create_if_not_exists( "$dir/genomes/$genome" );
+
+ if ( -f "$dir/genomes/$genome/fasta/$genome.fna" )
+ {
+ $fasta_dir = "$dir/genomes/$genome/fasta";
+ }
+ elsif ( grep { $_ =~ /fasta/i } @{ $options->{ "formats" } } )
+ {
+ Maasha::Common::dir_create_if_not_exists( "$dir/genomes/$genome/fasta" );
+
+ $fasta_dir = "$dir/genomes/$genome/fasta";
+
+ $fh_out = Maasha::Common::write_open( "$fasta_dir/$genome.fna" );
+ }
+ else
+ {
+ $fasta_dir = $BP_TMP;
+
+ $fh_out = Maasha::Common::write_open( "$fasta_dir/$genome.fna" );
+ }
+
+ while ( $record = get_record( $in ) )
+ {
+ if ( $fh_out and $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) {
+ Maasha::Fasta::put_entry( [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ], $fh_out, $options->{ "wrap" } );
+ }
+
+ put_record( $record, $out ) if not $options->{ "no_stream" };
+ }
+
+ foreach $format ( @{ $options->{ 'formats' } } )
+ {
+ if ( $format =~ /^fasta$/i ) { Maasha::Fasta::fasta_index( "$fasta_dir/$genome.fna", "$dir/genomes/$genome/fasta/$genome.index" ) }
+ elsif ( $format =~ /^blast$/i ) { Maasha::NCBI::blast_index( "$genome.fna", $fasta_dir, "$dir/genomes/$genome/blast", "dna", $genome ) }
+ elsif ( $format =~ /^blat$/i ) { print STDERR "BLAT FORMAT NOT IMPLEMENTED" }
+ elsif ( $format =~ /^vmatch$/i ) { Maasha::Match::vmatch_index( "$genome.fna", $fasta_dir, "$dir/genomes/$genome/vmatch", $BP_TMP ) }
+ }
+
+ close $fh_out if $fh_out;
+}
+
+
sub script_length_seq
{
# Martin A. Hansen, August 2007.
# Returns nothing.
- my ( $record, $genome_file, $index_file, $index, $fh, $index_head, $index_beg, $index_len, $beg, $len, %lookup_hash, @begs, @lens, $i );
+ my ( $record, $genome, $genome_file, $index_file, $index, $fh, $index_head, $index_beg, $index_len, $beg, $len, %lookup_hash, @begs, @lens, $i );
$options->{ "flank" } ||= 0;
if ( $options->{ "genome" } )
{
- $genome_file = Maasha::Config::genome_fasta( $options->{ 'genome' } );
- $index_file = Maasha::Config::genome_fasta_index( $options->{ 'genome' } );
+ $genome = $options->{ "genome" };
+
+ $genome_file = "$ENV{ 'BP_DATA' }/genomes/$genome/fasta/$genome.fna";
+ $index_file = "$ENV{ 'BP_DATA' }/genomes/$genome/fasta/$genome.index";
$fh = Maasha::Common::read_open( $genome_file );
$index = Maasha::Fasta::index_retrieve( $index_file );
$patterns = Maasha::Patscan::read_patterns( $options->{ "patterns_in" } );
}
- $genome_file = Maasha::Config::genome_fasta( $options->{ 'genome' } ) if $options->{ 'genome' };
+ $genome_file = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna" if $options->{ 'genome' };
push @args, "-c" if $options->{ "comp" };
push @args, "-m $options->{ 'max_hits' }" if $options->{ 'max_hits' };
$options->{ "filter" } = "T" if $options->{ "filter" };
$options->{ "cpus" } ||= 1;
- $options->{ "database" } = Maasha::Config::genome_blast( $options->{ 'genome' } ) if $options->{ 'genome' };
+ $options->{ "database" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/blast/$options->{ 'genome' }.fna"if $options->{ 'genome' };
$tmp_in = "$BP_TMP/blast_query.seq";
$tmp_out = "$BP_TMP/blast.result";
my ( $blat_args, $genome_file, $query_file, $fh_in, $fh_out, $type, $record, $result_file, $entries );
- $genome_file = Maasha::Config::genome_fasta( $options->{ "genome" } );
+ $genome_file = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna";
$options->{ 'tile_size' } ||= 11;
$options->{ 'one_off' } ||= 0;
$blat_args .= " -minIdentity=$options->{ 'min_identity' }";
$blat_args .= " -minScore=$options->{ 'min_score' }";
$blat_args .= " -stepSize=$options->{ 'step_size' }";
- $blat_args .= " -ooc=" . Maasha::Config::genome_blat_ooc( $options->{ "genome" }, 11 ) if $options->{ 'ooc' };
+# $blat_args .= " -ooc=" . Maasha::Config::genome_blat_ooc( $options->{ "genome" }, 11 ) if $options->{ 'ooc' };
$query_file = "$BP_TMP/blat.seq";
# Returns nothing.
- my ( @index_files, @records, $result_file, $fh_in, $record );
+ my ( @index_files, @records, $result_file, $fh_in, $record, %hash );
$options->{ 'count' } = 1 if $options->{ 'max_hits' };
- if ( $options->{ "index_name" } ) {
+ if ( $options->{ "index_name" } )
+ {
@index_files = $options->{ "index_name" };
- } else {
- @index_files = Maasha::Config::genome_vmatch( $options->{ "genome" } );
+ }
+ else
+ {
+ @index_files = Maasha::Common::ls_files( "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/vmatch" );
+
+ map { $_ =~ /^(.+)\.[a-z1]{3,4}$/; $hash{ $1 } = 1 } @index_files;
+
+ @index_files = sort keys %hash;
}
while ( $record = get_record( $in ) )
use Maasha::Common;
use Maasha::Fasta;
use Maasha::Seq;
-use Maasha::Berkeley_DB;
use vars qw ( @ISA @EXPORT );
+use constant {
+ SEQ_NAME => 0,
+ SEQ => 1,
+};
+
@ISA = qw( Exporter );
}
-sub vmatch_count_hits_old
-{
- # Martin A. Hansen, April 2008.
-
- # Given a Vmatch result file, substitute the
- # score field with the times the query sequence
- # was found.
-
- my ( $tmp_dir, # directory in where to save temp files
- $path, # full path to vmatch file
- $max_count, # filter too abundant seqs - OPTIONAL
- ) = @_;
-
- # Returns nothing.
-
- my ( $fh_in, $fh_out, $line, @fields, @count_list );
-
- $fh_in = Maasha::Common::read_open( $path );
-
- while ( $line = <$fh_in> )
- {
- chomp $line;
-
- next if $line =~ /^#/;
-
- @fields = split " ", $line;
-
- $count_list[ $fields[ 5 ] ]++;
- }
-
- close $fh_in;
-
- $fh_in = Maasha::Common::read_open( $path );
- $fh_out = Maasha::Common::write_open( "$tmp_dir/vmatch.count" );
-
- while ( $line = <$fh_in> )
- {
- chomp $line;
-
- next if $line =~ /^#/;
-
- @fields = split " ", $line;
-
- $fields[ 9 ] = $count_list[ $fields[ 5 ] ];
-
- if ( $max_count ) {
- print $fh_out join( "\t", @fields ), "\n" if $fields[ 9 ] <= $max_count;
- } else {
- print $fh_out join( "\t", @fields ), "\n";
- }
- }
-
- close $fh_in;
- close $fh_out;
-
- rename "$tmp_dir/vmatch.count", $path;
-}
-
-
-sub vmatch_count_hits_old
-{
- # Martin A. Hansen, April 2008.
-
- # Given a Vmatch result file, substitute the
- # score field with the times the query sequence
- # was found.
-
- my ( $tmp_dir, # directory in where to save temp files
- $path, # full path to vmatch file
- $max_count, # filter too abundant seqs - OPTIONAL
- ) = @_;
-
- # Returns nothing.
-
- my ( $fh_in, $fh_out, $line, @fields, %count_hash );
-
- if ( $max_count ) {
- %count_hash = ();
- } else {
- %count_hash = Maasha::Berkeley_DB::db_init( "$tmp_dir/hash.bdb" );
- }
-
- $fh_in = Maasha::Common::read_open( $path );
-
- while ( $line = <$fh_in> )
- {
- chomp $line;
-
- next if $line =~ /^#/;
-
- @fields = split " ", $line;
-
- $count_hash{ $fields[ 5 ] }++;
- }
-
- close $fh_in;
-
- $fh_in = Maasha::Common::read_open( $path );
- $fh_out = Maasha::Common::write_open( "$tmp_dir/vmatch.count" );
-
- while ( $line = <$fh_in> )
- {
- chomp $line;
-
- next if $line =~ /^#/;
-
- @fields = split " ", $line;
-
- $fields[ 9 ] = $count_hash{ $fields[ 5 ] };
-
- if ( $max_count ) {
- print $fh_out join( "\t", @fields ), "\n" if $fields[ 9 ] <= $max_count;
- } else {
- print $fh_out join( "\t", @fields ), "\n";
- }
- }
-
- close $fh_in;
- close $fh_out;
-
- if ( not $max_count )
- {
- untie %count_hash;
- unlink "$tmp_dir/hash.bdb";
- }
-
- rename "$tmp_dir/vmatch.count", $path;
-}
-
-
sub vmatch_get_entry
{
# Martin A. Hansen, January 2008.
}
+sub vmatch_index
+{
+ # Martin A. Hansen, July 2008.
+
+ # Use mkvtree to create a vmatch index of a given file
+
+ my ( $file, # FASTA file to index
+ $src_dir, # source directory with FASTA file
+ $dst_dir, # distination directory for Vmatch index
+ $tmp_dir, # temp directory - OPTIONAL
+ ) = @_;
+
+ # Returns nothing.
+
+ my ( $fh, $entry, $tmp_file );
+
+ Maasha::Common::dir_create_if_not_exists( $dst_dir );
+
+ if ( Maasha::Common::file_size( $file ) < 250_000_000 )
+ {
+ &Maasha::Common::run( "mkvtree", "-db $src_dir/$file -dna -pl -allout -indexname $dst_dir/$file > /dev/null 3>&1" );
+ }
+ else
+ {
+ $fh = Maasha::Common::read_open( "$src_dir/$file" );
+
+ while ( $entry = Maasha::Fasta::get_entry( $fh ) )
+ {
+ $tmp_file = $entry->[ SEQ_NAME ] . ".fna";
+
+ Maasha::Fasta::put_entries( [ $entry ], "$tmp_dir/$tmp_file" )
+
+ &Maasha::Common::run( "mkvtree", "-db $tmp_dir/$tmp_file -dna -pl -allout -indexname $dst_dir/$tmp_file > /dev/null 3>&1" );
+
+ unlink "$tmp_dir/$tmp_file";
+ }
+
+ close $fh;
+ }
+}
+
+
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<