From 5cbfb1e379cfbe79bd871359fd3455b590c9b396 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Thu, 10 Jul 2008 06:47:48 +0000 Subject: [PATCH] added format_genome and list_genomes git-svn-id: http://biopieces.googlecode.com/svn/trunk@141 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/format_genome | 6 ++ bp_bin/list_genomes | 6 ++ bp_conf/bashrc | 3 +- code_perl/Maasha/Biopieces.pm | 184 ++++++++++++++++++++++++++++++---- code_perl/Maasha/Fasta.pm | 20 ++++ code_perl/Maasha/Match.pm | 178 +++++++++----------------------- code_perl/Maasha/NCBI.pm | 33 ++++++ 7 files changed, 277 insertions(+), 153 deletions(-) create mode 100755 bp_bin/format_genome create mode 100755 bp_bin/list_genomes diff --git a/bp_bin/format_genome b/bp_bin/format_genome new file mode 100755 index 0000000..4cd1d44 --- /dev/null +++ b/bp_bin/format_genome @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biopieces; diff --git a/bp_bin/list_genomes b/bp_bin/list_genomes new file mode 100755 index 0000000..4cd1d44 --- /dev/null +++ b/bp_bin/list_genomes @@ -0,0 +1,6 @@ +#!/usr/bin/env perl + +use warnings; +use strict; + +use Maasha::Biopieces; diff --git a/bp_conf/bashrc b/bp_conf/bashrc index 10814f1..cdeadf3 100644 --- a/bp_conf/bashrc +++ b/bp_conf/bashrc @@ -7,7 +7,7 @@ ### Allso, it is possible to change the temporary directory from the system's tmp. export BP_DIR="/Users/m.hansen/biopieces" # Directory where biopieces are installed -export BP_DATA="/Users/m.hansen/bio_piece_data" # Contains genomic data etc. +export BP_DATA="/Users/m.hansen/BP_DATA" # Contains genomic data etc. export BP_TMP="/tmp" # Required temporary directory. @@ -15,7 +15,6 @@ export BP_TMP="/tmp" # Required temporary directory export BP_BIN="$BP_DIR/bp_bin" # Directory with biopiece executables. - ### The following directories hold the biopiece libraries, modules, gems, etc - one per programming language. export BP_PERL="$BP_DIR/code_perl" # Direcotry with Perl code. diff --git a/code_perl/Maasha/Biopieces.pm b/code_perl/Maasha/Biopieces.pm index 7badb51..0523a45 100644 --- a/code_perl/Maasha/Biopieces.pm +++ b/code_perl/Maasha/Biopieces.pm @@ -157,13 +157,16 @@ sub run_script $options = get_options( $script ); - $script = "print_usage" if ( -t STDIN and not @ARGV and $script ne "list_biopieces" ); + if ( $script ne "list_biopieces" and $script ne "list_genomes" ) { + $script = "print_usage" if ( -t STDIN and keys %{ $options } <= 1 or $options->{ 'help' } ); + } - $in = read_stream( $options->{ "stream_in" } ); - $out = write_stream( $options->{ "stream_out" } ); + $in = read_stream( $options->{ "stream_in" } ); + $out = write_stream( $options->{ "stream_out" } ); if ( $script eq "print_usage" ) { script_print_usage( $in, $out, $options ) } elsif ( $script eq "list_biopieces" ) { script_list_biopieces( $in, $out, $options ) } + elsif ( $script eq "list_genomes" ) { script_list_genomes( $in, $out, $options ) } elsif ( $script eq "read_fasta" ) { script_read_fasta( $in, $out, $options ) } elsif ( $script eq "read_tab" ) { script_read_tab( $in, $out, $options ) } elsif ( $script eq "read_psl" ) { script_read_psl( $in, $out, $options ) } @@ -178,6 +181,7 @@ sub run_script elsif ( $script eq "read_solexa" ) { script_read_solexa( $in, $out, $options ) } elsif ( $script eq "read_solid" ) { script_read_solid( $in, $out, $options ) } elsif ( $script eq "read_mysql" ) { script_read_mysql( $in, $out, $options ) } + elsif ( $script eq "format_genome" ) { script_format_genome( $in, $out, $options ) } elsif ( $script eq "length_seq" ) { script_length_seq( $in, $out, $options ) } elsif ( $script eq "uppercase_seq" ) { script_uppercase_seq( $in, $out, $options ) } elsif ( $script eq "shuffle_seq" ) { script_shuffle_seq( $in, $out, $options ) } @@ -389,6 +393,15 @@ sub get_options password|p=s ); } + elsif ( $script eq "format_genome" ) + { + @options = qw( + no_stream|x + dir|d=s + genome|g=s + formats|f=s + ); + } elsif ( $script eq "length_seq" ) { @options = qw( @@ -908,6 +921,7 @@ sub get_options $options{ "quals" } = [ split ",", $options{ "quals" } ] if defined $options{ "quals" }; $options{ "feats" } = [ split ",", $options{ "feats" } ] if defined $options{ "feats" }; $options{ "frames" } = [ split ",", $options{ "frames" } ] if defined $options{ "frames" }; + $options{ "formats" } = [ split ",", $options{ "formats" } ] if defined $options{ "formats" }; # ---- check arguments ---- @@ -948,12 +962,13 @@ sub get_options { Maasha::Common::error( qq(Argument to --$opt must be "+" or "-" - not "$options{ $opt }") ); } - elsif ( $opt eq "genome" ) + elsif ( $opt eq "genome" and $script ne "format_genome" ) { - @genomes = Maasha::Config::genomes(); - - if ( not grep $options{ $opt }, @genomes ) { - Maasha::Common::error( qq(Genome $options{ $opt } not found in "$ENV{ 'INST_DIR' }/conf/genomes.conf") ); + @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" ); + map { $_ =~ s/.*\/(.+)$/$1/ } @genomes; + + if ( not grep { $_ =~ /^$options{ $opt }$/ } @genomes ) { + Maasha::Common::error( qq(Genome $options{ $opt } not found in "$ENV{ 'BP_DATA' }/genomes/") ); } } elsif ( $opt eq "terminal" and not $options{ $opt } =~ /^(svg|post|dumb)/ ) @@ -972,7 +987,7 @@ sub get_options Maasha::Common::error( qq(both --database and --genome specified) ) if $script eq "blast_seq" and $options{ "genome" } and $options{ "database" }; Maasha::Common::error( qq(no --index_name or --genome specified) ) if $script eq "vmatch_seq" and not $options{ "genome" } and not $options{ "index_name" }; Maasha::Common::error( qq(both --index and --genome specified) ) if $script eq "vmatch_seq" and $options{ "genome" } and $options{ "index" }; - Maasha::Common::error( qq(no --genome specified) ) if $script =~ /get_genome_seq|get_genome_align|get_genome_phastcons|blat_seq|plot_phastcons_profiles|plot_karyogram/ and not $options{ "genome" }; + Maasha::Common::error( qq(no --genome specified) ) if $script =~ /format_genome|get_genome_seq|get_genome_align|get_genome_phastcons|blat_seq|plot_phastcons_profiles|plot_karyogram/ and not $options{ "genome" }; Maasha::Common::error( qq(no --key specified) ) if $script =~ /plot_lendist|plot_histogram/ and not $options{ "key" }; Maasha::Common::error( qq(no --keys speficied) ) if $script =~ /sort_records|count_vals|sum_vals|mean_vals|median_vals|length_vals/ and not $options{ "keys" }; @@ -1063,6 +1078,64 @@ sub script_list_biopieces } +sub script_list_genomes +{ + # Martin A. Hansen, January 2008. + + # Prints the synopsis from the usage for each of the biopieces. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( @genomes, $genome, @formats, $format, %hash, %found, @row ); + + @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" ); + + foreach $genome ( @genomes ) + { + next if $genome =~ /\.$/; + + @formats = Maasha::Common::ls_dirs( $genome ); + + foreach $format ( @formats ) + { + if ( $format =~ /\/([^\/]+)\/(\w+)$/ ) + { + $hash{ $1 }{ $2 } = 1; + + $found{ $2 } = 1; + } + } + } + + @row = "Genome"; + + map { push @row, $_ } sort keys %found; + + print join( "\t", @row ), "\n"; + + foreach $genome ( sort keys %hash ) + { + @row = $genome; + + foreach $format ( sort keys %found ) + { + if ( exists $hash{ $genome }{ $format } ) { + push @row, "yes"; + } else { + push @row, "no"; + } + } + + print join( "\t", @row ), "\n"; + } +} + + sub script_read_fasta { # Martin A. Hansen, August 2007. @@ -1864,6 +1937,68 @@ sub script_read_mysql } +sub script_format_genome +{ + # Martin A. Hansen, Juli 2008. + + # Format a genome to speficed formats. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $dir, $genome, $fasta_dir, $fh_out, $record, $format, $index ); + + $dir = $options->{ 'dir' } || $ENV{ 'BP_DATA' }; + $genome = $options->{ 'genome' }; + + Maasha::Common::error( "Directory: $dir does not exist" ) if not -d $dir; + Maasha::Common::dir_create_if_not_exists( "$dir/genomes" ); + Maasha::Common::dir_create_if_not_exists( "$dir/genomes/$genome" ); + + if ( -f "$dir/genomes/$genome/fasta/$genome.fna" ) + { + $fasta_dir = "$dir/genomes/$genome/fasta"; + } + elsif ( grep { $_ =~ /fasta/i } @{ $options->{ "formats" } } ) + { + Maasha::Common::dir_create_if_not_exists( "$dir/genomes/$genome/fasta" ); + + $fasta_dir = "$dir/genomes/$genome/fasta"; + + $fh_out = Maasha::Common::write_open( "$fasta_dir/$genome.fna" ); + } + else + { + $fasta_dir = $BP_TMP; + + $fh_out = Maasha::Common::write_open( "$fasta_dir/$genome.fna" ); + } + + while ( $record = get_record( $in ) ) + { + if ( $fh_out and $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { + Maasha::Fasta::put_entry( [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ], $fh_out, $options->{ "wrap" } ); + } + + put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + foreach $format ( @{ $options->{ 'formats' } } ) + { + if ( $format =~ /^fasta$/i ) { Maasha::Fasta::fasta_index( "$fasta_dir/$genome.fna", "$dir/genomes/$genome/fasta/$genome.index" ) } + elsif ( $format =~ /^blast$/i ) { Maasha::NCBI::blast_index( "$genome.fna", $fasta_dir, "$dir/genomes/$genome/blast", "dna", $genome ) } + elsif ( $format =~ /^blat$/i ) { print STDERR "BLAT FORMAT NOT IMPLEMENTED" } + elsif ( $format =~ /^vmatch$/i ) { Maasha::Match::vmatch_index( "$genome.fna", $fasta_dir, "$dir/genomes/$genome/vmatch", $BP_TMP ) } + } + + close $fh_out if $fh_out; +} + + sub script_length_seq { # Martin A. Hansen, August 2007. @@ -2478,14 +2613,16 @@ sub script_get_genome_seq # Returns nothing. - my ( $record, $genome_file, $index_file, $index, $fh, $index_head, $index_beg, $index_len, $beg, $len, %lookup_hash, @begs, @lens, $i ); + my ( $record, $genome, $genome_file, $index_file, $index, $fh, $index_head, $index_beg, $index_len, $beg, $len, %lookup_hash, @begs, @lens, $i ); $options->{ "flank" } ||= 0; if ( $options->{ "genome" } ) { - $genome_file = Maasha::Config::genome_fasta( $options->{ 'genome' } ); - $index_file = Maasha::Config::genome_fasta_index( $options->{ 'genome' } ); + $genome = $options->{ "genome" }; + + $genome_file = "$ENV{ 'BP_DATA' }/genomes/$genome/fasta/$genome.fna"; + $index_file = "$ENV{ 'BP_DATA' }/genomes/$genome/fasta/$genome.index"; $fh = Maasha::Common::read_open( $genome_file ); $index = Maasha::Fasta::index_retrieve( $index_file ); @@ -3029,7 +3166,7 @@ sub script_patscan_seq $patterns = Maasha::Patscan::read_patterns( $options->{ "patterns_in" } ); } - $genome_file = Maasha::Config::genome_fasta( $options->{ 'genome' } ) if $options->{ 'genome' }; + $genome_file = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna" if $options->{ 'genome' }; push @args, "-c" if $options->{ "comp" }; push @args, "-m $options->{ 'max_hits' }" if $options->{ 'max_hits' }; @@ -3176,7 +3313,7 @@ sub script_blast_seq $options->{ "filter" } = "T" if $options->{ "filter" }; $options->{ "cpus" } ||= 1; - $options->{ "database" } = Maasha::Config::genome_blast( $options->{ 'genome' } ) if $options->{ 'genome' }; + $options->{ "database" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/blast/$options->{ 'genome' }.fna"if $options->{ 'genome' }; $tmp_in = "$BP_TMP/blast_query.seq"; $tmp_out = "$BP_TMP/blast.result"; @@ -3281,7 +3418,7 @@ sub script_blat_seq my ( $blat_args, $genome_file, $query_file, $fh_in, $fh_out, $type, $record, $result_file, $entries ); - $genome_file = Maasha::Config::genome_fasta( $options->{ "genome" } ); + $genome_file = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna"; $options->{ 'tile_size' } ||= 11; $options->{ 'one_off' } ||= 0; @@ -3294,7 +3431,7 @@ sub script_blat_seq $blat_args .= " -minIdentity=$options->{ 'min_identity' }"; $blat_args .= " -minScore=$options->{ 'min_score' }"; $blat_args .= " -stepSize=$options->{ 'step_size' }"; - $blat_args .= " -ooc=" . Maasha::Config::genome_blat_ooc( $options->{ "genome" }, 11 ) if $options->{ 'ooc' }; +# $blat_args .= " -ooc=" . Maasha::Config::genome_blat_ooc( $options->{ "genome" }, 11 ) if $options->{ 'ooc' }; $query_file = "$BP_TMP/blat.seq"; @@ -3433,14 +3570,21 @@ sub script_vmatch_seq # Returns nothing. - my ( @index_files, @records, $result_file, $fh_in, $record ); + my ( @index_files, @records, $result_file, $fh_in, $record, %hash ); $options->{ 'count' } = 1 if $options->{ 'max_hits' }; - if ( $options->{ "index_name" } ) { + if ( $options->{ "index_name" } ) + { @index_files = $options->{ "index_name" }; - } else { - @index_files = Maasha::Config::genome_vmatch( $options->{ "genome" } ); + } + else + { + @index_files = Maasha::Common::ls_files( "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/vmatch" ); + + map { $_ =~ /^(.+)\.[a-z1]{3,4}$/; $hash{ $1 } = 1 } @index_files; + + @index_files = sort keys %hash; } while ( $record = get_record( $in ) ) diff --git a/code_perl/Maasha/Fasta.pm b/code_perl/Maasha/Fasta.pm index 3df2ddc..82307f9 100644 --- a/code_perl/Maasha/Fasta.pm +++ b/code_perl/Maasha/Fasta.pm @@ -328,6 +328,26 @@ sub fasta_reformat } +sub fasta_index +{ + # Martin A. Hansen, July 2008. + + # Given a path to a FASTA file create a FASTA index + # and save to the specified path. + + my ( $fasta_path, # path to FASTA file + $index_path, # path to index file + ) = @_; + + # Returns nothing. + + my ( $index ); + + $index = index_create( $fasta_path ); + + Maasha::Fasta::index_store( $index_path, $index ); +} + sub index_create { # Matin A. Hansen, December 2004. diff --git a/code_perl/Maasha/Match.pm b/code_perl/Maasha/Match.pm index 5a37650..f5a919e 100644 --- a/code_perl/Maasha/Match.pm +++ b/code_perl/Maasha/Match.pm @@ -34,9 +34,13 @@ use Storable qw( dclone ); use Maasha::Common; use Maasha::Fasta; use Maasha::Seq; -use Maasha::Berkeley_DB; use vars qw ( @ISA @EXPORT ); +use constant { + SEQ_NAME => 0, + SEQ => 1, +}; + @ISA = qw( Exporter ); @@ -256,136 +260,6 @@ sub vmatch_count_hits } -sub vmatch_count_hits_old -{ - # Martin A. Hansen, April 2008. - - # Given a Vmatch result file, substitute the - # score field with the times the query sequence - # was found. - - my ( $tmp_dir, # directory in where to save temp files - $path, # full path to vmatch file - $max_count, # filter too abundant seqs - OPTIONAL - ) = @_; - - # Returns nothing. - - my ( $fh_in, $fh_out, $line, @fields, @count_list ); - - $fh_in = Maasha::Common::read_open( $path ); - - while ( $line = <$fh_in> ) - { - chomp $line; - - next if $line =~ /^#/; - - @fields = split " ", $line; - - $count_list[ $fields[ 5 ] ]++; - } - - close $fh_in; - - $fh_in = Maasha::Common::read_open( $path ); - $fh_out = Maasha::Common::write_open( "$tmp_dir/vmatch.count" ); - - while ( $line = <$fh_in> ) - { - chomp $line; - - next if $line =~ /^#/; - - @fields = split " ", $line; - - $fields[ 9 ] = $count_list[ $fields[ 5 ] ]; - - if ( $max_count ) { - print $fh_out join( "\t", @fields ), "\n" if $fields[ 9 ] <= $max_count; - } else { - print $fh_out join( "\t", @fields ), "\n"; - } - } - - close $fh_in; - close $fh_out; - - rename "$tmp_dir/vmatch.count", $path; -} - - -sub vmatch_count_hits_old -{ - # Martin A. Hansen, April 2008. - - # Given a Vmatch result file, substitute the - # score field with the times the query sequence - # was found. - - my ( $tmp_dir, # directory in where to save temp files - $path, # full path to vmatch file - $max_count, # filter too abundant seqs - OPTIONAL - ) = @_; - - # Returns nothing. - - my ( $fh_in, $fh_out, $line, @fields, %count_hash ); - - if ( $max_count ) { - %count_hash = (); - } else { - %count_hash = Maasha::Berkeley_DB::db_init( "$tmp_dir/hash.bdb" ); - } - - $fh_in = Maasha::Common::read_open( $path ); - - while ( $line = <$fh_in> ) - { - chomp $line; - - next if $line =~ /^#/; - - @fields = split " ", $line; - - $count_hash{ $fields[ 5 ] }++; - } - - close $fh_in; - - $fh_in = Maasha::Common::read_open( $path ); - $fh_out = Maasha::Common::write_open( "$tmp_dir/vmatch.count" ); - - while ( $line = <$fh_in> ) - { - chomp $line; - - next if $line =~ /^#/; - - @fields = split " ", $line; - - $fields[ 9 ] = $count_hash{ $fields[ 5 ] }; - - if ( $max_count ) { - print $fh_out join( "\t", @fields ), "\n" if $fields[ 9 ] <= $max_count; - } else { - print $fh_out join( "\t", @fields ), "\n"; - } - } - - close $fh_in; - close $fh_out; - - if ( not $max_count ) - { - untie %count_hash; - unlink "$tmp_dir/hash.bdb"; - } - - rename "$tmp_dir/vmatch.count", $path; -} - - sub vmatch_get_entry { # Martin A. Hansen, January 2008. @@ -435,6 +309,48 @@ sub vmatch_get_entry } +sub vmatch_index +{ + # Martin A. Hansen, July 2008. + + # Use mkvtree to create a vmatch index of a given file + + my ( $file, # FASTA file to index + $src_dir, # source directory with FASTA file + $dst_dir, # distination directory for Vmatch index + $tmp_dir, # temp directory - OPTIONAL + ) = @_; + + # Returns nothing. + + my ( $fh, $entry, $tmp_file ); + + Maasha::Common::dir_create_if_not_exists( $dst_dir ); + + if ( Maasha::Common::file_size( $file ) < 250_000_000 ) + { + &Maasha::Common::run( "mkvtree", "-db $src_dir/$file -dna -pl -allout -indexname $dst_dir/$file > /dev/null 3>&1" ); + } + else + { + $fh = Maasha::Common::read_open( "$src_dir/$file" ); + + while ( $entry = Maasha::Fasta::get_entry( $fh ) ) + { + $tmp_file = $entry->[ SEQ_NAME ] . ".fna"; + + Maasha::Fasta::put_entries( [ $entry ], "$tmp_dir/$tmp_file" ) + + &Maasha::Common::run( "mkvtree", "-db $tmp_dir/$tmp_file -dna -pl -allout -indexname $dst_dir/$tmp_file > /dev/null 3>&1" ); + + unlink "$tmp_dir/$tmp_file"; + } + + close $fh; + } +} + + # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< diff --git a/code_perl/Maasha/NCBI.pm b/code_perl/Maasha/NCBI.pm index 689fd7d..e6b7407 100644 --- a/code_perl/Maasha/NCBI.pm +++ b/code_perl/Maasha/NCBI.pm @@ -32,6 +32,7 @@ use strict; use Data::Dumper; use LWP::Simple; use Maasha::Common; +use Maasha::Seq; use vars qw( @ISA @EXPORT ); @@ -422,6 +423,38 @@ sub soft_get_sample } +sub blast_index +{ + # Martin A. Hansen, Juli 2008. + + # Create a BLAST index of a given FASTA file + # in a specified directory using a given name. + + my ( $file, # path to FASTA file + $src_dir, # source directory with FASTA file + $dst_dir, # destination directory for BLAST index + $index_type, # protein or nucleotide + $index_name, # name of index + ) = @_; + + # Returns nothing. + + my ( $type ); + + if ( $index_type =~ /protein/i ) { + $type = "T"; + } else { + $type = "F"; + } + + Maasha::Common::dir_create_if_not_exists( $dst_dir ); + Maasha::Common::run( "formatdb", "-p $type -i $src_dir/$file -t $index_name -l /dev/null" ); + Maasha::Common::run( "mv", "$src_dir/*hr $dst_dir" ); + Maasha::Common::run( "mv", "$src_dir/*in $dst_dir" ); + Maasha::Common::run( "mv", "$src_dir/*sq $dst_dir" ); +} + + # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -- 2.39.5