From c32b13419ca8e524a5c98869048d0be670ee76d0 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Tue, 12 May 2009 20:37:35 +0000 Subject: [PATCH] split Biopieces.pm into BioRun.pm and Biopieces.pm git-svn-id: http://biopieces.googlecode.com/svn/trunk@370 74ccb610-7750-0410-82ae-013aeee3265d --- bp_bin/add_ident | 2 +- bp_bin/align_seq | 2 +- bp_bin/analyze_bed | 2 +- bp_bin/analyze_seq | 2 +- bp_bin/analyze_tags | 2 +- bp_bin/analyze_vals | 2 +- bp_bin/assemble_tag_contigs | 2 +- bp_bin/blast_seq | 2 +- bp_bin/blat_seq | 2 +- bp_bin/calc_bit_scores | 2 +- bp_bin/calc_fixedstep | 2 +- bp_bin/complement_seq | 2 +- bp_bin/complexity_seq | 2 +- bp_bin/compute | 2 +- bp_bin/count_records | 2 +- bp_bin/count_vals | 2 +- bp_bin/create_blast_db | 2 +- bp_bin/create_vmatch_index | 2 +- bp_bin/create_weight_matrix | 2 +- bp_bin/extract_seq | 2 +- bp_bin/flip_tab | 2 +- bp_bin/fold_seq | 2 +- bp_bin/format_genome | 2 +- bp_bin/get_genome_align | 2 +- bp_bin/get_genome_phastcons | 2 +- bp_bin/get_genome_seq | 2 +- bp_bin/grab | 2 +- bp_bin/head_records | 2 +- bp_bin/invert_align | 2 +- bp_bin/length_seq | 2 +- bp_bin/length_vals | 2 +- bp_bin/list_biopieces | 2 +- bp_bin/list_genomes | 2 +- bp_bin/match_seq | 2 +- bp_bin/max_vals | 2 +- bp_bin/mean_vals | 2 +- bp_bin/median_vals | 2 +- bp_bin/merge_records | 2 +- bp_bin/merge_vals | 2 +- bp_bin/min_vals | 2 +- bp_bin/oligo_freq | 2 +- bp_bin/patscan_seq | 2 +- bp_bin/plot_chrdist | 2 +- bp_bin/plot_histogram | 2 +- bp_bin/plot_karyogram | 2 +- bp_bin/plot_lendist | 2 +- bp_bin/plot_matches | 2 +- bp_bin/plot_phastcons_profiles | 2 +- bp_bin/plot_seqlogo | 2 +- bp_bin/print_usage | 2 +- bp_bin/random_records | 2 +- bp_bin/read_2bit | 2 +- bp_bin/read_bed | 2 +- bp_bin/read_blast_tab | 2 +- bp_bin/read_embl | 2 +- bp_bin/read_fasta | 2 +- bp_bin/read_fixedstep | 2 +- bp_bin/read_gff | 2 +- bp_bin/read_mysql | 2 +- bp_bin/read_phastcons | 2 +- bp_bin/read_psl | 2 +- bp_bin/read_soft | 2 +- bp_bin/read_solexa | 2 +- bp_bin/read_solid | 2 +- bp_bin/read_stockholm | 2 +- bp_bin/read_tab | 2 +- bp_bin/read_ucsc_config | 2 +- bp_bin/remove_adaptor | 2 +- bp_bin/remove_indels | 2 +- bp_bin/remove_keys | 2 +- bp_bin/remove_mysql_tables | 2 +- bp_bin/remove_ucsc_tracks | 2 +- bp_bin/rename_keys | 2 +- bp_bin/reverse_seq | 2 +- bp_bin/shuffle_seq | 2 +- bp_bin/soap_seq | 2 +- bp_bin/sort_records | 2 +- bp_bin/split_bed | 2 +- bp_bin/split_seq | 2 +- bp_bin/sum_vals | 2 +- bp_bin/tile_seq | 2 +- bp_bin/translate_seq | 2 +- bp_bin/transliterate_seq | 2 +- bp_bin/transliterate_vals | 2 +- bp_bin/uniq_vals | 2 +- bp_bin/upload_to_ucsc | 2 +- bp_bin/uppercase_seq | 2 +- bp_bin/vmatch_seq | 2 +- bp_bin/write_2bit | 2 +- bp_bin/write_align | 2 +- bp_bin/write_bed | 2 +- bp_bin/write_blast | 2 +- bp_bin/write_fasta | 2 +- bp_bin/write_fixedstep | 2 +- bp_bin/write_psl | 2 +- bp_bin/write_solid | 2 +- bp_bin/write_tab | 2 +- bp_bin/write_ucsc_config | 2 +- code_perl/Maasha/BioRun.pm | 6445 +++++++++++++++++++++++++++++++ code_perl/Maasha/Biopieces.pm | 6471 +------------------------------- 100 files changed, 6572 insertions(+), 6540 deletions(-) create mode 100644 code_perl/Maasha/BioRun.pm diff --git a/bp_bin/add_ident b/bp_bin/add_ident index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/add_ident +++ b/bp_bin/add_ident @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/align_seq b/bp_bin/align_seq index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/align_seq +++ b/bp_bin/align_seq @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/analyze_bed b/bp_bin/analyze_bed index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/analyze_bed +++ b/bp_bin/analyze_bed @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/analyze_seq b/bp_bin/analyze_seq index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/analyze_seq +++ b/bp_bin/analyze_seq @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/analyze_tags b/bp_bin/analyze_tags index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/analyze_tags +++ b/bp_bin/analyze_tags @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/analyze_vals b/bp_bin/analyze_vals index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/analyze_vals +++ b/bp_bin/analyze_vals @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/assemble_tag_contigs b/bp_bin/assemble_tag_contigs index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/assemble_tag_contigs +++ b/bp_bin/assemble_tag_contigs @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/blast_seq b/bp_bin/blast_seq index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/blast_seq +++ b/bp_bin/blast_seq @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/blat_seq b/bp_bin/blat_seq index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/blat_seq +++ b/bp_bin/blat_seq @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/calc_bit_scores b/bp_bin/calc_bit_scores index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/calc_bit_scores +++ b/bp_bin/calc_bit_scores @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/calc_fixedstep b/bp_bin/calc_fixedstep index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/calc_fixedstep +++ b/bp_bin/calc_fixedstep @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/complement_seq b/bp_bin/complement_seq index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/complement_seq +++ b/bp_bin/complement_seq @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/complexity_seq b/bp_bin/complexity_seq index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/complexity_seq +++ b/bp_bin/complexity_seq @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/compute b/bp_bin/compute index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/compute +++ b/bp_bin/compute @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/count_records b/bp_bin/count_records index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/count_records +++ b/bp_bin/count_records @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/count_vals b/bp_bin/count_vals index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/count_vals +++ b/bp_bin/count_vals @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/create_blast_db b/bp_bin/create_blast_db index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/create_blast_db +++ b/bp_bin/create_blast_db @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/create_vmatch_index b/bp_bin/create_vmatch_index index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/create_vmatch_index +++ b/bp_bin/create_vmatch_index @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/create_weight_matrix b/bp_bin/create_weight_matrix index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/create_weight_matrix +++ b/bp_bin/create_weight_matrix @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/extract_seq b/bp_bin/extract_seq index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/extract_seq +++ b/bp_bin/extract_seq @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/flip_tab b/bp_bin/flip_tab index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/flip_tab +++ b/bp_bin/flip_tab @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/fold_seq b/bp_bin/fold_seq index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/fold_seq +++ b/bp_bin/fold_seq @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/format_genome b/bp_bin/format_genome index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/format_genome +++ b/bp_bin/format_genome @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/get_genome_align b/bp_bin/get_genome_align index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/get_genome_align +++ b/bp_bin/get_genome_align @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/get_genome_phastcons b/bp_bin/get_genome_phastcons index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/get_genome_phastcons +++ b/bp_bin/get_genome_phastcons @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/get_genome_seq b/bp_bin/get_genome_seq index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/get_genome_seq +++ b/bp_bin/get_genome_seq @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/grab b/bp_bin/grab index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/grab +++ b/bp_bin/grab @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/head_records b/bp_bin/head_records index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/head_records +++ b/bp_bin/head_records @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/invert_align b/bp_bin/invert_align index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/invert_align +++ b/bp_bin/invert_align @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/length_seq b/bp_bin/length_seq index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/length_seq +++ b/bp_bin/length_seq @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/length_vals b/bp_bin/length_vals index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/length_vals +++ b/bp_bin/length_vals @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/list_biopieces b/bp_bin/list_biopieces index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/list_biopieces +++ b/bp_bin/list_biopieces @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/list_genomes b/bp_bin/list_genomes index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/list_genomes +++ b/bp_bin/list_genomes @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/match_seq b/bp_bin/match_seq index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/match_seq +++ b/bp_bin/match_seq @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/max_vals b/bp_bin/max_vals index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/max_vals +++ b/bp_bin/max_vals @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/mean_vals b/bp_bin/mean_vals index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/mean_vals +++ b/bp_bin/mean_vals @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/median_vals b/bp_bin/median_vals index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/median_vals +++ b/bp_bin/median_vals @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/merge_records b/bp_bin/merge_records index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/merge_records +++ b/bp_bin/merge_records @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/merge_vals b/bp_bin/merge_vals index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/merge_vals +++ b/bp_bin/merge_vals @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/min_vals b/bp_bin/min_vals index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/min_vals +++ b/bp_bin/min_vals @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/oligo_freq b/bp_bin/oligo_freq index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/oligo_freq +++ b/bp_bin/oligo_freq @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/patscan_seq b/bp_bin/patscan_seq index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/patscan_seq +++ b/bp_bin/patscan_seq @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/plot_chrdist b/bp_bin/plot_chrdist index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/plot_chrdist +++ b/bp_bin/plot_chrdist @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/plot_histogram b/bp_bin/plot_histogram index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/plot_histogram +++ b/bp_bin/plot_histogram @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/plot_karyogram b/bp_bin/plot_karyogram index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/plot_karyogram +++ b/bp_bin/plot_karyogram @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/plot_lendist b/bp_bin/plot_lendist index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/plot_lendist +++ b/bp_bin/plot_lendist @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/plot_matches b/bp_bin/plot_matches index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/plot_matches +++ b/bp_bin/plot_matches @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/plot_phastcons_profiles b/bp_bin/plot_phastcons_profiles index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/plot_phastcons_profiles +++ b/bp_bin/plot_phastcons_profiles @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/plot_seqlogo b/bp_bin/plot_seqlogo index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/plot_seqlogo +++ b/bp_bin/plot_seqlogo @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/print_usage b/bp_bin/print_usage index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/print_usage +++ b/bp_bin/print_usage @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/random_records b/bp_bin/random_records index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/random_records +++ b/bp_bin/random_records @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/read_2bit b/bp_bin/read_2bit index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/read_2bit +++ b/bp_bin/read_2bit @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/read_bed b/bp_bin/read_bed index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/read_bed +++ b/bp_bin/read_bed @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/read_blast_tab b/bp_bin/read_blast_tab index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/read_blast_tab +++ b/bp_bin/read_blast_tab @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/read_embl b/bp_bin/read_embl index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/read_embl +++ b/bp_bin/read_embl @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/read_fasta b/bp_bin/read_fasta index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/read_fasta +++ b/bp_bin/read_fasta @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/read_fixedstep b/bp_bin/read_fixedstep index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/read_fixedstep +++ b/bp_bin/read_fixedstep @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/read_gff b/bp_bin/read_gff index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/read_gff +++ b/bp_bin/read_gff @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/read_mysql b/bp_bin/read_mysql index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/read_mysql +++ b/bp_bin/read_mysql @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/read_phastcons b/bp_bin/read_phastcons index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/read_phastcons +++ b/bp_bin/read_phastcons @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/read_psl b/bp_bin/read_psl index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/read_psl +++ b/bp_bin/read_psl @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/read_soft b/bp_bin/read_soft index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/read_soft +++ b/bp_bin/read_soft @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/read_solexa b/bp_bin/read_solexa index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/read_solexa +++ b/bp_bin/read_solexa @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/read_solid b/bp_bin/read_solid index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/read_solid +++ b/bp_bin/read_solid @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/read_stockholm b/bp_bin/read_stockholm index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/read_stockholm +++ b/bp_bin/read_stockholm @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/read_tab b/bp_bin/read_tab index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/read_tab +++ b/bp_bin/read_tab @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/read_ucsc_config b/bp_bin/read_ucsc_config index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/read_ucsc_config +++ b/bp_bin/read_ucsc_config @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/remove_adaptor b/bp_bin/remove_adaptor index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/remove_adaptor +++ b/bp_bin/remove_adaptor @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/remove_indels b/bp_bin/remove_indels index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/remove_indels +++ b/bp_bin/remove_indels @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/remove_keys b/bp_bin/remove_keys index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/remove_keys +++ b/bp_bin/remove_keys @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/remove_mysql_tables b/bp_bin/remove_mysql_tables index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/remove_mysql_tables +++ b/bp_bin/remove_mysql_tables @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/remove_ucsc_tracks b/bp_bin/remove_ucsc_tracks index 4cd1d44..fdf5bd2 100644 --- a/bp_bin/remove_ucsc_tracks +++ b/bp_bin/remove_ucsc_tracks @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/rename_keys b/bp_bin/rename_keys index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/rename_keys +++ b/bp_bin/rename_keys @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/reverse_seq b/bp_bin/reverse_seq index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/reverse_seq +++ b/bp_bin/reverse_seq @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/shuffle_seq b/bp_bin/shuffle_seq index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/shuffle_seq +++ b/bp_bin/shuffle_seq @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/soap_seq b/bp_bin/soap_seq index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/soap_seq +++ b/bp_bin/soap_seq @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/sort_records b/bp_bin/sort_records index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/sort_records +++ b/bp_bin/sort_records @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/split_bed b/bp_bin/split_bed index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/split_bed +++ b/bp_bin/split_bed @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/split_seq b/bp_bin/split_seq index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/split_seq +++ b/bp_bin/split_seq @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/sum_vals b/bp_bin/sum_vals index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/sum_vals +++ b/bp_bin/sum_vals @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/tile_seq b/bp_bin/tile_seq index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/tile_seq +++ b/bp_bin/tile_seq @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/translate_seq b/bp_bin/translate_seq index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/translate_seq +++ b/bp_bin/translate_seq @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/transliterate_seq b/bp_bin/transliterate_seq index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/transliterate_seq +++ b/bp_bin/transliterate_seq @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/transliterate_vals b/bp_bin/transliterate_vals index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/transliterate_vals +++ b/bp_bin/transliterate_vals @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/uniq_vals b/bp_bin/uniq_vals index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/uniq_vals +++ b/bp_bin/uniq_vals @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/upload_to_ucsc b/bp_bin/upload_to_ucsc index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/upload_to_ucsc +++ b/bp_bin/upload_to_ucsc @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/uppercase_seq b/bp_bin/uppercase_seq index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/uppercase_seq +++ b/bp_bin/uppercase_seq @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/vmatch_seq b/bp_bin/vmatch_seq index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/vmatch_seq +++ b/bp_bin/vmatch_seq @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/write_2bit b/bp_bin/write_2bit index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/write_2bit +++ b/bp_bin/write_2bit @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/write_align b/bp_bin/write_align index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/write_align +++ b/bp_bin/write_align @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/write_bed b/bp_bin/write_bed index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/write_bed +++ b/bp_bin/write_bed @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/write_blast b/bp_bin/write_blast index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/write_blast +++ b/bp_bin/write_blast @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/write_fasta b/bp_bin/write_fasta index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/write_fasta +++ b/bp_bin/write_fasta @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/write_fixedstep b/bp_bin/write_fixedstep index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/write_fixedstep +++ b/bp_bin/write_fixedstep @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/write_psl b/bp_bin/write_psl index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/write_psl +++ b/bp_bin/write_psl @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/write_solid b/bp_bin/write_solid index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/write_solid +++ b/bp_bin/write_solid @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/write_tab b/bp_bin/write_tab index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/write_tab +++ b/bp_bin/write_tab @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/bp_bin/write_ucsc_config b/bp_bin/write_ucsc_config index 4cd1d44..fdf5bd2 100755 --- a/bp_bin/write_ucsc_config +++ b/bp_bin/write_ucsc_config @@ -3,4 +3,4 @@ use warnings; use strict; -use Maasha::Biopieces; +use Maasha::BioRun; diff --git a/code_perl/Maasha/BioRun.pm b/code_perl/Maasha/BioRun.pm new file mode 100644 index 0000000..0f8cf30 --- /dev/null +++ b/code_perl/Maasha/BioRun.pm @@ -0,0 +1,6445 @@ +package Maasha::BioRun; + + +# Copyright (C) 2007-2009 Martin A. Hansen. + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> DESCRIPTION <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +# Routines that contains Biopieces which are run. + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +use strict; +use Data::Dumper; +use Getopt::Long qw( :config bundling ); +use Time::HiRes qw( gettimeofday ); +use Storable qw( dclone ); +use Maasha::Biopieces; +use Maasha::Config; +use Maasha::Common; +use Maasha::Filesys; +use Maasha::Fasta; +use Maasha::Align; +use Maasha::Matrix; +use Maasha::Match; +use Maasha::EMBL; +use Maasha::Stockholm; +use Maasha::Seq; +use Maasha::Patscan; +use Maasha::Plot; +use Maasha::Calc; +use Maasha::UCSC; +use Maasha::UCSC::BED; +use Maasha::UCSC::Wiggle; +use Maasha::NCBI; +use Maasha::GFF; +use Maasha::TwoBit; +use Maasha::Solid; +use Maasha::Solexa; +use Maasha::SQL; +use Maasha::Gwiki; + +use vars qw( @ISA @EXPORT_OK ); + +require Exporter; + +@ISA = qw( Exporter ); + +use constant { + SEQ_NAME => 0, + SEQ => 1, +}; + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> GLOBALS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +my ( $script, $BP_TMP ); + +$script = Maasha::Common::get_scriptname(); +$BP_TMP = Maasha::Common::get_tmpdir(); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RUN SCRIPT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +run_script( $script ); + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub run_script +{ + # Martin A. Hansen, August 2007. + + # Run a specific script. + + my ( $script, # script name + ) = @_; + + # Returns nothing. + + my ( $t0, $t1, $options, $in, $out ); + + Maasha::Biopieces::log_biopiece(); + + $t0 = gettimeofday(); + + $options = get_options( $script ); + + $options->{ "SCRIPT" } = $script; + + if ( $script ne "list_biopieces" and $script ne "list_genomes" ) { + $script = "print_usage" if ( -t STDIN and keys %{ $options } <= 1 or $options->{ 'help' } ); + } + + $in = Maasha::Biopieces::read_stream( $options->{ "stream_in" } ); + $out = Maasha::Biopieces::write_stream( $options->{ "stream_out" } ); + + if ( $script eq "print_usage" ) { script_print_usage( $in, $out, $options ) } + elsif ( $script eq "list_biopieces" ) { script_list_biopieces( $in, $out, $options ) } + elsif ( $script eq "list_genomes" ) { script_list_genomes( $in, $out, $options ) } + elsif ( $script eq "read_fasta" ) { script_read_fasta( $in, $out, $options ) } + elsif ( $script eq "read_tab" ) { script_read_tab( $in, $out, $options ) } + elsif ( $script eq "read_psl" ) { script_read_psl( $in, $out, $options ) } + elsif ( $script eq "read_bed" ) { script_read_bed( $in, $out, $options ) } + elsif ( $script eq "read_fixedstep" ) { script_read_fixedstep( $in, $out, $options ) } + elsif ( $script eq "read_blast_tab" ) { script_read_blast_tab( $in, $out, $options ) } + elsif ( $script eq "read_embl" ) { script_read_embl( $in, $out, $options ) } + elsif ( $script eq "read_stockholm" ) { script_read_stockholm( $in, $out, $options ) } + elsif ( $script eq "read_phastcons" ) { script_read_phastcons( $in, $out, $options ) } + elsif ( $script eq "read_soft" ) { script_read_soft( $in, $out, $options ) } + elsif ( $script eq "read_gff" ) { script_read_gff( $in, $out, $options ) } + elsif ( $script eq "read_2bit" ) { script_read_2bit( $in, $out, $options ) } + elsif ( $script eq "read_solexa" ) { script_read_solexa( $in, $out, $options ) } + elsif ( $script eq "read_solid" ) { script_read_solid( $in, $out, $options ) } + elsif ( $script eq "read_mysql" ) { script_read_mysql( $in, $out, $options ) } + elsif ( $script eq "read_ucsc_config" ) { script_read_ucsc_config( $in, $out, $options ) } + elsif ( $script eq "assemble_tag_contigs" ) { script_assemble_tag_contigs( $in, $out, $options ) } + elsif ( $script eq "format_genome" ) { script_format_genome( $in, $out, $options ) } + elsif ( $script eq "length_seq" ) { script_length_seq( $in, $out, $options ) } + elsif ( $script eq "uppercase_seq" ) { script_uppercase_seq( $in, $out, $options ) } + elsif ( $script eq "shuffle_seq" ) { script_shuffle_seq( $in, $out, $options ) } + elsif ( $script eq "analyze_seq" ) { script_analyze_seq( $in, $out, $options ) } + elsif ( $script eq "analyze_tags" ) { script_analyze_tags( $in, $out, $options ) } + elsif ( $script eq "complexity_seq" ) { script_complexity_seq( $in, $out, $options ) } + elsif ( $script eq "oligo_freq" ) { script_oligo_freq( $in, $out, $options ) } + elsif ( $script eq "create_weight_matrix" ) { script_create_weight_matrix( $in, $out, $options ) } + elsif ( $script eq "calc_bit_scores" ) { script_calc_bit_scores( $in, $out, $options ) } + elsif ( $script eq "calc_fixedstep" ) { script_calc_fixedstep( $in, $out, $options ) } + elsif ( $script eq "reverse_seq" ) { script_reverse_seq( $in, $out, $options ) } + elsif ( $script eq "complement_seq" ) { script_complement_seq( $in, $out, $options ) } + elsif ( $script eq "remove_indels" ) { script_remove_indels( $in, $out, $options ) } + elsif ( $script eq "transliterate_seq" ) { script_transliterate_seq( $in, $out, $options ) } + elsif ( $script eq "transliterate_vals" ) { script_transliterate_vals( $in, $out, $options ) } + elsif ( $script eq "translate_seq" ) { script_translate_seq( $in, $out, $options ) } + elsif ( $script eq "extract_seq" ) { script_extract_seq( $in, $out, $options ) } + elsif ( $script eq "get_genome_seq" ) { script_get_genome_seq( $in, $out, $options ) } + elsif ( $script eq "get_genome_align" ) { script_get_genome_align( $in, $out, $options ) } + elsif ( $script eq "get_genome_phastcons" ) { script_get_genome_phastcons( $in, $out, $options ) } + elsif ( $script eq "fold_seq" ) { script_fold_seq( $in, $out, $options ) } + elsif ( $script eq "split_seq" ) { script_split_seq( $in, $out, $options ) } + elsif ( $script eq "split_bed" ) { script_split_bed( $in, $out, $options ) } + elsif ( $script eq "align_seq" ) { script_align_seq( $in, $out, $options ) } + elsif ( $script eq "tile_seq" ) { script_tile_seq( $in, $out, $options ) } + elsif ( $script eq "invert_align" ) { script_invert_align( $in, $out, $options ) } + elsif ( $script eq "patscan_seq" ) { script_patscan_seq( $in, $out, $options ) } + elsif ( $script eq "create_blast_db" ) { script_create_blast_db( $in, $out, $options ) } + elsif ( $script eq "blast_seq" ) { script_blast_seq( $in, $out, $options ) } + elsif ( $script eq "blat_seq" ) { script_blat_seq( $in, $out, $options ) } + elsif ( $script eq "soap_seq" ) { script_soap_seq( $in, $out, $options ) } + elsif ( $script eq "match_seq" ) { script_match_seq( $in, $out, $options ) } + elsif ( $script eq "create_vmatch_index" ) { script_create_vmatch_index( $in, $out, $options ) } + elsif ( $script eq "vmatch_seq" ) { script_vmatch_seq( $in, $out, $options ) } + elsif ( $script eq "write_fasta" ) { script_write_fasta( $in, $out, $options ) } + elsif ( $script eq "write_align" ) { script_write_align( $in, $out, $options ) } + elsif ( $script eq "write_blast" ) { script_write_blast( $in, $out, $options ) } + elsif ( $script eq "write_tab" ) { script_write_tab( $in, $out, $options ) } + elsif ( $script eq "write_bed" ) { script_write_bed( $in, $out, $options ) } + elsif ( $script eq "write_psl" ) { script_write_psl( $in, $out, $options ) } + elsif ( $script eq "write_fixedstep" ) { script_write_fixedstep( $in, $out, $options ) } + elsif ( $script eq "write_2bit" ) { script_write_2bit( $in, $out, $options ) } + elsif ( $script eq "write_solid" ) { script_write_solid( $in, $out, $options ) } + elsif ( $script eq "write_ucsc_config" ) { script_write_ucsc_config( $in, $out, $options ) } + elsif ( $script eq "head_records" ) { script_head_records( $in, $out, $options ) } + elsif ( $script eq "remove_keys" ) { script_remove_keys( $in, $out, $options ) } + elsif ( $script eq "remove_adaptor" ) { script_remove_adaptor( $in, $out, $options ) } + elsif ( $script eq "remove_mysql_tables" ) { script_remove_mysql_tables( $in, $out, $options ) } + elsif ( $script eq "remove_ucsc_tracks" ) { script_remove_ucsc_tracks( $in, $out, $options ) } + elsif ( $script eq "rename_keys" ) { script_rename_keys( $in, $out, $options ) } + elsif ( $script eq "uniq_vals" ) { script_uniq_vals( $in, $out, $options ) } + elsif ( $script eq "merge_vals" ) { script_merge_vals( $in, $out, $options ) } + elsif ( $script eq "merge_records" ) { script_merge_records( $in, $out, $options ) } + elsif ( $script eq "grab" ) { script_grab( $in, $out, $options ) } + elsif ( $script eq "compute" ) { script_compute( $in, $out, $options ) } + elsif ( $script eq "flip_tab" ) { script_flip_tab( $in, $out, $options ) } + elsif ( $script eq "add_ident" ) { script_add_ident( $in, $out, $options ) } + elsif ( $script eq "count_records" ) { script_count_records( $in, $out, $options ) } + elsif ( $script eq "random_records" ) { script_random_records( $in, $out, $options ) } + elsif ( $script eq "sort_records" ) { script_sort_records( $in, $out, $options ) } + elsif ( $script eq "count_vals" ) { script_count_vals( $in, $out, $options ) } + elsif ( $script eq "plot_histogram" ) { script_plot_histogram( $in, $out, $options ) } + elsif ( $script eq "plot_lendist" ) { script_plot_lendist( $in, $out, $options ) } + elsif ( $script eq "plot_chrdist" ) { script_plot_chrdist( $in, $out, $options ) } + elsif ( $script eq "plot_karyogram" ) { script_plot_karyogram( $in, $out, $options ) } + elsif ( $script eq "plot_matches" ) { script_plot_matches( $in, $out, $options ) } + elsif ( $script eq "plot_seqlogo" ) { script_plot_seqlogo( $in, $out, $options ) } + elsif ( $script eq "plot_phastcons_profiles" ) { script_plot_phastcons_profiles( $in, $out, $options ) } + elsif ( $script eq "analyze_bed" ) { script_analyze_bed( $in, $out, $options ) } + elsif ( $script eq "analyze_vals" ) { script_analyze_vals( $in, $out, $options ) } + elsif ( $script eq "length_vals" ) { script_length_vals( $in, $out, $options ) } + elsif ( $script eq "sum_vals" ) { script_sum_vals( $in, $out, $options ) } + elsif ( $script eq "mean_vals" ) { script_mean_vals( $in, $out, $options ) } + elsif ( $script eq "median_vals" ) { script_median_vals( $in, $out, $options ) } + elsif ( $script eq "max_vals" ) { script_max_vals( $in, $out, $options ) } + elsif ( $script eq "min_vals" ) { script_min_vals( $in, $out, $options ) } + elsif ( $script eq "upload_to_ucsc" ) { script_upload_to_ucsc( $in, $out, $options ) } + + close $in if defined $in; + close $out; + + $t1 = gettimeofday(); + + print STDERR "Program: $script" . ( " " x ( 25 - length( $script ) ) ) . sprintf( "Run time: %.4f\n", ( $t1 - $t0 ) ) if $options->{ 'verbose' }; +} + + +sub get_options +{ + # Martin A. Hansen, February 2008. + + # Gets options from commandline and checks these vigerously. + + my ( $script, # name of script + ) = @_; + + # Returns hash + + my ( %options, @options, $opt, @genomes, $real ); + + if ( $script eq "print_usage" ) + { + @options = qw( + data_in|i=s + ); + } + elsif ( $script eq "read_fasta" ) + { + @options = qw( + data_in|i=s + num|n=s + ); + } + elsif ( $script eq "read_tab" ) + { + @options = qw( + data_in|i=s + delimit|d=s + cols|c=s + keys|k=s + skip|s=s + num|n=s + ); + } + elsif ( $script eq "read_psl" ) + { + @options = qw( + data_in|i=s + num|n=s + ); + } + elsif ( $script eq "read_bed" ) + { + @options = qw( + data_in|i=s + cols|c=s + num|n=s + check|C + ); + } + elsif ( $script eq "read_fixedstep" ) + { + @options = qw( + data_in|i=s + num|n=s + ); + } + elsif ( $script eq "read_blast_tab" ) + { + @options = qw( + data_in|i=s + num|n=s + ); + } + elsif ( $script eq "read_embl" ) + { + @options = qw( + data_in|i=s + num|n=s + keys|k=s + feats|f=s + quals|q=s + ); + } + elsif ( $script eq "read_stockholm" ) + { + @options = qw( + data_in|i=s + num|n=s + ); + } + elsif ( $script eq "read_phastcons" ) + { + @options = qw( + data_in|i=s + num|n=s + min|m=s + dist|d=s + threshold|t=f + gap|g=s + ); + } + elsif ( $script eq "read_soft" ) + { + @options = qw( + data_in|i=s + samples|s=s + num|n=s + ); + } + elsif ( $script eq "read_gff" ) + { + @options = qw( + data_in|i=s + num|n=s + ); + } + elsif ( $script eq "read_2bit" ) + { + @options = qw( + data_in|i=s + num|n=s + no_mask|N + ); + } + elsif ( $script eq "read_solexa" ) + { + @options = qw( + data_in|i=s + num|n=s + format|f=s + quality|q=s + ); + } + elsif ( $script eq "read_solid" ) + { + @options = qw( + data_in|i=s + num|n=s + quality|q=s + ); + } + elsif ( $script eq "read_mysql" ) + { + @options = qw( + database|d=s + query|q=s + user|u=s + password|p=s + ); + } + elsif ( $script eq "read_ucsc_config" ) + { + @options = qw( + data_in|i=s + num|n=s + ); + } + elsif ( $script eq "assemble_tag_contigs" ) + { + @options = qw( + check|C + ); + } + elsif ( $script eq "calc_fixedstep" ) + { + @options = qw( + check|C + ); + } + elsif ( $script eq "format_genome" ) + { + @options = qw( + no_stream|x + dir|d=s + genome|g=s + formats|f=s + ); + } + elsif ( $script eq "length_seq" ) + { + @options = qw( + no_stream|x + data_out|o=s + ); + } + elsif ( $script eq "oligo_freq" ) + { + @options = qw( + word_size|w=s + all|a + ); + } + elsif ( $script eq "create_weight_matrix" ) + { + @options = qw( + percent|p + ); + } + elsif ( $script eq "transliterate_seq" ) + { + @options = qw( + search|s=s + replace|r=s + delete|d=s + ); + } + elsif ( $script eq "transliterate_vals" ) + { + @options = qw( + keys|k=s + search|s=s + replace|r=s + delete|d=s + ); + } + elsif ( $script eq "translate_seq" ) + { + @options = qw( + frames|f=s + ); + } + elsif ( $script eq "extract_seq" ) + { + @options = qw( + beg|b=s + end|e=s + len|l=s + ); + } + elsif ( $script eq "get_genome_seq" ) + { + @options = qw( + genome|g=s + chr|c=s + beg|b=s + end|e=s + len|l=s + flank|f=s + mask|m + splice|s + ); + } + elsif ( $script eq "get_genome_align" ) + { + @options = qw( + genome|g=s + chr|c=s + beg|b=s + end|e=s + len|l=s + strand|s=s + ); + } + elsif ( $script eq "get_genome_phastcons" ) + { + @options = qw( + genome|g=s + chr|c=s + beg|b=s + end|e=s + len|l=s + flank|f=s + ); + } + elsif ( $script eq "split_seq" ) + { + @options = qw( + word_size|w=s + uniq|u + ); + } + elsif ( $script eq "split_bed" ) + { + @options = qw( + window_size|w=s + step_size|s=s + ); + } + elsif ( $script eq "tile_seq" ) + { + @options = qw( + identity|i=s + supress_indels|s + ); + } + elsif ( $script eq "invert_align" ) + { + @options = qw( + soft|s + ); + } + elsif ( $script eq "patscan_seq" ) + { + @options = qw( + patterns|p=s + patterns_in|P=s + comp|c + max_hits|h=s + max_misses|m=s + genome|g=s + ); + } + elsif ( $script eq "create_blast_db" ) + { + @options = qw( + no_stream|x + database|d=s + ); + } + elsif ( $script eq "blast_seq" ) + { + @options = qw( + database|d=s + genome|g=s + program|p=s + e_val|e=f + filter|f + cpus|c=s + no_filter|F + ); + } + elsif ( $script eq "blat_seq" ) + { + @options = qw( + genome|g=s + tile_size|t=s + step_size|s=s + min_identity|m=s + min_score|M=s + one_off|o=s + ooc|c + ); + } + elsif ( $script eq "soap_seq" ) + { + @options = qw( + in_file|i=s + genome|g=s + seed_size|s=s + mismatches|m=s + gap_size|G=s + cpus|c=s + ); + } + elsif ( $script eq "match_seq" ) + { + @options = qw( + word_size|w=s + direction|d=s + ); + } + elsif ( $script eq "create_vmatch_index" ) + { + @options = qw( + index_name|i=s + prefix_length|p=s + no_stream|x + ); + } + elsif ( $script eq "vmatch_seq" ) + { + @options = qw( + genome|g=s + index_name|i=s + count|c + max_hits|m=s + hamming_dist|h=s + edit_dist|e=s + ); + } + elsif ( $script eq "write_fasta" ) + { + @options = qw( + wrap|w=s + no_stream|x + data_out|o=s + compress|Z + ); + } + elsif ( $script eq "write_align" ) + { + @options = qw( + wrap|w=s + no_stream|x + no_ruler|R + no_consensus|C + data_out|o=s + ); + } + elsif ( $script eq "write_blast" ) + { + @options = qw( + no_stream|x + data_out|o=s + comment|c + compress|Z + ); + } + elsif ( $script eq "write_tab" ) + { + @options = qw( + no_stream|x + data_out|o=s + delimit|d=s + keys|k=s + no_keys|K=s + comment|c + compress|Z + ); + } + elsif ( $script eq "write_bed" ) + { + @options = qw( + cols|c=s + check|C + no_stream|x + data_out|o=s + compress|Z + ); + } + elsif ( $script eq "write_psl" ) + { + @options = qw( + no_stream|x + data_out|o=s + compress|Z + ); + } + elsif ( $script eq "write_fixedstep" ) + { + @options = qw( + no_stream|x + data_out|o=s + compress|Z + ); + } + elsif ( $script eq "write_2bit" ) + { + @options = qw( + no_stream|x + data_out|o=s + no_mask|N + ); + } + elsif ( $script eq "write_solid" ) + { + @options = qw( + wrap|w=s + no_stream|x + data_out|o=s + compress|Z + ); + } + elsif ( $script eq "write_ucsc_config" ) + { + @options = qw( + no_stream|x + data_out|o=s + ); + } + elsif ( $script eq "plot_seqlogo" ) + { + @options = qw( + no_stream|x + data_out|o=s + ); + } + elsif ( $script eq "plot_phastcons_profiles" ) + { + @options = qw( + no_stream|x + data_out|o=s + genome|g=s + mean|m + median|M + flank|f=s + terminal|t=s + title|T=s + xlabel|X=s + ylabel|Y=s + ); + } + elsif ( $script eq "analyze_vals" ) + { + @options = qw( + no_stream|x + keys|k=s + ); + } + elsif ( $script eq "head_records" ) + { + @options = qw( + num|n=s + ); + } + elsif ( $script eq "remove_keys" ) + { + @options = qw( + keys|k=s + save_keys|K=s + ); + } + elsif ( $script eq "remove_adaptor" ) + { + @options = qw( + adaptor|a=s + mismatches|m=s + remove|r=s + offset|o=s + ); + } + elsif ( $script eq "remove_mysql_tables" ) + { + @options = qw( + database|d=s + tables|t=s + keys|k=s + user|u=s + password|p=s + no_stream|x + ); + } + elsif ( $script eq "remove_ucsc_tracks" ) + { + @options = qw( + database|d=s + tracks|t=s + keys|k=s + config_file|c=s + user|u=s + password|p=s + no_stream|x + ); + } + elsif ( $script eq "rename_keys" ) + { + @options = qw( + keys|k=s + ); + } + elsif ( $script eq "uniq_vals" ) + { + @options = qw( + key|k=s + invert|i + ); + } + elsif ( $script eq "merge_vals" ) + { + @options = qw( + keys|k=s + delimit|d=s + ); + } + elsif ( $script eq "merge_records" ) + { + @options = qw( + keys|k=s + merge|m=s + ); + } + elsif ( $script eq "grab" ) + { + @options = qw( + patterns|p=s + patterns_in|P=s + regex|r=s + eval|e=s + exact_in|E=s + invert|i + case_insensitive|c + keys|k=s + keys_only|K + vals_only|V + ); + } + elsif ( $script eq "compute" ) + { + @options = qw( + eval|e=s + ); + } + elsif ( $script eq "add_ident" ) + { + @options = qw( + prefix|p=s + key|k=s + ); + } + elsif ( $script eq "count_records" ) + { + @options = qw( + no_stream|x + data_out|o=s + ); + } + elsif ( $script eq "random_records" ) + { + @options = qw( + num|n=s + ); + } + elsif ( $script eq "sort_records" ) + { + @options = qw( + reverse|r + keys|k=s + ); + } + elsif ( $script eq "count_vals" ) + { + @options = qw( + keys|k=s + ); + } + elsif ( $script eq "plot_histogram" ) + { + @options = qw( + no_stream|x + data_out|o=s + terminal|t=s + title|T=s + xlabel|X=s + ylabel|Y=s + key|k=s + sort|s=s + ); + } + elsif ( $script eq "plot_lendist" ) + { + @options = qw( + no_stream|x + data_out|o=s + terminal|t=s + title|T=s + xlabel|X=s + ylabel|Y=s + key|k=s + ); + } + elsif ( $script eq "plot_chrdist" ) + { + @options = qw( + no_stream|x + data_out|o=s + terminal|t=s + title|T=s + xlabel|X=s + ylabel|Y=s + ); + } + elsif ( $script eq "plot_karyogram" ) + { + @options = qw( + no_stream|x + data_out|o=s + genome|g=s + feat_color|f=s + ); + } + elsif ( $script eq "plot_matches" ) + { + @options = qw( + no_stream|x + data_out|o=s + terminal|t=s + title|T=s + xlabel|X=s + ylabel|Y=s + direction|d=s + ); + } + elsif ( $script eq "length_vals" ) + { + @options = qw( + keys|k=s + ); + } + elsif ( $script eq "sum_vals" ) + { + @options = qw( + no_stream|x + data_out|o=s + keys|k=s + ); + } + elsif ( $script eq "mean_vals" ) + { + @options = qw( + no_stream|x + data_out|o=s + keys|k=s + ); + } + elsif ( $script eq "median_vals" ) + { + @options = qw( + no_stream|x + data_out|o=s + keys|k=s + ); + } + elsif ( $script eq "max_vals" ) + { + @options = qw( + no_stream|x + data_out|o=s + keys|k=s + ); + } + elsif ( $script eq "min_vals" ) + { + @options = qw( + no_stream|x + data_out|o=s + keys|k=s + ); + } + elsif ( $script eq "upload_to_ucsc" ) + { + @options = qw( + no_stream|x + database|d=s + table|t=s + short_label|s=s + long_label|l=s + group|g=s + priority|p=f + use_score|u + visibility|V=s + color|c=s + check|C + ); + } + + push @options, qw( + stream_in|I=s + stream_out|O=s + verbose|v + help|? + ); + +# print STDERR Dumper( \@options ); + + GetOptions( + \%options, + @options, + ); + +# print STDERR Dumper( \%options ); + + if ( -t STDIN && scalar( keys %options ) == 0 or $options{ "help" } ) { + return wantarray ? %options : \%options; + } + + $options{ "cols" } = [ split ",", $options{ "cols" } ] if defined $options{ "cols" }; + $options{ "keys" } = [ split ",", $options{ "keys" } ] if defined $options{ "keys" }; + $options{ "no_keys" } = [ split ",", $options{ "no_keys" } ] if defined $options{ "no_keys" }; + $options{ "save_keys" } = [ split ",", $options{ "save_keys" } ] if defined $options{ "save_keys" }; + $options{ "quals" } = [ split ",", $options{ "quals" } ] if defined $options{ "quals" }; + $options{ "feats" } = [ split ",", $options{ "feats" } ] if defined $options{ "feats" }; + $options{ "frames" } = [ split ",", $options{ "frames" } ] if defined $options{ "frames" }; + $options{ "formats" } = [ split ",", $options{ "formats" } ] if defined $options{ "formats" }; + $options{ "samples" } = [ split ",", $options{ "samples" } ] if defined $options{ "samples" }; + $options{ "tables" } = [ split ",", $options{ "tables" } ] if defined $options{ "tables" }; + $options{ "tracks" } = [ split ",", $options{ "tracks" } ] if defined $options{ "tracks" }; + + # ---- check arguments ---- + + if ( $options{ 'data_in' } ) + { + $options{ "files" } = Maasha::Biopieces::getopt_files( $options{ 'data_in' } ); + + Maasha::Common::error( qq(Argument to --data_in must be a valid file or fileglob expression) ) if scalar @{ $options{ "files" } } == 0; + } + + map { Maasha::Common::error( qq(Argument to --cols must be a whole numbers - not "$_") ) if $_ !~ /^\d+$/ } @{ $options{ "cols" } } if $options{ "cols" }; + + # print STDERR Dumper( \%options ); + + $real = "beg|end|word_size|wrap|tile_size|len|prefix_length|mismatches|offset|num|skip|cpus|window_size|step_size"; + + foreach $opt ( keys %options ) + { + if ( $opt =~ /stream_in|pattern_in|exact_in/ and not -f $options{ $opt } ) + { + Maasha::Common::error( qq(Argument to --$opt must be a valid file or fileglob expression - not "$options{ $opt }") ); + } + elsif ( $opt =~ /$real/ and $options{ $opt } !~ /^\d+$/ ) + { + Maasha::Common::error( qq(Argument to --$opt must be a whole number - not "$options{ $opt }") ); + } + elsif ( $opt =~ /max_hits|max_hits|max_misses|dist|edit_dist|flank|gap|hamming_dist|priority/ and $options{ $opt } !~ /^-?\d+$/ ) + { + Maasha::Common::error( qq(Argument to --$opt must be an integer - not "$options{ $opt }") ); + } + elsif ( $opt =~ /identity|threshold/ and $options{ $opt } !~ /^-?(?:\d+(?:\.\d*)?|\.\d+)$/ ) + { + Maasha::Common::error( qq(Argument to --$opt must be a decimal number - not "$options{ $opt }") ); + } + elsif ( $opt =~ /e_val/ and $options{ $opt } !~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/ ) + { + Maasha::Common::error( qq(Argument to --$opt must be a float - not "$options{ $opt }") ); + } + elsif ( $opt =~ /strand/ and $options{ $opt } !~ /^(\+|-)$/ ) + { + Maasha::Common::error( qq(Argument to --$opt must be "+" or "-" - not "$options{ $opt }") ); + } + elsif ( $opt eq "genome" and $script ne "format_genome" ) + { + @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" ); + map { $_ =~ s/.*\/(.+)$/$1/ } @genomes; + + if ( not grep { $_ =~ /^$options{ $opt }$/ } @genomes ) { + Maasha::Common::error( qq(Genome $options{ $opt } not found in "$ENV{ 'BP_DATA' }/genomes/") ); + } + } + elsif ( $opt eq "terminal" and not $options{ $opt } =~ /^(svg|post|dumb|x11)/ ) + { + Maasha::Common::error( qq(Bad --$opt argument "$options{ $opt }") ); + } + elsif ( $opt eq "table" and $options{ $opt } =~ /(-|\.)/ ) + { + Maasha::Common::error( qq(Character '$1' is not allowed in table name: $options{ $opt }) ); + } + elsif ( $opt eq "merge" and $options{ $opt } !~ /^(AandB|AorB|BorA|AnotB|BnotA)$/ ) + { + Maasha::Common::error( qq(Argument to --$opt must be AandB, AorB, BorA, AnotB, or BnotA - not "$options{ $opt }") ); + } + elsif ( $opt eq "format" and $script eq "read_solexa" and $options{ $opt } !~ /octal|decimal/ ) + { + Maasha::Common::error( qq(Argument to --$opt must be octal or decimal - not "$options{ $opt }") ); + } + elsif ( $opt eq "remove" and $script eq "remove_adaptor" and $options{ $opt } !~ /before|after|skip/ ) + { + Maasha::Common::error( qq(Argument to --$opt must be before, after, or skip - not "$options{ $opt }") ); + } + } + + Maasha::Common::error( qq(no --database specified) ) if $script eq "remove_ucsc_tracks" and not $options{ "database" }; + Maasha::Common::error( qq(no --database specified) ) if $script eq "create_blast_db" and not $options{ "database" }; + Maasha::Common::error( qq(no --index_name specified) ) if $script =~ /create_vmatch_index/ and not $options{ "index_name" }; + Maasha::Common::error( qq(no --database or --genome specified) ) if $script eq "blast_seq" and not $options{ "genome" } and not $options{ "database" }; + Maasha::Common::error( qq(both --database and --genome specified) ) if $script eq "blast_seq" and $options{ "genome" } and $options{ "database" }; + Maasha::Common::error( qq(no --index_name or --genome specified) ) if $script eq "vmatch_seq" and not $options{ "genome" } and not $options{ "index_name" }; + Maasha::Common::error( qq(both --index and --genome specified) ) if $script eq "vmatch_seq" and $options{ "genome" } and $options{ "index_name" }; + Maasha::Common::error( qq(no --in_file or --genome specified) ) if $script eq "soap_seq" and not $options{ "genome" } and not $options{ "in_file" }; + Maasha::Common::error( qq(both --in_file and --genome specified) ) if $script eq "soap_seq" and $options{ "genome" } and $options{ "in_file" }; + Maasha::Common::error( qq(no --genome specified) ) if $script =~ /format_genome|get_genome_seq|get_genome_align|get_genome_phastcons|blat_seq|plot_phastcons_profiles|plot_karyogram/ and not $options{ "genome" }; + Maasha::Common::error( qq(no --key specified) ) if $script =~ /plot_lendist|plot_histogram/ and not $options{ "key" }; + Maasha::Common::error( qq(no --keys speficied) ) if $script =~ /sort_records|count_vals|sum_vals|mean_vals|median_vals|length_vals/ and not $options{ "keys" }; + + if ( $script eq "upload_to_ucsc" ) + { + Maasha::Common::error( qq(no --database specified) ) if not $options{ "database" }; + Maasha::Common::error( qq(no --table specified) ) if not $options{ "table" }; + } + + return wantarray ? %options : \%options; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SCRIPTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub script_print_usage +{ + # Martin A. Hansen, January 2008. + + # Retrieves usage information from file and + # prints this nicely formatted. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $file, $wiki, $lines ); + + if ( $options->{ 'data_in' } ) { + $file = $options->{ 'data_in' }; + } else { + $file = join "", $ENV{ 'BP_DIR' }, "/bp_usage/", $options->{ 'SCRIPT' }, ".wiki"; + } + + $wiki = Maasha::Gwiki::gwiki_read( $file ); + + ( $wiki->[ 2 ], $wiki->[ 3 ], $wiki->[ 0 ], $wiki->[ 1 ] ) = ( $wiki->[ 0 ], $wiki->[ 1 ], $wiki->[ 2 ], $wiki->[ 3 ] ); + + if ( not $options->{ "help" } ) { + @{ $wiki } = grep { $_->[ 0 ]->{ 'SECTION' } =~ /Biopiece|summary|Usage|Options|Help/ } @{ $wiki }; + } + + $lines = Maasha::Gwiki::gwiki2ascii( $wiki ); + + print STDERR "$_\n" foreach @{ $lines }; + + exit; +} + + +sub script_list_biopieces +{ + # Martin A. Hansen, January 2008. + + # Prints the summary from the usage for each of the biopieces. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( @files, $file, $wiki, $program, $summary ); + + @files = Maasha::Common::ls_files( "$ENV{ 'BP_DIR' }/bp_usage" ); + + foreach $file ( sort @files ) + { + if ( $file =~ /\/([a-z0-9_]+)\.wiki$/ ) + { + $program = $1; + + $wiki = Maasha::Gwiki::gwiki_read( $file ); + + $summary = $wiki->[ 0 ]->[ 0 ]->{ 'TEXT' }; + $summary =~ s/^#summary\s+//; + + printf( "%-30s%s\n", $program, $summary ); + } + } + + exit; +} + + +sub script_list_genomes +{ + # Martin A. Hansen, January 2008. + + # Prints the synopsis from the usage for each of the biopieces. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( @genomes, $genome, @formats, $format, %hash, %found, @row ); + + @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" ); + + foreach $genome ( @genomes ) + { + next if $genome =~ /\.$/; + + @formats = Maasha::Common::ls_dirs( $genome ); + + foreach $format ( @formats ) + { + if ( $format =~ /\/([^\/]+)\/(\w+)$/ ) + { + $hash{ $1 }{ $2 } = 1; + + $found{ $2 } = 1; + } + } + } + + @row = "Genome"; + + map { push @row, $_ } sort keys %found; + + print join( "\t", @row ), "\n"; + + foreach $genome ( sort keys %hash ) + { + @row = $genome; + + foreach $format ( sort keys %found ) + { + if ( exists $hash{ $genome }{ $format } ) { + push @row, "yes"; + } else { + push @row, "no"; + } + } + + print join( "\t", @row ), "\n"; + } +} + + +sub script_read_fasta +{ + # Martin A. Hansen, August 2007. + + # Read sequences from FASTA file. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $file, $data_in, $entry, $num ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = Maasha::Common::read_open( $file ); + + while ( $entry = Maasha::Fasta::get_entry( $data_in ) ) + { + if ( defined $entry->[ SEQ_NAME ] and $entry->[ SEQ ] ) + { + $record = { + SEQ_NAME => $entry->[ SEQ_NAME ], + SEQ => $entry->[ SEQ ], + SEQ_LEN => length $entry->[ SEQ ], + }; + + Maasha::Biopieces::put_record( $record, $out ); + } + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_tab +{ + # Martin A. Hansen, August 2007. + + # Read table or table columns from stream or file. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $file, $line, @fields, @fields2, $i, $record, $data_in, $skip, $num ); + + $options->{ 'delimit' } ||= '\s+'; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } + + $skip = $options->{ 'skip' } ||= 0; + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = Maasha::Common::read_open( $file ); + + while ( $line = <$data_in> ) + { + if ( $skip ) + { + $skip--; + next; + } + + next if $line =~ /^#|^$/; + + chomp $line; + + undef $record; + undef @fields2; + + @fields = split /$options->{'delimit'}/, $line; + + if ( $options->{ "cols" } ) { + map { push @fields2, $fields[ $_ ] } @{ $options->{ "cols" } }; + } else { + @fields2 = @fields; + } + + for ( $i = 0; $i < @fields2; $i++ ) + { + if ( $options->{ "keys" }->[ $i ] ) { + $record->{ $options->{ "keys" }->[ $i ] } = $fields2[ $i ]; + } else { + $record->{ "V" . $i } = $fields2[ $i ]; + } + } + + Maasha::Biopieces::put_record( $record, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_psl +{ + # Martin A. Hansen, August 2007. + + # Read psl table from stream or file. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $file, $data_in, $num ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = Maasha::Common::read_open( $file ); + + while ( $record = Maasha::UCSC::psl_get_entry( $data_in ) ) + { + Maasha::Biopieces::put_record( $record, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + } + + NUM: +} + + +sub script_read_bed +{ + # Martin A. Hansen, August 2007. + + # Read bed table from stream or file. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $cols, $file, $record, $bed_entry, $data_in, $num ); + + $cols = $options->{ 'cols' }->[ 0 ]; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = Maasha::Common::read_open( $file ); + + while ( $bed_entry = Maasha::UCSC::BED::bed_entry_get( $data_in, $cols, $options->{ 'check' } ) ) + { + $record = Maasha::UCSC::BED::bed2biopiece( $bed_entry ); + + Maasha::Biopieces::put_record( $record, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_fixedstep +{ + # Martin A. Hansen, Juli 2008. + + # Read fixedstep wiggle format from stream or file. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $file, $record, $entry, $data_in, $num ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = Maasha::Common::read_open( $file ); + + while ( $entry = Maasha::UCSC::Wiggle::fixedstep_entry_get( $data_in ) ) + { + $record = Maasha::UCSC::Wiggle::fixedstep2biopiece( $entry ); + + Maasha::Biopieces::put_record( $record, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_blast_tab +{ + # Martin A. Hansen, September 2007. + + # Read tabular BLAST output from NCBI blast run with -m8 or -m9. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $file, $line, @fields, $strand, $record, $data_in, $num ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = Maasha::Common::read_open( $file ); + + while ( $line = <$data_in> ) + { + chomp $line; + + next if $line =~ /^#/; + + @fields = split /\t/, $line; + + $record->{ "REC_TYPE" } = "BLAST"; + $record->{ "Q_ID" } = $fields[ 0 ]; + $record->{ "S_ID" } = $fields[ 1 ]; + $record->{ "IDENT" } = $fields[ 2 ]; + $record->{ "ALIGN_LEN" } = $fields[ 3 ]; + $record->{ "MISMATCHES" } = $fields[ 4 ]; + $record->{ "GAPS" } = $fields[ 5 ]; + $record->{ "Q_BEG" } = $fields[ 6 ] - 1; # BLAST is 1-based + $record->{ "Q_END" } = $fields[ 7 ] - 1; # BLAST is 1-based + $record->{ "S_BEG" } = $fields[ 8 ] - 1; # BLAST is 1-based + $record->{ "S_END" } = $fields[ 9 ] - 1; # BLAST is 1-based + $record->{ "E_VAL" } = $fields[ 10 ]; + $record->{ "BIT_SCORE" } = $fields[ 11 ]; + + if ( $record->{ "S_BEG" } > $record->{ "S_END" } ) + { + $record->{ "STRAND" } = '-'; + + ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } ); + } + else + { + $record->{ "STRAND" } = '+'; + } + + Maasha::Biopieces::put_record( $record, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_embl +{ + # Martin A. Hansen, August 2007. + + # Read EMBL format. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( %options2, $file, $data_in, $num, $entry, $record ); + + map { $options2{ "keys" }{ $_ } = 1 } @{ $options->{ "keys" } }; + map { $options2{ "feats" }{ $_ } = 1 } @{ $options->{ "feats" } }; + map { $options2{ "quals" }{ $_ } = 1 } @{ $options->{ "quals" } }; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = Maasha::Common::read_open( $file ); + + while ( $entry = Maasha::EMBL::get_embl_entry( $data_in ) ) + { + $record = Maasha::EMBL::parse_embl_entry( $entry, \%options2 ); + + my ( $feat, $feat2, $qual, $qual_val, $record_copy ); + + $record_copy = dclone $record; + + delete $record_copy->{ "FT" }; + + Maasha::Biopieces::put_record( $record_copy, $out ); + + delete $record_copy->{ "SEQ" }; + + foreach $feat ( keys %{ $record->{ "FT" } } ) + { + $record_copy->{ "FEAT_TYPE" } = $feat; + + foreach $feat2 ( @{ $record->{ "FT" }->{ $feat } } ) + { + foreach $qual ( keys %{ $feat2 } ) + { + $qual_val = join "; ", @{ $feat2->{ $qual } }; + + $qual =~ s/^_//; + $qual = uc $qual; + + $record_copy->{ $qual } = $qual_val; + } + + Maasha::Biopieces::put_record( $record_copy, $out ); + } + } + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_stockholm +{ + # Martin A. Hansen, August 2007. + + # Read Stockholm format. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $data_in, $file, $num, $entry, $record, $record_anno, $record_align, $key, $seq ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = Maasha::Common::read_open( $file ); + + while ( $entry = Maasha::Stockholm::get_stockholm_entry( $data_in ) ) + { + $record = Maasha::Stockholm::parse_stockholm_entry( $entry ); + + undef $record_anno; + + foreach $key ( keys %{ $record->{ "GF" } } ) { + $record_anno->{ $key } = $record->{ "GF" }->{ $key }; + } + + $record_anno->{ "ALIGN" } = $num; + + Maasha::Biopieces::put_record( $record_anno, $out ); + + foreach $seq ( @{ $record->{ "ALIGN" } } ) + { + undef $record_align; + + $record_align = { + SEQ_NAME => $seq->[ 0 ], + SEQ => $seq->[ 1 ], + }; + + Maasha::Biopieces::put_record( $record_align, $out ); + } + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_phastcons +{ + # Martin A. Hansen, December 2007. + + # Read PhastCons format. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $data_in, $file, $num, $entry, @records, $record ); + + $options->{ "min" } ||= 10; + $options->{ "dist" } ||= 25; + $options->{ "threshold" } ||= 0.8; + $options->{ "gap" } ||= 5; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = Maasha::Common::read_open( $file ); + + while ( $entry = Maasha::UCSC::fixedstep_get_entry( $data_in ) ) + { + @records = Maasha::UCSC::phastcons_parse_entry( $entry, $options ); + + foreach $record ( @records ) + { + $record->{ "REC_TYPE" } = "BED"; + $record->{ "BED_LEN" } = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1; + + Maasha::Biopieces::put_record( $record, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_soft +{ + # Martin A. Hansen, December 2007. + + # Read soft format. + # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $data_in, $file, $num, $records, $record, $soft_index, $fh, @platforms, $plat_table, @samples, $sample, $old_end, $skip ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + print STDERR "Creating index for file: $file\n" if $options->{ "verbose" }; + + $soft_index = Maasha::NCBI::soft_index_file( $file ); + + $fh = Maasha::Common::read_open( $file ); + + @platforms = grep { $_->{ "SECTION" } =~ /PLATFORM/ } @{ $soft_index }; + + print STDERR "Getting platform tables for file: $file\n" if $options->{ "verbose" }; + + $plat_table = Maasha::NCBI::soft_get_platform( $fh, $platforms[ 0 ]->{ "LINE_BEG" }, $platforms[ -1 ]->{ "LINE_END" } ); + + @samples = grep { $_->{ "SECTION" } =~ /SAMPLE/ } @{ $soft_index }; + + $old_end = $platforms[ -1 ]->{ "LINE_END" }; + + foreach $sample ( @samples ) + { + $skip = 0; + $skip = 1 if ( $options->{ "samples" } and grep { $sample->{ "SECTION" } !~ /$_/ } @{ $options->{ "samples" } } ); + + print STDERR "Getting samples for dataset: $sample->{ 'SECTION' }\n" if $options->{ "verbose" } and not $skip; + + $records = Maasha::NCBI::soft_get_sample( $fh, $plat_table, $sample->{ "LINE_BEG" } - $old_end - 1, $sample->{ "LINE_END" } - $old_end - 1, $skip ); + + foreach $record ( @{ $records } ) + { + Maasha::Biopieces::put_record( $record, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + $old_end = $sample->{ "LINE_END" }; + } + + close $fh; + } + + NUM: + + close $data_in if $data_in; + close $fh if $fh; +} + + +sub script_read_gff +{ + # Martin A. Hansen, February 2008. + + # Read soft format. + # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $data_in, $file, $fh, $num, $record, $entry ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $fh = Maasha::Common::read_open( $file ); + + while ( $entry = Maasha::GFF::get_entry( $fh ) ) + { + Maasha::Biopieces::put_record( $entry, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $fh; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_2bit +{ + # Martin A. Hansen, March 2008. + + # Read sequences from 2bit file. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $file, $data_in, $mask, $toc, $line, $num ); + + $mask = 1 if not $options->{ "no_mask" }; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = Maasha::Common::read_open( $file ); + + $toc = Maasha::TwoBit::twobit_get_TOC( $data_in ); + + foreach $line ( @{ $toc } ) + { + $record->{ "SEQ_NAME" } = $line->[ 0 ]; + $record->{ "SEQ" } = Maasha::TwoBit::twobit_get_seq( $data_in, $line->[ 1 ], undef, undef, $mask ); + $record->{ "SEQ_LEN" } = length $record->{ "SEQ" }; + + Maasha::Biopieces::put_record( $record, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_solexa +{ + # Martin A. Hansen, March 2008. + + # Read Solexa sequence reads from file. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $file, $data_in, $entry, $num, @seqs, @scores, $i ); + + $options->{ "format" } ||= "octal"; + $options->{ "quality" } ||= 20; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = Maasha::Common::read_open( $file ); + + if ( $options->{ "format" } eq "octal" ) + { + while ( $entry = Maasha::Solexa::solexa_get_entry_octal( $data_in ) ) + { + $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } ); + + Maasha::Biopieces::put_record( $record, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + } + else + { + while ( $entry = Maasha::Solexa::solexa_get_entry_decimal( $data_in ) ) + { + $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } ); + + Maasha::Biopieces::put_record( $record, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_solid +{ + # Martin A. Hansen, April 2008. + + # Read Solid sequence from file. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $file, $data_in, $line, $num, $seq_name, $seq_cs, $seq_qual, @scores, @seqs, $i ); + + $options->{ "quality" } ||= 15; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = Maasha::Common::read_open( $file ); + + while ( $line = <$data_in> ) + { + chomp $line; + + ( $seq_name, $seq_cs, $seq_qual ) = split /\t/, $line; + + @scores = split /,/, $seq_qual; + @seqs = split //, Maasha::Solid::color_space2seq( $seq_cs ); + + for ( $i = 0; $i < @seqs; $i++ ) { + $seqs[ $i ] = lc $seqs[ $i ] if $scores[ $i ] < $options->{ "quality" }; + } + + $record = { + REC_TYPE => 'SOLID', + SEQ_NAME => $seq_name, + SEQ_CS => $seq_cs, + SEQ_QUAL => join( ";", @scores ), + SEQ_LEN => length $seq_cs, + SEQ => join( "", @seqs ), + SCORE_MEAN => sprintf( "%.2f", Maasha::Calc::mean( \@scores ) ), + }; + + Maasha::Biopieces::put_record( $record, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_read_mysql +{ + # Martin A. Hansen, May 2008. + + # Read a MySQL query into stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $dbh, $results ); + + $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user(); + $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password(); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } + + $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } ); + + $results = Maasha::SQL::query_hashref_list( $dbh, $options->{ "query" } ); + + Maasha::SQL::disconnect( $dbh ); + + map { Maasha::Biopieces::put_record( $_ ) } @{ $results }; +} + + +sub script_read_ucsc_config +{ + # Martin A. Hansen, November 2008. + + # Read track entries from UCSC Genome Browser '.ra' files. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $file, $data_in, $entry, $num ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } + + $num = 1; + + foreach $file ( @{ $options->{ "files" } } ) + { + $data_in = Maasha::Common::read_open( $file ); + + while ( $record = Maasha::UCSC::ucsc_config_entry_get( $data_in ) ) + { + $record->{ 'REC_TYPE' } = "UCSC Config"; + + Maasha::Biopieces::put_record( $record, $out ); + + goto NUM if $options->{ "num" } and $num == $options->{ "num" }; + + $num++; + } + + close $data_in; + } + + NUM: + + close $data_in if $data_in; +} + + +sub script_assemble_tag_contigs +{ + # Martin A. Hansen, November 2008. + + # Assemble tags from the stream into + # tag contigs. + + # The current implementation is quite + # slow because of heavy use of temporary + # files. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $bed_file, $tag_file, $fh_in, $fh_out, $cols, $record, $bed_entry, $file_hash, $chr, $strand ); + + $bed_file = "$BP_TMP/assemble_tag_contigs.bed"; + $fh_out = Maasha::Filesys::file_write_open( $bed_file ); + $cols = 6; # we only need the first 6 BED columns + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record, $cols ) ) + { + $strand = $record->{ 'STRAND' } || '+'; + + Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh_out, $cols, $options->{ 'check' } ); + } + } + + close $fh_out; + + $file_hash = Maasha::UCSC::BED::bed_file_split_on_chr( $bed_file, $BP_TMP, $cols ); + + unlink $bed_file; + + foreach $chr ( sort keys %{ $file_hash } ) + { + $bed_file = $file_hash->{ $chr }; + $tag_file = "$bed_file.tc"; + + Maasha::Common::run( "bed2tag_contigs", "< $bed_file > $tag_file" ); + + $fh_in = Maasha::Filesys::file_read_open( $tag_file ); + + while ( $bed_entry = Maasha::UCSC::BED::bed_entry_get( $fh_in, $cols, $options->{ 'check' } ) ) + { + if ( $record = Maasha::UCSC::BED::bed2biopiece( $bed_entry ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } + } + + close $fh_in; + + unlink $bed_file; + unlink $tag_file; + } +} + + +sub script_format_genome +{ + # Martin A. Hansen, Juli 2008. + + # Format a genome to speficed formats. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $dir, $genome, $fasta_dir, $phastcons_dir, $vals, $fh_out, $record, $format, $index, $entry ); + + $dir = $options->{ 'dir' } || $ENV{ 'BP_DATA' }; + $genome = $options->{ 'genome' }; + + Maasha::Common::error( "Directory: $dir does not exist" ) if not -d $dir; + Maasha::Common::dir_create_if_not_exists( "$dir/genomes" ); + Maasha::Common::dir_create_if_not_exists( "$dir/genomes/$genome" ); + + if ( grep { $_ =~ /fasta|blast|vmatch/i } @{ $options->{ "formats" } } ) + { + if ( -f "$dir/genomes/$genome/fasta/$genome.fna" ) + { + $fasta_dir = "$dir/genomes/$genome/fasta"; + } + else + { + Maasha::Common::dir_create_if_not_exists( "$dir/genomes/$genome/fasta" ); + + $fasta_dir = "$dir/genomes/$genome/fasta"; + + $fh_out = Maasha::Common::write_open( "$fasta_dir/$genome.fna" ); + } + } + elsif ( grep { $_ =~ /phastcons/i } @{ $options->{ "formats" } } ) + { + Maasha::Common::dir_create_if_not_exists( "$dir/genomes/$genome/phastcons" ); + + $phastcons_dir = "$dir/genomes/$genome/phastcons"; + + $fh_out = Maasha::Common::write_open( "$phastcons_dir/$genome.pp" ); + } + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $fh_out and $entry = Maasha::Fasta::biopiece2fasta( $record ) ) + { + Maasha::Fasta::put_entry( $entry, $fh_out, $options->{ "wrap" } ); + } + elsif ( $fh_out and $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "STEP" } and $record->{ "VALS" } ) + { + print $fh_out "fixedStep chrom=$record->{ 'CHR' } start=$record->{ 'CHR_BEG' } step=$record->{ 'STEP' }\n"; + + $vals = $record->{ 'VALS' }; + + $vals =~ tr/,/\n/; + + print $fh_out "$vals\n"; + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + foreach $format ( @{ $options->{ 'formats' } } ) + { + if ( $format =~ /^fasta$/i ) { Maasha::Fasta::fasta_index( "$fasta_dir/$genome.fna", "$dir/genomes/$genome/fasta/$genome.index" ) } + elsif ( $format =~ /^blast$/i ) { Maasha::NCBI::blast_index( "$genome.fna", $fasta_dir, "$dir/genomes/$genome/blast", "dna", $genome ) } + elsif ( $format =~ /^blat$/i ) { print STDERR "BLAT FORMAT NOT IMPLEMENTED" } + elsif ( $format =~ /^vmatch$/i ) { Maasha::Match::vmatch_index( "$genome.fna", $fasta_dir, "$dir/genomes/$genome/vmatch", $BP_TMP ) } + elsif ( $format =~ /^phastcons$/i ) { Maasha::UCSC::phastcons_index( "$genome.pp", $phastcons_dir ) } + } + + close $fh_out if $fh_out; +} + + +sub script_length_seq +{ + # Martin A. Hansen, August 2007. + + # Determine the length of sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $total ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + $record->{ "SEQ_LEN" } = length $record->{ "SEQ" }; + $total += $record->{ "SEQ_LEN" }; + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + Maasha::Biopieces::put_record( { TOTAL_SEQ_LEN => $total }, $out ); +} + + +sub script_uppercase_seq +{ + # Martin A. Hansen, August 2007. + + # Uppercases sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + ) = @_; + + # Returns nothing. + + my ( $record ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + $record->{ "SEQ" } = uc $record->{ "SEQ" } if $record->{ "SEQ" }; + + Maasha::Biopieces::put_record( $record, $out ); + } +} + + +sub script_shuffle_seq +{ + # Martin A. Hansen, December 2007. + + # Shuffle sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + ) = @_; + + # Returns nothing. + + my ( $record ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + $record->{ "SEQ" } = Maasha::Seq::seq_shuffle( $record->{ "SEQ" } ) if $record->{ "SEQ" }; + + Maasha::Biopieces::put_record( $record, $out ); + } +} + + +sub script_analyze_seq +{ + # Martin A. Hansen, August 2007. + + # Analyze sequence composition of sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + ) = @_; + + # Returns nothing. + + my ( $record, $analysis ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + $analysis = Maasha::Seq::seq_analyze( $record->{ "SEQ" } ); + + map { $record->{ $_ } = $analysis->{ $_ } } keys %{ $analysis }; + } + + Maasha::Biopieces::put_record( $record, $out ); + } +} + + +sub script_analyze_tags +{ + # Martin A. Hansen, August 2008. + + # Analyze sequence tags in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + ) = @_; + + # Returns nothing. + + my ( $record, $analysis, %len_hash, %clone_hash, $clones, $key, $tag_record ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) + { + if ( $record->{ "SEQ_NAME" } =~ /_(\d+)$/ ) + { + $clones = $1; + + $len_hash{ length( $record->{ "SEQ" } ) }++; + $clone_hash{ length( $record->{ "SEQ" } ) } += $clones; + } + } + elsif ( $record->{ "Q_ID" } and $record->{ "BED_LEN" } ) + { + if ( $record->{ "Q_ID" } =~ /_(\d+)$/ ) + { + $clones = $1; + + $len_hash{ $record->{ "BED_LEN" } }++; + $clone_hash{ $record->{ "BED_LEN" } } += $clones; + } + } + } + + foreach $key ( sort { $a <=> $b } keys %len_hash ) + { + $tag_record->{ "TAG_LEN" } = $key; + $tag_record->{ "TAG_COUNT" } = $len_hash{ $key }; + $tag_record->{ "TAG_CLONES" } = $clone_hash{ $key }; + + Maasha::Biopieces::put_record( $tag_record, $out ); + } +} + + +sub script_complexity_seq +{ + # Martin A. Hansen, May 2008. + + # Generates an index calculated as the most common di-residue over + # the sequence length for all sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + ) = @_; + + # Returns nothing. + + my ( $record, $index ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + $record->{ "SEQ_COMPLEXITY" } = sprintf( "%.2f", Maasha::Seq::seq_complexity( $record->{ "SEQ" } ) ) if $record->{ "SEQ" }; + + Maasha::Biopieces::put_record( $record, $out ); + } +} + + +sub script_oligo_freq +{ + # Martin A. Hansen, August 2007. + + # Determine the length of sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, %oligos, @freq_table ); + + $options->{ "word_size" } ||= 7; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + map { $oligos{ $_ }++ } Maasha::Seq::seq2oligos( \$record->{ "SEQ" }, $options->{ "word_size" } ); + + if ( not $options->{ "all" } ) + { + @freq_table = Maasha::Seq::oligo_freq( \%oligos ); + + map { Maasha::Biopieces::put_record( $_, $out ) } @freq_table; + + undef %oligos; + } + } + + Maasha::Biopieces::put_record( $record, $out ); + } + + if ( $options->{ "all" } ) + { + @freq_table = Maasha::Seq::oligo_freq( \%oligos ); + + map { Maasha::Biopieces::put_record( $_, $out ) } @freq_table; + } +} + + +sub script_create_weight_matrix +{ + # Martin A. Hansen, August 2007. + + # Creates a weight matrix from an alignmnet. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $count, $i, $res, %freq_hash, %res_hash, $freq ); + + $count = 0; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + for ( $i = 0; $i < length $record->{ "SEQ" }; $i++ ) + { + $res = substr $record->{ "SEQ" }, $i, 1; + + $freq_hash{ $i }{ $res }++; + $res_hash{ $res } = 1; + } + + $count++; + } + else + { + Maasha::Biopieces::put_record( $record, $out ); + } + } + + foreach $res ( sort keys %res_hash ) + { + undef $record; + + $record->{ "V0" } = $res; + + for ( $i = 0; $i < keys %freq_hash; $i++ ) + { + $freq = $freq_hash{ $i }{ $res } || 0; + + if ( $options->{ "percent" } ) { + $freq = sprintf( "%.0f", 100 * $freq / $count ) if $freq > 0; + } + + $record->{ "V" . ( $i + 1 ) } = $freq; + } + + Maasha::Biopieces::put_record( $record, $out ); + } +} + + +sub script_calc_bit_scores +{ + # Martin A. Hansen, March 2007. + + # Calculates the bit scores for each position from an alignmnet in the stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + ) = @_; + + # Returns nothing. + + my ( $record, $type, $count, $i, $res, %freq_hash, $bit_max, $bit_height, $bit_diff ); + + $count = 0; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) if not $type; + + for ( $i = 0; $i < length $record->{ "SEQ" }; $i++ ) + { + $res = substr $record->{ "SEQ" }, $i, 1; + + next if $res =~ /-|_|~|\./; + + $freq_hash{ $i }{ $res }++; + } + + $count++; + } + else + { + Maasha::Biopieces::put_record( $record, $out ); + } + } + + undef $record; + + if ( $type eq "protein" ) { + $bit_max = 4; + } else { + $bit_max = 2; + } + + for ( $i = 0; $i < keys %freq_hash; $i++ ) + { + $bit_height = Maasha::Seq::seqlogo_calc_bit_height( $freq_hash{ $i }, $count ); + + $bit_diff = $bit_max - $bit_height; + + $record->{ "V" . ( $i ) } = sprintf( "%.2f", $bit_diff ); + } + + Maasha::Biopieces::put_record( $record, $out ); +} + + +sub script_calc_fixedstep +{ + # Martin A. Hansen, September 2008. + + # Calculates fixedstep entries from data in the stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $bed_file, $fh_in, $fh_out, $record, $file_hash, $chr, $bed_entry, $fixedstep_file, $fixedstep_entry ); + + $bed_file = "$BP_TMP/calc_fixedstep.bed"; + $fh_out = Maasha::Filesys::file_write_open( $bed_file ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record ) ) { + Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh_out, undef, $options->{ 'check' } ); + } + } + + close $fh_out; + + $file_hash = Maasha::UCSC::BED::bed_file_split_on_chr( $bed_file, $BP_TMP ); + + unlink $bed_file; + + foreach $chr ( sort keys %{ $file_hash } ) + { + $bed_file = $file_hash->{ $chr }; + + $fixedstep_file = Maasha::UCSC::Wiggle::fixedstep_calc( $bed_file, $chr, $options->{ 'use_score' }, $options->{ 'use_log10' } ); + + #$fixedstep_file = "$bed_file.fixedstep"; + + # Maasha::Common::run( "bed2fixedstep", "< $bed_file > $fixedstep_file" ); + + + $fh_in = Maasha::Filesys::file_read_open( $fixedstep_file ); + + while ( $fixedstep_entry = Maasha::UCSC::Wiggle::fixedstep_entry_get( $fh_in ) ) + { + if ( $record = Maasha::UCSC::Wiggle::fixedstep2biopiece( $fixedstep_entry ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } + } + + close $fh_in; + + unlink $bed_file; + unlink $fixedstep_file; + } +} + + +sub script_reverse_seq +{ + # Martin A. Hansen, August 2007. + + # Reverse sequence in record. + + my ( $in, # handle to in stream + $out, # handle to out stream + ) = @_; + + # Returns nothing. + + my ( $record ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) { + $record->{ "SEQ" } = reverse $record->{ "SEQ" }; + } + + Maasha::Biopieces::put_record( $record, $out ); + } +} + + +sub script_complement_seq +{ + # Martin A. Hansen, August 2007. + + # Complement sequence in record. + + my ( $in, # handle to in stream + $out, # handle to out stream + ) = @_; + + # Returns nothing. + + my ( $record, $type ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + if ( not $type ) { + $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ); + } + + if ( $type eq "rna" ) { + Maasha::Seq::rna_comp( \$record->{ "SEQ" } ); + } elsif ( $type eq "dna" ) { + Maasha::Seq::dna_comp( \$record->{ "SEQ" } ); + } + } + + Maasha::Biopieces::put_record( $record, $out ); + } +} + + +sub script_remove_indels +{ + # Martin A. Hansen, August 2007. + + # Remove indels from sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + ) = @_; + + # Returns nothing. + + my ( $record ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + $record->{ 'SEQ' } =~ tr/-~.//d if $record->{ "SEQ" }; + + Maasha::Biopieces::put_record( $record, $out ); + } +} + + +sub script_transliterate_seq +{ + # Martin A. Hansen, August 2007. + + # Transliterate chars from sequence in record. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $search, $replace, $delete ); + + $search = $options->{ "search" } || ""; + $replace = $options->{ "replace" } || ""; + $delete = $options->{ "delete" } || ""; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + if ( $search and $replace ) { + eval "\$record->{ 'SEQ' } =~ tr/$search/$replace/"; + } elsif ( $delete ) { + eval "\$record->{ 'SEQ' } =~ tr/$delete//d"; + } + } + + Maasha::Biopieces::put_record( $record, $out ); + } +} + + +sub script_transliterate_vals +{ + # Martin A. Hansen, April 2008. + + # Transliterate chars from values in record. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $search, $replace, $delete, $key ); + + $search = $options->{ "search" } || ""; + $replace = $options->{ "replace" } || ""; + $delete = $options->{ "delete" } || ""; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( exists $record->{ $key } ) + { + if ( $search and $replace ) { + eval "\$record->{ $key } =~ tr/$search/$replace/"; + } elsif ( $delete ) { + eval "\$record->{ $key } =~ tr/$delete//d"; + } + } + } + + Maasha::Biopieces::put_record( $record, $out ); + } +} + + +sub script_translate_seq +{ + # Martin A. Hansen, February 2008. + + # Translate DNA sequence into protein sequence. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $frame, %new_record ); + + $options->{ "frames" } ||= [ 1, 2, 3, -1, -2, -3 ]; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + if ( Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) eq "dna" ) + { + foreach $frame ( @{ $options->{ "frames" } } ) + { + %new_record = %{ $record }; + + $new_record{ "SEQ" } = Maasha::Seq::translate( $record->{ "SEQ" }, $frame ); + $new_record{ "SEQ_LEN" } = length $new_record{ "SEQ" }; + $new_record{ "FRAME" } = $frame; + + Maasha::Biopieces::put_record( \%new_record, $out ); + } + } + } + else + { + Maasha::Biopieces::put_record( $record, $out ); + } + } +} + + +sub script_extract_seq +{ + # Martin A. Hansen, August 2007. + + # Extract subsequences from sequences in record. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $beg, $end, $len, $record ); + + if ( not defined $options->{ "beg" } or $options->{ "beg" } < 0 ) { + $beg = 0; + } else { + $beg = $options->{ "beg" } - 1; # correcting for start offset + } + + if ( defined $options->{ "end" } and $options->{ "end" } - 1 < $beg ) { + $end = $beg - 1; + } elsif ( defined $options->{ "end" } ) { + $end = $options->{ "end" } - 1; # correcting for start offset + } + + $len = $options->{ "len" }; + +# print "beg->$beg, end->$end, len->$len\n"; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + if ( defined $beg and defined $end ) + { + if ( $end - $beg + 1 > length $record->{ "SEQ" } ) { + $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg; + } else { + $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg, $end - $beg + 1; + } + } + elsif ( defined $beg and defined $len ) + { + if ( $len > length $record->{ "SEQ" } ) { + $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg; + } else { + $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg, $len; + } + } + elsif ( defined $beg ) + { + $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg; + } + } + + $record->{ "SEQ_LEN" } = length $record->{ "SEQ" }; + + Maasha::Biopieces::put_record( $record, $out ); + } +} + + +sub script_get_genome_seq +{ + # Martin A. Hansen, December 2007. + + # Gets a subsequence from a genome. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $genome, $genome_file, $index_file, $index, $fh, $index_head, $index_beg, $index_len, $beg, $len, %lookup_hash, @begs, @lens, $i, $seq ); + + $options->{ "flank" } ||= 0; + + if ( $options->{ "genome" } ) + { + $genome = $options->{ "genome" }; + + $genome_file = "$ENV{ 'BP_DATA' }/genomes/$genome/fasta/$genome.fna"; + $index_file = "$ENV{ 'BP_DATA' }/genomes/$genome/fasta/$genome.index"; + + $fh = Maasha::Common::read_open( $genome_file ); + $index = Maasha::Fasta::index_retrieve( $index_file ); + + shift @{ $index }; # Get rid of the file size info + + map { $lookup_hash{ $_->[ 0 ] } = [ $_->[ 1 ], $_->[ 2 ] ] } @{ $index }; + + if ( exists $lookup_hash{ $options->{ "chr" } } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) ) + { + ( $index_beg, $index_len ) = @{ $lookup_hash{ $options->{ "chr" } } }; + + $beg = $index_beg + $options->{ "beg" } - 1; + + if ( $options->{ "len" } ) { + $len = $options->{ "len" }; + } elsif ( $options->{ "end" } ) { + $len = ( $options->{ "end" } - $options->{ "beg" } + 1 ); + } + + $beg -= $options->{ "flank" }; + $len += 2 * $options->{ "flank" }; + + if ( $beg <= $index_beg ) + { + $len -= $index_beg - $beg; + $beg = $index_beg; + } + + $len = $index_beg + $index_len - $beg if $beg + $len > $index_beg + $index_len; + + next if $beg > $index_beg + $index_len; + + $record->{ "CHR" } = $options->{ "chr" }; + $record->{ "CHR_BEG" } = $beg - $index_beg; + $record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $len - 1; + + $record->{ "SEQ" } = Maasha::Common::file_read( $fh, $beg, $len ); + $record->{ "SEQ_LEN" } = $len; + + Maasha::Biopieces::put_record( $record, $out ); + } + } + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $options->{ "genome" } and not $record->{ "SEQ" } ) + { + if ( $record->{ "REC_TYPE" } eq "BED" and exists $lookup_hash{ $record->{ "CHR" } } ) + { + ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "CHR" } } }; + + $beg = $record->{ "CHR_BEG" } + $index_beg; + $len = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1; + } + elsif ( $record->{ "REC_TYPE" } eq "PSL" and exists $lookup_hash{ $record->{ "S_ID" } } ) + { + ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "S_ID" } } }; + + $beg = $record->{ "S_BEG" } + $index_beg; + $len = $record->{ "S_END" } - $record->{ "S_BEG" } + 1; + } + elsif ( $record->{ "REC_TYPE" } eq "BLAST" and exists $lookup_hash{ $record->{ "S_ID" } } ) + { + ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "S_ID" } } }; + + $beg = $record->{ "S_BEG" } + $index_beg; + $len = $record->{ "S_END" } - $record->{ "S_BEG" } + 1; + } + + $beg -= $options->{ "flank" }; + $len += 2 * $options->{ "flank" }; + + if ( $beg <= $index_beg ) + { + $len -= $index_beg - $beg; + $beg = $index_beg; + } + + $len = $index_beg + $index_len - $beg if $beg + $len > $index_beg + $index_len; + + next if $beg > $index_beg + $index_len; + + $record->{ "CHR_BEG" } = $beg - $index_beg; + $record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $len - 1; + + $record->{ "SEQ" } = Maasha::Common::file_read( $fh, $beg, $len ); + + if ( $record->{ "STRAND" } and $record->{ "STRAND" } eq "-" ) + { + Maasha::Seq::dna_comp( \$record->{ "SEQ" } ); + $record->{ "SEQ" } = reverse $record->{ "SEQ" }; + } + + if ( $options->{ "mask" } ) + { + if ( $record->{ "BLOCK_COUNT" } > 1 ) # uppercase hit block segments and lowercase the rest. + { + $record->{ "SEQ" } = lc $record->{ "SEQ" }; + + @begs = split ",", $record->{ "Q_BEGS" }; + @lens = split ",", $record->{ "BLOCK_LENS" }; + + for ( $i = 0; $i < @begs; $i++ ) { + substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ], uc substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ]; + } + } + } + elsif ( $options->{ "splice" } ) + { + if ( $record->{ "BLOCK_COUNT" } > 1 ) # splice block sequences + { + $seq = ""; + @begs = split ",", $record->{ "Q_BEGS" }; + @lens = split ",", $record->{ "BLOCK_LENS" }; + + for ( $i = 0; $i < @begs; $i++ ) { + $seq .= substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ]; + } + + $record->{ "SEQ" } = $seq; + } + } + + $record->{ "SEQ_LEN" } = length $record->{ "SEQ" }; + } + + Maasha::Biopieces::put_record( $record, $out ); + } + + close $fh if $fh; +} + + +sub script_get_genome_align +{ + # Martin A. Hansen, April 2008. + + # Gets a subalignment from a multiple genome alignment. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $maf_track, $align, $align_num, $beg, $end, $len, $entry ); + + $options->{ "strand" } ||= "+"; + + $align_num = 1; + + $maf_track = Maasha::Config::maf_track( $options->{ "genome" } ); + + if ( $options->{ "chr" } and $options->{ "beg" } and ( $options->{ "end" } or $options->{ "len" } ) ) + { + $beg = $options->{ "beg" } - 1; + + if ( $options->{ "end" } ) { + $end = $options->{ "end" }; + } elsif ( $options->{ "len" } ) { + $end = $beg + $options->{ "len" }; + } + + $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $options->{ "chr" }, $beg, $end, $options->{ "strand" } ); + + foreach $entry ( @{ $align } ) + { + $entry->{ "CHR" } = $record->{ "CHR" }; + $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" }; + $entry->{ "CHR_END" } = $record->{ "CHR_END" }; + $entry->{ "STRAND" } = $record->{ "STRAND" } || '+'; + $entry->{ "Q_ID" } = $record->{ "Q_ID" }; + $entry->{ "SCORE" } = $record->{ "SCORE" }; + + Maasha::Biopieces::put_record( $entry, $out ); + } + } + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "REC_TYPE" } eq "BED" ) + { + $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $record->{ "STRAND" } ); + } + elsif ( $record->{ "REC_TYPE" } eq "VMATCH" ) + { + $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" } + 1, $record->{ "STRAND" } ); + } + elsif ( $record->{ "REC_TYPE" } eq "PSL" ) + { + $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } ); + } + elsif ( $record->{ "REC_TYPE" } eq "BLAST" ) + { + $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } ); + } + + foreach $entry ( @{ $align } ) + { + $entry->{ "CHR" } = $record->{ "CHR" }; + $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" }; + $entry->{ "CHR_END" } = $record->{ "CHR_END" }; + $entry->{ "STRAND" } = $record->{ "STRAND" }; + $entry->{ "Q_ID" } = $record->{ "Q_ID" }; + $entry->{ "SCORE" } = $record->{ "SCORE" }; + + Maasha::Biopieces::put_record( $entry, $out ); + } + + $align_num++; + } +} + + +sub script_get_genome_phastcons +{ + # Martin A. Hansen, February 2008. + + # Get phastcons scores from genome intervals. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $scores, $record ); + + $options->{ "flank" } ||= 0; + + $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } ); + $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } ); + + $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index ); + $fh_phastcons = Maasha::Common::read_open( $phastcons_file ); + + if ( defined $options->{ "chr" } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) ) + { + $options->{ "beg" } -= 1; # request is 1-based + $options->{ "end" } -= 1; # request is 1-based + + if ( $options->{ "len" } ) { + $options->{ "end" } = $options->{ "beg" } + $options->{ "len" } - 1; + } + + $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $options->{ "chr" }, $options->{ "beg" }, $options->{ "end" }, $options->{ "flank" } ); + + $record->{ "CHR" } = $options->{ "chr" }; + $record->{ "CHR_BEG" } = $options->{ "beg" } - $options->{ "flank" }; + $record->{ "CHR_END" } = $options->{ "end" } + $options->{ "flank" }; + + $record->{ "PHASTCONS" } = join ",", @{ $scores }; + $record->{ "PHAST_COUNT" } = scalar @{ $scores }; # DEBUG + + Maasha::Biopieces::put_record( $record, $out ); + } + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "REC_TYPE" } eq "BED" ) + { + $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "flank" } ); + } + elsif ( $record->{ "REC_TYPE" } eq "PSL" ) + { + $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } ); + } + elsif ( $record->{ "REC_TYPE" } eq "BLAST" ) + { + $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } ); + } + + $record->{ "PHASTCONS" } = join ",", @{ $scores } if @{ $scores }; +# $record->{ "PHAST_COUNT" } = @{ $scores } if @{ $scores }; # DEBUG + + Maasha::Biopieces::put_record( $record, $out ); + } + + close $fh_phastcons if $fh_phastcons; +} + + +sub script_fold_seq +{ + # Martin A. Hansen, December 2007. + + # Folds sequences in stream into secondary structures. + + my ( $in, # handle to in stream + $out, # handle to out stream + ) = @_; + + # Returns nothing. + + my ( $record, $type, $struct, $index ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + if ( not $type ) { + $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ); + } + + if ( $type ne "protein" ) + { + ( $struct, $index ) = Maasha::Seq::fold_struct_rnafold( $record->{ "SEQ" } ); + $record->{ "SEC_STRUCT" } = $struct; + $record->{ "FREE_ENERGY" } = $index; + $record->{ "SCORE" } = abs int $index; + $record->{ "SIZE" } = length $struct; + $record->{ "CONF" } = "1," x $record->{ "SIZE" }; + } + } + + Maasha::Biopieces::put_record( $record, $out ); + } +} + + +sub script_split_seq +{ + # Martin A. Hansen, August 2007. + + # Split a sequence in stream into words. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $new_record, $i, $subseq, %lookup ); + + $options->{ "word_size" } ||= 7; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) + { + for ( $i = 0; $i < length( $record->{ "SEQ" } ) - $options->{ "word_size" } + 1; $i++ ) + { + $subseq = substr $record->{ "SEQ" }, $i, $options->{ "word_size" }; + + if ( $options->{ "uniq" } and not $lookup{ $subseq } ) + { + $new_record->{ "SEQ_NAME" } = $record->{ "SEQ_NAME" } . "[" . ( $i + 1 ) . "-" . ( $i + $options->{ "word_size" } ) . "]"; + $new_record->{ "SEQ" } = $subseq; + + Maasha::Biopieces::put_record( $new_record, $out ); + + $lookup{ $subseq } = 1; + } + else + { + $new_record->{ "SEQ_NAME" } = $record->{ "SEQ_NAME" } . "[" . ( $i + 1 ) . "-" . ( $i + $options->{ "word_size" } ) . "]"; + $new_record->{ "SEQ" } = $subseq; + + Maasha::Biopieces::put_record( $new_record, $out ); + } + } + } + else + { + Maasha::Biopieces::put_record( $record, $out ); + } + } +} + + +sub script_split_bed +{ + # Martin A. Hansen, June 2008. + + # Split a BED record into overlapping windows. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $new_record, $i ); + + $options->{ "window_size" } ||= 20; + $options->{ "step_size" } ||= 1; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) + { + $record->{ "BED_LEN" } = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1; + + for ( $i = 0; $i < $record->{ "BED_LEN" } - $options->{ "window_size" }; $i += $options->{ "step_size" } ) + { + $new_record->{ "REC_TYPE" } = "BED"; + $new_record->{ "CHR" } = $record->{ "CHR" }; + $new_record->{ "CHR_BEG" } = $record->{ "CHR_BEG" } + $i; + $new_record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $i + $options->{ "window_size" }; + $new_record->{ "BED_LEN" } = $options->{ "window_size" }; + $new_record->{ "Q_ID" } = $record->{ "Q_ID" } . "_$i"; + $new_record->{ "SCORE" } = $record->{ "SCORE" }; + $new_record->{ "STRAND" } = $record->{ "STRAND" }; + + Maasha::Biopieces::put_record( $new_record, $out ); + } + } + else + { + Maasha::Biopieces::put_record( $record, $out ); + } + } +} + + +sub script_align_seq +{ + # Martin A. Hansen, August 2007. + + # Align sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + ) = @_; + + # Returns nothing. + + my ( $record, @entries, $entry ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { + push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; + } elsif ( $record->{ "Q_ID" } and $record->{ "SEQ" } ) { + push @entries, [ $record->{ "Q_ID" }, $record->{ "SEQ" } ]; + } else { + Maasha::Biopieces::put_record( $record, $out ); + } + } + + @entries = Maasha::Align::align( \@entries ); + + foreach $entry ( @entries ) + { + if ( $entry->[ SEQ_NAME ] and $entry->[ SEQ ] ) + { + $record = { + SEQ_NAME => $entry->[ SEQ_NAME ], + SEQ => $entry->[ SEQ ], + }; + + Maasha::Biopieces::put_record( $record, $out ); + } + } +} + + +sub script_tile_seq +{ + # Martin A. Hansen, February 2008. + + # Using the first sequence in stream as reference, tile + # all subsequent sequences based on pairwise alignments. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $first, $ref_entry, @entries ); + + $first = 1; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) + { + if ( $first ) + { + $ref_entry = [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; + + $first = 0; + } + else + { + push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; + } + } + else + { + Maasha::Biopieces::put_record( $record, $out ); + } + } + + @entries = Maasha::Align::align_tile( $ref_entry, \@entries, $options ); + + map { Maasha::Biopieces::put_record( { SEQ_NAME => $_->[ SEQ_NAME ], SEQ => $_->[ SEQ ] }, $out ) } @entries; +} + + +sub script_invert_align +{ + # Martin A. Hansen, February 2008. + + # Inverts an alignment showing only non-mathing residues + # using the first sequence as reference. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, @entries ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) + { + push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; + } + else + { + Maasha::Biopieces::put_record( $record, $out ); + } + } + + Maasha::Align::align_invert( \@entries, $options->{ "soft" } ); + + map { Maasha::Biopieces::put_record( { SEQ_NAME => $_->[ SEQ_NAME ], SEQ => $_->[ SEQ ] }, $out ) } @entries; +} + + +sub script_patscan_seq +{ + # Martin A. Hansen, August 2007. + + # Locates patterns in sequences using scan_for_matches. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $genome_file, @args, $arg, $type, $seq_file, $pat_file, $out_file, $fh_in, $fh_out, $record, $patterns, $pattern, $entry, $result, %head_hash, $i ); + + if ( $options->{ "patterns" } ) { + $patterns = Maasha::Patscan::parse_patterns( $options->{ "patterns" } ); + } elsif ( -f $options->{ "patterns_in" } ) { + $patterns = Maasha::Patscan::read_patterns( $options->{ "patterns_in" } ); + } + + $genome_file = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna" if $options->{ 'genome' }; + + push @args, "-c" if $options->{ "comp" }; + push @args, "-m $options->{ 'max_hits' }" if $options->{ 'max_hits' }; + push @args, "-n $options->{ 'max_misses' }" if $options->{ 'max_hits' }; + + $seq_file = "$BP_TMP/patscan.seq"; + $pat_file = "$BP_TMP/patscan.pat"; + $out_file = "$BP_TMP/patscan.out"; + + $fh_out = Maasha::Common::write_open( $seq_file ); + + $i = 0; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "SEQ" } and $record->{ "SEQ_NAME" } ) + { + $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) if not $type; + + Maasha::Fasta::put_entry( [ $i, $record->{ "SEQ" } ], $fh_out ); + + $head_hash{ $i } = $record->{ "SEQ_NAME" }; + + $i++; + } + } + + close $fh_out; + + $arg = join " ", @args; + $arg .= " -p" if $type eq "protein"; + + foreach $pattern ( @{ $patterns } ) + { + $fh_out = Maasha::Common::write_open( $pat_file ); + + print $fh_out "$pattern\n"; + + close $fh_out; + + if ( $options->{ 'genome' } ) { + `scan_for_matches $arg $pat_file < $genome_file > $out_file`; + # Maasha::Common::run( "scan_for_matches", "$arg $pat_file < $genome_file > $out_file" ); + } else { + `scan_for_matches $arg $pat_file < $seq_file > $out_file`; + # Maasha::Common::run( "scan_for_matches", "$arg $pat_file < $seq_file > $out_file" ); + } + + $fh_in = Maasha::Common::read_open( $out_file ); + + while ( $entry = Maasha::Fasta::get_entry( $fh_in ) ) + { + $result = Maasha::Patscan::parse_scan_result( $entry, $pattern ); + + if ( $options->{ 'genome' } ) + { + $result->{ "CHR" } = $result->{ "S_ID" }; + $result->{ "CHR_BEG" } = $result->{ "S_BEG" }; + $result->{ "CHR_END" } = $result->{ "S_END" }; + + delete $result->{ "S_ID" }; + delete $result->{ "S_BEG" }; + delete $result->{ "S_END" }; + } + else + { + $result->{ "S_ID" } = $head_hash{ $result->{ "S_ID" } }; + } + + Maasha::Biopieces::put_record( $result, $out ); + } + + close $fh_in; + } + + unlink $pat_file; + unlink $seq_file; + unlink $out_file; +} + + +sub script_create_blast_db +{ + # Martin A. Hansen, September 2007. + + # Creates a NCBI BLAST database with formatdb + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $fh, $seq_type, $path, $record, $entry ); + + $path = $options->{ "database" }; + + $fh = Maasha::Common::write_open( $path ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) + { + $seq_type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $seq_type; + + Maasha::Fasta::put_entry( $entry, $fh ); + } + } + + close $fh; + + if ( $seq_type eq "protein" ) { + Maasha::Common::run( "formatdb", "-p T -i $path -t $options->{ 'database' }" ); + } else { + Maasha::Common::run( "formatdb", "-p F -i $path -t $options->{ 'database' }" ); + } + + unlink $path; +} + + +sub script_blast_seq +{ + # Martin A. Hansen, September 2007. + + # BLASTs sequences in stream against a given database. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $genome, $q_type, $s_type, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields, $entry ); + + $options->{ "e_val" } = 10 if not defined $options->{ "e_val" }; + $options->{ "filter" } = "F"; + $options->{ "filter" } = "T" if $options->{ "filter" }; + $options->{ "cpus" } ||= 1; + + $options->{ "database" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/blast/$options->{ 'genome' }.fna" if $options->{ 'genome' }; + + $tmp_in = "$BP_TMP/blast_query.seq"; + $tmp_out = "$BP_TMP/blast.result"; + + $fh_out = Maasha::Filesys::file_write_open( $tmp_in ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) + { + $q_type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $q_type; + + Maasha::Fasta::put_entry( $entry, $fh_out ); + } + + Maasha::Biopieces::put_record( $record, $out ); + } + + close $fh_out; + + if ( -f $options->{ 'database' } . ".phr" ) { + $s_type = "protein"; + } else { + $s_type = "nucleotide"; + } + + if ( not $options->{ 'program' } ) + { + if ( $q_type ne "protein" and $s_type ne "protein" ) { + $options->{ 'program' } = "blastn"; + } elsif ( $q_type eq "protein" and $s_type eq "protein" ) { + $options->{ 'program' } = "blastp"; + } elsif ( $q_type ne "protein" and $s_type eq "protein" ) { + $options->{ 'program' } = "blastx"; + } elsif ( $q_type eq "protein" and $s_type ne "protein" ) { + $options->{ 'program' } = "tblastn"; + } + } + + if ( $options->{ 'verbose' } ) + { + Maasha::Common::run( + "blastall", + join( " ", + "-p $options->{ 'program' }", + "-e $options->{ 'e_val' }", + "-a $options->{ 'cpus' }", + "-m 8", + "-i $tmp_in", + "-d $options->{ 'database' }", + "-F $options->{ 'filter' }", + "-o $tmp_out", + ), + 1 + ); + } + else + { + Maasha::Common::run( + "blastall", + join( " ", + "-p $options->{ 'program' }", + "-e $options->{ 'e_val' }", + "-a $options->{ 'cpus' }", + "-m 8", + "-i $tmp_in", + "-d $options->{ 'database' }", + "-F $options->{ 'filter' }", + "-o $tmp_out", + "> /dev/null 2>&1" + ), + 1 + ); + } + + unlink $tmp_in; + + $fh_out = Maasha::Filesys::file_read_open( $tmp_out ); + + undef $record; + + while ( $line = <$fh_out> ) + { + chomp $line; + + next if $line =~ /^#/; + + @fields = split /\s+/, $line; + + $record->{ "REC_TYPE" } = "BLAST"; + $record->{ "Q_ID" } = $fields[ 0 ]; + $record->{ "S_ID" } = $fields[ 1 ]; + $record->{ "IDENT" } = $fields[ 2 ]; + $record->{ "ALIGN_LEN" } = $fields[ 3 ]; + $record->{ "MISMATCHES" } = $fields[ 4 ]; + $record->{ "GAPS" } = $fields[ 5 ]; + $record->{ "Q_BEG" } = $fields[ 6 ] - 1; # BLAST is 1-based + $record->{ "Q_END" } = $fields[ 7 ] - 1; # BLAST is 1-based + $record->{ "S_BEG" } = $fields[ 8 ] - 1; # BLAST is 1-based + $record->{ "S_END" } = $fields[ 9 ] - 1; # BLAST is 1-based + $record->{ "E_VAL" } = $fields[ 10 ]; + $record->{ "BIT_SCORE" } = $fields[ 11 ]; + + if ( $record->{ "S_BEG" } > $record->{ "S_END" } ) + { + $record->{ "STRAND" } = '-'; + + ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } ); + } + else + { + $record->{ "STRAND" } = '+'; + } + + Maasha::Biopieces::put_record( $record, $out ); + } + + close $fh_out; + + unlink $tmp_out; +} + + +sub script_blat_seq +{ + # Martin A. Hansen, August 2007. + + # BLATs sequences in stream against a given genome. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $blat_args, $genome_file, $query_file, $fh_in, $fh_out, $type, $record, $result_file, $entries, $entry ); + + $genome_file = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna"; + + $options->{ 'tile_size' } ||= 11; + $options->{ 'one_off' } ||= 0; + $options->{ 'min_identity' } ||= 90; + $options->{ 'min_score' } ||= 0; + $options->{ 'step_size' } ||= $options->{ 'tile_size' }; + + $blat_args .= " -tileSize=$options->{ 'tile_size' }"; + $blat_args .= " -oneOff=$options->{ 'one_off' }"; + $blat_args .= " -minIdentity=$options->{ 'min_identity' }"; + $blat_args .= " -minScore=$options->{ 'min_score' }"; + $blat_args .= " -stepSize=$options->{ 'step_size' }"; +# $blat_args .= " -ooc=" . Maasha::Config::genome_blat_ooc( $options->{ "genome" }, 11 ) if $options->{ 'ooc' }; + + $query_file = "$BP_TMP/blat.seq"; + + $fh_out = Maasha::Common::write_open( $query_file ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) + { + $type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $type; + Maasha::Fasta::put_entry( $entry, $fh_out, 80 ); + } + + Maasha::Biopieces::put_record( $record, $out ); + } + + close $fh_out; + + $blat_args .= " -t=dnax" if $type eq "protein"; + $blat_args .= " -q=$type"; + + $result_file = "$BP_TMP/blat.psl"; + + Maasha::Common::run( "blat", "$genome_file $query_file $blat_args $result_file > /dev/null 2>&1" ); + + unlink $query_file; + + $entries = Maasha::UCSC::psl_get_entries( $result_file ); + + map { Maasha::Biopieces::put_record( $_, $out ) } @{ $entries }; + + unlink $result_file; +} + + +sub script_soap_seq +{ + # Martin A. Hansen, July 2008. + + # soap sequences in stream against a given file or genome. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $genome, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields, $entry, $count, $args ); + + $options->{ "seed_size" } ||= 10; + $options->{ "mismatches" } ||= 2; + $options->{ "gap_size" } ||= 0; + $options->{ "cpus" } ||= 1; + + if ( $options->{ "genome" } ) { + $options->{ "in_file" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna"; + } + + $tmp_in = "$BP_TMP/soap_query.seq"; + $tmp_out = "$BP_TMP/soap.result"; + + $fh_out = Maasha::Common::write_open( $tmp_in ); + + $count = 0; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) + { + Maasha::Fasta::put_entry( $entry, $fh_out ); + + $count++; + } + + Maasha::Biopieces::put_record( $record, $out ); + } + + close $fh_out; + + if ( $count > 0 ) + { + $args = join( " ", + "-s $options->{ 'seed_size' }", + "-r 2", + "-a $tmp_in", + "-v $options->{ 'mismatches' }", + "-g $options->{ 'gap_size' }", + "-p $options->{ 'cpus' }", + "-d $options->{ 'in_file' }", + "-o $tmp_out", + ); + + $args .= " > /dev/null 2>&1" if not $options->{ 'verbose' }; + + Maasha::Common::run( "soap", $args, 1 ); + + unlink $tmp_in; + + $fh_out = Maasha::Common::read_open( $tmp_out ); + + undef $record; + + while ( $line = <$fh_out> ) + { + chomp $line; + + @fields = split /\t/, $line; + + $record->{ "REC_TYPE" } = "SOAP"; + $record->{ "Q_ID" } = $fields[ 0 ]; + $record->{ "SCORE" } = $fields[ 3 ]; + $record->{ "STRAND" } = $fields[ 6 ]; + $record->{ "S_ID" } = $fields[ 7 ]; + $record->{ "S_BEG" } = $fields[ 8 ] - 1; # soap is 1-based + $record->{ "S_END" } = $fields[ 8 ] + $fields[ 5 ] - 2; + + Maasha::Biopieces::put_record( $record, $out ); + } + + close $fh_out; + } + + unlink $tmp_out; +} + + +sub script_match_seq +{ + # Martin A. Hansen, August 2007. + + # BLATs sequences in stream against a given genome. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, @entries, $results ); + + $options->{ "word_size" } ||= 20; + $options->{ "direction" } ||= "both"; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { + push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; + } + + Maasha::Biopieces::put_record( $record, $out ); + } + + if ( @entries == 1 ) + { + $results = Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 0 ] ], $options, $BP_TMP ); + + map { Maasha::Biopieces::put_record( $_, $out ) } @{ $results }; + } + elsif ( @entries == 2 ) + { + $results = Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 1 ] ], $options, $BP_TMP ); + + map { Maasha::Biopieces::put_record( $_, $out ) } @{ $results }; + } +} + + +sub script_create_vmatch_index +{ + # Martin A. Hansen, January 2008. + + # Create a vmatch index from sequences in the stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $file_tmp, $fh_tmp, $type, $entry ); + + if ( $options->{ "index_name" } ) + { + $file_tmp = $options->{ 'index_name' }; + $fh_tmp = Maasha::Common::write_open( $file_tmp ); + } + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $options->{ "index_name" } and $entry = Maasha::Fasta::biopiece2fasta( $record ) ) + { + Maasha::Fasta::put_entry( $entry, $fh_tmp ); + + $type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not defined $type; + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + if ( $options->{ "index_name" } ) + { + close $fh_tmp; + + if ( $type eq "protein" ) { + Maasha::Common::run( "mkvtree", "-db $file_tmp -protein -pl $options->{ 'prefix_length' } -allout -indexname $file_tmp > /dev/null 2>&1" ); + } else { + Maasha::Common::run( "mkvtree", "-db $file_tmp -dna -pl $options->{ 'prefix_length' } -allout -indexname $file_tmp > /dev/null 2>&1" ); + } + + unlink $file_tmp; + } +} + + +sub script_vmatch_seq +{ + # Martin A. Hansen, August 2007. + + # Vmatches sequences in stream against a given genome. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( @index_files, @records, $result_file, $fh_in, $record, %hash ); + + $options->{ 'count' } = 1 if $options->{ 'max_hits' }; + + if ( $options->{ "index_name" } ) + { + @index_files = $options->{ "index_name" }; + } + else + { + @index_files = Maasha::Common::ls_files( "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/vmatch" ); + + map { $_ =~ /^(.+)\.[a-z1]{3,4}$/; $hash{ $1 } = 1 } @index_files; + + @index_files = sort keys %hash; + } + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + push @records, $record; + + Maasha::Biopieces::put_record( $record, $out ); + } + + $result_file = Maasha::Match::match_vmatch( $BP_TMP, \@records, \@index_files, $options ); + + undef @records; + + $fh_in = Maasha::Common::read_open( $result_file ); + + while ( $record = Maasha::Match::vmatch_get_entry( $fh_in ) ) { + Maasha::Biopieces::put_record( $record, $out ); + } + + close $fh_in; + + unlink $result_file; +} + + +sub script_write_fasta +{ + # Martin A. Hansen, August 2007. + + # Write FASTA entries from sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $fh, $entry ); + + $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { + Maasha::Fasta::put_entry( $entry, $fh, $options->{ "wrap" } ); + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + close $fh; +} + + +sub script_write_align +{ + # Martin A. Hansen, August 2007. + + # Write pretty alignments aligned sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $fh, $record, @entries ); + + $fh = write_stream( $options->{ "data_out" } ) ; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { + push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + if ( scalar( @entries ) == 2 ) { + Maasha::Align::align_print_pairwise( $entries[ 0 ], $entries[ 1 ], $fh, $options->{ "wrap" } ); + } elsif ( scalar ( @entries ) > 2 ) { + Maasha::Align::align_print_multi( \@entries, $fh, $options->{ "wrap" }, $options->{ "no_ruler" }, $options->{ "no_consensus" } ); + } + + close $fh if $fh; +} + + +sub script_write_blast +{ + # Martin A. Hansen, November 2007. + + # Write data in blast table format (-m8 and 9). + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $fh, $record, $first ); + + $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ) ; + + $first = 1; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "REC_TYPE" } eq "BLAST" ) + { + if ( $options->{ "comment" } and $first ) + { + print "# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score\n"; + + $first = 0; + } + + if ( $record->{ "STRAND" } eq "-" ) { + ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } ); + } + + print $fh join( "\t", + $record->{ "Q_ID" }, + $record->{ "S_ID" }, + $record->{ "IDENT" }, + $record->{ "ALIGN_LEN" }, + $record->{ "MISMATCHES" }, + $record->{ "GAPS" }, + $record->{ "Q_BEG" } + 1, + $record->{ "Q_END" } + 1, + $record->{ "S_BEG" } + 1, + $record->{ "S_END" } + 1, + $record->{ "E_VAL" }, + $record->{ "BIT_SCORE" } + ), "\n"; + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + close $fh; +} + + +sub script_write_tab +{ + # Martin A. Hansen, August 2007. + + # Write data as table. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $fh, $record, $key, @keys, @vals, $ok, %no_keys, $A, $B ); + + $options->{ "delimit" } ||= "\t"; + + map { $no_keys{ $_ } = 1 } @{ $options->{ "no_keys" } }; + + $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + undef @vals; + $ok = 1; + + if ( $options->{ "keys" } ) + { + map { $ok = 0 if not exists $record->{ $_ } } @{ $options->{ "keys" } }; + + if ( $ok ) + { + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( exists $record->{ $key } ) + { + push @keys, $key if $options->{ "comment" }; + push @vals, $record->{ $key }; + } + } + } + } + else + { + foreach $key ( sort { $A = $a; $B = $b; $A =~ s/^V(\d+)$/$1/; $B =~ s/^V(\d+)$/$1/; $A <=> $B } keys %{ $record } ) + { + next if exists $no_keys{ $key }; + + push @keys, $key if $options->{ "comment" }; + push @vals, $record->{ $key }; + } + } + + if ( @keys and $options->{ "comment" } ) + { + print $fh "#", join( $options->{ "delimit" }, @keys ), "\n"; + + delete $options->{ "comment" }; + } + + print $fh join( $options->{ "delimit" }, @vals ), "\n" if @vals; + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + close $fh; +} + + +sub script_write_bed +{ + # Martin A. Hansen, August 2007. + + # Write BED format for the UCSC genome browser using records in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $cols, $fh, $record, $bed_entry, $new_record ); + + $cols = $options->{ 'cols' }->[ 0 ]; + + $fh = write_stream( $options->{ 'data_out' }, $options->{ 'compress' } ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + $record = Maasha::UCSC::psl2record( $record ) if $record->{ 'tBaseInsert' }; # Dirty addition to allow Affy data from MySQL to be dumped + + if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record, $cols ) ) { + Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh, $cols, $options->{ 'check' } ); + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' }; + } + + close $fh; +} + + +sub script_write_psl +{ + # Martin A. Hansen, August 2007. + + # Write PSL output from stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $fh, $record, @output, $first ); + + $first = 1; + + $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + + if ( $record->{ "REC_TYPE" } and $record->{ "REC_TYPE" } eq "PSL" ) + { + Maasha::UCSC::psl_put_header( $fh ) if $first; + Maasha::UCSC::psl_put_entry( $record, $fh ); + $first = 0; + } + } + + close $fh; +} + + +sub script_write_fixedstep +{ + # Martin A. Hansen, Juli 2008. + + # Write fixedStep entries from recrods in the stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $fh, $record, $entry ); + + $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) { + Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh ); + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + close $fh; +} + + +sub script_write_2bit +{ + # Martin A. Hansen, March 2008. + + # Write sequence entries from stream in 2bit format. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $mask, $tmp_file, $fh_tmp, $fh_in, $fh_out, $entry ); + + $mask = 1 if not $options->{ "no_mask" }; + + $tmp_file = "$BP_TMP/write_2bit.fna"; + $fh_tmp = Maasha::Common::write_open( $tmp_file ); + + $fh_out = write_stream( $options->{ "data_out" } ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { + Maasha::Fasta::put_entry( $entry, $fh_tmp ); + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + close $fh_tmp; + + $fh_in = Maasha::Common::read_open( $tmp_file ); + + Maasha::TwoBit::fasta2twobit( $fh_in, $fh_out, $mask ); + + close $fh_in; + close $fh_out; + + unlink $tmp_file; +} + + +sub script_write_solid +{ + # Martin A. Hansen, April 2008. + + # Write di-base encoded Solid sequence from entries in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $fh, $entry ); + + $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) + { + $entry->[ SEQ ] = Maasha::Solid::seq2color_space( uc $entry->[ SEQ ] ); + + Maasha::Fasta::put_entry( $entry, $fh, $options->{ "wrap" } ); + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + close $fh; +} + + +sub script_write_ucsc_config +{ + # Martin A. Hansen, November 2008. + + # Write UCSC Genome Broser configuration (.ra file type) from + # records in the stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $fh ); + + $fh = write_stream( $options->{ "data_out" } ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + Maasha::UCSC::ucsc_config_entry_put( $record, $fh ) if $record->{ "REC_TYPE" } eq "UCSC Config"; + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + close $fh; +} + + +sub script_plot_seqlogo +{ + # Martin A. Hansen, August 2007. + + # Calculates and writes a sequence logo for alignments. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, @entries, $logo, $fh ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { + push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + $logo = Maasha::Plot::seq_logo( \@entries ); + + $fh = write_stream( $options->{ "data_out" } ); + + print $fh $logo; + + close $fh; +} + + +sub script_plot_phastcons_profiles +{ + # Martin A. Hansen, January 2008. + + # Plots PhastCons profiles. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $record, $scores, $AoA, $plot, $fh ); + + $options->{ "title" } ||= "PhastCons Profiles"; + + $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } ); + $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } ); + + $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index ); + $fh_phastcons = Maasha::Common::read_open( $phastcons_file ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) + { + $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, + $record->{ "CHR_BEG" }, + $record->{ "CHR_END" }, + $options->{ "flank" } ); + + push @{ $AoA }, [ @{ $scores } ]; + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + Maasha::UCSC::phastcons_normalize( $AoA ); + + $AoA = [ [ Maasha::UCSC::phastcons_mean( $AoA ) ] ] if $options->{ "mean" }; + $AoA = [ [ Maasha::UCSC::phastcons_median( $AoA ) ] ] if $options->{ "median" }; + + $AoA = Maasha::Matrix::matrix_flip( $AoA ); + + $plot = Maasha::Plot::lineplot_simple( $AoA, $options, $BP_TMP ); + + $fh = write_stream( $options->{ "data_out" } ); + + print $fh "$_\n" foreach @{ $plot }; + + close $fh; +} + + +sub script_analyze_bed +{ + # Martin A. Hansen, March 2008. + + # Analyze BED entries in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + $record = Maasha::UCSC::bed_analyze( $record ) if $record->{ "REC_TYPE" } eq "BED"; + + Maasha::Biopieces::put_record( $record, $out ); + } +} + + +sub script_analyze_vals +{ + # Martin A. Hansen, August 2007. + + # Analyze values for given keys in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $key, @keys, %key_hash, $analysis, $len ); + + map { $key_hash{ $_ } = 1 } @{ $options->{ "keys" } }; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + foreach $key ( keys %{ $record } ) + { + next if $options->{ "keys" } and not exists $key_hash{ $key }; + + $analysis->{ $key }->{ "COUNT" }++; + + if ( Maasha::Calc::is_a_number( $record->{ $key } ) ) + { + $analysis->{ $key }->{ "TYPE" } = "num"; + $analysis->{ $key }->{ "SUM" } += $record->{ $key }; + $analysis->{ $key }->{ "MAX" } = $record->{ $key } if $record->{ $key } > $analysis->{ $key }->{ "MAX" } or not $analysis->{ $key }->{ "MAX" }; + $analysis->{ $key }->{ "MIN" } = $record->{ $key } if $record->{ $key } < $analysis->{ $key }->{ "MIN" } or not $analysis->{ $key }->{ "MIN" }; + } + else + { + $len = length $record->{ $key }; + + $analysis->{ $key }->{ "TYPE" } = "alph"; + $analysis->{ $key }->{ "SUM" } += $len; + $analysis->{ $key }->{ "MAX" } = $len if $len > $analysis->{ $key }->{ "MAX" } or not $analysis->{ $key }->{ "MAX" }; + $analysis->{ $key }->{ "MIN" } = $len if $len < $analysis->{ $key }->{ "MIM" } or not $analysis->{ $key }->{ "MIN" }; + } + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + foreach $key ( keys %{ $analysis } ) + { + $analysis->{ $key }->{ "MEAN" } = sprintf "%.2f", $analysis->{ $key }->{ "SUM" } / $analysis->{ $key }->{ "COUNT" }; + $analysis->{ $key }->{ "SUM" } = sprintf "%.2f", $analysis->{ $key }->{ "SUM" }; + } + + my ( $keys, $types, $counts, $mins, $maxs, $sums, $means ); + + $keys = "KEY "; + $types = "TYPE "; + $counts = "COUNT"; + $mins = "MIN "; + $maxs = "MAX "; + $sums = "SUM "; + $means = "MEAN "; + + if ( $options->{ "keys" } ) { + @keys = @{ $options->{ "keys" } }; + } else { + @keys = keys %{ $analysis }; + } + + foreach $key ( @keys ) + { + $keys .= sprintf "% 15s", $key; + $types .= sprintf "% 15s", $analysis->{ $key }->{ "TYPE" }; + $counts .= sprintf "% 15s", $analysis->{ $key }->{ "COUNT" }; + $mins .= sprintf "% 15s", $analysis->{ $key }->{ "MIN" }; + $maxs .= sprintf "% 15s", $analysis->{ $key }->{ "MAX" }; + $sums .= sprintf "% 15s", $analysis->{ $key }->{ "SUM" }; + $means .= sprintf "% 15s", $analysis->{ $key }->{ "MEAN" }; + } + + print $out "$keys\n"; + print $out "$types\n"; + print $out "$counts\n"; + print $out "$mins\n"; + print $out "$maxs\n"; + print $out "$sums\n"; + print $out "$means\n"; +} + + +sub script_head_records +{ + # Martin A. Hansen, August 2007. + + # Display the first sequences in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $count ); + + $options->{ "num" } ||= 10; + + $count = 0; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + $count++; + + Maasha::Biopieces::put_record( $record, $out ); + + last if $count == $options->{ "num" }; + } +} + + +sub script_remove_keys +{ + # Martin A. Hansen, August 2007. + + # Remove keys from stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $new_record ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $options->{ "keys" } ) + { + map { delete $record->{ $_ } } @{ $options->{ "keys" } }; + } + elsif ( $options->{ "save_keys" } ) + { + map { $new_record->{ $_ } = $record->{ $_ } if exists $record->{ $_ } } @{ $options->{ "save_keys" } }; + + $record = $new_record; + } + + Maasha::Biopieces::put_record( $record, $out ) if keys %{ $record }; + } +} + + +sub script_remove_adaptor +{ + # Martin A. Hansen, August 2008. + + # Find and remove adaptor from sequences in the stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $adaptor, $seq, $adaptor_len, $seq_len, $offset, $max_match, $max_mismatch, $pos ); + + $options->{ "remove" } ||= "after"; + + $max_mismatch = $options->{ "mismatches" } || 0; + $offset = $options->{ "offset" }; + + if ( not defined $offset ) { + $offset = 0; + } else { + $offset--; + } + + $adaptor = uc $options->{ "adaptor" }; + $adaptor_len = length $adaptor; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "SEQ" } ) + { + $seq = uc $record->{ "SEQ" }; + $seq_len = length $seq; + + $pos = Maasha::Common::index_m( $seq, $adaptor, $seq_len, $adaptor_len, $offset, $max_mismatch ); + + $record->{ "ADAPTOR_POS" } = $pos; + + if ( $pos >= 0 and $options->{ "remove" } ne "skip" ) + { + if ( $options->{ "remove" } eq "after" ) + { + $record->{ "SEQ" } = substr $record->{ "SEQ" }, 0, $pos; + $record->{ "SEQ_LEN" } = $pos; + } + else + { + $record->{ "SEQ" } = substr $record->{ "SEQ" }, $pos + $adaptor_len; + $record->{ "SEQ_LEN" } = length $record->{ "SEQ" }; + } + } + + Maasha::Biopieces::put_record( $record, $out ); + } + else + { + Maasha::Biopieces::put_record( $record, $out ); + } + } +} + + +sub script_remove_mysql_tables +{ + # Martin A. Hansen, November 2008. + + # Remove MySQL tables from values in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, %table_hash, $dbh, $table ); + + $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user(); + $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password(); + + map { $table_hash{ $_ } = 1 } @{ $options->{ 'tables' } }; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + map { $table_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } }; + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' }; + } + + $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } ); + + foreach $table ( sort keys %table_hash ) + { + if ( Maasha::SQL::table_exists( $dbh, $table ) ) + { + print STDERR qq(Removing table "$table" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' }; + Maasha::SQL::delete_table( $dbh, $table ); + print STDERR "done.\n" if $options->{ 'verbose' }; + } + else + { + print STDERR qq(WARNING: table "$table" not found in database "$options->{ 'database' }\n"); + } + } + + Maasha::SQL::disconnect( $dbh ); +} + + +sub script_remove_ucsc_tracks +{ + # Martin A. Hansen, November 2008. + + # Remove track from MySQL tables and config file. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, %track_hash, $fh_in, $fh_out, $track, @tracks, @new_tracks, $dbh ); + + $options->{ 'user' } ||= Maasha::UCSC::ucsc_get_user(); + $options->{ 'password' } ||= Maasha::UCSC::ucsc_get_password(); + $options->{ 'config_file' } ||= "$ENV{ 'HOME' }/ucsc/my_tracks.ra"; + + map { $track_hash{ $_ } = 1 } @{ $options->{ 'tracks' } }; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + map { $track_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } }; + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ 'no_stream' }; + } + + $fh_in = Maasha::Filesys::file_read_open( $options->{ 'config_file' } ); + + while ( $track = Maasha::UCSC::ucsc_config_entry_get( $fh_in ) ) { + push @tracks, $track; + } + + close $fh_in; + + foreach $track ( @tracks ) + { + if ( $track->{ 'database' } eq $options->{ 'database' } and exists $track_hash{ $track->{ 'track' } } ) { + print STDERR qq(Removing track "$track->{ 'track' }" from config file.\n) if $options->{ 'verbose' }; + } else { + push @new_tracks, $track; + } + } + + rename "$options->{ 'config_file' }", "$options->{ 'config_file' }~"; + + $fh_out = Maasha::Filesys::file_write_open( $options->{ 'config_file' } ); + + map { Maasha::UCSC::ucsc_config_entry_put( $_, $fh_out ) } @new_tracks; + + close $fh_out; + + # ---- locate track in database ---- + + $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } ); + + foreach $track ( sort keys %track_hash ) + { + if ( Maasha::SQL::table_exists( $dbh, $track ) ) + { + print STDERR qq(Removing table "$track" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' }; + Maasha::SQL::delete_table( $dbh, $track ); + print STDERR "done.\n" if $options->{ 'verbose' }; + } + else + { + print STDERR qq(WARNING: table "$track" not found in database "$options->{ 'database' }\n"); + } + } + + Maasha::SQL::disconnect( $dbh ); + + Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" ); +} + + +sub script_rename_keys +{ + # Martin A. Hansen, August 2007. + + # Rename keys in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( exists $record->{ $options->{ "keys" }->[ 0 ] } ) + { + $record->{ $options->{ "keys" }->[ 1 ] } = $record->{ $options->{ "keys" }->[ 0 ] }; + + delete $record->{ $options->{ "keys" }->[ 0 ] }; + } + + Maasha::Biopieces::put_record( $record, $out ); + } +} + + +sub script_uniq_vals +{ + # Martin A. Hansen, August 2007. + + # Find unique values in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( %hash, $record ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ $options->{ "key" } } ) + { + if ( not $hash{ $record->{ $options->{ "key" } } } and not $options->{ "invert" } ) + { + Maasha::Biopieces::put_record( $record, $out ); + + $hash{ $record->{ $options->{ "key" } } } = 1; + } + elsif ( $hash{ $record->{ $options->{ "key" } } } and $options->{ "invert" } ) + { + Maasha::Biopieces::put_record( $record, $out ); + } + else + { + $hash{ $record->{ $options->{ "key" } } } = 1; + } + } + else + { + Maasha::Biopieces::put_record( $record, $out ); + } + } +} + + +sub script_merge_vals +{ + # Martin A. Hansen, August 2007. + + # Rename keys in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, @join, $i ); + + $options->{ "delimit" } ||= '_'; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( exists $record->{ $options->{ "keys" }->[ 0 ] } ) + { + @join = $record->{ $options->{ "keys" }->[ 0 ] }; + + for ( $i = 1; $i < @{ $options->{ "keys" } }; $i++ ) { + push @join, $record->{ $options->{ "keys" }->[ $i ] } if exists $record->{ $options->{ "keys" }->[ $i ] }; + } + + $record->{ $options->{ "keys" }->[ 0 ] } = join $options->{ "delimit" }, @join; + } + + Maasha::Biopieces::put_record( $record, $out ); + } +} + + +sub script_merge_records +{ + # Martin A. Hansen, July 2008. + + # Merges records in the stream based on identical values of two given keys. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $merge, $record, $file1, $file2, $fh1, $fh2, $key1, $key2, @keys1, @keys2, @vals1, @vals2, + $num1, $num2, $num, $cmp, $i ); + + $merge = $options->{ "merge" } || "AandB"; + + $file1 = "$BP_TMP/merge_records1.tmp"; + $file2 = "$BP_TMP/merge_records2.tmp"; + + $fh1 = Maasha::Common::write_open( $file1 ); + $fh2 = Maasha::Common::write_open( $file2 ); + + $key1 = $options->{ "keys" }->[ 0 ]; + $key2 = $options->{ "keys" }->[ 1 ]; + + $num = $key2 =~ s/n$//; + $num1 = 0; + $num2 = 0; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( exists $record->{ $key1 } ) + { + @keys1 = $key1; + @vals1 = $record->{ $key1 }; + + delete $record->{ $key1 }; + + map { push @keys1, $_; push @vals1, $record->{ $_ } } keys %{ $record }; + + print $fh1 join( "\t", @vals1 ), "\n"; + + $num1++; + } + elsif ( exists $record->{ $key2 } ) + { + @keys2 = $key2; + @vals2 = $record->{ $key2 }; + + delete $record->{ $key2 }; + + map { push @keys2, $_; push @vals2, $record->{ $_ } } keys %{ $record }; + + print $fh2 join( "\t", @vals2 ), "\n"; + + $num2++; + } + } + + close $fh1; + close $fh2; + + if ( $num ) + { + Maasha::Common::run( "sort", "-k 1,1n $file1 > $file1.sort" ) and rename "$file1.sort", $file1; + Maasha::Common::run( "sort", "-k 1,1n $file2 > $file2.sort" ) and rename "$file2.sort", $file2; + } + else + { + Maasha::Common::run( "sort", "-k 1,1 $file1 > $file1.sort" ) and rename "$file1.sort", $file1; + Maasha::Common::run( "sort", "-k 1,1 $file2 > $file2.sort" ) and rename "$file2.sort", $file2; + } + + $fh1 = Maasha::Common::read_open( $file1 ); + $fh2 = Maasha::Common::read_open( $file2 ); + + @vals1 = Maasha::Common::get_fields( $fh1 ); + @vals2 = Maasha::Common::get_fields( $fh2 ); + + while ( $num1 > 0 and $num2 > 0 ) + { + undef $record; + + if ( $num ) { + $cmp = $vals1[ 0 ] <=> $vals2[ 0 ]; + } else { + $cmp = $vals1[ 0 ] cmp $vals2[ 0 ]; + } + + if ( $cmp < 0 ) + { + if ( $merge =~ /^(AorB|AnotB)$/ ) + { + for ( $i = 0; $i < @keys1; $i++ ) { + $record->{ $keys1[ $i ] } = $vals1[ $i ]; + } + + Maasha::Biopieces::put_record( $record, $out ); + } + + @vals1 = Maasha::Common::get_fields( $fh1 ); + $num1--; + } + elsif ( $cmp > 0 ) + { + if ( $merge =~ /^(BorA|BnotA)$/ ) + { + for ( $i = 0; $i < @keys2; $i++ ) { + $record->{ $keys2[ $i ] } = $vals2[ $i ]; + } + + Maasha::Biopieces::put_record( $record, $out ); + } + + @vals2 = Maasha::Common::get_fields( $fh2 ); + $num2--; + } + else + { + if ( $merge =~ /^(AandB|AorB|BorA)$/ ) + { + for ( $i = 0; $i < @keys1; $i++ ) { + $record->{ $keys1[ $i ] } = $vals1[ $i ]; + } + + for ( $i = 1; $i < @keys2; $i++ ) { + $record->{ $keys2[ $i ] } = $vals2[ $i ]; + } + + Maasha::Biopieces::put_record( $record, $out ); + } + + @vals1 = Maasha::Common::get_fields( $fh1 ); + @vals2 = Maasha::Common::get_fields( $fh2 ); + $num1--; + $num2--; + } + } + + close $fh1; + close $fh2; + + unlink $file1; + unlink $file2; + + if ( $num1 > 0 and $merge =~ /^(AorB|AnotB)$/ ) + { + undef $record; + + for ( $i = 0; $i < @keys1; $i++ ) { + $record->{ $keys1[ $i ] } = $vals1[ $i ]; + } + + Maasha::Biopieces::put_record( $record, $out ); + } + + if ( $num2 > 0 and $merge =~ /^(BorA|BnotA)$/ ) + { + undef $record; + + for ( $i = 0; $i < @keys2; $i++ ) { + $record->{ $keys2[ $i ] } = $vals2[ $i ]; + } + + Maasha::Biopieces::put_record( $record, $out ); + } +} + + +sub script_grab +{ + # Martin A. Hansen, August 2007. + + # Grab for records in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $keys, $vals_only, $keys_only, $invert, $patterns, $pattern, $regex, $record, $key, $op, $val, %lookup_hash, $found ); + + $keys = $options->{ 'keys' }; + $vals_only = $options->{ 'vals_only' }; + $keys_only = $options->{ 'keys_only' }; + $invert = $options->{ 'invert' }; + + if ( $options->{ 'patterns' } ) + { + $patterns = [ split ",", $options->{ 'patterns' } ]; + } + elsif ( -f $options->{ 'patterns_in' } ) + { + $patterns = Maasha::Patscan::read_patterns( $options->{ 'patterns_in' } ); + } + elsif ( $options->{ 'regex' } ) + { + if ( $options->{ 'case_insensitive' } ) { + $regex = qr/$options->{ 'regex' }/i; + } else { + $regex = qr/$options->{ 'regex' }/; + } + } + elsif ( -f $options->{ 'exact_in' } ) + { + $patterns = Maasha::Patscan::read_patterns( $options->{ 'exact_in' } ); + + map { $lookup_hash{ $_ } = 1 } @{ $patterns }; + + undef $patterns; + } + elsif ( $options->{ 'eval' } ) + { + if ( $options->{ 'eval' } =~ /^([^><=! ]+)\s*(>=|<=|>|<|=|!=|eq|ne)\s*(.+)$/ ) + { + $key = $1; + $op = $2; + $val = $3; + } + } + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + $found = 0; + + if ( %lookup_hash ) { + $found = grab_lookup( \%lookup_hash, $record, $keys, $vals_only, $keys_only ); + } elsif ( $patterns ) { + $found = grab_patterns( $patterns, $record, $keys, $vals_only, $keys_only ); + } elsif ( $regex ) { + $found = grab_regex( $regex, $record, $keys, $vals_only, $keys_only ); + } elsif ( $op ) { + $found = grab_eval( $key, $op, $val, $record ); + } + + if ( $found and not $invert ) { + Maasha::Biopieces::put_record( $record, $out ); + } elsif ( not $found and $invert ) { + Maasha::Biopieces::put_record( $record, $out ); + } + } +} + + +sub script_compute +{ + # Martin A. Hansen, August 2007. + + # Evaluate extression for records in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $eval_key, @keys, $eval_val ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $options->{ "eval" } ) + { + if ( $options->{ "eval" } =~ /^(\S+)\s*=\s*(.+)$/ ) + { + $eval_key = $1; + $eval_val = $2; + + if ( not @keys ) + { + @keys = split /\s+|\+|-|\*|\/|\*\*/, $eval_val; + + @keys = grep { exists $record->{ $_ } } @keys; + } + + map { $eval_val =~ s/\Q$_\E/$record->{ $_ }/g } @keys; + + $record->{ $eval_key } = eval "$eval_val"; + Maasha::Common::error( qq(eval "$eval_key = $eval_val" failed -> $@) ) if $@; + } + else + { + Maasha::Common::error( qq(Bad compute expression: "$options->{ 'eval' }"\n) ); + } + } + + Maasha::Biopieces::put_record( $record, $out ); + } +} + + +sub script_flip_tab +{ + # Martin A. Hansen, June 2008. + + # Flip a table. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $key, $A, $B, @rows, @matrix, $row, $i ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + undef @rows; + + foreach $key ( sort { $A = $a; $B = $b; $A =~ s/^V(\d+)$/$1/; $B =~ s/^V(\d+)$/$1/; $A <=> $B } keys %{ $record } ) + { + push @rows, $record->{ $key }; + + } + + push @matrix, [ @rows ]; + } + + undef $record; + + @matrix = Maasha::Matrix::matrix_flip( \@matrix ); + + foreach $row ( @matrix ) + { + for ( $i = 0; $i < @{ $row }; $i++ ) { + $record->{ "V$i" } = $row->[ $i ]; + } + + Maasha::Biopieces::put_record( $record, $out ); + } +} + + +sub script_add_ident +{ + # Martin A. Hansen, May 2008. + + # Add a unique identifier to each record in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $key, $prefix, $i ); + + $key = $options->{ "key" } || "ID"; + $prefix = $options->{ "prefix" } || "ID"; + + $i = 0; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + $record->{ $key } = sprintf( "$prefix%08d", $i ); + + Maasha::Biopieces::put_record( $record, $out ); + + $i++; + } +} + + +sub script_count_records +{ + # Martin A. Hansen, August 2007. + + # Count records in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $count, $result, $fh, $line ); + + $count = 0; + + if ( $options->{ "no_stream" } ) + { + while ( $line = <$in> ) + { + chomp $line; + + $count++ if $line eq "---"; + } + } + else + { + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + Maasha::Biopieces::put_record( $record, $out ); + + $count++; + } + } + + $result = { "RECORDS_COUNT" => $count }; + + $fh = write_stream( $options->{ "data_out" } ); + + Maasha::Biopieces::put_record( $result, $fh ); + + close $fh; +} + + +sub script_random_records +{ + # Martin A. Hansen, August 2007. + + # Pick a number or random records from stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $tmp_file, $fh_out, $fh_in, $count, $i, %rand_hash, $rand, $max ); + + $options->{ "num" } ||= 10; + + $tmp_file = "$BP_TMP/random_records.tmp"; + + $fh_out = Maasha::Common::write_open( $tmp_file ); + + $count = 0; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + Maasha::Biopieces::put_record( $record, $fh_out ); + + $count++; + } + + close $fh_out; + + $max = 0; + $i = 0; + + Maasha::Common::error( qq(Requested random records > records in stream) ) if $options->{ "num" } > $count; + + while ( $i < $options->{ "num" } ) + { + $rand = int( rand( $count ) ); + + if ( not exists $rand_hash{ $rand } ) + { + $rand_hash{ $rand } = 1; + + $max = $rand if $rand > $max; + + $i++; + } + } + + $fh_in = Maasha::Common::read_open( $tmp_file ); + + $count = 0; + + while ( $record = Maasha::Biopieces::get_record( $fh_in ) ) + { + Maasha::Biopieces::put_record( $record, $out ) if exists $rand_hash{ $count }; + + last if $count == $max; + + $count++; + } + + close $fh_in; + + unlink $tmp_file; +} + + +sub script_sort_records +{ + # Martin A. Hansen, August 2007. + + # Sort to sort records according to keys. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( @keys, $key, @sort_cmd, $sort_str, $sort_sub, @records, $record, $i ); + + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( $key =~ s/n$// ) { + push @sort_cmd, qq(\$a->{ "$key" } <=> \$b->{ "$key" }); + } else { + push @sort_cmd, qq(\$a->{ "$key" } cmp \$b->{ "$key" }); + } + } + + $sort_str = join " or ", @sort_cmd; + $sort_sub = eval "sub { $sort_str }"; # NB security issue! + + while ( $record = Maasha::Biopieces::get_record( $in ) ) { + push @records, $record; + } + + @records = sort $sort_sub @records; + + if ( $options->{ "reverse" } ) + { + for ( $i = scalar @records - 1; $i >= 0; $i-- ) { + Maasha::Biopieces::put_record( $records[ $i ], $out ); + } + } + else + { + for ( $i = 0; $i < scalar @records; $i++ ) { + Maasha::Biopieces::put_record( $records[ $i ], $out ); + } + } +} + + +sub script_count_vals +{ + # Martin A. Hansen, August 2007. + + # Count records in stream. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $num, $record, %count_hash, @records, $tmp_file, $fh_out, $fh_in, $cache ); + + $tmp_file = "$BP_TMP/count_cache.tmp"; + + $fh_out = Maasha::Common::write_open( $tmp_file ); + + $cache = 0; + $num = 0; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + map { $count_hash{ $_ }{ $record->{ $_ } }++ if exists $record->{ $_ } } @{ $options->{ "keys" } }; + + push @records, $record; + + if ( scalar @records > 5_000_000 ) # too many records to hold in memory - use disk cache + { + map { Maasha::Biopieces::put_record( $_, $fh_out ) } @records; + + undef @records; + + $cache = 1; + } + + print STDERR "verbose: records read $num\n" if ( $options->{ 'verbose' } and ( $num % 1_000_000 ) == 0 ); + + $num++; + } + + close $fh_out; + + if ( $cache ) + { + $num = 0; + + $fh_in = Maasha::Common::read_open( $tmp_file ); + + while ( $record = Maasha::Biopieces::get_record( $fh_in ) ) + { + map { $record->{ $_ . "_COUNT" } = $count_hash{ $_ }{ $record->{ $_ } } if exists $record->{ $_ } } @{ $options->{ "keys" } }; + + Maasha::Biopieces::put_record( $record, $out ); + + print STDERR "verbose: cache read $num\n" if ( $options->{ 'verbose' } and ( $num % 1_000_000 ) == 0 ); + + $num++; + } + + close $fh_in; + } + + foreach $record ( @records ) + { + map { $record->{ $_ . "_COUNT" } = $count_hash{ $_ }{ $record->{ $_ } } if exists $record->{ $_ } } @{ $options->{ "keys" } }; + + Maasha::Biopieces::put_record( $record, $out ); + } + + unlink $tmp_file; +} + + +sub script_plot_histogram +{ + # Martin A. Hansen, September 2007. + + # Plot a simple histogram for a given key using GNU plot. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, %data_hash, $max, @data_list, $i, $result, $fh ); + + $options->{ "title" } ||= "Histogram"; + $options->{ "sort" } ||= "num"; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + $data_hash{ $record->{ $options->{ "key" } } }++ if defined $record->{ $options->{ "key" } }; + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + if ( $options->{ "sort" } eq "num" ) { + map { push @data_list, [ $_, $data_hash{ $_ } ] } sort { $a <=> $b } keys %data_hash; + } else { + map { push @data_list, [ $_, $data_hash{ $_ } ] } sort keys %data_hash; + } + + $result = Maasha::Plot::histogram_simple( \@data_list, $options ); + + $fh = write_stream( $options->{ "data_out" } ); + + print $fh "$_\n" foreach @{ $result }; + + close $fh; +} + + +sub script_plot_lendist +{ + # Martin A. Hansen, August 2007. + + # Plot length distribution using GNU plot. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, %data_hash, $max, @data_list, $i, $result, $fh ); + + $options->{ "title" } ||= "Length Distribution"; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + $data_hash{ $record->{ $options->{ "key" } } }++ if defined $record->{ $options->{ "key" } }; + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + $max = Maasha::Calc::list_max( [ keys %data_hash ] ); + + for ( $i = 0; $i < $max; $i++ ) { + push @data_list, [ $i, $data_hash{ $i } || 0 ]; + } + + $result = Maasha::Plot::histogram_lendist( \@data_list, $options ); + + $fh = write_stream( $options->{ "data_out" } ); + + print $fh "$_\n" foreach @{ $result }; + + close $fh; +} + + +sub script_plot_chrdist +{ + # Martin A. Hansen, August 2007. + + # Plot chromosome distribution using GNU plot. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, %data_hash, @data_list, $elem, $sort_key, $count, $result, $fh ); + + $options->{ "title" } ||= "Chromosome Distribution"; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "CHR" } ) { # generic + $data_hash{ $record->{ "CHR" } }++; + } elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "S_ID" } =~ /^chr/i ) { # patscan + $data_hash{ $record->{ "S_ID" } }++; + } elsif ( $record->{ "REC_TYPE" } eq "PSL" and $record->{ "S_ID" } =~ /^chr/i ) { # BLAT / PSL + $data_hash{ $record->{ "S_ID" } }++; + } elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/i ) { # BLAST + $data_hash{ $record->{ "S_ID" } }++; + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + foreach $elem ( keys %data_hash ) + { + $sort_key = $elem; + + $sort_key =~ s/chr//i; + + $sort_key =~ s/^X(.*)/99$1/; + $sort_key =~ s/^Y(.*)/99$1/; + $sort_key =~ s/^Z(.*)/999$1/; + $sort_key =~ s/^M(.*)/9999$1/; + $sort_key =~ s/^U(.*)/99999$1/; + + $count = $sort_key =~ tr/_//; + + $sort_key =~ s/_.*/"999999" x $count/ex; + + push @data_list, [ $elem, $data_hash{ $elem }, $sort_key ]; + } + + @data_list = sort { $a->[ 2 ] <=> $b->[ 2 ] } @data_list; + + $result = Maasha::Plot::histogram_chrdist( \@data_list, $options ); + + $fh = write_stream( $options->{ "data_out" } ); + + print $fh "$_\n" foreach @{ $result }; + + close $fh; +} + + +sub script_plot_karyogram +{ + # Martin A. Hansen, August 2007. + + # Plot hits on karyogram. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( %options, $record, @data, $fh, $result, %data_hash ); + + $options->{ "genome" } ||= "human"; + $options->{ "feat_color" } ||= "black"; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) + { + push @{ $data_hash{ $record->{ "CHR" } } }, [ $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "feat_color" } ]; + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + $result = Maasha::Plot::karyogram( \%data_hash, \%options ); + + $fh = write_stream( $options->{ "data_out" } ); + + print $fh $result; + + close $fh; +} + + +sub script_plot_matches +{ + # Martin A. Hansen, August 2007. + + # Plot matches in 2D generating a dotplot. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, @data, $fh, $result, %data_hash ); + + $options->{ "direction" } ||= "both"; + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + if ( defined $record->{ "Q_BEG" } and defined $record->{ "S_BEG" } and $record->{ "Q_END" } and $record->{ "S_END" } ) { + push @data, $record; + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + $options->{ "title" } ||= "plot_matches"; + $options->{ "xlabel" } ||= $data[ 0 ]->{ "Q_ID" }; + $options->{ "ylabel" } ||= $data[ 0 ]->{ "S_ID" }; + + $result = Maasha::Plot::dotplot_matches( \@data, $options, $BP_TMP ); + + $fh = write_stream( $options->{ "data_out" } ); + + print $fh "$_\n" foreach @{ $result }; + + close $fh; +} + + +sub script_length_vals +{ + # Martin A. Hansen, August 2007. + + # Determine the length of the value for given keys. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $key ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( $record->{ $key } ) { + $record->{ $key . "_LEN" } = length $record->{ $key }; + } + } + + Maasha::Biopieces::put_record( $record, $out ); + } +} + + +sub script_sum_vals +{ + # Martin A. Hansen, August 2007. + + # Calculates the sums for values of given keys. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $key, %sum_hash, $fh ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( $record->{ $key } ) { + $sum_hash{ $key } += $record->{ $key }; + } + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + $fh = write_stream( $options->{ "data_out" } ); + + foreach $key ( @{ $options->{ "keys" } } ) { + Maasha::Biopieces::put_record( { $key . "_SUM" => $sum_hash{ $key } || 0 } , $fh ); + } + + close $fh; +} + + +sub script_mean_vals +{ + # Martin A. Hansen, August 2007. + + # Calculate the mean of values of given keys. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $key, %sum_hash, %count_hash, $mean, $fh ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( $record->{ $key } ) + { + $sum_hash{ $key } += $record->{ $key }; + $count_hash{ $key }++; + } + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + $fh = write_stream( $options->{ "data_out" } ); + + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( $count_hash{ $key } ) { + $mean = sprintf( "%.2f", ( $sum_hash{ $key } / $count_hash{ $key } ) ); + } else { + $mean = "N/A"; + } + + Maasha::Biopieces::put_record( { $key . "_MEAN" => $mean } , $fh ); + } + + close $fh; +} + + +sub script_median_vals +{ + # Martin A. Hansen, March 2008. + + # Calculate the median values of given keys. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $key, %median_hash, $median, $fh ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + foreach $key ( @{ $options->{ "keys" } } ) { + push @{ $median_hash{ $key } }, $record->{ $key } if defined $record->{ $key }; + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + $fh = write_stream( $options->{ "data_out" } ); + + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( $median_hash{ $key } ) { + $median = Maasha::Calc::median( $median_hash{ $key } ); + } else { + $median = "N/A"; + } + + Maasha::Biopieces::put_record( { $key . "_MEDIAN" => $median } , $fh ); + } + + close $fh; +} + + +sub script_max_vals +{ + # Martin A. Hansen, February 2008. + + # Determine the maximum values of given keys. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $key, $fh, %max_hash, $max_record ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( $record->{ $key } ) + { + $max_hash{ $key } = $record->{ $key } if $record->{ $key } > $max_hash{ $key }; + } + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + $fh = write_stream( $options->{ "data_out" } ); + + foreach $key ( @{ $options->{ "keys" } } ) + { + $max_record->{ $key . "_MAX" } = $max_hash{ $key }; + } + + Maasha::Biopieces::put_record( $max_record, $fh ); + + close $fh; +} + + +sub script_min_vals +{ + # Martin A. Hansen, February 2008. + + # Determine the minimum values of given keys. + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $key, $fh, %min_hash, $min_record ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + foreach $key ( @{ $options->{ "keys" } } ) + { + if ( defined $record->{ $key } ) + { + if ( exists $min_hash{ $key } ) { + $min_hash{ $key } = $record->{ $key } if $record->{ $key } < $min_hash{ $key }; + } else { + $min_hash{ $key } = $record->{ $key }; + } + } + } + + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + } + + $fh = write_stream( $options->{ "data_out" } ); + + foreach $key ( @{ $options->{ "keys" } } ) + { + $min_record->{ $key . "_MIN" } = $min_hash{ $key }; + } + + Maasha::Biopieces::put_record( $min_record, $fh ); + + close $fh; +} + + +sub script_upload_to_ucsc +{ + # Martin A. Hansen, August 2007. + + # Calculate the mean of values of given keys. + + # This routine has developed into the most ugly hack. Do something! + + my ( $in, # handle to in stream + $out, # handle to out stream + $options, # options hash + ) = @_; + + # Returns nothing. + + my ( $record, $file, $wib_file, $wig_file, $wib_dir, $fh_out, $i, $first, $format, $type, $columns, $append, $entry ); + + $options->{ "short_label" } ||= $options->{ 'table' }; + $options->{ "long_label" } ||= $options->{ 'table' }; + $options->{ "group" } ||= $ENV{ "LOGNAME" }; + $options->{ "priority" } ||= 1; + $options->{ "visibility" } ||= "pack"; + $options->{ "color" } ||= join( ",", int( rand( 255 ) ), int( rand( 255 ) ), int( rand( 255 ) ) ); + $options->{ "chunk_size" } ||= 10_000_000_000; # Due to 32-bit UCSC compilation really large tables cannot be loaded in one go. + + $file = "$BP_TMP/ucsc_upload.tmp"; + $append = 0; + $first = 1; + $i = 0; + + $fh_out = Maasha::Common::write_open( $file ); + + while ( $record = Maasha::Biopieces::get_record( $in ) ) + { + Maasha::Biopieces::put_record( $record, $out ) if not $options->{ "no_stream" }; + + if ( $record->{ "REC_TYPE" } eq "fixed_step" ) + { + $format = "WIGGLE"; + + if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) { + Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh_out ); + } + } + elsif ( $record->{ "REC_TYPE" } eq "PSL" ) + { + $format = "PSL"; + + Maasha::UCSC::psl_put_header( $fh_out ) if $first; + Maasha::UCSC::psl_put_entry( $record, $fh_out ); + + $first = 0; + } + elsif ( $record->{ "REC_TYPE" } eq "BED" and $record->{ "SEC_STRUCT" } ) + { + # chrom chromStart chromEnd name score strand size secStr conf + + $format = "BED_SS"; + + print $fh_out join ( "\t", + $record->{ "CHR" }, + $record->{ "CHR_BEG" }, + $record->{ "CHR_END" } + 1, + $record->{ "Q_ID" }, + $record->{ "SCORE" }, + $record->{ "STRAND" }, + $record->{ "SIZE" }, + $record->{ "SEC_STRUCT" }, + $record->{ "CONF" }, + ), "\n"; + } + elsif ( $record->{ "REC_TYPE" } eq "BED" ) + { + $format = "BED"; + $columns = $record->{ "BED_COLS" }; + + if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) { + Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } ); + } + } + elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "CHR" } ) + { + $format = "BED"; + $columns = 6; + + if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) { + Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } ); + } + } + elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/ ) + { + $format = "BED"; + $columns = 6; + + $record->{ "SCORE" } = $record->{ "BIT_SCORE" } * 1000; + + if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) { + Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } ); + } + } + elsif ( $record->{ "REC_TYPE" } eq "VMATCH" and $record->{ "S_ID" } =~ /^chr/i ) + { + $format = "BED"; + $columns = 6; + + if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) { + Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } ); + } + } + + if ( $i == $options->{ "chunk_size" } ) + { + close $fh_out; + + if ( $format eq "BED" ) { + Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append ); + } elsif ( $format eq "PSL" ) { + Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append ); + } + + unlink $file; + + $first = 1; + + $append = 1; + + $fh_out = Maasha::Common::write_open( $file ); + } + + $i++; + } + + close $fh_out; + + if ( exists $options->{ "database" } and $options->{ "table" } ) + { + if ( $format eq "BED" ) + { + $type = "bed $columns"; + + Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append ); + } + elsif ( $format eq "BED_SS" ) + { + $type = "type bed 6 +"; + + Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append ); + } + elsif ( $format eq "PSL" ) + { + $type = "psl"; + + Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append ); + } + elsif ( $format eq "WIGGLE" ) + { + $options->{ "visibility" } = "full"; + + $wig_file = "$options->{ 'table' }.wig"; + $wib_file = "$options->{ 'table' }.wib"; + + $wib_dir = "$ENV{ 'HOME' }/ucsc/wib"; + + Maasha::Common::dir_create_if_not_exists( $wib_dir ); + + if ( $options->{ 'verbose' } ) { + `cd $BP_TMP && wigEncode $file $wig_file $wib_file`; + } else { + `cd $BP_TMP && wigEncode $file $wig_file $wib_file > /dev/null 2>&1`; + } + + Maasha::Common::run( "mv", "$BP_TMP/$wib_file $wib_dir" ); + + unlink $file; + + $file = $wig_file; + + $type = "wig 0"; + + Maasha::UCSC::wiggle_upload_to_ucsc( $BP_TMP, $wib_dir, $file, $options ); + } + + unlink $file; + + Maasha::UCSC::ucsc_update_config( $options, $type ); + } +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + + +sub grab_lookup +{ + # Martin A. Hansen, November 2009. + + # Uses keys from a lookup hash to search records. Optionally, a list of + # keys can be given so the lookup is limited to these, also, flags + # can be given to limit lookup to keys or vals only. Returns 1 if lookup + # succeeded, else 0. + + my ( $lookup_hash, # hashref with patterns + $record, # hashref + $keys, # list of keys - OPTIONAL + $vals_only, # only vals flag - OPTIONAL + $keys_only, # only keys flag - OPTIONAL + ) = @_; + + # Returns boolean. + + if ( $keys ) + { + map { return 1 if exists $lookup_hash->{ $record->{ $_ } } } @{ $keys }; + } + else + { + if ( not $vals_only ) { + map { return 1 if exists $lookup_hash->{ $_ } } keys %{ $record }; + } + + if ( not $keys_only ) { + map { return 1 if exists $lookup_hash->{ $record->{ $_ } } } keys %{ $record }; + } + } + + return 0; +} + + +sub grab_patterns +{ + # Martin A. Hansen, November 2009. + + # Uses patterns to match records containing the pattern as a substring. + # Returns 1 if the record is matched, else 0. + + my ( $patterns, # list of patterns + $record, # hashref + $keys, # list of keys - OPTIONAL + $vals_only, # only vals flag - OPTIONAL + $keys_only, # only keys flag - OPTIONAL + ) = @_; + + # Returns boolean. + + my ( $pattern ); + + foreach $pattern ( @{ $patterns } ) + { + if ( $keys ) + { + map { return 1 if index( $record->{ $_ }, $pattern ) >= 0 } @{ $keys }; + } + else + { + if ( not $vals_only ) { + map { return 1 if index( $_, $pattern ) >= 0 } keys %{ $record }; + } + + if ( not $keys_only ) { + map { return 1 if index( $record->{ $_ }, $pattern ) >= 0 } keys %{ $record }; + } + } + } + + return 0; +} + + +sub grab_regex +{ + # Martin A. Hansen, November 2009. + + # Uses regex to match records. + # Returns 1 if the record is matched, else 0. + + my ( $regex, # regex to match + $record, # hashref + $keys, # list of keys - OPTIONAL + $vals_only, # only vals flag - OPTIONAL + $keys_only, # only keys flag - OPTIONAL + ) = @_; + + # Returns boolean. + + if ( $keys ) + { + map { return 1 if $record->{ $_ } =~ /$regex/ } @{ $keys }; + } + else + { + if ( not $vals_only ) { + map { return 1 if $_ =~ /$regex/ } keys %{ $record }; + } + + if ( not $keys_only ) { + map { return 1 if $record->{ $_ } =~ /$regex/ } keys %{ $record }; + } + } + + return 0; +} + + +sub grab_eval +{ + # Martin A. Hansen, November 2009. + + # Test if the value of a given record key evaluates according + # to a given operator. Returns 1 if eval is OK, else 0. + + my ( $key, # record key + $op, # operator + $val, # value + $record, # hashref + ) = @_; + + # Returns boolean. + + if ( defined $record->{ $key } ) + { + return 1 if ( $op eq "<" and $record->{ $key } < $val ); + return 1 if ( $op eq ">" and $record->{ $key } > $val ); + return 1 if ( $op eq ">=" and $record->{ $key } >= $val ); + return 1 if ( $op eq "<=" and $record->{ $key } <= $val ); + return 1 if ( $op eq "=" and $record->{ $key } == $val ); + return 1 if ( $op eq "!=" and $record->{ $key } != $val ); + return 1 if ( $op eq "eq" and $record->{ $key } eq $val ); + return 1 if ( $op eq "ne" and $record->{ $key } ne $val ); + } + + return 0; +} + + +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + +1; + +__END__ diff --git a/code_perl/Maasha/Biopieces.pm b/code_perl/Maasha/Biopieces.pm index cba6c52..8d8f403 100644 --- a/code_perl/Maasha/Biopieces.pm +++ b/code_perl/Maasha/Biopieces.pm @@ -29,6467 +29,53 @@ package Maasha::Biopieces; # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< -use strict; -use Data::Dumper; -use Getopt::Long qw( :config bundling ); -use Time::HiRes qw( gettimeofday ); -use Storable qw( dclone ); -use Maasha::Config; -use Maasha::Common; -use Maasha::Filesys; -use Maasha::Fasta; -use Maasha::Align; -use Maasha::Matrix; -use Maasha::Match; -use Maasha::EMBL; -use Maasha::Stockholm; -use Maasha::Seq; -use Maasha::Patscan; -use Maasha::Plot; -use Maasha::Calc; -use Maasha::UCSC; -use Maasha::UCSC::BED; -use Maasha::UCSC::Wiggle; -use Maasha::NCBI; -use Maasha::GFF; -use Maasha::TwoBit; -use Maasha::Solid; -use Maasha::Solexa; -use Maasha::SQL; -use Maasha::Gwiki; - -use vars qw( @ISA @EXPORT_OK ); - -require Exporter; - -@ISA = qw( Exporter ); - -@EXPORT_OK = qw( - read_stream - write_stream - get_record - put_record -); - -use constant { - SEQ_NAME => 0, - SEQ => 1, -}; - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SIGNAL HANDLER <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -$SIG{ '__DIE__' } = \&sig_handler; -$SIG{ 'INT' } = \&sig_handler; -$SIG{ 'TERM' } = \&sig_handler; - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> GLOBALS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -my ( $script, $BP_TMP ); - -$script = Maasha::Common::get_scriptname(); -$BP_TMP = Maasha::Common::get_tmpdir(); - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> LOG <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -my ( $log_global, $log_local ); - -$log_global = Maasha::Common::append_open( "$ENV{ 'BP_LOG' }/biopieces.log" ); -$log_local = Maasha::Common::append_open( "$ENV{ 'HOME' }/.biopieces.log" ); - -$log_global->autoflush( 1 ); -$log_local->autoflush( 1 ); - -&log( $log_global, $script, \@ARGV ); -&log( $log_local, $script, \@ARGV ); - -close $log_global; -close $log_local; - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> RUN SCRIPT <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -run_script( $script ); - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -sub log -{ - # Martin A. Hansen, January 2008. - - # Log messages to logfile. - - my ( $fh, # filehandle to logfile - $script, # script name - $argv, # reference to @ARGV - ) = @_; - - # Returns nothing. - - my ( $time_stamp, $user ); - - $time_stamp = Maasha::Common::time_stamp(); - - $user = $ENV{ 'USER' }; - - $script = "biopieces" if $script eq "-e"; - - print $fh "$time_stamp\t$user\t$script ", join( " ", @{ $argv } ), "\n"; -} - - -sub run_script -{ - # Martin A. Hansen, August 2007. - - # Run a specific script. - - my ( $script, # script name - ) = @_; - - # Returns nothing. - - my ( $t0, $t1, $options, $in, $out ); - - $t0 = gettimeofday(); - - $options = get_options( $script ); - - $options->{ "SCRIPT" } = $script; - - if ( $script ne "list_biopieces" and $script ne "list_genomes" ) { - $script = "print_usage" if ( -t STDIN and keys %{ $options } <= 1 or $options->{ 'help' } ); - } - - $in = read_stream( $options->{ "stream_in" } ); - $out = write_stream( $options->{ "stream_out" } ); - - if ( $script eq "print_usage" ) { script_print_usage( $in, $out, $options ) } - elsif ( $script eq "list_biopieces" ) { script_list_biopieces( $in, $out, $options ) } - elsif ( $script eq "list_genomes" ) { script_list_genomes( $in, $out, $options ) } - elsif ( $script eq "read_fasta" ) { script_read_fasta( $in, $out, $options ) } - elsif ( $script eq "read_tab" ) { script_read_tab( $in, $out, $options ) } - elsif ( $script eq "read_psl" ) { script_read_psl( $in, $out, $options ) } - elsif ( $script eq "read_bed" ) { script_read_bed( $in, $out, $options ) } - elsif ( $script eq "read_fixedstep" ) { script_read_fixedstep( $in, $out, $options ) } - elsif ( $script eq "read_blast_tab" ) { script_read_blast_tab( $in, $out, $options ) } - elsif ( $script eq "read_embl" ) { script_read_embl( $in, $out, $options ) } - elsif ( $script eq "read_stockholm" ) { script_read_stockholm( $in, $out, $options ) } - elsif ( $script eq "read_phastcons" ) { script_read_phastcons( $in, $out, $options ) } - elsif ( $script eq "read_soft" ) { script_read_soft( $in, $out, $options ) } - elsif ( $script eq "read_gff" ) { script_read_gff( $in, $out, $options ) } - elsif ( $script eq "read_2bit" ) { script_read_2bit( $in, $out, $options ) } - elsif ( $script eq "read_solexa" ) { script_read_solexa( $in, $out, $options ) } - elsif ( $script eq "read_solid" ) { script_read_solid( $in, $out, $options ) } - elsif ( $script eq "read_mysql" ) { script_read_mysql( $in, $out, $options ) } - elsif ( $script eq "read_ucsc_config" ) { script_read_ucsc_config( $in, $out, $options ) } - elsif ( $script eq "assemble_tag_contigs" ) { script_assemble_tag_contigs( $in, $out, $options ) } - elsif ( $script eq "format_genome" ) { script_format_genome( $in, $out, $options ) } - elsif ( $script eq "length_seq" ) { script_length_seq( $in, $out, $options ) } - elsif ( $script eq "uppercase_seq" ) { script_uppercase_seq( $in, $out, $options ) } - elsif ( $script eq "shuffle_seq" ) { script_shuffle_seq( $in, $out, $options ) } - elsif ( $script eq "analyze_seq" ) { script_analyze_seq( $in, $out, $options ) } - elsif ( $script eq "analyze_tags" ) { script_analyze_tags( $in, $out, $options ) } - elsif ( $script eq "complexity_seq" ) { script_complexity_seq( $in, $out, $options ) } - elsif ( $script eq "oligo_freq" ) { script_oligo_freq( $in, $out, $options ) } - elsif ( $script eq "create_weight_matrix" ) { script_create_weight_matrix( $in, $out, $options ) } - elsif ( $script eq "calc_bit_scores" ) { script_calc_bit_scores( $in, $out, $options ) } - elsif ( $script eq "calc_fixedstep" ) { script_calc_fixedstep( $in, $out, $options ) } - elsif ( $script eq "reverse_seq" ) { script_reverse_seq( $in, $out, $options ) } - elsif ( $script eq "complement_seq" ) { script_complement_seq( $in, $out, $options ) } - elsif ( $script eq "remove_indels" ) { script_remove_indels( $in, $out, $options ) } - elsif ( $script eq "transliterate_seq" ) { script_transliterate_seq( $in, $out, $options ) } - elsif ( $script eq "transliterate_vals" ) { script_transliterate_vals( $in, $out, $options ) } - elsif ( $script eq "translate_seq" ) { script_translate_seq( $in, $out, $options ) } - elsif ( $script eq "extract_seq" ) { script_extract_seq( $in, $out, $options ) } - elsif ( $script eq "get_genome_seq" ) { script_get_genome_seq( $in, $out, $options ) } - elsif ( $script eq "get_genome_align" ) { script_get_genome_align( $in, $out, $options ) } - elsif ( $script eq "get_genome_phastcons" ) { script_get_genome_phastcons( $in, $out, $options ) } - elsif ( $script eq "fold_seq" ) { script_fold_seq( $in, $out, $options ) } - elsif ( $script eq "split_seq" ) { script_split_seq( $in, $out, $options ) } - elsif ( $script eq "split_bed" ) { script_split_bed( $in, $out, $options ) } - elsif ( $script eq "align_seq" ) { script_align_seq( $in, $out, $options ) } - elsif ( $script eq "tile_seq" ) { script_tile_seq( $in, $out, $options ) } - elsif ( $script eq "invert_align" ) { script_invert_align( $in, $out, $options ) } - elsif ( $script eq "patscan_seq" ) { script_patscan_seq( $in, $out, $options ) } - elsif ( $script eq "create_blast_db" ) { script_create_blast_db( $in, $out, $options ) } - elsif ( $script eq "blast_seq" ) { script_blast_seq( $in, $out, $options ) } - elsif ( $script eq "blat_seq" ) { script_blat_seq( $in, $out, $options ) } - elsif ( $script eq "soap_seq" ) { script_soap_seq( $in, $out, $options ) } - elsif ( $script eq "match_seq" ) { script_match_seq( $in, $out, $options ) } - elsif ( $script eq "create_vmatch_index" ) { script_create_vmatch_index( $in, $out, $options ) } - elsif ( $script eq "vmatch_seq" ) { script_vmatch_seq( $in, $out, $options ) } - elsif ( $script eq "write_fasta" ) { script_write_fasta( $in, $out, $options ) } - elsif ( $script eq "write_align" ) { script_write_align( $in, $out, $options ) } - elsif ( $script eq "write_blast" ) { script_write_blast( $in, $out, $options ) } - elsif ( $script eq "write_tab" ) { script_write_tab( $in, $out, $options ) } - elsif ( $script eq "write_bed" ) { script_write_bed( $in, $out, $options ) } - elsif ( $script eq "write_psl" ) { script_write_psl( $in, $out, $options ) } - elsif ( $script eq "write_fixedstep" ) { script_write_fixedstep( $in, $out, $options ) } - elsif ( $script eq "write_2bit" ) { script_write_2bit( $in, $out, $options ) } - elsif ( $script eq "write_solid" ) { script_write_solid( $in, $out, $options ) } - elsif ( $script eq "write_ucsc_config" ) { script_write_ucsc_config( $in, $out, $options ) } - elsif ( $script eq "head_records" ) { script_head_records( $in, $out, $options ) } - elsif ( $script eq "remove_keys" ) { script_remove_keys( $in, $out, $options ) } - elsif ( $script eq "remove_adaptor" ) { script_remove_adaptor( $in, $out, $options ) } - elsif ( $script eq "remove_mysql_tables" ) { script_remove_mysql_tables( $in, $out, $options ) } - elsif ( $script eq "remove_ucsc_tracks" ) { script_remove_ucsc_tracks( $in, $out, $options ) } - elsif ( $script eq "rename_keys" ) { script_rename_keys( $in, $out, $options ) } - elsif ( $script eq "uniq_vals" ) { script_uniq_vals( $in, $out, $options ) } - elsif ( $script eq "merge_vals" ) { script_merge_vals( $in, $out, $options ) } - elsif ( $script eq "merge_records" ) { script_merge_records( $in, $out, $options ) } - elsif ( $script eq "grab" ) { script_grab( $in, $out, $options ) } - elsif ( $script eq "compute" ) { script_compute( $in, $out, $options ) } - elsif ( $script eq "flip_tab" ) { script_flip_tab( $in, $out, $options ) } - elsif ( $script eq "add_ident" ) { script_add_ident( $in, $out, $options ) } - elsif ( $script eq "count_records" ) { script_count_records( $in, $out, $options ) } - elsif ( $script eq "random_records" ) { script_random_records( $in, $out, $options ) } - elsif ( $script eq "sort_records" ) { script_sort_records( $in, $out, $options ) } - elsif ( $script eq "count_vals" ) { script_count_vals( $in, $out, $options ) } - elsif ( $script eq "plot_histogram" ) { script_plot_histogram( $in, $out, $options ) } - elsif ( $script eq "plot_lendist" ) { script_plot_lendist( $in, $out, $options ) } - elsif ( $script eq "plot_chrdist" ) { script_plot_chrdist( $in, $out, $options ) } - elsif ( $script eq "plot_karyogram" ) { script_plot_karyogram( $in, $out, $options ) } - elsif ( $script eq "plot_matches" ) { script_plot_matches( $in, $out, $options ) } - elsif ( $script eq "plot_seqlogo" ) { script_plot_seqlogo( $in, $out, $options ) } - elsif ( $script eq "plot_phastcons_profiles" ) { script_plot_phastcons_profiles( $in, $out, $options ) } - elsif ( $script eq "analyze_bed" ) { script_analyze_bed( $in, $out, $options ) } - elsif ( $script eq "analyze_vals" ) { script_analyze_vals( $in, $out, $options ) } - elsif ( $script eq "length_vals" ) { script_length_vals( $in, $out, $options ) } - elsif ( $script eq "sum_vals" ) { script_sum_vals( $in, $out, $options ) } - elsif ( $script eq "mean_vals" ) { script_mean_vals( $in, $out, $options ) } - elsif ( $script eq "median_vals" ) { script_median_vals( $in, $out, $options ) } - elsif ( $script eq "max_vals" ) { script_max_vals( $in, $out, $options ) } - elsif ( $script eq "min_vals" ) { script_min_vals( $in, $out, $options ) } - elsif ( $script eq "upload_to_ucsc" ) { script_upload_to_ucsc( $in, $out, $options ) } - - close $in if defined $in; - close $out; - - $t1 = gettimeofday(); - - print STDERR "Program: $script" . ( " " x ( 25 - length( $script ) ) ) . sprintf( "Run time: %.4f\n", ( $t1 - $t0 ) ) if $options->{ 'verbose' }; -} - - -sub get_options -{ - # Martin A. Hansen, February 2008. - - # Gets options from commandline and checks these vigerously. - - my ( $script, # name of script - ) = @_; - - # Returns hash - - my ( %options, @options, $opt, @genomes, $real ); - - if ( $script eq "print_usage" ) - { - @options = qw( - data_in|i=s - ); - } - elsif ( $script eq "read_fasta" ) - { - @options = qw( - data_in|i=s - num|n=s - ); - } - elsif ( $script eq "read_tab" ) - { - @options = qw( - data_in|i=s - delimit|d=s - cols|c=s - keys|k=s - skip|s=s - num|n=s - ); - } - elsif ( $script eq "read_psl" ) - { - @options = qw( - data_in|i=s - num|n=s - ); - } - elsif ( $script eq "read_bed" ) - { - @options = qw( - data_in|i=s - cols|c=s - num|n=s - check|C - ); - } - elsif ( $script eq "read_fixedstep" ) - { - @options = qw( - data_in|i=s - num|n=s - ); - } - elsif ( $script eq "read_blast_tab" ) - { - @options = qw( - data_in|i=s - num|n=s - ); - } - elsif ( $script eq "read_embl" ) - { - @options = qw( - data_in|i=s - num|n=s - keys|k=s - feats|f=s - quals|q=s - ); - } - elsif ( $script eq "read_stockholm" ) - { - @options = qw( - data_in|i=s - num|n=s - ); - } - elsif ( $script eq "read_phastcons" ) - { - @options = qw( - data_in|i=s - num|n=s - min|m=s - dist|d=s - threshold|t=f - gap|g=s - ); - } - elsif ( $script eq "read_soft" ) - { - @options = qw( - data_in|i=s - samples|s=s - num|n=s - ); - } - elsif ( $script eq "read_gff" ) - { - @options = qw( - data_in|i=s - num|n=s - ); - } - elsif ( $script eq "read_2bit" ) - { - @options = qw( - data_in|i=s - num|n=s - no_mask|N - ); - } - elsif ( $script eq "read_solexa" ) - { - @options = qw( - data_in|i=s - num|n=s - format|f=s - quality|q=s - ); - } - elsif ( $script eq "read_solid" ) - { - @options = qw( - data_in|i=s - num|n=s - quality|q=s - ); - } - elsif ( $script eq "read_mysql" ) - { - @options = qw( - database|d=s - query|q=s - user|u=s - password|p=s - ); - } - elsif ( $script eq "read_ucsc_config" ) - { - @options = qw( - data_in|i=s - num|n=s - ); - } - elsif ( $script eq "assemble_tag_contigs" ) - { - @options = qw( - check|C - ); - } - elsif ( $script eq "calc_fixedstep" ) - { - @options = qw( - check|C - ); - } - elsif ( $script eq "format_genome" ) - { - @options = qw( - no_stream|x - dir|d=s - genome|g=s - formats|f=s - ); - } - elsif ( $script eq "length_seq" ) - { - @options = qw( - no_stream|x - data_out|o=s - ); - } - elsif ( $script eq "oligo_freq" ) - { - @options = qw( - word_size|w=s - all|a - ); - } - elsif ( $script eq "create_weight_matrix" ) - { - @options = qw( - percent|p - ); - } - elsif ( $script eq "transliterate_seq" ) - { - @options = qw( - search|s=s - replace|r=s - delete|d=s - ); - } - elsif ( $script eq "transliterate_vals" ) - { - @options = qw( - keys|k=s - search|s=s - replace|r=s - delete|d=s - ); - } - elsif ( $script eq "translate_seq" ) - { - @options = qw( - frames|f=s - ); - } - elsif ( $script eq "extract_seq" ) - { - @options = qw( - beg|b=s - end|e=s - len|l=s - ); - } - elsif ( $script eq "get_genome_seq" ) - { - @options = qw( - genome|g=s - chr|c=s - beg|b=s - end|e=s - len|l=s - flank|f=s - mask|m - splice|s - ); - } - elsif ( $script eq "get_genome_align" ) - { - @options = qw( - genome|g=s - chr|c=s - beg|b=s - end|e=s - len|l=s - strand|s=s - ); - } - elsif ( $script eq "get_genome_phastcons" ) - { - @options = qw( - genome|g=s - chr|c=s - beg|b=s - end|e=s - len|l=s - flank|f=s - ); - } - elsif ( $script eq "split_seq" ) - { - @options = qw( - word_size|w=s - uniq|u - ); - } - elsif ( $script eq "split_bed" ) - { - @options = qw( - window_size|w=s - step_size|s=s - ); - } - elsif ( $script eq "tile_seq" ) - { - @options = qw( - identity|i=s - supress_indels|s - ); - } - elsif ( $script eq "invert_align" ) - { - @options = qw( - soft|s - ); - } - elsif ( $script eq "patscan_seq" ) - { - @options = qw( - patterns|p=s - patterns_in|P=s - comp|c - max_hits|h=s - max_misses|m=s - genome|g=s - ); - } - elsif ( $script eq "create_blast_db" ) - { - @options = qw( - no_stream|x - database|d=s - ); - } - elsif ( $script eq "blast_seq" ) - { - @options = qw( - database|d=s - genome|g=s - program|p=s - e_val|e=f - filter|f - cpus|c=s - no_filter|F - ); - } - elsif ( $script eq "blat_seq" ) - { - @options = qw( - genome|g=s - tile_size|t=s - step_size|s=s - min_identity|m=s - min_score|M=s - one_off|o=s - ooc|c - ); - } - elsif ( $script eq "soap_seq" ) - { - @options = qw( - in_file|i=s - genome|g=s - seed_size|s=s - mismatches|m=s - gap_size|G=s - cpus|c=s - ); - } - elsif ( $script eq "match_seq" ) - { - @options = qw( - word_size|w=s - direction|d=s - ); - } - elsif ( $script eq "create_vmatch_index" ) - { - @options = qw( - index_name|i=s - prefix_length|p=s - no_stream|x - ); - } - elsif ( $script eq "vmatch_seq" ) - { - @options = qw( - genome|g=s - index_name|i=s - count|c - max_hits|m=s - hamming_dist|h=s - edit_dist|e=s - ); - } - elsif ( $script eq "write_fasta" ) - { - @options = qw( - wrap|w=s - no_stream|x - data_out|o=s - compress|Z - ); - } - elsif ( $script eq "write_align" ) - { - @options = qw( - wrap|w=s - no_stream|x - no_ruler|R - no_consensus|C - data_out|o=s - ); - } - elsif ( $script eq "write_blast" ) - { - @options = qw( - no_stream|x - data_out|o=s - comment|c - compress|Z - ); - } - elsif ( $script eq "write_tab" ) - { - @options = qw( - no_stream|x - data_out|o=s - delimit|d=s - keys|k=s - no_keys|K=s - comment|c - compress|Z - ); - } - elsif ( $script eq "write_bed" ) - { - @options = qw( - cols|c=s - check|C - no_stream|x - data_out|o=s - compress|Z - ); - } - elsif ( $script eq "write_psl" ) - { - @options = qw( - no_stream|x - data_out|o=s - compress|Z - ); - } - elsif ( $script eq "write_fixedstep" ) - { - @options = qw( - no_stream|x - data_out|o=s - compress|Z - ); - } - elsif ( $script eq "write_2bit" ) - { - @options = qw( - no_stream|x - data_out|o=s - no_mask|N - ); - } - elsif ( $script eq "write_solid" ) - { - @options = qw( - wrap|w=s - no_stream|x - data_out|o=s - compress|Z - ); - } - elsif ( $script eq "write_ucsc_config" ) - { - @options = qw( - no_stream|x - data_out|o=s - ); - } - elsif ( $script eq "plot_seqlogo" ) - { - @options = qw( - no_stream|x - data_out|o=s - ); - } - elsif ( $script eq "plot_phastcons_profiles" ) - { - @options = qw( - no_stream|x - data_out|o=s - genome|g=s - mean|m - median|M - flank|f=s - terminal|t=s - title|T=s - xlabel|X=s - ylabel|Y=s - ); - } - elsif ( $script eq "analyze_vals" ) - { - @options = qw( - no_stream|x - keys|k=s - ); - } - elsif ( $script eq "head_records" ) - { - @options = qw( - num|n=s - ); - } - elsif ( $script eq "remove_keys" ) - { - @options = qw( - keys|k=s - save_keys|K=s - ); - } - elsif ( $script eq "remove_adaptor" ) - { - @options = qw( - adaptor|a=s - mismatches|m=s - remove|r=s - offset|o=s - ); - } - elsif ( $script eq "remove_mysql_tables" ) - { - @options = qw( - database|d=s - tables|t=s - keys|k=s - user|u=s - password|p=s - no_stream|x - ); - } - elsif ( $script eq "remove_ucsc_tracks" ) - { - @options = qw( - database|d=s - tracks|t=s - keys|k=s - config_file|c=s - user|u=s - password|p=s - no_stream|x - ); - } - elsif ( $script eq "rename_keys" ) - { - @options = qw( - keys|k=s - ); - } - elsif ( $script eq "uniq_vals" ) - { - @options = qw( - key|k=s - invert|i - ); - } - elsif ( $script eq "merge_vals" ) - { - @options = qw( - keys|k=s - delimit|d=s - ); - } - elsif ( $script eq "merge_records" ) - { - @options = qw( - keys|k=s - merge|m=s - ); - } - elsif ( $script eq "grab" ) - { - @options = qw( - patterns|p=s - patterns_in|P=s - regex|r=s - eval|e=s - exact_in|E=s - invert|i - case_insensitive|c - keys|k=s - keys_only|K - vals_only|V - ); - } - elsif ( $script eq "compute" ) - { - @options = qw( - eval|e=s - ); - } - elsif ( $script eq "add_ident" ) - { - @options = qw( - prefix|p=s - key|k=s - ); - } - elsif ( $script eq "count_records" ) - { - @options = qw( - no_stream|x - data_out|o=s - ); - } - elsif ( $script eq "random_records" ) - { - @options = qw( - num|n=s - ); - } - elsif ( $script eq "sort_records" ) - { - @options = qw( - reverse|r - keys|k=s - ); - } - elsif ( $script eq "count_vals" ) - { - @options = qw( - keys|k=s - ); - } - elsif ( $script eq "plot_histogram" ) - { - @options = qw( - no_stream|x - data_out|o=s - terminal|t=s - title|T=s - xlabel|X=s - ylabel|Y=s - key|k=s - sort|s=s - ); - } - elsif ( $script eq "plot_lendist" ) - { - @options = qw( - no_stream|x - data_out|o=s - terminal|t=s - title|T=s - xlabel|X=s - ylabel|Y=s - key|k=s - ); - } - elsif ( $script eq "plot_chrdist" ) - { - @options = qw( - no_stream|x - data_out|o=s - terminal|t=s - title|T=s - xlabel|X=s - ylabel|Y=s - ); - } - elsif ( $script eq "plot_karyogram" ) - { - @options = qw( - no_stream|x - data_out|o=s - genome|g=s - feat_color|f=s - ); - } - elsif ( $script eq "plot_matches" ) - { - @options = qw( - no_stream|x - data_out|o=s - terminal|t=s - title|T=s - xlabel|X=s - ylabel|Y=s - direction|d=s - ); - } - elsif ( $script eq "length_vals" ) - { - @options = qw( - keys|k=s - ); - } - elsif ( $script eq "sum_vals" ) - { - @options = qw( - no_stream|x - data_out|o=s - keys|k=s - ); - } - elsif ( $script eq "mean_vals" ) - { - @options = qw( - no_stream|x - data_out|o=s - keys|k=s - ); - } - elsif ( $script eq "median_vals" ) - { - @options = qw( - no_stream|x - data_out|o=s - keys|k=s - ); - } - elsif ( $script eq "max_vals" ) - { - @options = qw( - no_stream|x - data_out|o=s - keys|k=s - ); - } - elsif ( $script eq "min_vals" ) - { - @options = qw( - no_stream|x - data_out|o=s - keys|k=s - ); - } - elsif ( $script eq "upload_to_ucsc" ) - { - @options = qw( - no_stream|x - database|d=s - table|t=s - short_label|s=s - long_label|l=s - group|g=s - priority|p=f - use_score|u - visibility|V=s - color|c=s - check|C - ); - } - - push @options, qw( - stream_in|I=s - stream_out|O=s - verbose|v - help|? - ); - -# print STDERR Dumper( \@options ); - - GetOptions( - \%options, - @options, - ); - -# print STDERR Dumper( \%options ); - - if ( -t STDIN && scalar( keys %options ) == 0 or $options{ "help" } ) { - return wantarray ? %options : \%options; - } - - $options{ "cols" } = [ split ",", $options{ "cols" } ] if defined $options{ "cols" }; - $options{ "keys" } = [ split ",", $options{ "keys" } ] if defined $options{ "keys" }; - $options{ "no_keys" } = [ split ",", $options{ "no_keys" } ] if defined $options{ "no_keys" }; - $options{ "save_keys" } = [ split ",", $options{ "save_keys" } ] if defined $options{ "save_keys" }; - $options{ "quals" } = [ split ",", $options{ "quals" } ] if defined $options{ "quals" }; - $options{ "feats" } = [ split ",", $options{ "feats" } ] if defined $options{ "feats" }; - $options{ "frames" } = [ split ",", $options{ "frames" } ] if defined $options{ "frames" }; - $options{ "formats" } = [ split ",", $options{ "formats" } ] if defined $options{ "formats" }; - $options{ "samples" } = [ split ",", $options{ "samples" } ] if defined $options{ "samples" }; - $options{ "tables" } = [ split ",", $options{ "tables" } ] if defined $options{ "tables" }; - $options{ "tracks" } = [ split ",", $options{ "tracks" } ] if defined $options{ "tracks" }; - - # ---- check arguments ---- - - if ( $options{ 'data_in' } ) - { - $options{ "files" } = getopt_files( $options{ 'data_in' } ); - - Maasha::Common::error( qq(Argument to --data_in must be a valid file or fileglob expression) ) if scalar @{ $options{ "files" } } == 0; - } - - map { Maasha::Common::error( qq(Argument to --cols must be a whole numbers - not "$_") ) if $_ !~ /^\d+$/ } @{ $options{ "cols" } } if $options{ "cols" }; - - # print STDERR Dumper( \%options ); - - $real = "beg|end|word_size|wrap|tile_size|len|prefix_length|mismatches|offset|num|skip|cpus|window_size|step_size"; - - foreach $opt ( keys %options ) - { - if ( $opt =~ /stream_in|pattern_in|exact_in/ and not -f $options{ $opt } ) - { - Maasha::Common::error( qq(Argument to --$opt must be a valid file or fileglob expression - not "$options{ $opt }") ); - } - elsif ( $opt =~ /$real/ and $options{ $opt } !~ /^\d+$/ ) - { - Maasha::Common::error( qq(Argument to --$opt must be a whole number - not "$options{ $opt }") ); - } - elsif ( $opt =~ /max_hits|max_hits|max_misses|dist|edit_dist|flank|gap|hamming_dist|priority/ and $options{ $opt } !~ /^-?\d+$/ ) - { - Maasha::Common::error( qq(Argument to --$opt must be an integer - not "$options{ $opt }") ); - } - elsif ( $opt =~ /identity|threshold/ and $options{ $opt } !~ /^-?(?:\d+(?:\.\d*)?|\.\d+)$/ ) - { - Maasha::Common::error( qq(Argument to --$opt must be a decimal number - not "$options{ $opt }") ); - } - elsif ( $opt =~ /e_val/ and $options{ $opt } !~ /^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/ ) - { - Maasha::Common::error( qq(Argument to --$opt must be a float - not "$options{ $opt }") ); - } - elsif ( $opt =~ /strand/ and $options{ $opt } !~ /^(\+|-)$/ ) - { - Maasha::Common::error( qq(Argument to --$opt must be "+" or "-" - not "$options{ $opt }") ); - } - elsif ( $opt eq "genome" and $script ne "format_genome" ) - { - @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" ); - map { $_ =~ s/.*\/(.+)$/$1/ } @genomes; - - if ( not grep { $_ =~ /^$options{ $opt }$/ } @genomes ) { - Maasha::Common::error( qq(Genome $options{ $opt } not found in "$ENV{ 'BP_DATA' }/genomes/") ); - } - } - elsif ( $opt eq "terminal" and not $options{ $opt } =~ /^(svg|post|dumb|x11)/ ) - { - Maasha::Common::error( qq(Bad --$opt argument "$options{ $opt }") ); - } - elsif ( $opt eq "table" and $options{ $opt } =~ /(-|\.)/ ) - { - Maasha::Common::error( qq(Character '$1' is not allowed in table name: $options{ $opt }) ); - } - elsif ( $opt eq "merge" and $options{ $opt } !~ /^(AandB|AorB|BorA|AnotB|BnotA)$/ ) - { - Maasha::Common::error( qq(Argument to --$opt must be AandB, AorB, BorA, AnotB, or BnotA - not "$options{ $opt }") ); - } - elsif ( $opt eq "format" and $script eq "read_solexa" and $options{ $opt } !~ /octal|decimal/ ) - { - Maasha::Common::error( qq(Argument to --$opt must be octal or decimal - not "$options{ $opt }") ); - } - elsif ( $opt eq "remove" and $script eq "remove_adaptor" and $options{ $opt } !~ /before|after|skip/ ) - { - Maasha::Common::error( qq(Argument to --$opt must be before, after, or skip - not "$options{ $opt }") ); - } - } - - Maasha::Common::error( qq(no --database specified) ) if $script eq "remove_ucsc_tracks" and not $options{ "database" }; - Maasha::Common::error( qq(no --database specified) ) if $script eq "create_blast_db" and not $options{ "database" }; - Maasha::Common::error( qq(no --index_name specified) ) if $script =~ /create_vmatch_index/ and not $options{ "index_name" }; - Maasha::Common::error( qq(no --database or --genome specified) ) if $script eq "blast_seq" and not $options{ "genome" } and not $options{ "database" }; - Maasha::Common::error( qq(both --database and --genome specified) ) if $script eq "blast_seq" and $options{ "genome" } and $options{ "database" }; - Maasha::Common::error( qq(no --index_name or --genome specified) ) if $script eq "vmatch_seq" and not $options{ "genome" } and not $options{ "index_name" }; - Maasha::Common::error( qq(both --index and --genome specified) ) if $script eq "vmatch_seq" and $options{ "genome" } and $options{ "index_name" }; - Maasha::Common::error( qq(no --in_file or --genome specified) ) if $script eq "soap_seq" and not $options{ "genome" } and not $options{ "in_file" }; - Maasha::Common::error( qq(both --in_file and --genome specified) ) if $script eq "soap_seq" and $options{ "genome" } and $options{ "in_file" }; - Maasha::Common::error( qq(no --genome specified) ) if $script =~ /format_genome|get_genome_seq|get_genome_align|get_genome_phastcons|blat_seq|plot_phastcons_profiles|plot_karyogram/ and not $options{ "genome" }; - Maasha::Common::error( qq(no --key specified) ) if $script =~ /plot_lendist|plot_histogram/ and not $options{ "key" }; - Maasha::Common::error( qq(no --keys speficied) ) if $script =~ /sort_records|count_vals|sum_vals|mean_vals|median_vals|length_vals/ and not $options{ "keys" }; - - if ( $script eq "upload_to_ucsc" ) - { - Maasha::Common::error( qq(no --database specified) ) if not $options{ "database" }; - Maasha::Common::error( qq(no --table specified) ) if not $options{ "table" }; - } - - return wantarray ? %options : \%options; -} - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SCRIPTS <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -sub script_print_usage -{ - # Martin A. Hansen, January 2008. - - # Retrieves usage information from file and - # prints this nicely formatted. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $file, $wiki, $lines ); - - if ( $options->{ 'data_in' } ) { - $file = $options->{ 'data_in' }; - } else { - $file = join "", $ENV{ 'BP_DIR' }, "/bp_usage/", $options->{ 'SCRIPT' }, ".wiki"; - } - - $wiki = Maasha::Gwiki::gwiki_read( $file ); - - ( $wiki->[ 2 ], $wiki->[ 3 ], $wiki->[ 0 ], $wiki->[ 1 ] ) = ( $wiki->[ 0 ], $wiki->[ 1 ], $wiki->[ 2 ], $wiki->[ 3 ] ); - - if ( not $options->{ "help" } ) { - @{ $wiki } = grep { $_->[ 0 ]->{ 'SECTION' } =~ /Biopiece|summary|Usage|Options|Help/ } @{ $wiki }; - } - - $lines = Maasha::Gwiki::gwiki2ascii( $wiki ); - - print STDERR "$_\n" foreach @{ $lines }; - - exit; -} - - -sub script_list_biopieces -{ - # Martin A. Hansen, January 2008. - - # Prints the summary from the usage for each of the biopieces. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( @files, $file, $wiki, $program, $summary ); - - @files = Maasha::Common::ls_files( "$ENV{ 'BP_DIR' }/bp_usage" ); - - foreach $file ( sort @files ) - { - if ( $file =~ /\/([a-z0-9_]+)\.wiki$/ ) - { - $program = $1; - - $wiki = Maasha::Gwiki::gwiki_read( $file ); - - $summary = $wiki->[ 0 ]->[ 0 ]->{ 'TEXT' }; - $summary =~ s/^#summary\s+//; - - printf( "%-30s%s\n", $program, $summary ); - } - } - - exit; -} - - -sub script_list_genomes -{ - # Martin A. Hansen, January 2008. - - # Prints the synopsis from the usage for each of the biopieces. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( @genomes, $genome, @formats, $format, %hash, %found, @row ); - - @genomes = Maasha::Common::ls_dirs( "$ENV{ 'BP_DATA' }/genomes" ); - - foreach $genome ( @genomes ) - { - next if $genome =~ /\.$/; - - @formats = Maasha::Common::ls_dirs( $genome ); - - foreach $format ( @formats ) - { - if ( $format =~ /\/([^\/]+)\/(\w+)$/ ) - { - $hash{ $1 }{ $2 } = 1; - - $found{ $2 } = 1; - } - } - } - - @row = "Genome"; - - map { push @row, $_ } sort keys %found; - - print join( "\t", @row ), "\n"; - - foreach $genome ( sort keys %hash ) - { - @row = $genome; - - foreach $format ( sort keys %found ) - { - if ( exists $hash{ $genome }{ $format } ) { - push @row, "yes"; - } else { - push @row, "no"; - } - } - - print join( "\t", @row ), "\n"; - } -} - - -sub script_read_fasta -{ - # Martin A. Hansen, August 2007. - - # Read sequences from FASTA file. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $file, $data_in, $entry, $num ); - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $entry = Maasha::Fasta::get_entry( $data_in ) ) - { - if ( defined $entry->[ SEQ_NAME ] and $entry->[ SEQ ] ) - { - $record = { - SEQ_NAME => $entry->[ SEQ_NAME ], - SEQ => $entry->[ SEQ ], - SEQ_LEN => length $entry->[ SEQ ], - }; - - put_record( $record, $out ); - } - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_tab -{ - # Martin A. Hansen, August 2007. - - # Read table or table columns from stream or file. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $file, $line, @fields, @fields2, $i, $record, $data_in, $skip, $num ); - - $options->{ 'delimit' } ||= '\s+'; - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $skip = $options->{ 'skip' } ||= 0; - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $line = <$data_in> ) - { - if ( $skip ) - { - $skip--; - next; - } - - next if $line =~ /^#|^$/; - - chomp $line; - - undef $record; - undef @fields2; - - @fields = split /$options->{'delimit'}/, $line; - - if ( $options->{ "cols" } ) { - map { push @fields2, $fields[ $_ ] } @{ $options->{ "cols" } }; - } else { - @fields2 = @fields; - } - - for ( $i = 0; $i < @fields2; $i++ ) - { - if ( $options->{ "keys" }->[ $i ] ) { - $record->{ $options->{ "keys" }->[ $i ] } = $fields2[ $i ]; - } else { - $record->{ "V" . $i } = $fields2[ $i ]; - } - } - - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_psl -{ - # Martin A. Hansen, August 2007. - - # Read psl table from stream or file. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $file, $data_in, $num ); - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $record = Maasha::UCSC::psl_get_entry( $data_in ) ) - { - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - } - - NUM: -} - - -sub script_read_bed -{ - # Martin A. Hansen, August 2007. - - # Read bed table from stream or file. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $cols, $file, $record, $bed_entry, $data_in, $num ); - - $cols = $options->{ 'cols' }->[ 0 ]; - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $bed_entry = Maasha::UCSC::BED::bed_entry_get( $data_in, $cols, $options->{ 'check' } ) ) - { - $record = Maasha::UCSC::BED::bed2biopiece( $bed_entry ); - - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_fixedstep -{ - # Martin A. Hansen, Juli 2008. - - # Read fixedstep wiggle format from stream or file. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $file, $record, $entry, $data_in, $num ); - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $entry = Maasha::UCSC::Wiggle::fixedstep_entry_get( $data_in ) ) - { - $record = Maasha::UCSC::Wiggle::fixedstep2biopiece( $entry ); - - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_blast_tab -{ - # Martin A. Hansen, September 2007. - - # Read tabular BLAST output from NCBI blast run with -m8 or -m9. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $file, $line, @fields, $strand, $record, $data_in, $num ); - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $line = <$data_in> ) - { - chomp $line; - - next if $line =~ /^#/; - - @fields = split /\t/, $line; - - $record->{ "REC_TYPE" } = "BLAST"; - $record->{ "Q_ID" } = $fields[ 0 ]; - $record->{ "S_ID" } = $fields[ 1 ]; - $record->{ "IDENT" } = $fields[ 2 ]; - $record->{ "ALIGN_LEN" } = $fields[ 3 ]; - $record->{ "MISMATCHES" } = $fields[ 4 ]; - $record->{ "GAPS" } = $fields[ 5 ]; - $record->{ "Q_BEG" } = $fields[ 6 ] - 1; # BLAST is 1-based - $record->{ "Q_END" } = $fields[ 7 ] - 1; # BLAST is 1-based - $record->{ "S_BEG" } = $fields[ 8 ] - 1; # BLAST is 1-based - $record->{ "S_END" } = $fields[ 9 ] - 1; # BLAST is 1-based - $record->{ "E_VAL" } = $fields[ 10 ]; - $record->{ "BIT_SCORE" } = $fields[ 11 ]; - - if ( $record->{ "S_BEG" } > $record->{ "S_END" } ) - { - $record->{ "STRAND" } = '-'; - - ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } ); - } - else - { - $record->{ "STRAND" } = '+'; - } - - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_embl -{ - # Martin A. Hansen, August 2007. - - # Read EMBL format. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( %options2, $file, $data_in, $num, $entry, $record ); - - map { $options2{ "keys" }{ $_ } = 1 } @{ $options->{ "keys" } }; - map { $options2{ "feats" }{ $_ } = 1 } @{ $options->{ "feats" } }; - map { $options2{ "quals" }{ $_ } = 1 } @{ $options->{ "quals" } }; - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $entry = Maasha::EMBL::get_embl_entry( $data_in ) ) - { - $record = Maasha::EMBL::parse_embl_entry( $entry, \%options2 ); - - my ( $feat, $feat2, $qual, $qual_val, $record_copy ); - - $record_copy = dclone $record; - - delete $record_copy->{ "FT" }; - - put_record( $record_copy, $out ); - - delete $record_copy->{ "SEQ" }; - - foreach $feat ( keys %{ $record->{ "FT" } } ) - { - $record_copy->{ "FEAT_TYPE" } = $feat; - - foreach $feat2 ( @{ $record->{ "FT" }->{ $feat } } ) - { - foreach $qual ( keys %{ $feat2 } ) - { - $qual_val = join "; ", @{ $feat2->{ $qual } }; - - $qual =~ s/^_//; - $qual = uc $qual; - - $record_copy->{ $qual } = $qual_val; - } - - put_record( $record_copy, $out ); - } - } - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_stockholm -{ - # Martin A. Hansen, August 2007. - - # Read Stockholm format. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $data_in, $file, $num, $entry, $record, $record_anno, $record_align, $key, $seq ); - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $entry = Maasha::Stockholm::get_stockholm_entry( $data_in ) ) - { - $record = Maasha::Stockholm::parse_stockholm_entry( $entry ); - - undef $record_anno; - - foreach $key ( keys %{ $record->{ "GF" } } ) { - $record_anno->{ $key } = $record->{ "GF" }->{ $key }; - } - - $record_anno->{ "ALIGN" } = $num; - - put_record( $record_anno, $out ); - - foreach $seq ( @{ $record->{ "ALIGN" } } ) - { - undef $record_align; - - $record_align = { - SEQ_NAME => $seq->[ 0 ], - SEQ => $seq->[ 1 ], - }; - - put_record( $record_align, $out ); - } - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_phastcons -{ - # Martin A. Hansen, December 2007. - - # Read PhastCons format. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $data_in, $file, $num, $entry, @records, $record ); - - $options->{ "min" } ||= 10; - $options->{ "dist" } ||= 25; - $options->{ "threshold" } ||= 0.8; - $options->{ "gap" } ||= 5; - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $entry = Maasha::UCSC::fixedstep_get_entry( $data_in ) ) - { - @records = Maasha::UCSC::phastcons_parse_entry( $entry, $options ); - - foreach $record ( @records ) - { - $record->{ "REC_TYPE" } = "BED"; - $record->{ "BED_LEN" } = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1; - - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_soft -{ - # Martin A. Hansen, December 2007. - - # Read soft format. - # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $data_in, $file, $num, $records, $record, $soft_index, $fh, @platforms, $plat_table, @samples, $sample, $old_end, $skip ); - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - print STDERR "Creating index for file: $file\n" if $options->{ "verbose" }; - - $soft_index = Maasha::NCBI::soft_index_file( $file ); - - $fh = Maasha::Common::read_open( $file ); - - @platforms = grep { $_->{ "SECTION" } =~ /PLATFORM/ } @{ $soft_index }; - - print STDERR "Getting platform tables for file: $file\n" if $options->{ "verbose" }; - - $plat_table = Maasha::NCBI::soft_get_platform( $fh, $platforms[ 0 ]->{ "LINE_BEG" }, $platforms[ -1 ]->{ "LINE_END" } ); - - @samples = grep { $_->{ "SECTION" } =~ /SAMPLE/ } @{ $soft_index }; - - $old_end = $platforms[ -1 ]->{ "LINE_END" }; - - foreach $sample ( @samples ) - { - $skip = 0; - $skip = 1 if ( $options->{ "samples" } and grep { $sample->{ "SECTION" } !~ /$_/ } @{ $options->{ "samples" } } ); - - print STDERR "Getting samples for dataset: $sample->{ 'SECTION' }\n" if $options->{ "verbose" } and not $skip; - - $records = Maasha::NCBI::soft_get_sample( $fh, $plat_table, $sample->{ "LINE_BEG" } - $old_end - 1, $sample->{ "LINE_END" } - $old_end - 1, $skip ); - - foreach $record ( @{ $records } ) - { - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - $old_end = $sample->{ "LINE_END" }; - } - - close $fh; - } - - NUM: - - close $data_in if $data_in; - close $fh if $fh; -} - - -sub script_read_gff -{ - # Martin A. Hansen, February 2008. - - # Read soft format. - # http://www.ncbi.nlm.nih.gov/geo/info/soft2.html - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $data_in, $file, $fh, $num, $record, $entry ); - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $fh = Maasha::Common::read_open( $file ); - - while ( $entry = Maasha::GFF::get_entry( $fh ) ) - { - put_record( $entry, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $fh; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_2bit -{ - # Martin A. Hansen, March 2008. - - # Read sequences from 2bit file. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $file, $data_in, $mask, $toc, $line, $num ); - - $mask = 1 if not $options->{ "no_mask" }; - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - $toc = Maasha::TwoBit::twobit_get_TOC( $data_in ); - - foreach $line ( @{ $toc } ) - { - $record->{ "SEQ_NAME" } = $line->[ 0 ]; - $record->{ "SEQ" } = Maasha::TwoBit::twobit_get_seq( $data_in, $line->[ 1 ], undef, undef, $mask ); - $record->{ "SEQ_LEN" } = length $record->{ "SEQ" }; - - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_solexa -{ - # Martin A. Hansen, March 2008. - - # Read Solexa sequence reads from file. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $file, $data_in, $entry, $num, @seqs, @scores, $i ); - - $options->{ "format" } ||= "octal"; - $options->{ "quality" } ||= 20; - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - if ( $options->{ "format" } eq "octal" ) - { - while ( $entry = Maasha::Solexa::solexa_get_entry_octal( $data_in ) ) - { - $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } ); - - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - } - else - { - while ( $entry = Maasha::Solexa::solexa_get_entry_decimal( $data_in ) ) - { - $record = Maasha::Solexa::solexa2biopiece( $entry, $options->{ "quality" } ); - - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_solid -{ - # Martin A. Hansen, April 2008. - - # Read Solid sequence from file. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $file, $data_in, $line, $num, $seq_name, $seq_cs, $seq_qual, @scores, @seqs, $i ); - - $options->{ "quality" } ||= 15; - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $line = <$data_in> ) - { - chomp $line; - - ( $seq_name, $seq_cs, $seq_qual ) = split /\t/, $line; - - @scores = split /,/, $seq_qual; - @seqs = split //, Maasha::Solid::color_space2seq( $seq_cs ); - - for ( $i = 0; $i < @seqs; $i++ ) { - $seqs[ $i ] = lc $seqs[ $i ] if $scores[ $i ] < $options->{ "quality" }; - } - - $record = { - REC_TYPE => 'SOLID', - SEQ_NAME => $seq_name, - SEQ_CS => $seq_cs, - SEQ_QUAL => join( ";", @scores ), - SEQ_LEN => length $seq_cs, - SEQ => join( "", @seqs ), - SCORE_MEAN => sprintf( "%.2f", Maasha::Calc::mean( \@scores ) ), - }; - - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_read_mysql -{ - # Martin A. Hansen, May 2008. - - # Read a MySQL query into stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $dbh, $results ); - - $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user(); - $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password(); - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } ); - - $results = Maasha::SQL::query_hashref_list( $dbh, $options->{ "query" } ); - - Maasha::SQL::disconnect( $dbh ); - - map { put_record( $_ ) } @{ $results }; -} - - -sub script_read_ucsc_config -{ - # Martin A. Hansen, November 2008. - - # Read track entries from UCSC Genome Browser '.ra' files. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $file, $data_in, $entry, $num ); - - while ( $record = get_record( $in ) ) { - put_record( $record, $out ); - } - - $num = 1; - - foreach $file ( @{ $options->{ "files" } } ) - { - $data_in = Maasha::Common::read_open( $file ); - - while ( $record = Maasha::UCSC::ucsc_config_entry_get( $data_in ) ) - { - $record->{ 'REC_TYPE' } = "UCSC Config"; - - put_record( $record, $out ); - - goto NUM if $options->{ "num" } and $num == $options->{ "num" }; - - $num++; - } - - close $data_in; - } - - NUM: - - close $data_in if $data_in; -} - - -sub script_assemble_tag_contigs -{ - # Martin A. Hansen, November 2008. - - # Assemble tags from the stream into - # tag contigs. - - # The current implementation is quite - # slow because of heavy use of temporary - # files. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $bed_file, $tag_file, $fh_in, $fh_out, $cols, $record, $bed_entry, $file_hash, $chr, $strand ); - - $bed_file = "$BP_TMP/assemble_tag_contigs.bed"; - $fh_out = Maasha::Filesys::file_write_open( $bed_file ); - $cols = 6; # we only need the first 6 BED columns - - while ( $record = get_record( $in ) ) - { - if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record, $cols ) ) - { - $strand = $record->{ 'STRAND' } || '+'; - - Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh_out, $cols, $options->{ 'check' } ); - } - } - - close $fh_out; - - $file_hash = Maasha::UCSC::BED::bed_file_split_on_chr( $bed_file, $BP_TMP, $cols ); - - unlink $bed_file; - - foreach $chr ( sort keys %{ $file_hash } ) - { - $bed_file = $file_hash->{ $chr }; - $tag_file = "$bed_file.tc"; - - Maasha::Common::run( "bed2tag_contigs", "< $bed_file > $tag_file" ); - - $fh_in = Maasha::Filesys::file_read_open( $tag_file ); - - while ( $bed_entry = Maasha::UCSC::BED::bed_entry_get( $fh_in, $cols, $options->{ 'check' } ) ) - { - if ( $record = Maasha::UCSC::BED::bed2biopiece( $bed_entry ) ) { - put_record( $record, $out ); - } - } - - close $fh_in; - - unlink $bed_file; - unlink $tag_file; - } -} - - -sub script_format_genome -{ - # Martin A. Hansen, Juli 2008. - - # Format a genome to speficed formats. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $dir, $genome, $fasta_dir, $phastcons_dir, $vals, $fh_out, $record, $format, $index, $entry ); - - $dir = $options->{ 'dir' } || $ENV{ 'BP_DATA' }; - $genome = $options->{ 'genome' }; - - Maasha::Common::error( "Directory: $dir does not exist" ) if not -d $dir; - Maasha::Common::dir_create_if_not_exists( "$dir/genomes" ); - Maasha::Common::dir_create_if_not_exists( "$dir/genomes/$genome" ); - - if ( grep { $_ =~ /fasta|blast|vmatch/i } @{ $options->{ "formats" } } ) - { - if ( -f "$dir/genomes/$genome/fasta/$genome.fna" ) - { - $fasta_dir = "$dir/genomes/$genome/fasta"; - } - else - { - Maasha::Common::dir_create_if_not_exists( "$dir/genomes/$genome/fasta" ); - - $fasta_dir = "$dir/genomes/$genome/fasta"; - - $fh_out = Maasha::Common::write_open( "$fasta_dir/$genome.fna" ); - } - } - elsif ( grep { $_ =~ /phastcons/i } @{ $options->{ "formats" } } ) - { - Maasha::Common::dir_create_if_not_exists( "$dir/genomes/$genome/phastcons" ); - - $phastcons_dir = "$dir/genomes/$genome/phastcons"; - - $fh_out = Maasha::Common::write_open( "$phastcons_dir/$genome.pp" ); - } - - while ( $record = get_record( $in ) ) - { - if ( $fh_out and $entry = Maasha::Fasta::biopiece2fasta( $record ) ) - { - Maasha::Fasta::put_entry( $entry, $fh_out, $options->{ "wrap" } ); - } - elsif ( $fh_out and $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "STEP" } and $record->{ "VALS" } ) - { - print $fh_out "fixedStep chrom=$record->{ 'CHR' } start=$record->{ 'CHR_BEG' } step=$record->{ 'STEP' }\n"; - - $vals = $record->{ 'VALS' }; - - $vals =~ tr/,/\n/; - - print $fh_out "$vals\n"; - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - foreach $format ( @{ $options->{ 'formats' } } ) - { - if ( $format =~ /^fasta$/i ) { Maasha::Fasta::fasta_index( "$fasta_dir/$genome.fna", "$dir/genomes/$genome/fasta/$genome.index" ) } - elsif ( $format =~ /^blast$/i ) { Maasha::NCBI::blast_index( "$genome.fna", $fasta_dir, "$dir/genomes/$genome/blast", "dna", $genome ) } - elsif ( $format =~ /^blat$/i ) { print STDERR "BLAT FORMAT NOT IMPLEMENTED" } - elsif ( $format =~ /^vmatch$/i ) { Maasha::Match::vmatch_index( "$genome.fna", $fasta_dir, "$dir/genomes/$genome/vmatch", $BP_TMP ) } - elsif ( $format =~ /^phastcons$/i ) { Maasha::UCSC::phastcons_index( "$genome.pp", $phastcons_dir ) } - } - - close $fh_out if $fh_out; -} - - -sub script_length_seq -{ - # Martin A. Hansen, August 2007. - - # Determine the length of sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $total ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) - { - $record->{ "SEQ_LEN" } = length $record->{ "SEQ" }; - $total += $record->{ "SEQ_LEN" }; - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - put_record( { TOTAL_SEQ_LEN => $total }, $out ); -} - - -sub script_uppercase_seq -{ - # Martin A. Hansen, August 2007. - - # Uppercases sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record ); - - while ( $record = get_record( $in ) ) - { - $record->{ "SEQ" } = uc $record->{ "SEQ" } if $record->{ "SEQ" }; - - put_record( $record, $out ); - } -} - - -sub script_shuffle_seq -{ - # Martin A. Hansen, December 2007. - - # Shuffle sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record ); - - while ( $record = get_record( $in ) ) - { - $record->{ "SEQ" } = Maasha::Seq::seq_shuffle( $record->{ "SEQ" } ) if $record->{ "SEQ" }; - - put_record( $record, $out ); - } -} - - -sub script_analyze_seq -{ - # Martin A. Hansen, August 2007. - - # Analyze sequence composition of sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record, $analysis ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) - { - $analysis = Maasha::Seq::seq_analyze( $record->{ "SEQ" } ); - - map { $record->{ $_ } = $analysis->{ $_ } } keys %{ $analysis }; - } - - put_record( $record, $out ); - } -} - - -sub script_analyze_tags -{ - # Martin A. Hansen, August 2008. - - # Analyze sequence tags in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record, $analysis, %len_hash, %clone_hash, $clones, $key, $tag_record ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) - { - if ( $record->{ "SEQ_NAME" } =~ /_(\d+)$/ ) - { - $clones = $1; - - $len_hash{ length( $record->{ "SEQ" } ) }++; - $clone_hash{ length( $record->{ "SEQ" } ) } += $clones; - } - } - elsif ( $record->{ "Q_ID" } and $record->{ "BED_LEN" } ) - { - if ( $record->{ "Q_ID" } =~ /_(\d+)$/ ) - { - $clones = $1; - - $len_hash{ $record->{ "BED_LEN" } }++; - $clone_hash{ $record->{ "BED_LEN" } } += $clones; - } - } - } - - foreach $key ( sort { $a <=> $b } keys %len_hash ) - { - $tag_record->{ "TAG_LEN" } = $key; - $tag_record->{ "TAG_COUNT" } = $len_hash{ $key }; - $tag_record->{ "TAG_CLONES" } = $clone_hash{ $key }; - - put_record( $tag_record, $out ); - } -} - - -sub script_complexity_seq -{ - # Martin A. Hansen, May 2008. - - # Generates an index calculated as the most common di-residue over - # the sequence length for all sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record, $index ); - - while ( $record = get_record( $in ) ) - { - $record->{ "SEQ_COMPLEXITY" } = sprintf( "%.2f", Maasha::Seq::seq_complexity( $record->{ "SEQ" } ) ) if $record->{ "SEQ" }; - - put_record( $record, $out ); - } -} - - -sub script_oligo_freq -{ - # Martin A. Hansen, August 2007. - - # Determine the length of sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, %oligos, @freq_table ); - - $options->{ "word_size" } ||= 7; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) - { - map { $oligos{ $_ }++ } Maasha::Seq::seq2oligos( \$record->{ "SEQ" }, $options->{ "word_size" } ); - - if ( not $options->{ "all" } ) - { - @freq_table = Maasha::Seq::oligo_freq( \%oligos ); - - map { put_record( $_, $out ) } @freq_table; - - undef %oligos; - } - } - - put_record( $record, $out ); - } - - if ( $options->{ "all" } ) - { - @freq_table = Maasha::Seq::oligo_freq( \%oligos ); - - map { put_record( $_, $out ) } @freq_table; - } -} - - -sub script_create_weight_matrix -{ - # Martin A. Hansen, August 2007. - - # Creates a weight matrix from an alignmnet. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $count, $i, $res, %freq_hash, %res_hash, $freq ); - - $count = 0; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) - { - for ( $i = 0; $i < length $record->{ "SEQ" }; $i++ ) - { - $res = substr $record->{ "SEQ" }, $i, 1; - - $freq_hash{ $i }{ $res }++; - $res_hash{ $res } = 1; - } - - $count++; - } - else - { - put_record( $record, $out ); - } - } - - foreach $res ( sort keys %res_hash ) - { - undef $record; - - $record->{ "V0" } = $res; - - for ( $i = 0; $i < keys %freq_hash; $i++ ) - { - $freq = $freq_hash{ $i }{ $res } || 0; - - if ( $options->{ "percent" } ) { - $freq = sprintf( "%.0f", 100 * $freq / $count ) if $freq > 0; - } - - $record->{ "V" . ( $i + 1 ) } = $freq; - } - - put_record( $record, $out ); - } -} - - -sub script_calc_bit_scores -{ - # Martin A. Hansen, March 2007. - - # Calculates the bit scores for each position from an alignmnet in the stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record, $type, $count, $i, $res, %freq_hash, $bit_max, $bit_height, $bit_diff ); - - $count = 0; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) - { - $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) if not $type; - - for ( $i = 0; $i < length $record->{ "SEQ" }; $i++ ) - { - $res = substr $record->{ "SEQ" }, $i, 1; - - next if $res =~ /-|_|~|\./; - - $freq_hash{ $i }{ $res }++; - } - - $count++; - } - else - { - put_record( $record, $out ); - } - } - - undef $record; - - if ( $type eq "protein" ) { - $bit_max = 4; - } else { - $bit_max = 2; - } - - for ( $i = 0; $i < keys %freq_hash; $i++ ) - { - $bit_height = Maasha::Seq::seqlogo_calc_bit_height( $freq_hash{ $i }, $count ); - - $bit_diff = $bit_max - $bit_height; - - $record->{ "V" . ( $i ) } = sprintf( "%.2f", $bit_diff ); - } - - put_record( $record, $out ); -} - - -sub script_calc_fixedstep -{ - # Martin A. Hansen, September 2008. - - # Calculates fixedstep entries from data in the stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $bed_file, $fh_in, $fh_out, $record, $file_hash, $chr, $bed_entry, $fixedstep_file, $fixedstep_entry ); - - $bed_file = "$BP_TMP/calc_fixedstep.bed"; - $fh_out = Maasha::Filesys::file_write_open( $bed_file ); - - while ( $record = get_record( $in ) ) - { - if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record ) ) { - Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh_out, undef, $options->{ 'check' } ); - } - } - - close $fh_out; - - $file_hash = Maasha::UCSC::BED::bed_file_split_on_chr( $bed_file, $BP_TMP ); - - unlink $bed_file; - - foreach $chr ( sort keys %{ $file_hash } ) - { - $bed_file = $file_hash->{ $chr }; - - $fixedstep_file = Maasha::UCSC::Wiggle::fixedstep_calc( $bed_file, $chr, $options->{ 'use_score' }, $options->{ 'use_log10' } ); - - #$fixedstep_file = "$bed_file.fixedstep"; - - # Maasha::Common::run( "bed2fixedstep", "< $bed_file > $fixedstep_file" ); - - - $fh_in = Maasha::Filesys::file_read_open( $fixedstep_file ); - - while ( $fixedstep_entry = Maasha::UCSC::Wiggle::fixedstep_entry_get( $fh_in ) ) - { - if ( $record = Maasha::UCSC::Wiggle::fixedstep2biopiece( $fixedstep_entry ) ) { - put_record( $record, $out ); - } - } - - close $fh_in; - - unlink $bed_file; - unlink $fixedstep_file; - } -} - - -sub script_reverse_seq -{ - # Martin A. Hansen, August 2007. - - # Reverse sequence in record. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) { - $record->{ "SEQ" } = reverse $record->{ "SEQ" }; - } - - put_record( $record, $out ); - } -} - - -sub script_complement_seq -{ - # Martin A. Hansen, August 2007. - - # Complement sequence in record. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record, $type ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) - { - if ( not $type ) { - $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ); - } - - if ( $type eq "rna" ) { - Maasha::Seq::rna_comp( \$record->{ "SEQ" } ); - } elsif ( $type eq "dna" ) { - Maasha::Seq::dna_comp( \$record->{ "SEQ" } ); - } - } - - put_record( $record, $out ); - } -} - - -sub script_remove_indels -{ - # Martin A. Hansen, August 2007. - - # Remove indels from sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record ); - - while ( $record = get_record( $in ) ) - { - $record->{ 'SEQ' } =~ tr/-~.//d if $record->{ "SEQ" }; - - put_record( $record, $out ); - } -} - - -sub script_transliterate_seq -{ - # Martin A. Hansen, August 2007. - - # Transliterate chars from sequence in record. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $search, $replace, $delete ); - - $search = $options->{ "search" } || ""; - $replace = $options->{ "replace" } || ""; - $delete = $options->{ "delete" } || ""; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) - { - if ( $search and $replace ) { - eval "\$record->{ 'SEQ' } =~ tr/$search/$replace/"; - } elsif ( $delete ) { - eval "\$record->{ 'SEQ' } =~ tr/$delete//d"; - } - } - - put_record( $record, $out ); - } -} - - -sub script_transliterate_vals -{ - # Martin A. Hansen, April 2008. - - # Transliterate chars from values in record. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $search, $replace, $delete, $key ); - - $search = $options->{ "search" } || ""; - $replace = $options->{ "replace" } || ""; - $delete = $options->{ "delete" } || ""; - - while ( $record = get_record( $in ) ) - { - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( exists $record->{ $key } ) - { - if ( $search and $replace ) { - eval "\$record->{ $key } =~ tr/$search/$replace/"; - } elsif ( $delete ) { - eval "\$record->{ $key } =~ tr/$delete//d"; - } - } - } - - put_record( $record, $out ); - } -} - - -sub script_translate_seq -{ - # Martin A. Hansen, February 2008. - - # Translate DNA sequence into protein sequence. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $frame, %new_record ); - - $options->{ "frames" } ||= [ 1, 2, 3, -1, -2, -3 ]; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) - { - if ( Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) eq "dna" ) - { - foreach $frame ( @{ $options->{ "frames" } } ) - { - %new_record = %{ $record }; - - $new_record{ "SEQ" } = Maasha::Seq::translate( $record->{ "SEQ" }, $frame ); - $new_record{ "SEQ_LEN" } = length $new_record{ "SEQ" }; - $new_record{ "FRAME" } = $frame; - - put_record( \%new_record, $out ); - } - } - } - else - { - put_record( $record, $out ); - } - } -} - - -sub script_extract_seq -{ - # Martin A. Hansen, August 2007. - - # Extract subsequences from sequences in record. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $beg, $end, $len, $record ); - - if ( not defined $options->{ "beg" } or $options->{ "beg" } < 0 ) { - $beg = 0; - } else { - $beg = $options->{ "beg" } - 1; # correcting for start offset - } - - if ( defined $options->{ "end" } and $options->{ "end" } - 1 < $beg ) { - $end = $beg - 1; - } elsif ( defined $options->{ "end" } ) { - $end = $options->{ "end" } - 1; # correcting for start offset - } - - $len = $options->{ "len" }; - -# print "beg->$beg, end->$end, len->$len\n"; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) - { - if ( defined $beg and defined $end ) - { - if ( $end - $beg + 1 > length $record->{ "SEQ" } ) { - $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg; - } else { - $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg, $end - $beg + 1; - } - } - elsif ( defined $beg and defined $len ) - { - if ( $len > length $record->{ "SEQ" } ) { - $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg; - } else { - $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg, $len; - } - } - elsif ( defined $beg ) - { - $record->{ "SEQ" } = substr $record->{ "SEQ" }, $beg; - } - } - - $record->{ "SEQ_LEN" } = length $record->{ "SEQ" }; - - put_record( $record, $out ); - } -} - - -sub script_get_genome_seq -{ - # Martin A. Hansen, December 2007. - - # Gets a subsequence from a genome. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $genome, $genome_file, $index_file, $index, $fh, $index_head, $index_beg, $index_len, $beg, $len, %lookup_hash, @begs, @lens, $i, $seq ); - - $options->{ "flank" } ||= 0; - - if ( $options->{ "genome" } ) - { - $genome = $options->{ "genome" }; - - $genome_file = "$ENV{ 'BP_DATA' }/genomes/$genome/fasta/$genome.fna"; - $index_file = "$ENV{ 'BP_DATA' }/genomes/$genome/fasta/$genome.index"; - - $fh = Maasha::Common::read_open( $genome_file ); - $index = Maasha::Fasta::index_retrieve( $index_file ); - - shift @{ $index }; # Get rid of the file size info - - map { $lookup_hash{ $_->[ 0 ] } = [ $_->[ 1 ], $_->[ 2 ] ] } @{ $index }; - - if ( exists $lookup_hash{ $options->{ "chr" } } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) ) - { - ( $index_beg, $index_len ) = @{ $lookup_hash{ $options->{ "chr" } } }; - - $beg = $index_beg + $options->{ "beg" } - 1; - - if ( $options->{ "len" } ) { - $len = $options->{ "len" }; - } elsif ( $options->{ "end" } ) { - $len = ( $options->{ "end" } - $options->{ "beg" } + 1 ); - } - - $beg -= $options->{ "flank" }; - $len += 2 * $options->{ "flank" }; - - if ( $beg <= $index_beg ) - { - $len -= $index_beg - $beg; - $beg = $index_beg; - } - - $len = $index_beg + $index_len - $beg if $beg + $len > $index_beg + $index_len; - - next if $beg > $index_beg + $index_len; - - $record->{ "CHR" } = $options->{ "chr" }; - $record->{ "CHR_BEG" } = $beg - $index_beg; - $record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $len - 1; - - $record->{ "SEQ" } = Maasha::Common::file_read( $fh, $beg, $len ); - $record->{ "SEQ_LEN" } = $len; - - put_record( $record, $out ); - } - } - - while ( $record = get_record( $in ) ) - { - if ( $options->{ "genome" } and not $record->{ "SEQ" } ) - { - if ( $record->{ "REC_TYPE" } eq "BED" and exists $lookup_hash{ $record->{ "CHR" } } ) - { - ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "CHR" } } }; - - $beg = $record->{ "CHR_BEG" } + $index_beg; - $len = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1; - } - elsif ( $record->{ "REC_TYPE" } eq "PSL" and exists $lookup_hash{ $record->{ "S_ID" } } ) - { - ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "S_ID" } } }; - - $beg = $record->{ "S_BEG" } + $index_beg; - $len = $record->{ "S_END" } - $record->{ "S_BEG" } + 1; - } - elsif ( $record->{ "REC_TYPE" } eq "BLAST" and exists $lookup_hash{ $record->{ "S_ID" } } ) - { - ( $index_beg, $index_len ) = @{ $lookup_hash{ $record->{ "S_ID" } } }; - - $beg = $record->{ "S_BEG" } + $index_beg; - $len = $record->{ "S_END" } - $record->{ "S_BEG" } + 1; - } - - $beg -= $options->{ "flank" }; - $len += 2 * $options->{ "flank" }; - - if ( $beg <= $index_beg ) - { - $len -= $index_beg - $beg; - $beg = $index_beg; - } - - $len = $index_beg + $index_len - $beg if $beg + $len > $index_beg + $index_len; - - next if $beg > $index_beg + $index_len; - - $record->{ "CHR_BEG" } = $beg - $index_beg; - $record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $len - 1; - - $record->{ "SEQ" } = Maasha::Common::file_read( $fh, $beg, $len ); - - if ( $record->{ "STRAND" } and $record->{ "STRAND" } eq "-" ) - { - Maasha::Seq::dna_comp( \$record->{ "SEQ" } ); - $record->{ "SEQ" } = reverse $record->{ "SEQ" }; - } - - if ( $options->{ "mask" } ) - { - if ( $record->{ "BLOCK_COUNT" } > 1 ) # uppercase hit block segments and lowercase the rest. - { - $record->{ "SEQ" } = lc $record->{ "SEQ" }; - - @begs = split ",", $record->{ "Q_BEGS" }; - @lens = split ",", $record->{ "BLOCK_LENS" }; - - for ( $i = 0; $i < @begs; $i++ ) { - substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ], uc substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ]; - } - } - } - elsif ( $options->{ "splice" } ) - { - if ( $record->{ "BLOCK_COUNT" } > 1 ) # splice block sequences - { - $seq = ""; - @begs = split ",", $record->{ "Q_BEGS" }; - @lens = split ",", $record->{ "BLOCK_LENS" }; - - for ( $i = 0; $i < @begs; $i++ ) { - $seq .= substr $record->{ "SEQ" }, $begs[ $i ], $lens[ $i ]; - } - - $record->{ "SEQ" } = $seq; - } - } - - $record->{ "SEQ_LEN" } = length $record->{ "SEQ" }; - } - - put_record( $record, $out ); - } - - close $fh if $fh; -} - - -sub script_get_genome_align -{ - # Martin A. Hansen, April 2008. - - # Gets a subalignment from a multiple genome alignment. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $maf_track, $align, $align_num, $beg, $end, $len, $entry ); - - $options->{ "strand" } ||= "+"; - - $align_num = 1; - - $maf_track = Maasha::Config::maf_track( $options->{ "genome" } ); - - if ( $options->{ "chr" } and $options->{ "beg" } and ( $options->{ "end" } or $options->{ "len" } ) ) - { - $beg = $options->{ "beg" } - 1; - - if ( $options->{ "end" } ) { - $end = $options->{ "end" }; - } elsif ( $options->{ "len" } ) { - $end = $beg + $options->{ "len" }; - } - - $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $options->{ "chr" }, $beg, $end, $options->{ "strand" } ); - - foreach $entry ( @{ $align } ) - { - $entry->{ "CHR" } = $record->{ "CHR" }; - $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" }; - $entry->{ "CHR_END" } = $record->{ "CHR_END" }; - $entry->{ "STRAND" } = $record->{ "STRAND" } || '+'; - $entry->{ "Q_ID" } = $record->{ "Q_ID" }; - $entry->{ "SCORE" } = $record->{ "SCORE" }; - - put_record( $entry, $out ); - } - } - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "REC_TYPE" } eq "BED" ) - { - $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $record->{ "STRAND" } ); - } - elsif ( $record->{ "REC_TYPE" } eq "VMATCH" ) - { - $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" } + 1, $record->{ "STRAND" } ); - } - elsif ( $record->{ "REC_TYPE" } eq "PSL" ) - { - $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } ); - } - elsif ( $record->{ "REC_TYPE" } eq "BLAST" ) - { - $align = Maasha::UCSC::maf_extract( $BP_TMP, $options->{ "genome" }, $maf_track, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $record->{ "STRAND" } ); - } - - foreach $entry ( @{ $align } ) - { - $entry->{ "CHR" } = $record->{ "CHR" }; - $entry->{ "CHR_BEG" } = $record->{ "CHR_BEG" }; - $entry->{ "CHR_END" } = $record->{ "CHR_END" }; - $entry->{ "STRAND" } = $record->{ "STRAND" }; - $entry->{ "Q_ID" } = $record->{ "Q_ID" }; - $entry->{ "SCORE" } = $record->{ "SCORE" }; - - put_record( $entry, $out ); - } - - $align_num++; - } -} - - -sub script_get_genome_phastcons -{ - # Martin A. Hansen, February 2008. - - # Get phastcons scores from genome intervals. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $scores, $record ); - - $options->{ "flank" } ||= 0; - - $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } ); - $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } ); - - $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index ); - $fh_phastcons = Maasha::Common::read_open( $phastcons_file ); - - if ( defined $options->{ "chr" } and defined $options->{ "beg" } and ( defined $options->{ "end" } or defined $options->{ "len" } ) ) - { - $options->{ "beg" } -= 1; # request is 1-based - $options->{ "end" } -= 1; # request is 1-based - - if ( $options->{ "len" } ) { - $options->{ "end" } = $options->{ "beg" } + $options->{ "len" } - 1; - } - - $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $options->{ "chr" }, $options->{ "beg" }, $options->{ "end" }, $options->{ "flank" } ); - - $record->{ "CHR" } = $options->{ "chr" }; - $record->{ "CHR_BEG" } = $options->{ "beg" } - $options->{ "flank" }; - $record->{ "CHR_END" } = $options->{ "end" } + $options->{ "flank" }; - - $record->{ "PHASTCONS" } = join ",", @{ $scores }; - $record->{ "PHAST_COUNT" } = scalar @{ $scores }; # DEBUG - - put_record( $record, $out ); - } - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "REC_TYPE" } eq "BED" ) - { - $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "flank" } ); - } - elsif ( $record->{ "REC_TYPE" } eq "PSL" ) - { - $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } ); - } - elsif ( $record->{ "REC_TYPE" } eq "BLAST" ) - { - $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "S_ID" }, $record->{ "S_BEG" }, $record->{ "S_END" }, $options->{ "flank" } ); - } - - $record->{ "PHASTCONS" } = join ",", @{ $scores } if @{ $scores }; -# $record->{ "PHAST_COUNT" } = @{ $scores } if @{ $scores }; # DEBUG - - put_record( $record, $out ); - } - - close $fh_phastcons if $fh_phastcons; -} - - -sub script_fold_seq -{ - # Martin A. Hansen, December 2007. - - # Folds sequences in stream into secondary structures. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record, $type, $struct, $index ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) - { - if ( not $type ) { - $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ); - } - - if ( $type ne "protein" ) - { - ( $struct, $index ) = Maasha::Seq::fold_struct_rnafold( $record->{ "SEQ" } ); - $record->{ "SEC_STRUCT" } = $struct; - $record->{ "FREE_ENERGY" } = $index; - $record->{ "SCORE" } = abs int $index; - $record->{ "SIZE" } = length $struct; - $record->{ "CONF" } = "1," x $record->{ "SIZE" }; - } - } - - put_record( $record, $out ); - } -} - - -sub script_split_seq -{ - # Martin A. Hansen, August 2007. - - # Split a sequence in stream into words. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $new_record, $i, $subseq, %lookup ); - - $options->{ "word_size" } ||= 7; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) - { - for ( $i = 0; $i < length( $record->{ "SEQ" } ) - $options->{ "word_size" } + 1; $i++ ) - { - $subseq = substr $record->{ "SEQ" }, $i, $options->{ "word_size" }; - - if ( $options->{ "uniq" } and not $lookup{ $subseq } ) - { - $new_record->{ "SEQ_NAME" } = $record->{ "SEQ_NAME" } . "[" . ( $i + 1 ) . "-" . ( $i + $options->{ "word_size" } ) . "]"; - $new_record->{ "SEQ" } = $subseq; - - put_record( $new_record, $out ); - - $lookup{ $subseq } = 1; - } - else - { - $new_record->{ "SEQ_NAME" } = $record->{ "SEQ_NAME" } . "[" . ( $i + 1 ) . "-" . ( $i + $options->{ "word_size" } ) . "]"; - $new_record->{ "SEQ" } = $subseq; - - put_record( $new_record, $out ); - } - } - } - else - { - put_record( $record, $out ); - } - } -} - - -sub script_split_bed -{ - # Martin A. Hansen, June 2008. - - # Split a BED record into overlapping windows. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $new_record, $i ); - - $options->{ "window_size" } ||= 20; - $options->{ "step_size" } ||= 1; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) - { - $record->{ "BED_LEN" } = $record->{ "CHR_END" } - $record->{ "CHR_BEG" } + 1; - - for ( $i = 0; $i < $record->{ "BED_LEN" } - $options->{ "window_size" }; $i += $options->{ "step_size" } ) - { - $new_record->{ "REC_TYPE" } = "BED"; - $new_record->{ "CHR" } = $record->{ "CHR" }; - $new_record->{ "CHR_BEG" } = $record->{ "CHR_BEG" } + $i; - $new_record->{ "CHR_END" } = $record->{ "CHR_BEG" } + $i + $options->{ "window_size" }; - $new_record->{ "BED_LEN" } = $options->{ "window_size" }; - $new_record->{ "Q_ID" } = $record->{ "Q_ID" } . "_$i"; - $new_record->{ "SCORE" } = $record->{ "SCORE" }; - $new_record->{ "STRAND" } = $record->{ "STRAND" }; - - put_record( $new_record, $out ); - } - } - else - { - put_record( $record, $out ); - } - } -} - - -sub script_align_seq -{ - # Martin A. Hansen, August 2007. - - # Align sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - ) = @_; - - # Returns nothing. - - my ( $record, @entries, $entry ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { - push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; - } elsif ( $record->{ "Q_ID" } and $record->{ "SEQ" } ) { - push @entries, [ $record->{ "Q_ID" }, $record->{ "SEQ" } ]; - } else { - put_record( $record, $out ); - } - } - - @entries = Maasha::Align::align( \@entries ); - - foreach $entry ( @entries ) - { - if ( $entry->[ SEQ_NAME ] and $entry->[ SEQ ] ) - { - $record = { - SEQ_NAME => $entry->[ SEQ_NAME ], - SEQ => $entry->[ SEQ ], - }; - - put_record( $record, $out ); - } - } -} - - -sub script_tile_seq -{ - # Martin A. Hansen, February 2008. - - # Using the first sequence in stream as reference, tile - # all subsequent sequences based on pairwise alignments. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $first, $ref_entry, @entries ); - - $first = 1; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) - { - if ( $first ) - { - $ref_entry = [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; - - $first = 0; - } - else - { - push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; - } - } - else - { - put_record( $record, $out ); - } - } - - @entries = Maasha::Align::align_tile( $ref_entry, \@entries, $options ); - - map { put_record( { SEQ_NAME => $_->[ SEQ_NAME ], SEQ => $_->[ SEQ ] }, $out ) } @entries; -} - - -sub script_invert_align -{ - # Martin A. Hansen, February 2008. - - # Inverts an alignment showing only non-mathing residues - # using the first sequence as reference. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, @entries ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) - { - push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; - } - else - { - put_record( $record, $out ); - } - } - - Maasha::Align::align_invert( \@entries, $options->{ "soft" } ); - - map { put_record( { SEQ_NAME => $_->[ SEQ_NAME ], SEQ => $_->[ SEQ ] }, $out ) } @entries; -} - - -sub script_patscan_seq -{ - # Martin A. Hansen, August 2007. - - # Locates patterns in sequences using scan_for_matches. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $genome_file, @args, $arg, $type, $seq_file, $pat_file, $out_file, $fh_in, $fh_out, $record, $patterns, $pattern, $entry, $result, %head_hash, $i ); - - if ( $options->{ "patterns" } ) { - $patterns = Maasha::Patscan::parse_patterns( $options->{ "patterns" } ); - } elsif ( -f $options->{ "patterns_in" } ) { - $patterns = Maasha::Patscan::read_patterns( $options->{ "patterns_in" } ); - } - - $genome_file = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna" if $options->{ 'genome' }; - - push @args, "-c" if $options->{ "comp" }; - push @args, "-m $options->{ 'max_hits' }" if $options->{ 'max_hits' }; - push @args, "-n $options->{ 'max_misses' }" if $options->{ 'max_hits' }; - - $seq_file = "$BP_TMP/patscan.seq"; - $pat_file = "$BP_TMP/patscan.pat"; - $out_file = "$BP_TMP/patscan.out"; - - $fh_out = Maasha::Common::write_open( $seq_file ); - - $i = 0; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } and $record->{ "SEQ_NAME" } ) - { - $type = Maasha::Seq::seq_guess_type( $record->{ "SEQ" } ) if not $type; - - Maasha::Fasta::put_entry( [ $i, $record->{ "SEQ" } ], $fh_out ); - - $head_hash{ $i } = $record->{ "SEQ_NAME" }; - - $i++; - } - } - - close $fh_out; - - $arg = join " ", @args; - $arg .= " -p" if $type eq "protein"; - - foreach $pattern ( @{ $patterns } ) - { - $fh_out = Maasha::Common::write_open( $pat_file ); - - print $fh_out "$pattern\n"; - - close $fh_out; - - if ( $options->{ 'genome' } ) { - `scan_for_matches $arg $pat_file < $genome_file > $out_file`; - # Maasha::Common::run( "scan_for_matches", "$arg $pat_file < $genome_file > $out_file" ); - } else { - `scan_for_matches $arg $pat_file < $seq_file > $out_file`; - # Maasha::Common::run( "scan_for_matches", "$arg $pat_file < $seq_file > $out_file" ); - } - - $fh_in = Maasha::Common::read_open( $out_file ); - - while ( $entry = Maasha::Fasta::get_entry( $fh_in ) ) - { - $result = Maasha::Patscan::parse_scan_result( $entry, $pattern ); - - if ( $options->{ 'genome' } ) - { - $result->{ "CHR" } = $result->{ "S_ID" }; - $result->{ "CHR_BEG" } = $result->{ "S_BEG" }; - $result->{ "CHR_END" } = $result->{ "S_END" }; - - delete $result->{ "S_ID" }; - delete $result->{ "S_BEG" }; - delete $result->{ "S_END" }; - } - else - { - $result->{ "S_ID" } = $head_hash{ $result->{ "S_ID" } }; - } - - put_record( $result, $out ); - } - - close $fh_in; - } - - unlink $pat_file; - unlink $seq_file; - unlink $out_file; -} - - -sub script_create_blast_db -{ - # Martin A. Hansen, September 2007. - - # Creates a NCBI BLAST database with formatdb - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $fh, $seq_type, $path, $record, $entry ); - - $path = $options->{ "database" }; - - $fh = Maasha::Common::write_open( $path ); - - while ( $record = get_record( $in ) ) - { - put_record( $record, $out ) if not $options->{ "no_stream" }; - - if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) - { - $seq_type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $seq_type; - - Maasha::Fasta::put_entry( $entry, $fh ); - } - } - - close $fh; - - if ( $seq_type eq "protein" ) { - Maasha::Common::run( "formatdb", "-p T -i $path -t $options->{ 'database' }" ); - } else { - Maasha::Common::run( "formatdb", "-p F -i $path -t $options->{ 'database' }" ); - } - - unlink $path; -} - - -sub script_blast_seq -{ - # Martin A. Hansen, September 2007. - - # BLASTs sequences in stream against a given database. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $genome, $q_type, $s_type, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields, $entry ); - - $options->{ "e_val" } = 10 if not defined $options->{ "e_val" }; - $options->{ "filter" } = "F"; - $options->{ "filter" } = "T" if $options->{ "filter" }; - $options->{ "cpus" } ||= 1; - - $options->{ "database" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/blast/$options->{ 'genome' }.fna" if $options->{ 'genome' }; - - $tmp_in = "$BP_TMP/blast_query.seq"; - $tmp_out = "$BP_TMP/blast.result"; - - $fh_out = Maasha::Filesys::file_write_open( $tmp_in ); - - while ( $record = get_record( $in ) ) - { - if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) - { - $q_type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $q_type; - - Maasha::Fasta::put_entry( $entry, $fh_out ); - } - - put_record( $record, $out ); - } - - close $fh_out; - - if ( -f $options->{ 'database' } . ".phr" ) { - $s_type = "protein"; - } else { - $s_type = "nucleotide"; - } - - if ( not $options->{ 'program' } ) - { - if ( $q_type ne "protein" and $s_type ne "protein" ) { - $options->{ 'program' } = "blastn"; - } elsif ( $q_type eq "protein" and $s_type eq "protein" ) { - $options->{ 'program' } = "blastp"; - } elsif ( $q_type ne "protein" and $s_type eq "protein" ) { - $options->{ 'program' } = "blastx"; - } elsif ( $q_type eq "protein" and $s_type ne "protein" ) { - $options->{ 'program' } = "tblastn"; - } - } - - if ( $options->{ 'verbose' } ) - { - Maasha::Common::run( - "blastall", - join( " ", - "-p $options->{ 'program' }", - "-e $options->{ 'e_val' }", - "-a $options->{ 'cpus' }", - "-m 8", - "-i $tmp_in", - "-d $options->{ 'database' }", - "-F $options->{ 'filter' }", - "-o $tmp_out", - ), - 1 - ); - } - else - { - Maasha::Common::run( - "blastall", - join( " ", - "-p $options->{ 'program' }", - "-e $options->{ 'e_val' }", - "-a $options->{ 'cpus' }", - "-m 8", - "-i $tmp_in", - "-d $options->{ 'database' }", - "-F $options->{ 'filter' }", - "-o $tmp_out", - "> /dev/null 2>&1" - ), - 1 - ); - } - - unlink $tmp_in; - - $fh_out = Maasha::Filesys::file_read_open( $tmp_out ); - - undef $record; - - while ( $line = <$fh_out> ) - { - chomp $line; - - next if $line =~ /^#/; - - @fields = split /\s+/, $line; - - $record->{ "REC_TYPE" } = "BLAST"; - $record->{ "Q_ID" } = $fields[ 0 ]; - $record->{ "S_ID" } = $fields[ 1 ]; - $record->{ "IDENT" } = $fields[ 2 ]; - $record->{ "ALIGN_LEN" } = $fields[ 3 ]; - $record->{ "MISMATCHES" } = $fields[ 4 ]; - $record->{ "GAPS" } = $fields[ 5 ]; - $record->{ "Q_BEG" } = $fields[ 6 ] - 1; # BLAST is 1-based - $record->{ "Q_END" } = $fields[ 7 ] - 1; # BLAST is 1-based - $record->{ "S_BEG" } = $fields[ 8 ] - 1; # BLAST is 1-based - $record->{ "S_END" } = $fields[ 9 ] - 1; # BLAST is 1-based - $record->{ "E_VAL" } = $fields[ 10 ]; - $record->{ "BIT_SCORE" } = $fields[ 11 ]; - - if ( $record->{ "S_BEG" } > $record->{ "S_END" } ) - { - $record->{ "STRAND" } = '-'; - - ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } ); - } - else - { - $record->{ "STRAND" } = '+'; - } - - put_record( $record, $out ); - } - - close $fh_out; - - unlink $tmp_out; -} - - -sub script_blat_seq -{ - # Martin A. Hansen, August 2007. - - # BLATs sequences in stream against a given genome. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $blat_args, $genome_file, $query_file, $fh_in, $fh_out, $type, $record, $result_file, $entries, $entry ); - - $genome_file = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna"; - - $options->{ 'tile_size' } ||= 11; - $options->{ 'one_off' } ||= 0; - $options->{ 'min_identity' } ||= 90; - $options->{ 'min_score' } ||= 0; - $options->{ 'step_size' } ||= $options->{ 'tile_size' }; - - $blat_args .= " -tileSize=$options->{ 'tile_size' }"; - $blat_args .= " -oneOff=$options->{ 'one_off' }"; - $blat_args .= " -minIdentity=$options->{ 'min_identity' }"; - $blat_args .= " -minScore=$options->{ 'min_score' }"; - $blat_args .= " -stepSize=$options->{ 'step_size' }"; -# $blat_args .= " -ooc=" . Maasha::Config::genome_blat_ooc( $options->{ "genome" }, 11 ) if $options->{ 'ooc' }; - - $query_file = "$BP_TMP/blat.seq"; - - $fh_out = Maasha::Common::write_open( $query_file ); - - while ( $record = get_record( $in ) ) - { - if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) - { - $type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not $type; - Maasha::Fasta::put_entry( $entry, $fh_out, 80 ); - } - - put_record( $record, $out ); - } - - close $fh_out; - - $blat_args .= " -t=dnax" if $type eq "protein"; - $blat_args .= " -q=$type"; - - $result_file = "$BP_TMP/blat.psl"; - - Maasha::Common::run( "blat", "$genome_file $query_file $blat_args $result_file > /dev/null 2>&1" ); - - unlink $query_file; - - $entries = Maasha::UCSC::psl_get_entries( $result_file ); - - map { put_record( $_, $out ) } @{ $entries }; - - unlink $result_file; -} - - -sub script_soap_seq -{ - # Martin A. Hansen, July 2008. - - # soap sequences in stream against a given file or genome. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $genome, $tmp_in, $tmp_out, $fh_in, $fh_out, $record, $line, @fields, $entry, $count, $args ); - - $options->{ "seed_size" } ||= 10; - $options->{ "mismatches" } ||= 2; - $options->{ "gap_size" } ||= 0; - $options->{ "cpus" } ||= 1; - - if ( $options->{ "genome" } ) { - $options->{ "in_file" } = "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/fasta/$options->{ 'genome' }.fna"; - } - - $tmp_in = "$BP_TMP/soap_query.seq"; - $tmp_out = "$BP_TMP/soap.result"; - - $fh_out = Maasha::Common::write_open( $tmp_in ); - - $count = 0; - - while ( $record = get_record( $in ) ) - { - if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) - { - Maasha::Fasta::put_entry( $entry, $fh_out ); - - $count++; - } - - put_record( $record, $out ); - } - - close $fh_out; - - if ( $count > 0 ) - { - $args = join( " ", - "-s $options->{ 'seed_size' }", - "-r 2", - "-a $tmp_in", - "-v $options->{ 'mismatches' }", - "-g $options->{ 'gap_size' }", - "-p $options->{ 'cpus' }", - "-d $options->{ 'in_file' }", - "-o $tmp_out", - ); - - $args .= " > /dev/null 2>&1" if not $options->{ 'verbose' }; - - Maasha::Common::run( "soap", $args, 1 ); - - unlink $tmp_in; - - $fh_out = Maasha::Common::read_open( $tmp_out ); - - undef $record; - - while ( $line = <$fh_out> ) - { - chomp $line; - - @fields = split /\t/, $line; - - $record->{ "REC_TYPE" } = "SOAP"; - $record->{ "Q_ID" } = $fields[ 0 ]; - $record->{ "SCORE" } = $fields[ 3 ]; - $record->{ "STRAND" } = $fields[ 6 ]; - $record->{ "S_ID" } = $fields[ 7 ]; - $record->{ "S_BEG" } = $fields[ 8 ] - 1; # soap is 1-based - $record->{ "S_END" } = $fields[ 8 ] + $fields[ 5 ] - 2; - - put_record( $record, $out ); - } - - close $fh_out; - } - - unlink $tmp_out; -} - - -sub script_match_seq -{ - # Martin A. Hansen, August 2007. - - # BLATs sequences in stream against a given genome. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, @entries, $results ); - - $options->{ "word_size" } ||= 20; - $options->{ "direction" } ||= "both"; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { - push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; - } - - put_record( $record, $out ); - } - - if ( @entries == 1 ) - { - $results = Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 0 ] ], $options, $BP_TMP ); - - map { put_record( $_, $out ) } @{ $results }; - } - elsif ( @entries == 2 ) - { - $results = Maasha::Match::match_mummer( [ $entries[ 0 ] ], [ $entries[ 1 ] ], $options, $BP_TMP ); - - map { put_record( $_, $out ) } @{ $results }; - } -} - - -sub script_create_vmatch_index -{ - # Martin A. Hansen, January 2008. - - # Create a vmatch index from sequences in the stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $file_tmp, $fh_tmp, $type, $entry ); - - if ( $options->{ "index_name" } ) - { - $file_tmp = $options->{ 'index_name' }; - $fh_tmp = Maasha::Common::write_open( $file_tmp ); - } - - while ( $record = get_record( $in ) ) - { - if ( $options->{ "index_name" } and $entry = Maasha::Fasta::biopiece2fasta( $record ) ) - { - Maasha::Fasta::put_entry( $entry, $fh_tmp ); - - $type = Maasha::Seq::seq_guess_type( $entry->[ SEQ ] ) if not defined $type; - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - if ( $options->{ "index_name" } ) - { - close $fh_tmp; - - if ( $type eq "protein" ) { - Maasha::Common::run( "mkvtree", "-db $file_tmp -protein -pl $options->{ 'prefix_length' } -allout -indexname $file_tmp > /dev/null 2>&1" ); - } else { - Maasha::Common::run( "mkvtree", "-db $file_tmp -dna -pl $options->{ 'prefix_length' } -allout -indexname $file_tmp > /dev/null 2>&1" ); - } - - unlink $file_tmp; - } -} - - -sub script_vmatch_seq -{ - # Martin A. Hansen, August 2007. - - # Vmatches sequences in stream against a given genome. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( @index_files, @records, $result_file, $fh_in, $record, %hash ); - - $options->{ 'count' } = 1 if $options->{ 'max_hits' }; - - if ( $options->{ "index_name" } ) - { - @index_files = $options->{ "index_name" }; - } - else - { - @index_files = Maasha::Common::ls_files( "$ENV{ 'BP_DATA' }/genomes/$options->{ 'genome' }/vmatch" ); - - map { $_ =~ /^(.+)\.[a-z1]{3,4}$/; $hash{ $1 } = 1 } @index_files; - - @index_files = sort keys %hash; - } - - while ( $record = get_record( $in ) ) - { - push @records, $record; - - put_record( $record, $out ); - } - - $result_file = Maasha::Match::match_vmatch( $BP_TMP, \@records, \@index_files, $options ); - - undef @records; - - $fh_in = Maasha::Common::read_open( $result_file ); - - while ( $record = Maasha::Match::vmatch_get_entry( $fh_in ) ) { - put_record( $record, $out ); - } - - close $fh_in; - - unlink $result_file; -} - - -sub script_write_fasta -{ - # Martin A. Hansen, August 2007. - - # Write FASTA entries from sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $fh, $entry ); - - $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); - - while ( $record = get_record( $in ) ) - { - if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { - Maasha::Fasta::put_entry( $entry, $fh, $options->{ "wrap" } ); - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - close $fh; -} - - -sub script_write_align -{ - # Martin A. Hansen, August 2007. - - # Write pretty alignments aligned sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $fh, $record, @entries ); - - $fh = write_stream( $options->{ "data_out" } ) ; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { - push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - if ( scalar( @entries ) == 2 ) { - Maasha::Align::align_print_pairwise( $entries[ 0 ], $entries[ 1 ], $fh, $options->{ "wrap" } ); - } elsif ( scalar ( @entries ) > 2 ) { - Maasha::Align::align_print_multi( \@entries, $fh, $options->{ "wrap" }, $options->{ "no_ruler" }, $options->{ "no_consensus" } ); - } - - close $fh if $fh; -} - - -sub script_write_blast -{ - # Martin A. Hansen, November 2007. - - # Write data in blast table format (-m8 and 9). - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $fh, $record, $first ); - - $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ) ; - - $first = 1; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "REC_TYPE" } eq "BLAST" ) - { - if ( $options->{ "comment" } and $first ) - { - print "# Fields: Query id, Subject id, % identity, alignment length, mismatches, gap openings, q. start, q. end, s. start, s. end, e-value, bit score\n"; - - $first = 0; - } - - if ( $record->{ "STRAND" } eq "-" ) { - ( $record->{ "S_BEG" }, $record->{ "S_END" } ) = ( $record->{ "S_END" }, $record->{ "S_BEG" } ); - } - - print $fh join( "\t", - $record->{ "Q_ID" }, - $record->{ "S_ID" }, - $record->{ "IDENT" }, - $record->{ "ALIGN_LEN" }, - $record->{ "MISMATCHES" }, - $record->{ "GAPS" }, - $record->{ "Q_BEG" } + 1, - $record->{ "Q_END" } + 1, - $record->{ "S_BEG" } + 1, - $record->{ "S_END" } + 1, - $record->{ "E_VAL" }, - $record->{ "BIT_SCORE" } - ), "\n"; - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - close $fh; -} - - -sub script_write_tab -{ - # Martin A. Hansen, August 2007. - - # Write data as table. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $fh, $record, $key, @keys, @vals, $ok, %no_keys, $A, $B ); - - $options->{ "delimit" } ||= "\t"; - - map { $no_keys{ $_ } = 1 } @{ $options->{ "no_keys" } }; - - $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); - - while ( $record = get_record( $in ) ) - { - undef @vals; - $ok = 1; - - if ( $options->{ "keys" } ) - { - map { $ok = 0 if not exists $record->{ $_ } } @{ $options->{ "keys" } }; - - if ( $ok ) - { - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( exists $record->{ $key } ) - { - push @keys, $key if $options->{ "comment" }; - push @vals, $record->{ $key }; - } - } - } - } - else - { - foreach $key ( sort { $A = $a; $B = $b; $A =~ s/^V(\d+)$/$1/; $B =~ s/^V(\d+)$/$1/; $A <=> $B } keys %{ $record } ) - { - next if exists $no_keys{ $key }; - - push @keys, $key if $options->{ "comment" }; - push @vals, $record->{ $key }; - } - } - - if ( @keys and $options->{ "comment" } ) - { - print $fh "#", join( $options->{ "delimit" }, @keys ), "\n"; - - delete $options->{ "comment" }; - } - - print $fh join( $options->{ "delimit" }, @vals ), "\n" if @vals; - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - close $fh; -} - - -sub script_write_bed -{ - # Martin A. Hansen, August 2007. - - # Write BED format for the UCSC genome browser using records in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $cols, $fh, $record, $bed_entry, $new_record ); - - $cols = $options->{ 'cols' }->[ 0 ]; - - $fh = write_stream( $options->{ 'data_out' }, $options->{ 'compress' } ); - - while ( $record = get_record( $in ) ) - { - $record = Maasha::UCSC::psl2record( $record ) if $record->{ 'tBaseInsert' }; # Dirty addition to allow Affy data from MySQL to be dumped - - if ( $bed_entry = Maasha::UCSC::BED::biopiece2bed( $record, $cols ) ) { - Maasha::UCSC::BED::bed_entry_put( $bed_entry, $fh, $cols, $options->{ 'check' } ); - } - - put_record( $record, $out ) if not $options->{ 'no_stream' }; - } - - close $fh; -} - - -sub script_write_psl -{ - # Martin A. Hansen, August 2007. - - # Write PSL output from stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $fh, $record, @output, $first ); - - $first = 1; - - $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); - - while ( $record = get_record( $in ) ) - { - put_record( $record, $out ) if not $options->{ "no_stream" }; - - if ( $record->{ "REC_TYPE" } and $record->{ "REC_TYPE" } eq "PSL" ) - { - Maasha::UCSC::psl_put_header( $fh ) if $first; - Maasha::UCSC::psl_put_entry( $record, $fh ); - $first = 0; - } - } - - close $fh; -} - - -sub script_write_fixedstep -{ - # Martin A. Hansen, Juli 2008. - - # Write fixedStep entries from recrods in the stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $fh, $record, $entry ); - - $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); - - while ( $record = get_record( $in ) ) - { - if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) { - Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh ); - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - close $fh; -} - - -sub script_write_2bit -{ - # Martin A. Hansen, March 2008. - - # Write sequence entries from stream in 2bit format. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $mask, $tmp_file, $fh_tmp, $fh_in, $fh_out, $entry ); - - $mask = 1 if not $options->{ "no_mask" }; - - $tmp_file = "$BP_TMP/write_2bit.fna"; - $fh_tmp = Maasha::Common::write_open( $tmp_file ); - - $fh_out = write_stream( $options->{ "data_out" } ); - - while ( $record = get_record( $in ) ) - { - if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) { - Maasha::Fasta::put_entry( $entry, $fh_tmp ); - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - close $fh_tmp; - - $fh_in = Maasha::Common::read_open( $tmp_file ); - - Maasha::TwoBit::fasta2twobit( $fh_in, $fh_out, $mask ); - - close $fh_in; - close $fh_out; - - unlink $tmp_file; -} - - -sub script_write_solid -{ - # Martin A. Hansen, April 2008. - - # Write di-base encoded Solid sequence from entries in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $fh, $entry ); - - $fh = write_stream( $options->{ "data_out" }, $options->{ "compress" } ); - - while ( $record = get_record( $in ) ) - { - if ( $entry = Maasha::Fasta::biopiece2fasta( $record ) ) - { - $entry->[ SEQ ] = Maasha::Solid::seq2color_space( uc $entry->[ SEQ ] ); - - Maasha::Fasta::put_entry( $entry, $fh, $options->{ "wrap" } ); - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - close $fh; -} - - -sub script_write_ucsc_config -{ - # Martin A. Hansen, November 2008. - - # Write UCSC Genome Broser configuration (.ra file type) from - # records in the stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $fh ); - - $fh = write_stream( $options->{ "data_out" } ); - - while ( $record = get_record( $in ) ) - { - Maasha::UCSC::ucsc_config_entry_put( $record, $fh ) if $record->{ "REC_TYPE" } eq "UCSC Config"; - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - close $fh; -} - - -sub script_plot_seqlogo -{ - # Martin A. Hansen, August 2007. - - # Calculates and writes a sequence logo for alignments. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, @entries, $logo, $fh ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ_NAME" } and $record->{ "SEQ" } ) { - push @entries, [ $record->{ "SEQ_NAME" }, $record->{ "SEQ" } ]; - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - $logo = Maasha::Plot::seq_logo( \@entries ); - - $fh = write_stream( $options->{ "data_out" } ); - - print $fh $logo; - - close $fh; -} - - -sub script_plot_phastcons_profiles -{ - # Martin A. Hansen, January 2008. - - # Plots PhastCons profiles. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $phastcons_file, $phastcons_index, $index, $fh_phastcons, $record, $scores, $AoA, $plot, $fh ); - - $options->{ "title" } ||= "PhastCons Profiles"; - - $phastcons_file = Maasha::Config::genome_phastcons( $options->{ "genome" } ); - $phastcons_index = Maasha::Config::genome_phastcons_index( $options->{ "genome" } ); - - $index = Maasha::UCSC::fixedstep_index_retrieve( $phastcons_index ); - $fh_phastcons = Maasha::Common::read_open( $phastcons_file ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) - { - $scores = Maasha::UCSC::fixedstep_index_lookup( $index, $fh_phastcons, $record->{ "CHR" }, - $record->{ "CHR_BEG" }, - $record->{ "CHR_END" }, - $options->{ "flank" } ); - - push @{ $AoA }, [ @{ $scores } ]; - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - Maasha::UCSC::phastcons_normalize( $AoA ); - - $AoA = [ [ Maasha::UCSC::phastcons_mean( $AoA ) ] ] if $options->{ "mean" }; - $AoA = [ [ Maasha::UCSC::phastcons_median( $AoA ) ] ] if $options->{ "median" }; - - $AoA = Maasha::Matrix::matrix_flip( $AoA ); - - $plot = Maasha::Plot::lineplot_simple( $AoA, $options, $BP_TMP ); - - $fh = write_stream( $options->{ "data_out" } ); - - print $fh "$_\n" foreach @{ $plot }; - - close $fh; -} - - -sub script_analyze_bed -{ - # Martin A. Hansen, March 2008. - - # Analyze BED entries in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record ); - - while ( $record = get_record( $in ) ) - { - $record = Maasha::UCSC::bed_analyze( $record ) if $record->{ "REC_TYPE" } eq "BED"; - - put_record( $record, $out ); - } -} - - -sub script_analyze_vals -{ - # Martin A. Hansen, August 2007. - - # Analyze values for given keys in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $key, @keys, %key_hash, $analysis, $len ); - - map { $key_hash{ $_ } = 1 } @{ $options->{ "keys" } }; - - while ( $record = get_record( $in ) ) - { - foreach $key ( keys %{ $record } ) - { - next if $options->{ "keys" } and not exists $key_hash{ $key }; - - $analysis->{ $key }->{ "COUNT" }++; - - if ( Maasha::Calc::is_a_number( $record->{ $key } ) ) - { - $analysis->{ $key }->{ "TYPE" } = "num"; - $analysis->{ $key }->{ "SUM" } += $record->{ $key }; - $analysis->{ $key }->{ "MAX" } = $record->{ $key } if $record->{ $key } > $analysis->{ $key }->{ "MAX" } or not $analysis->{ $key }->{ "MAX" }; - $analysis->{ $key }->{ "MIN" } = $record->{ $key } if $record->{ $key } < $analysis->{ $key }->{ "MIN" } or not $analysis->{ $key }->{ "MIN" }; - } - else - { - $len = length $record->{ $key }; - - $analysis->{ $key }->{ "TYPE" } = "alph"; - $analysis->{ $key }->{ "SUM" } += $len; - $analysis->{ $key }->{ "MAX" } = $len if $len > $analysis->{ $key }->{ "MAX" } or not $analysis->{ $key }->{ "MAX" }; - $analysis->{ $key }->{ "MIN" } = $len if $len < $analysis->{ $key }->{ "MIM" } or not $analysis->{ $key }->{ "MIN" }; - } - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - foreach $key ( keys %{ $analysis } ) - { - $analysis->{ $key }->{ "MEAN" } = sprintf "%.2f", $analysis->{ $key }->{ "SUM" } / $analysis->{ $key }->{ "COUNT" }; - $analysis->{ $key }->{ "SUM" } = sprintf "%.2f", $analysis->{ $key }->{ "SUM" }; - } - - my ( $keys, $types, $counts, $mins, $maxs, $sums, $means ); - - $keys = "KEY "; - $types = "TYPE "; - $counts = "COUNT"; - $mins = "MIN "; - $maxs = "MAX "; - $sums = "SUM "; - $means = "MEAN "; - - if ( $options->{ "keys" } ) { - @keys = @{ $options->{ "keys" } }; - } else { - @keys = keys %{ $analysis }; - } - - foreach $key ( @keys ) - { - $keys .= sprintf "% 15s", $key; - $types .= sprintf "% 15s", $analysis->{ $key }->{ "TYPE" }; - $counts .= sprintf "% 15s", $analysis->{ $key }->{ "COUNT" }; - $mins .= sprintf "% 15s", $analysis->{ $key }->{ "MIN" }; - $maxs .= sprintf "% 15s", $analysis->{ $key }->{ "MAX" }; - $sums .= sprintf "% 15s", $analysis->{ $key }->{ "SUM" }; - $means .= sprintf "% 15s", $analysis->{ $key }->{ "MEAN" }; - } - - print $out "$keys\n"; - print $out "$types\n"; - print $out "$counts\n"; - print $out "$mins\n"; - print $out "$maxs\n"; - print $out "$sums\n"; - print $out "$means\n"; -} - - -sub script_head_records -{ - # Martin A. Hansen, August 2007. - - # Display the first sequences in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $count ); - - $options->{ "num" } ||= 10; - - $count = 0; - - while ( $record = get_record( $in ) ) - { - $count++; - - put_record( $record, $out ); - - last if $count == $options->{ "num" }; - } -} - - -sub script_remove_keys -{ - # Martin A. Hansen, August 2007. - - # Remove keys from stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $new_record ); - - while ( $record = get_record( $in ) ) - { - if ( $options->{ "keys" } ) - { - map { delete $record->{ $_ } } @{ $options->{ "keys" } }; - } - elsif ( $options->{ "save_keys" } ) - { - map { $new_record->{ $_ } = $record->{ $_ } if exists $record->{ $_ } } @{ $options->{ "save_keys" } }; - - $record = $new_record; - } - - put_record( $record, $out ) if keys %{ $record }; - } -} - - -sub script_remove_adaptor -{ - # Martin A. Hansen, August 2008. - - # Find and remove adaptor from sequences in the stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $adaptor, $seq, $adaptor_len, $seq_len, $offset, $max_match, $max_mismatch, $pos ); - - $options->{ "remove" } ||= "after"; - - $max_mismatch = $options->{ "mismatches" } || 0; - $offset = $options->{ "offset" }; - - if ( not defined $offset ) { - $offset = 0; - } else { - $offset--; - } - - $adaptor = uc $options->{ "adaptor" }; - $adaptor_len = length $adaptor; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "SEQ" } ) - { - $seq = uc $record->{ "SEQ" }; - $seq_len = length $seq; - - $pos = Maasha::Common::index_m( $seq, $adaptor, $seq_len, $adaptor_len, $offset, $max_mismatch ); - - $record->{ "ADAPTOR_POS" } = $pos; - - if ( $pos >= 0 and $options->{ "remove" } ne "skip" ) - { - if ( $options->{ "remove" } eq "after" ) - { - $record->{ "SEQ" } = substr $record->{ "SEQ" }, 0, $pos; - $record->{ "SEQ_LEN" } = $pos; - } - else - { - $record->{ "SEQ" } = substr $record->{ "SEQ" }, $pos + $adaptor_len; - $record->{ "SEQ_LEN" } = length $record->{ "SEQ" }; - } - } - - put_record( $record, $out ); - } - else - { - put_record( $record, $out ); - } - } -} - - -sub script_remove_mysql_tables -{ - # Martin A. Hansen, November 2008. - - # Remove MySQL tables from values in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, %table_hash, $dbh, $table ); - - $options->{ "user" } ||= Maasha::UCSC::ucsc_get_user(); - $options->{ "password" } ||= Maasha::UCSC::ucsc_get_password(); - - map { $table_hash{ $_ } = 1 } @{ $options->{ 'tables' } }; - - while ( $record = get_record( $in ) ) - { - map { $table_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } }; - - put_record( $record, $out ) if not $options->{ 'no_stream' }; - } - - $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } ); - - foreach $table ( sort keys %table_hash ) - { - if ( Maasha::SQL::table_exists( $dbh, $table ) ) - { - print STDERR qq(Removing table "$table" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' }; - Maasha::SQL::delete_table( $dbh, $table ); - print STDERR "done.\n" if $options->{ 'verbose' }; - } - else - { - print STDERR qq(WARNING: table "$table" not found in database "$options->{ 'database' }\n"); - } - } - - Maasha::SQL::disconnect( $dbh ); -} - - -sub script_remove_ucsc_tracks -{ - # Martin A. Hansen, November 2008. - - # Remove track from MySQL tables and config file. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, %track_hash, $fh_in, $fh_out, $track, @tracks, @new_tracks, $dbh ); - - $options->{ 'user' } ||= Maasha::UCSC::ucsc_get_user(); - $options->{ 'password' } ||= Maasha::UCSC::ucsc_get_password(); - $options->{ 'config_file' } ||= "$ENV{ 'HOME' }/ucsc/my_tracks.ra"; - - map { $track_hash{ $_ } = 1 } @{ $options->{ 'tracks' } }; - - while ( $record = get_record( $in ) ) - { - map { $track_hash{ $record->{ $_ } } = 1 } @{ $options->{ 'keys' } }; - - put_record( $record, $out ) if not $options->{ 'no_stream' }; - } - - $fh_in = Maasha::Filesys::file_read_open( $options->{ 'config_file' } ); - - while ( $track = Maasha::UCSC::ucsc_config_entry_get( $fh_in ) ) { - push @tracks, $track; - } - - close $fh_in; - - foreach $track ( @tracks ) - { - if ( $track->{ 'database' } eq $options->{ 'database' } and exists $track_hash{ $track->{ 'track' } } ) { - print STDERR qq(Removing track "$track->{ 'track' }" from config file.\n) if $options->{ 'verbose' }; - } else { - push @new_tracks, $track; - } - } - - rename "$options->{ 'config_file' }", "$options->{ 'config_file' }~"; - - $fh_out = Maasha::Filesys::file_write_open( $options->{ 'config_file' } ); - - map { Maasha::UCSC::ucsc_config_entry_put( $_, $fh_out ) } @new_tracks; - - close $fh_out; - - # ---- locate track in database ---- - - $dbh = Maasha::SQL::connect( $options->{ "database" }, $options->{ "user" }, $options->{ "password" } ); - - foreach $track ( sort keys %track_hash ) - { - if ( Maasha::SQL::table_exists( $dbh, $track ) ) - { - print STDERR qq(Removing table "$track" from database "$options->{ 'database' }" ... ) if $options->{ 'verbose' }; - Maasha::SQL::delete_table( $dbh, $track ); - print STDERR "done.\n" if $options->{ 'verbose' }; - } - else - { - print STDERR qq(WARNING: table "$track" not found in database "$options->{ 'database' }\n"); - } - } - - Maasha::SQL::disconnect( $dbh ); - - Maasha::Common::run( "ucscMakeTracks.pl", "-b > /dev/null 2>&1" ); -} - - -sub script_rename_keys -{ - # Martin A. Hansen, August 2007. - - # Rename keys in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record ); - - while ( $record = get_record( $in ) ) - { - if ( exists $record->{ $options->{ "keys" }->[ 0 ] } ) - { - $record->{ $options->{ "keys" }->[ 1 ] } = $record->{ $options->{ "keys" }->[ 0 ] }; - - delete $record->{ $options->{ "keys" }->[ 0 ] }; - } - - put_record( $record, $out ); - } -} - - -sub script_uniq_vals -{ - # Martin A. Hansen, August 2007. - - # Find unique values in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( %hash, $record ); - - while ( $record = get_record( $in ) ) - { - if ( $record->{ $options->{ "key" } } ) - { - if ( not $hash{ $record->{ $options->{ "key" } } } and not $options->{ "invert" } ) - { - put_record( $record, $out ); - - $hash{ $record->{ $options->{ "key" } } } = 1; - } - elsif ( $hash{ $record->{ $options->{ "key" } } } and $options->{ "invert" } ) - { - put_record( $record, $out ); - } - else - { - $hash{ $record->{ $options->{ "key" } } } = 1; - } - } - else - { - put_record( $record, $out ); - } - } -} - - -sub script_merge_vals -{ - # Martin A. Hansen, August 2007. - - # Rename keys in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, @join, $i ); - - $options->{ "delimit" } ||= '_'; - - while ( $record = get_record( $in ) ) - { - if ( exists $record->{ $options->{ "keys" }->[ 0 ] } ) - { - @join = $record->{ $options->{ "keys" }->[ 0 ] }; - - for ( $i = 1; $i < @{ $options->{ "keys" } }; $i++ ) { - push @join, $record->{ $options->{ "keys" }->[ $i ] } if exists $record->{ $options->{ "keys" }->[ $i ] }; - } - - $record->{ $options->{ "keys" }->[ 0 ] } = join $options->{ "delimit" }, @join; - } - - put_record( $record, $out ); - } -} - - -sub script_merge_records -{ - # Martin A. Hansen, July 2008. - - # Merges records in the stream based on identical values of two given keys. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $merge, $record, $file1, $file2, $fh1, $fh2, $key1, $key2, @keys1, @keys2, @vals1, @vals2, - $num1, $num2, $num, $cmp, $i ); - - $merge = $options->{ "merge" } || "AandB"; - - $file1 = "$BP_TMP/merge_records1.tmp"; - $file2 = "$BP_TMP/merge_records2.tmp"; - - $fh1 = Maasha::Common::write_open( $file1 ); - $fh2 = Maasha::Common::write_open( $file2 ); - - $key1 = $options->{ "keys" }->[ 0 ]; - $key2 = $options->{ "keys" }->[ 1 ]; - - $num = $key2 =~ s/n$//; - $num1 = 0; - $num2 = 0; - - while ( $record = get_record( $in ) ) - { - if ( exists $record->{ $key1 } ) - { - @keys1 = $key1; - @vals1 = $record->{ $key1 }; - - delete $record->{ $key1 }; - - map { push @keys1, $_; push @vals1, $record->{ $_ } } keys %{ $record }; - - print $fh1 join( "\t", @vals1 ), "\n"; - - $num1++; - } - elsif ( exists $record->{ $key2 } ) - { - @keys2 = $key2; - @vals2 = $record->{ $key2 }; - - delete $record->{ $key2 }; - - map { push @keys2, $_; push @vals2, $record->{ $_ } } keys %{ $record }; - - print $fh2 join( "\t", @vals2 ), "\n"; - - $num2++; - } - } - - close $fh1; - close $fh2; - - if ( $num ) - { - Maasha::Common::run( "sort", "-k 1,1n $file1 > $file1.sort" ) and rename "$file1.sort", $file1; - Maasha::Common::run( "sort", "-k 1,1n $file2 > $file2.sort" ) and rename "$file2.sort", $file2; - } - else - { - Maasha::Common::run( "sort", "-k 1,1 $file1 > $file1.sort" ) and rename "$file1.sort", $file1; - Maasha::Common::run( "sort", "-k 1,1 $file2 > $file2.sort" ) and rename "$file2.sort", $file2; - } - - $fh1 = Maasha::Common::read_open( $file1 ); - $fh2 = Maasha::Common::read_open( $file2 ); - - @vals1 = Maasha::Common::get_fields( $fh1 ); - @vals2 = Maasha::Common::get_fields( $fh2 ); - - while ( $num1 > 0 and $num2 > 0 ) - { - undef $record; - - if ( $num ) { - $cmp = $vals1[ 0 ] <=> $vals2[ 0 ]; - } else { - $cmp = $vals1[ 0 ] cmp $vals2[ 0 ]; - } - - if ( $cmp < 0 ) - { - if ( $merge =~ /^(AorB|AnotB)$/ ) - { - for ( $i = 0; $i < @keys1; $i++ ) { - $record->{ $keys1[ $i ] } = $vals1[ $i ]; - } - - put_record( $record, $out ); - } - - @vals1 = Maasha::Common::get_fields( $fh1 ); - $num1--; - } - elsif ( $cmp > 0 ) - { - if ( $merge =~ /^(BorA|BnotA)$/ ) - { - for ( $i = 0; $i < @keys2; $i++ ) { - $record->{ $keys2[ $i ] } = $vals2[ $i ]; - } - - put_record( $record, $out ); - } - - @vals2 = Maasha::Common::get_fields( $fh2 ); - $num2--; - } - else - { - if ( $merge =~ /^(AandB|AorB|BorA)$/ ) - { - for ( $i = 0; $i < @keys1; $i++ ) { - $record->{ $keys1[ $i ] } = $vals1[ $i ]; - } - - for ( $i = 1; $i < @keys2; $i++ ) { - $record->{ $keys2[ $i ] } = $vals2[ $i ]; - } - - put_record( $record, $out ); - } - - @vals1 = Maasha::Common::get_fields( $fh1 ); - @vals2 = Maasha::Common::get_fields( $fh2 ); - $num1--; - $num2--; - } - } - - close $fh1; - close $fh2; - - unlink $file1; - unlink $file2; - - if ( $num1 > 0 and $merge =~ /^(AorB|AnotB)$/ ) - { - undef $record; - - for ( $i = 0; $i < @keys1; $i++ ) { - $record->{ $keys1[ $i ] } = $vals1[ $i ]; - } - - put_record( $record, $out ); - } - - if ( $num2 > 0 and $merge =~ /^(BorA|BnotA)$/ ) - { - undef $record; - - for ( $i = 0; $i < @keys2; $i++ ) { - $record->{ $keys2[ $i ] } = $vals2[ $i ]; - } - - put_record( $record, $out ); - } -} - - -sub script_grab -{ - # Martin A. Hansen, August 2007. - - # Grab for records in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $keys, $vals_only, $keys_only, $invert, $patterns, $pattern, $regex, $record, $key, $op, $val, %lookup_hash, $found ); - - $keys = $options->{ 'keys' }; - $vals_only = $options->{ 'vals_only' }; - $keys_only = $options->{ 'keys_only' }; - $invert = $options->{ 'invert' }; - - if ( $options->{ 'patterns' } ) - { - $patterns = [ split ",", $options->{ 'patterns' } ]; - } - elsif ( -f $options->{ 'patterns_in' } ) - { - $patterns = Maasha::Patscan::read_patterns( $options->{ 'patterns_in' } ); - } - elsif ( $options->{ 'regex' } ) - { - if ( $options->{ 'case_insensitive' } ) { - $regex = qr/$options->{ 'regex' }/i; - } else { - $regex = qr/$options->{ 'regex' }/; - } - } - elsif ( -f $options->{ 'exact_in' } ) - { - $patterns = Maasha::Patscan::read_patterns( $options->{ 'exact_in' } ); - - map { $lookup_hash{ $_ } = 1 } @{ $patterns }; - - undef $patterns; - } - elsif ( $options->{ 'eval' } ) - { - if ( $options->{ 'eval' } =~ /^([^><=! ]+)\s*(>=|<=|>|<|=|!=|eq|ne)\s*(.+)$/ ) - { - $key = $1; - $op = $2; - $val = $3; - } - } - - while ( $record = get_record( $in ) ) - { - $found = 0; - - if ( %lookup_hash ) { - $found = grab_lookup( \%lookup_hash, $record, $keys, $vals_only, $keys_only ); - } elsif ( $patterns ) { - $found = grab_patterns( $patterns, $record, $keys, $vals_only, $keys_only ); - } elsif ( $regex ) { - $found = grab_regex( $regex, $record, $keys, $vals_only, $keys_only ); - } elsif ( $op ) { - $found = grab_eval( $key, $op, $val, $record ); - } - - if ( $found and not $invert ) { - put_record( $record, $out ); - } elsif ( not $found and $invert ) { - put_record( $record, $out ); - } - } -} - - -sub script_compute -{ - # Martin A. Hansen, August 2007. - - # Evaluate extression for records in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $eval_key, @keys, $eval_val ); - - while ( $record = get_record( $in ) ) - { - if ( $options->{ "eval" } ) - { - if ( $options->{ "eval" } =~ /^(\S+)\s*=\s*(.+)$/ ) - { - $eval_key = $1; - $eval_val = $2; - - if ( not @keys ) - { - @keys = split /\s+|\+|-|\*|\/|\*\*/, $eval_val; - - @keys = grep { exists $record->{ $_ } } @keys; - } - - map { $eval_val =~ s/\Q$_\E/$record->{ $_ }/g } @keys; - - $record->{ $eval_key } = eval "$eval_val"; - Maasha::Common::error( qq(eval "$eval_key = $eval_val" failed -> $@) ) if $@; - } - else - { - Maasha::Common::error( qq(Bad compute expression: "$options->{ 'eval' }"\n) ); - } - } - - put_record( $record, $out ); - } -} - - -sub script_flip_tab -{ - # Martin A. Hansen, June 2008. - - # Flip a table. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $key, $A, $B, @rows, @matrix, $row, $i ); - - while ( $record = get_record( $in ) ) - { - undef @rows; - - foreach $key ( sort { $A = $a; $B = $b; $A =~ s/^V(\d+)$/$1/; $B =~ s/^V(\d+)$/$1/; $A <=> $B } keys %{ $record } ) - { - push @rows, $record->{ $key }; - - } - - push @matrix, [ @rows ]; - } - - undef $record; - - @matrix = Maasha::Matrix::matrix_flip( \@matrix ); - - foreach $row ( @matrix ) - { - for ( $i = 0; $i < @{ $row }; $i++ ) { - $record->{ "V$i" } = $row->[ $i ]; - } - - put_record( $record, $out ); - } -} - - -sub script_add_ident -{ - # Martin A. Hansen, May 2008. - - # Add a unique identifier to each record in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $key, $prefix, $i ); - - $key = $options->{ "key" } || "ID"; - $prefix = $options->{ "prefix" } || "ID"; - - $i = 0; - - while ( $record = get_record( $in ) ) - { - $record->{ $key } = sprintf( "$prefix%08d", $i ); - - put_record( $record, $out ); - - $i++; - } -} - - -sub script_count_records -{ - # Martin A. Hansen, August 2007. - - # Count records in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $count, $result, $fh, $line ); - - $count = 0; - - if ( $options->{ "no_stream" } ) - { - while ( $line = <$in> ) - { - chomp $line; - - $count++ if $line eq "---"; - } - } - else - { - while ( $record = get_record( $in ) ) - { - put_record( $record, $out ); - - $count++; - } - } - - $result = { "RECORDS_COUNT" => $count }; - - $fh = write_stream( $options->{ "data_out" } ); - - put_record( $result, $fh ); - - close $fh; -} - - -sub script_random_records -{ - # Martin A. Hansen, August 2007. - - # Pick a number or random records from stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $tmp_file, $fh_out, $fh_in, $count, $i, %rand_hash, $rand, $max ); - - $options->{ "num" } ||= 10; - - $tmp_file = "$BP_TMP/random_records.tmp"; - - $fh_out = Maasha::Common::write_open( $tmp_file ); - - $count = 0; - - while ( $record = get_record( $in ) ) - { - put_record( $record, $fh_out ); - - $count++; - } - - close $fh_out; - - $max = 0; - $i = 0; - - Maasha::Common::error( qq(Requested random records > records in stream) ) if $options->{ "num" } > $count; - - while ( $i < $options->{ "num" } ) - { - $rand = int( rand( $count ) ); - - if ( not exists $rand_hash{ $rand } ) - { - $rand_hash{ $rand } = 1; - - $max = $rand if $rand > $max; - - $i++; - } - } - - $fh_in = Maasha::Common::read_open( $tmp_file ); - - $count = 0; - - while ( $record = get_record( $fh_in ) ) - { - put_record( $record, $out ) if exists $rand_hash{ $count }; - - last if $count == $max; - - $count++; - } - - close $fh_in; - - unlink $tmp_file; -} - - -sub script_sort_records -{ - # Martin A. Hansen, August 2007. - - # Sort to sort records according to keys. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( @keys, $key, @sort_cmd, $sort_str, $sort_sub, @records, $record, $i ); - - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( $key =~ s/n$// ) { - push @sort_cmd, qq(\$a->{ "$key" } <=> \$b->{ "$key" }); - } else { - push @sort_cmd, qq(\$a->{ "$key" } cmp \$b->{ "$key" }); - } - } - - $sort_str = join " or ", @sort_cmd; - $sort_sub = eval "sub { $sort_str }"; # NB security issue! - - while ( $record = get_record( $in ) ) { - push @records, $record; - } - - @records = sort $sort_sub @records; - - if ( $options->{ "reverse" } ) - { - for ( $i = scalar @records - 1; $i >= 0; $i-- ) { - put_record( $records[ $i ], $out ); - } - } - else - { - for ( $i = 0; $i < scalar @records; $i++ ) { - put_record( $records[ $i ], $out ); - } - } -} - - -sub script_count_vals -{ - # Martin A. Hansen, August 2007. - - # Count records in stream. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $num, $record, %count_hash, @records, $tmp_file, $fh_out, $fh_in, $cache ); - - $tmp_file = "$BP_TMP/count_cache.tmp"; - - $fh_out = Maasha::Common::write_open( $tmp_file ); - - $cache = 0; - $num = 0; - - while ( $record = get_record( $in ) ) - { - map { $count_hash{ $_ }{ $record->{ $_ } }++ if exists $record->{ $_ } } @{ $options->{ "keys" } }; - - push @records, $record; - - if ( scalar @records > 5_000_000 ) # too many records to hold in memory - use disk cache - { - map { put_record( $_, $fh_out ) } @records; - - undef @records; - - $cache = 1; - } - - print STDERR "verbose: records read $num\n" if ( $options->{ 'verbose' } and ( $num % 1_000_000 ) == 0 ); - - $num++; - } - - close $fh_out; - - if ( $cache ) - { - $num = 0; - - $fh_in = Maasha::Common::read_open( $tmp_file ); - - while ( $record = get_record( $fh_in ) ) - { - map { $record->{ $_ . "_COUNT" } = $count_hash{ $_ }{ $record->{ $_ } } if exists $record->{ $_ } } @{ $options->{ "keys" } }; - - put_record( $record, $out ); - - print STDERR "verbose: cache read $num\n" if ( $options->{ 'verbose' } and ( $num % 1_000_000 ) == 0 ); - - $num++; - } - - close $fh_in; - } - - foreach $record ( @records ) - { - map { $record->{ $_ . "_COUNT" } = $count_hash{ $_ }{ $record->{ $_ } } if exists $record->{ $_ } } @{ $options->{ "keys" } }; - - put_record( $record, $out ); - } - - unlink $tmp_file; -} - - -sub script_plot_histogram -{ - # Martin A. Hansen, September 2007. - - # Plot a simple histogram for a given key using GNU plot. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, %data_hash, $max, @data_list, $i, $result, $fh ); - - $options->{ "title" } ||= "Histogram"; - $options->{ "sort" } ||= "num"; - - while ( $record = get_record( $in ) ) - { - $data_hash{ $record->{ $options->{ "key" } } }++ if defined $record->{ $options->{ "key" } }; - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - if ( $options->{ "sort" } eq "num" ) { - map { push @data_list, [ $_, $data_hash{ $_ } ] } sort { $a <=> $b } keys %data_hash; - } else { - map { push @data_list, [ $_, $data_hash{ $_ } ] } sort keys %data_hash; - } - - $result = Maasha::Plot::histogram_simple( \@data_list, $options ); - - $fh = write_stream( $options->{ "data_out" } ); - - print $fh "$_\n" foreach @{ $result }; - - close $fh; -} - - -sub script_plot_lendist -{ - # Martin A. Hansen, August 2007. - - # Plot length distribution using GNU plot. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, %data_hash, $max, @data_list, $i, $result, $fh ); - - $options->{ "title" } ||= "Length Distribution"; - - while ( $record = get_record( $in ) ) - { - $data_hash{ $record->{ $options->{ "key" } } }++ if defined $record->{ $options->{ "key" } }; - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - $max = Maasha::Calc::list_max( [ keys %data_hash ] ); - - for ( $i = 0; $i < $max; $i++ ) { - push @data_list, [ $i, $data_hash{ $i } || 0 ]; - } - - $result = Maasha::Plot::histogram_lendist( \@data_list, $options ); - - $fh = write_stream( $options->{ "data_out" } ); - - print $fh "$_\n" foreach @{ $result }; - - close $fh; -} - - -sub script_plot_chrdist -{ - # Martin A. Hansen, August 2007. - - # Plot chromosome distribution using GNU plot. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, %data_hash, @data_list, $elem, $sort_key, $count, $result, $fh ); - - $options->{ "title" } ||= "Chromosome Distribution"; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "CHR" } ) { # generic - $data_hash{ $record->{ "CHR" } }++; - } elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "S_ID" } =~ /^chr/i ) { # patscan - $data_hash{ $record->{ "S_ID" } }++; - } elsif ( $record->{ "REC_TYPE" } eq "PSL" and $record->{ "S_ID" } =~ /^chr/i ) { # BLAT / PSL - $data_hash{ $record->{ "S_ID" } }++; - } elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/i ) { # BLAST - $data_hash{ $record->{ "S_ID" } }++; - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - foreach $elem ( keys %data_hash ) - { - $sort_key = $elem; - - $sort_key =~ s/chr//i; - - $sort_key =~ s/^X(.*)/99$1/; - $sort_key =~ s/^Y(.*)/99$1/; - $sort_key =~ s/^Z(.*)/999$1/; - $sort_key =~ s/^M(.*)/9999$1/; - $sort_key =~ s/^U(.*)/99999$1/; - - $count = $sort_key =~ tr/_//; - - $sort_key =~ s/_.*/"999999" x $count/ex; - - push @data_list, [ $elem, $data_hash{ $elem }, $sort_key ]; - } - - @data_list = sort { $a->[ 2 ] <=> $b->[ 2 ] } @data_list; - - $result = Maasha::Plot::histogram_chrdist( \@data_list, $options ); - - $fh = write_stream( $options->{ "data_out" } ); - - print $fh "$_\n" foreach @{ $result }; - - close $fh; -} - - -sub script_plot_karyogram -{ - # Martin A. Hansen, August 2007. - - # Plot hits on karyogram. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( %options, $record, @data, $fh, $result, %data_hash ); - - $options->{ "genome" } ||= "human"; - $options->{ "feat_color" } ||= "black"; - - while ( $record = get_record( $in ) ) - { - if ( $record->{ "CHR" } and $record->{ "CHR_BEG" } and $record->{ "CHR_END" } ) - { - push @{ $data_hash{ $record->{ "CHR" } } }, [ $record->{ "CHR_BEG" }, $record->{ "CHR_END" }, $options->{ "feat_color" } ]; - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - $result = Maasha::Plot::karyogram( \%data_hash, \%options ); - - $fh = write_stream( $options->{ "data_out" } ); - - print $fh $result; - - close $fh; -} - - -sub script_plot_matches -{ - # Martin A. Hansen, August 2007. - - # Plot matches in 2D generating a dotplot. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, @data, $fh, $result, %data_hash ); - - $options->{ "direction" } ||= "both"; - - while ( $record = get_record( $in ) ) - { - if ( defined $record->{ "Q_BEG" } and defined $record->{ "S_BEG" } and $record->{ "Q_END" } and $record->{ "S_END" } ) { - push @data, $record; - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - $options->{ "title" } ||= "plot_matches"; - $options->{ "xlabel" } ||= $data[ 0 ]->{ "Q_ID" }; - $options->{ "ylabel" } ||= $data[ 0 ]->{ "S_ID" }; - - $result = Maasha::Plot::dotplot_matches( \@data, $options, $BP_TMP ); - - $fh = write_stream( $options->{ "data_out" } ); - - print $fh "$_\n" foreach @{ $result }; - - close $fh; -} - - -sub script_length_vals -{ - # Martin A. Hansen, August 2007. - - # Determine the length of the value for given keys. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $key ); - - while ( $record = get_record( $in ) ) - { - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( $record->{ $key } ) { - $record->{ $key . "_LEN" } = length $record->{ $key }; - } - } - - put_record( $record, $out ); - } -} - - -sub script_sum_vals -{ - # Martin A. Hansen, August 2007. - - # Calculates the sums for values of given keys. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $key, %sum_hash, $fh ); - - while ( $record = get_record( $in ) ) - { - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( $record->{ $key } ) { - $sum_hash{ $key } += $record->{ $key }; - } - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - $fh = write_stream( $options->{ "data_out" } ); - - foreach $key ( @{ $options->{ "keys" } } ) { - put_record( { $key . "_SUM" => $sum_hash{ $key } || 0 } , $fh ); - } - - close $fh; -} - - -sub script_mean_vals -{ - # Martin A. Hansen, August 2007. - - # Calculate the mean of values of given keys. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $key, %sum_hash, %count_hash, $mean, $fh ); - - while ( $record = get_record( $in ) ) - { - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( $record->{ $key } ) - { - $sum_hash{ $key } += $record->{ $key }; - $count_hash{ $key }++; - } - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - $fh = write_stream( $options->{ "data_out" } ); - - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( $count_hash{ $key } ) { - $mean = sprintf( "%.2f", ( $sum_hash{ $key } / $count_hash{ $key } ) ); - } else { - $mean = "N/A"; - } - - put_record( { $key . "_MEAN" => $mean } , $fh ); - } - - close $fh; -} - - -sub script_median_vals -{ - # Martin A. Hansen, March 2008. - - # Calculate the median values of given keys. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $key, %median_hash, $median, $fh ); - - while ( $record = get_record( $in ) ) - { - foreach $key ( @{ $options->{ "keys" } } ) { - push @{ $median_hash{ $key } }, $record->{ $key } if defined $record->{ $key }; - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - $fh = write_stream( $options->{ "data_out" } ); - - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( $median_hash{ $key } ) { - $median = Maasha::Calc::median( $median_hash{ $key } ); - } else { - $median = "N/A"; - } - - put_record( { $key . "_MEDIAN" => $median } , $fh ); - } - - close $fh; -} - - -sub script_max_vals -{ - # Martin A. Hansen, February 2008. - - # Determine the maximum values of given keys. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. - - my ( $record, $key, $fh, %max_hash, $max_record ); - - while ( $record = get_record( $in ) ) - { - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( $record->{ $key } ) - { - $max_hash{ $key } = $record->{ $key } if $record->{ $key } > $max_hash{ $key }; - } - } - - put_record( $record, $out ) if not $options->{ "no_stream" }; - } - - $fh = write_stream( $options->{ "data_out" } ); - - foreach $key ( @{ $options->{ "keys" } } ) - { - $max_record->{ $key . "_MAX" } = $max_hash{ $key }; - } - - put_record( $max_record, $fh ); - - close $fh; -} - - -sub script_min_vals -{ - # Martin A. Hansen, February 2008. - - # Determine the minimum values of given keys. - - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; - - # Returns nothing. +use vars qw( @ISA @EXPORT_OK ); - my ( $record, $key, $fh, %min_hash, $min_record ); +require Exporter; - while ( $record = get_record( $in ) ) - { - foreach $key ( @{ $options->{ "keys" } } ) - { - if ( defined $record->{ $key } ) - { - if ( exists $min_hash{ $key } ) { - $min_hash{ $key } = $record->{ $key } if $record->{ $key } < $min_hash{ $key }; - } else { - $min_hash{ $key } = $record->{ $key }; - } - } - } +@ISA = qw( Exporter ); - put_record( $record, $out ) if not $options->{ "no_stream" }; - } +@EXPORT_OK = qw( + read_stream + write_stream + get_record + put_record +); - $fh = write_stream( $options->{ "data_out" } ); - foreach $key ( @{ $options->{ "keys" } } ) - { - $min_record->{ $key . "_MIN" } = $min_hash{ $key }; - } +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> SIGNAL HANDLER <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - put_record( $min_record, $fh ); - close $fh; -} +$SIG{ '__DIE__' } = \&sig_handler; +$SIG{ 'INT' } = \&sig_handler; +$SIG{ 'TERM' } = \&sig_handler; -sub script_upload_to_ucsc +sub log_biopiece { - # Martin A. Hansen, August 2007. - - # Calculate the mean of values of given keys. - - # This routine has developed into the most ugly hack. Do something! + # Martin A. Hansen, January 2008. - my ( $in, # handle to in stream - $out, # handle to out stream - $options, # options hash - ) = @_; + # Log messages to logfile. # Returns nothing. - my ( $record, $file, $wib_file, $wig_file, $wib_dir, $fh_out, $i, $first, $format, $type, $columns, $append, $entry ); - - $options->{ "short_label" } ||= $options->{ 'table' }; - $options->{ "long_label" } ||= $options->{ 'table' }; - $options->{ "group" } ||= $ENV{ "LOGNAME" }; - $options->{ "priority" } ||= 1; - $options->{ "visibility" } ||= "pack"; - $options->{ "color" } ||= join( ",", int( rand( 255 ) ), int( rand( 255 ) ), int( rand( 255 ) ) ); - $options->{ "chunk_size" } ||= 10_000_000_000; # Due to 32-bit UCSC compilation really large tables cannot be loaded in one go. - - $file = "$BP_TMP/ucsc_upload.tmp"; - $append = 0; - $first = 1; - $i = 0; - - $fh_out = Maasha::Common::write_open( $file ); - - while ( $record = get_record( $in ) ) - { - put_record( $record, $out ) if not $options->{ "no_stream" }; - - if ( $record->{ "REC_TYPE" } eq "fixed_step" ) - { - $format = "WIGGLE"; - - if ( $entry = Maasha::UCSC::Wiggle::biopiece2fixedstep( $record ) ) { - Maasha::UCSC::Wiggle::fixedstep_entry_put( $entry, $fh_out ); - } - } - elsif ( $record->{ "REC_TYPE" } eq "PSL" ) - { - $format = "PSL"; - - Maasha::UCSC::psl_put_header( $fh_out ) if $first; - Maasha::UCSC::psl_put_entry( $record, $fh_out ); - - $first = 0; - } - elsif ( $record->{ "REC_TYPE" } eq "BED" and $record->{ "SEC_STRUCT" } ) - { - # chrom chromStart chromEnd name score strand size secStr conf - - $format = "BED_SS"; - - print $fh_out join ( "\t", - $record->{ "CHR" }, - $record->{ "CHR_BEG" }, - $record->{ "CHR_END" } + 1, - $record->{ "Q_ID" }, - $record->{ "SCORE" }, - $record->{ "STRAND" }, - $record->{ "SIZE" }, - $record->{ "SEC_STRUCT" }, - $record->{ "CONF" }, - ), "\n"; - } - elsif ( $record->{ "REC_TYPE" } eq "BED" ) - { - $format = "BED"; - $columns = $record->{ "BED_COLS" }; - - if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) { - Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } ); - } - } - elsif ( $record->{ "REC_TYPE" } eq "PATSCAN" and $record->{ "CHR" } ) - { - $format = "BED"; - $columns = 6; - - if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) { - Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } ); - } - } - elsif ( $record->{ "REC_TYPE" } eq "BLAST" and $record->{ "S_ID" } =~ /^chr/ ) - { - $format = "BED"; - $columns = 6; - - $record->{ "SCORE" } = $record->{ "BIT_SCORE" } * 1000; - - if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) { - Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } ); - } - } - elsif ( $record->{ "REC_TYPE" } eq "VMATCH" and $record->{ "S_ID" } =~ /^chr/i ) - { - $format = "BED"; - $columns = 6; - - if ( $entry = Maasha::UCSC::BED::biopiece2bed( $record, $columns ) ) { - Maasha::UCSC::BED::bed_entry_put( $entry, $fh_out, $columns, $options->{ 'check' } ); - } - } - - if ( $i == $options->{ "chunk_size" } ) - { - close $fh_out; - - if ( $format eq "BED" ) { - Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append ); - } elsif ( $format eq "PSL" ) { - Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append ); - } - - unlink $file; - - $first = 1; - - $append = 1; - - $fh_out = Maasha::Common::write_open( $file ); - } - - $i++; - } - - close $fh_out; - - if ( exists $options->{ "database" } and $options->{ "table" } ) - { - if ( $format eq "BED" ) - { - $type = "bed $columns"; - - Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append ); - } - elsif ( $format eq "BED_SS" ) - { - $type = "type bed 6 +"; - - Maasha::UCSC::bed_upload_to_ucsc( $BP_TMP, $file, $options, $append ); - } - elsif ( $format eq "PSL" ) - { - $type = "psl"; - - Maasha::UCSC::psl_upload_to_ucsc( $file, $options, $append ); - } - elsif ( $format eq "WIGGLE" ) - { - $options->{ "visibility" } = "full"; - - $wig_file = "$options->{ 'table' }.wig"; - $wib_file = "$options->{ 'table' }.wib"; - - $wib_dir = "$ENV{ 'HOME' }/ucsc/wib"; - - Maasha::Common::dir_create_if_not_exists( $wib_dir ); - - if ( $options->{ 'verbose' } ) { - `cd $BP_TMP && wigEncode $file $wig_file $wib_file`; - } else { - `cd $BP_TMP && wigEncode $file $wig_file $wib_file > /dev/null 2>&1`; - } - - Maasha::Common::run( "mv", "$BP_TMP/$wib_file $wib_dir" ); - - unlink $file; - - $file = $wig_file; - - $type = "wig 0"; - - Maasha::UCSC::wiggle_upload_to_ucsc( $BP_TMP, $wib_dir, $file, $options ); - } - - unlink $file; - - Maasha::UCSC::ucsc_update_config( $options, $type ); - } -} - - -# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - -sub grab_lookup -{ - # Martin A. Hansen, November 2009. - - # Uses keys from a lookup hash to search records. Optionally, a list of - # keys can be given so the lookup is limited to these, also, flags - # can be given to limit lookup to keys or vals only. Returns 1 if lookup - # succeeded, else 0. - - my ( $lookup_hash, # hashref with patterns - $record, # hashref - $keys, # list of keys - OPTIONAL - $vals_only, # only vals flag - OPTIONAL - $keys_only, # only keys flag - OPTIONAL - ) = @_; - - # Returns boolean. - - if ( $keys ) - { - map { return 1 if exists $lookup_hash->{ $record->{ $_ } } } @{ $keys }; - } - else - { - if ( not $vals_only ) { - map { return 1 if exists $lookup_hash->{ $_ } } keys %{ $record }; - } - - if ( not $keys_only ) { - map { return 1 if exists $lookup_hash->{ $record->{ $_ } } } keys %{ $record }; - } - } - - return 0; -} - - -sub grab_patterns -{ - # Martin A. Hansen, November 2009. - - # Uses patterns to match records containing the pattern as a substring. - # Returns 1 if the record is matched, else 0. - - my ( $patterns, # list of patterns - $record, # hashref - $keys, # list of keys - OPTIONAL - $vals_only, # only vals flag - OPTIONAL - $keys_only, # only keys flag - OPTIONAL - ) = @_; - - # Returns boolean. - - my ( $pattern ); - - foreach $pattern ( @{ $patterns } ) - { - if ( $keys ) - { - map { return 1 if index( $record->{ $_ }, $pattern ) >= 0 } @{ $keys }; - } - else - { - if ( not $vals_only ) { - map { return 1 if index( $_, $pattern ) >= 0 } keys %{ $record }; - } - - if ( not $keys_only ) { - map { return 1 if index( $record->{ $_ }, $pattern ) >= 0 } keys %{ $record }; - } - } - } - - return 0; -} - - -sub grab_regex -{ - # Martin A. Hansen, November 2009. - - # Uses regex to match records. - # Returns 1 if the record is matched, else 0. - - my ( $regex, # regex to match - $record, # hashref - $keys, # list of keys - OPTIONAL - $vals_only, # only vals flag - OPTIONAL - $keys_only, # only keys flag - OPTIONAL - ) = @_; - - # Returns boolean. - - if ( $keys ) - { - map { return 1 if $record->{ $_ } =~ /$regex/ } @{ $keys }; - } - else - { - if ( not $vals_only ) { - map { return 1 if $_ =~ /$regex/ } keys %{ $record }; - } - - if ( not $keys_only ) { - map { return 1 if $record->{ $_ } =~ /$regex/ } keys %{ $record }; - } - } - - return 0; -} + my ( $time_stamp, $user, $script, $fh_global, $fh_local ); + $time_stamp = Maasha::Common::time_stamp(); + $user = Maasha::Common::get_user(); + $script = Maasha::Common::get_scriptname(); -sub grab_eval -{ - # Martin A. Hansen, November 2009. - - # Test if the value of a given record key evaluates according - # to a given operator. Returns 1 if eval is OK, else 0. + $fh_global = Maasha::Common::append_open( "$ENV{ 'BP_LOG' }/biopieces.log" ); + $fh_local = Maasha::Common::append_open( "$ENV{ 'HOME' }/.biopieces.log" ); - my ( $key, # record key - $op, # operator - $val, # value - $record, # hashref - ) = @_; - - # Returns boolean. + print $fh_global "$time_stamp\t$user\t$script ", join( " ", @ARGV ), "\n"; + print $fh_local "$time_stamp\t$user\t$script ", join( " ", @ARGV ), "\n"; - if ( defined $record->{ $key } ) - { - return 1 if ( $op eq "<" and $record->{ $key } < $val ); - return 1 if ( $op eq ">" and $record->{ $key } > $val ); - return 1 if ( $op eq ">=" and $record->{ $key } >= $val ); - return 1 if ( $op eq "<=" and $record->{ $key } <= $val ); - return 1 if ( $op eq "=" and $record->{ $key } == $val ); - return 1 if ( $op eq "!=" and $record->{ $key } != $val ); - return 1 if ( $op eq "eq" and $record->{ $key } eq $val ); - return 1 if ( $op eq "ne" and $record->{ $key } ne $val ); - } + $fh_global->autoflush( 1 ); + $fh_local->autoflush( 1 ); - return 0; + close $fh_global; + close $fh_local; } @@ -6729,6 +315,7 @@ END # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>><<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + 1; __END__ -- 2.39.5