From: martinahansen Date: Mon, 13 Dec 2010 09:26:34 +0000 (+0000) Subject: added QA_454_report.sh biopiece script X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=3de30d5afa0d1a33ea84c0db5849e979967902de;p=biopieces.git added QA_454_report.sh biopiece script git-svn-id: http://biopieces.googlecode.com/svn/trunk@1186 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_scripts/QA_454_report.sh b/bp_scripts/QA_454_report.sh new file mode 100755 index 0000000..33f0a42 --- /dev/null +++ b/bp_scripts/QA_454_report.sh @@ -0,0 +1,287 @@ +#!/bin/bash + +# Copyright (C) 2010 Martin A. Hansen (mail@maasha.dk). + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +sff_files=$@ + +if [ ! $1 ]; then + echo + echo "QA_454_report.sh generates a quality assurance report for" + echo "each of a given number of Roche/FLX 454 .sff files." + echo + echo "Usage: `basename $0` " + echo + exit +fi + +function puts +{ + msg=$1 + + echo $msg >> $out_file +} + +function pcat +{ + file=$1 + + cat $file >> $out_file +} + +for sff_file in $sff_files; do + base=`basename $sff_file .sff` + + tmp_dir="$HOME/QA_454_report_$base" + + if [ ! -d $tmp_dir ]; then + mkdir $tmp_dir + fi + + fq_file="$tmp_dir/$base.fq" + xml_file="$tmp_dir/$base.xml" + out_file="$tmp_dir/QA_454_report_$base.txt" + + if [ -f $out_file ]; then + mv $out_file "$out_file.bak" + fi + + analysis_vals="$tmp_dir/analysis_vals.txt" + analysis_seqs="$tmp_dir/analysis_seqs.txt" + plot_lendist_unclipped="$tmp_dir/plot_lendist_unclipped.txt" + plot_lendist_clipped="$tmp_dir/plot_lendist_clipped.txt" + plot_scores_unclipped="$tmp_dir/plot_scores_unclipped.txt" + plot_scores_clipped="$tmp_dir/plot_scores_clipped.txt" + plot_mean_scores="$tmp_dir/plot_mean_scores.txt" + count_score_mean="$tmp_dir/count_score_mean.txt" + table_mid="$tmp_dir/table_mid.tab" + table_mid_len="$tmp_dir/table_mid_len.tab" + table_mid_len_score="$tmp_dir/table_mid_len_score.tab" + table_mid_join="$tmp_dir/table_mid_join.tab" + table_freq="$tmp_dir/table_freq.tab" + + # sff_extract is a 3rd party tool from the MIRA package. + # http://sourceforge.net/projects/mira-assembler/files/ + echo -n "Converting sff file $sff_file to FASTQ format ... " + sff_extract --fastq $sff_file --seq_file $fq_file --xml_file $xml_file > /dev/null 2>&1 + echo "done." + + # Using Biopieces -> www.biopieces.org + + echo "" && echo "Running composition analysis on sequences ... " + read_fastq -i $fq_file | + progress_meter | + analyze_vals -k SEQ -o $analysis_vals | + analyze_seq | + mean_vals -k 'GC%,HARD_MASK%,SOFT_MASK%' | + grab -e 'REC_TYPE eq MEAN' | + write_tab -ck 'GC%_MEAN,HARD_MASK%_MEAN,SOFT_MASK%_MEAN' -o $analysis_seqs -x + + echo "" && echo "Plotting length distributions and scores before and after clipping ..." + read_fastq -i $fq_file | + progress_meter | + bin_vals -k SEQ_LEN -b 50 | + plot_lendist -k SEQ_LEN_BIN -T "Length Distribution - unclipped" -X "50 nucleotide bins" -Y "Count" -o $plot_lendist_unclipped | + plot_scores -o $plot_scores_unclipped -X "Sequence length" -Y "Score" | + clip_seq | + bin_vals -k SEQ_LEN -b 50 | + plot_lendist -k SEQ_LEN_BIN -T "Length Distribution - clipped" -X "50 nucleotide bins" -Y "Count" -o $plot_lendist_clipped | + plot_scores -o $plot_scores_clipped -X "Sequence length" -Y "Score" -x + + echo "" && echo "Plotting mean score bins and counting mean scores greater than 20 ... " + read_fastq -i $fq_file | + progress_meter | + mean_scores | + bin_vals -k SCORES_MEAN -b 5 | + plot_histogram -s num -k SCORES_MEAN_BIN -T "Mean score bins" -X "Bins (size 5)" -Y "Count" -o $plot_mean_scores | + grab -e 'SCORES_MEAN >= 20' | + count_records -o $count_score_mean -x + + echo "" && echo "Locating and counting MID tags ... " + read_fastq -i $fq_file | + progress_meter | + find_mids | + write_tab -o $table_mid -c -k MID_NUM,MID_SEQ,MID_COUNT -x + + echo "" && echo "Locating and counting MID tags for sequences longer than 250 ... " + read_fastq -i $fq_file | + progress_meter | + grab -e 'SEQ_LEN >= 250' | + find_mids | + write_tab -o $table_mid_len -c -k MID_NUM,MID_SEQ,MID_COUNT -x + + echo "" && echo "Locating and counting MID tags for sequences longer than 250 and mean score above 20 ... " + read_fastq -i $fq_file | + progress_meter | + grab -e 'SEQ_LEN >= 250' | + mean_scores | + grab -e 'SCORES_MEAN >= 20' | + find_mids | + write_tab -o $table_mid_len_score -c -k MID_NUM,MID_SEQ,MID_COUNT -x + + echo "" && echo -n "Joining MID tables ... " + read_tab -i $table_mid | + rename_keys -k MID_NUM,A | + rename_keys -k MID_COUNT,TOTAL | + read_tab -i $table_mid_len | + rename_keys -k MID_NUM,B | + rename_keys -k MID_COUNT,L250 | + merge_records -k A,B | + read_tab -i $table_mid_len_score | + rename_keys -k MID_NUM,C | + rename_keys -k MID_COUNT,L250_S20 | + merge_records -k A,C | + rename_keys -k A,MID_NUM | + sort_records -k MID_NUMn | + write_tab -o $table_mid_join -c -k MID_NUM,MID_SEQ,TOTAL,L250,L250_S20 -x + echo "done." + + echo "" && echo "Creating residue frequency table ... " + read_fastq -i $fq_file | + progress_meter | + extract_seq -l 50 | + uppercase_seq | + create_weight_matrix -p | + flip_tab | + write_tab -o $table_freq -x + + echo "" && echo -n "Generating report ... " + puts "" + puts "" + puts "" + puts "QA 454 Report" + puts "=============" + puts "" + puts "" + puts "" + puts "Date: `date`" + puts "" + puts "File: `pwd`/$sff_file" + puts "" + puts "" + puts "Sequence analysis" + puts "-----------------" + puts "" + puts "" + puts "The below table contains some basic info:" + puts "" + puts " COUNT is the number of sequences in the file." + puts " MIN is the minimum sequence length found." + puts " MAX is the maximum sequence length found." + puts " MEAN is the mean sequence length found." + puts " SUM is the total number of bases in the file." + puts "" + pcat $analysis_vals + puts "" + puts "" + puts "Sequence composition" + puts "--------------------" + puts "" + puts "" + puts "The below table contains composition analysis of the sequences:" + puts "" + puts " GC%_MEAN is the mean GC content." + puts " HARD_MASK%_MEAN is the mean of hard masked sequence (i.e. % of N's)." + puts " SOFT_MASK%_MEAN is the mean of soft masked sequence (i.e. lowercase residues = clipped sequence)." + puts "" + pcat $analysis_seqs + puts "" + puts "" + puts "Sequence length distribution" + puts "----------------------------" + puts "" + puts "" + puts "The length distribution of unclipped reads where the lengths are binned in buckets of size 50:" + puts "" + pcat $plot_lendist_unclipped + puts "" + puts "The length distribution of clipped reads where the lengths are binned in buckets of size 50:" + puts "" + pcat $plot_lendist_clipped + puts "" + puts "" + puts "Quality score means" + puts "-------------------" + puts "" + puts "" + puts "The mean scores of the unclipped sequences:" + puts "" + pcat $plot_scores_unclipped + puts "" + puts "The mean scores of the clipped sequences:" + puts "" + pcat $plot_scores_clipped + puts "" + puts "Histogram of bins with mean quality scores:" + puts "" + pcat $plot_mean_scores + puts "" + puts "Number of sequences with a mean score >= 20:" + puts "" + pcat $count_score_mean + puts "" + puts "" + puts "MID tag analysis" + puts "----------------" + puts "" + puts "" + puts "The below table contains the identified MID tags and the number of times they were found:" + puts "" + puts " MID_NUM is the MID tag identifier." + puts " MID_SEQ is the sequence of the MID tag." + puts " TOTAL is the number of times this MID tag was found." + puts " L250 is the a subset count of TOTAL af sequences longer than 250 bases" + puts " L250_S20 is a subset count of L250 af sequences with a mean score above 20" + puts "" + pcat $table_mid_join + puts "" + puts "" + puts "Residue frequency analysis" + puts "--------------------------" + puts "" + puts "" + puts "The below table contains the residue frequency (in percent) of the first 50 bases:" + puts "" + pcat $table_freq + puts "" + puts "end." + + rm $fq_file + rm $xml_file + rm $analysis_vals + rm $analysis_seqs + rm $plot_lendist_unclipped + rm $plot_lendist_clipped + rm $plot_scores_unclipped + rm $plot_scores_clipped + rm $plot_mean_scores + rm $count_score_mean + rm $table_mid + rm $table_mid_len + rm $table_mid_len_score + rm $table_mid_join + rm $table_freq + + echo "done." + + echo "" + echo "Report located here: $out_file" + echo "" +done + +echo "All done."