From: martinahansen Date: Mon, 13 Dec 2010 14:00:55 +0000 (+0000) Subject: added QA_Solexa_report.sh X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=15eff44d72eb5d662033a5c1de3b72b02cda1e55;hp=3de30d5afa0d1a33ea84c0db5849e979967902de;p=biopieces.git added QA_Solexa_report.sh git-svn-id: http://biopieces.googlecode.com/svn/trunk@1187 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_scripts/QA_Solexa_report.sh b/bp_scripts/QA_Solexa_report.sh new file mode 100755 index 0000000..43f1ff1 --- /dev/null +++ b/bp_scripts/QA_Solexa_report.sh @@ -0,0 +1,167 @@ +#!/bin/bash + +# Copyright (C) 2010 Martin A. Hansen (mail@maasha.dk). + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + +# http://www.gnu.org/copyleft/gpl.html + +fastq_files=$@ + +if [ ! $1 ]; then + echo + echo "QA_Solexa_report.sh generates a quality assurance report for" + echo "each of a given number of Solexa/Illumina FASTQ files." + echo + echo "Usage: `basename $0` " + echo + exit +fi + +function puts +{ + msg=$1 + + echo $msg >> $out_file +} + +function pcat +{ + file=$1 + + cat $file >> $out_file +} + +for fastq_file in $fastq_files; do + base=`basename $fastq_file` + date=`date | sed 's/[ :]/_/g'` + tmp_dir="$HOME/QA_Solexa_report_$date" + out_file="$tmp_dir/QA_Solexa_report_$date.txt" + plot_scores="$tmp_dir/plot_scores.txt" + plot_lendist="$tmp_dir/plot_lendist.txt" + freq_table="$tmp_dir/freq_table.txt" + read_count="$tmp_dir/read_count.txt" + read_length="$tmp_dir/read_length.txt" + top30_reads="$tmp_dir/top30_reads.txt" + + if [ ! -d $tmp_dir ]; then + mkdir $tmp_dir + fi + + read_fastq -n 1 -i $fastq_file | + write_tab -k SEQ_LEN -o $read_length -x + + echo "" && echo "Plotting scores and length distributions ... " + read_fastq -i $fastq_file | + progress_meter -c 10000 | + count_records -o $read_count | + plot_scores -o $plot_scores -Y "Mean score" -X "Sequence length" | + trim_seq | + bin_vals -k SEQ_LEN | + plot_lendist -k SEQ_LEN_BIN -o $plot_lendist -Y "Count" -X "Sequence Length (5nt bins)" -x + + echo "" && echo "Running composition analysis on sequences ... " + read_fastq -i $fastq_file | + progress_meter -c 10000 | + create_weight_matrix -p | + flip_tab | + write_tab -o $freq_table -x + + echo "" && echo "Locating top 30 unique reads ... " + read_fastq -i $fastq_file | + progress_meter -c 10000 | + uniq_seq -c | + sort_records -rk SEQ_COUNTn | + head_records -n 30 | + write_tab -ck SEQ_COUNT,SEQ -o $top30_reads -x + + echo "" && echo -n "Generating report ... " + puts "" + puts "" + puts "" + puts "QA Solexa Report" + puts "=============" + puts "" + puts "" + puts "" + puts "Date: `date`" + puts "" + puts "File: `pwd`/$fastq_file" + puts "" + puts "" + puts "Sequence analysis" + puts "-----------------" + puts "" + puts "" + puts "Read length:" + puts "" + pcat $read_length + puts "" + puts "Read count:" + puts "" + pcat $read_count + puts "" + puts "" + puts "Quality score means" + puts "-------------------" + puts "" + puts "" + puts "The mean scores of the untrimmed sequences:" + puts "" + pcat $plot_scores + puts "" + puts "" + puts "Sequence length distribution" + puts "----------------------------" + puts "" + puts "" + puts "The length distribution of trimmed reads where the lengths are binned in buckets of size 5:" + puts "" + pcat $plot_lendist + puts "" + puts "" + puts "Residue frequency analysis" + puts "--------------------------" + puts "" + puts "" + puts "The below table contains the residue frequency (in percent) of all base positions:" + puts "" + pcat $freq_table + puts "" + puts "" + puts "Top 30 reads" + puts "--------------------------" + puts "" + puts "" + pcat $top30_reads + puts "" + puts "" + puts "end." + + rm $plot_scores + rm $plot_lendist + rm $freq_table + rm $read_count + rm $read_length + rm $top30_reads + + echo "done." + + echo "" + echo "Report located here: $out_file" + echo "" +done + +echo "All done."