3 # Copyright (C) 2010 Martin A. Hansen (mail@maasha.dk).
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
25 echo "QA_Solexa_report.sh generates a quality assurance report for"
26 echo "each of a given number of Solexa/Illumina FASTQ files."
28 echo "Usage: `basename $0` <FASTQ file(s)>"
37 echo $msg >> $out_file
44 cat $file >> $out_file
47 for fastq_file in $fastq_files; do
48 base=`basename $fastq_file`
49 date=`date | sed 's/[ :]/_/g' | sed 's/__/_/g'`
50 tmp_dir="$HOME/QA_Solexa_report_$date"
51 out_file="$tmp_dir/QA_Solexa_report_$date.txt"
52 plot_scores="$tmp_dir/plot_scores.txt"
53 plot_lendist="$tmp_dir/plot_lendist.txt"
54 freq_table="$tmp_dir/freq_table.txt"
55 read_count="$tmp_dir/read_count.txt"
56 read_length="$tmp_dir/read_length.txt"
57 top30_reads="$tmp_dir/top30_reads.txt"
59 if [ ! -d $tmp_dir ]; then
63 read_fastq -n 1 -i $fastq_file |
64 write_tab -k SEQ_LEN -o $read_length -x
66 echo "" && echo "Plotting scores and length distributions ... "
67 read_fastq -i $fastq_file |
68 progress_meter -c 10000 |
69 count_records -o $read_count |
70 plot_scores -o $plot_scores -Y "Mean score" -X "Sequence length" |
73 plot_lendist -k SEQ_LEN_BIN -o $plot_lendist -Y "Count" -X "Sequence Length (5nt bins)" -x
75 echo "" && echo "Running composition analysis on sequences ... "
76 read_fastq -i $fastq_file |
77 progress_meter -c 10000 |
78 create_weight_matrix -p |
80 write_tab -o $freq_table -x
82 echo "" && echo "Locating top 30 unique reads ... "
83 read_fastq -i $fastq_file |
84 progress_meter -c 10000 |
86 sort_records -rk SEQ_COUNTn |
88 write_tab -ck SEQ_COUNT,SEQ -o $top30_reads -x
90 echo "" && echo -n "Generating report ... "
94 puts "QA Solexa Report"
101 puts "File: `pwd`/$fastq_file"
104 puts "Sequence analysis"
105 puts "-----------------"
117 puts "Quality score means"
118 puts "-------------------"
121 puts "The mean scores of the untrimmed sequences:"
126 puts "Sequence length distribution"
127 puts "----------------------------"
130 puts "The length distribution of trimmed reads where the lengths are binned in buckets of size 5:"
135 puts "Residue frequency analysis"
136 puts "--------------------------"
139 puts "The below table contains the residue frequency (in percent) of all base positions:"
145 puts "--------------------------"
163 echo "Report located here: $out_file"