3 # Copyright (C) 2010 Martin A. Hansen (mail@maasha.dk).
5 # This program is free software; you can redistribute it and/or
6 # modify it under the terms of the GNU General Public License
7 # as published by the Free Software Foundation; either version 2
8 # of the License, or (at your option) any later version.
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 # GNU General Public License for more details.
15 # You should have received a copy of the GNU General Public License
16 # along with this program; if not, write to the Free Software
17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
19 # http://www.gnu.org/copyleft/gpl.html
25 echo "QA_454_report.sh generates a quality assurance report for"
26 echo "each of a given number of Roche/FLX 454 .sff files."
28 echo "Usage: `basename $0` <sff file(s)>"
37 echo $msg >> $out_file
44 cat $file >> $out_file
47 for sff_file in $sff_files; do
48 base=`basename $sff_file .sff`
50 tmp_dir="$HOME/QA_454_report_$base"
52 if [ ! -d $tmp_dir ]; then
56 fq_file="$tmp_dir/$base.fq"
57 xml_file="$tmp_dir/$base.xml"
58 out_file="$tmp_dir/QA_454_report_$base.txt"
60 if [ -f $out_file ]; then
61 mv $out_file "$out_file.bak"
64 analysis_vals="$tmp_dir/analysis_vals.txt"
65 analysis_seqs="$tmp_dir/analysis_seqs.txt"
66 plot_lendist_unclipped="$tmp_dir/plot_lendist_unclipped.txt"
67 plot_lendist_clipped="$tmp_dir/plot_lendist_clipped.txt"
68 plot_scores_unclipped="$tmp_dir/plot_scores_unclipped.txt"
69 plot_scores_clipped="$tmp_dir/plot_scores_clipped.txt"
70 plot_mean_scores="$tmp_dir/plot_mean_scores.txt"
71 count_score_mean="$tmp_dir/count_score_mean.txt"
72 table_mid="$tmp_dir/table_mid.tab"
73 table_mid_len="$tmp_dir/table_mid_len.tab"
74 table_mid_len_score="$tmp_dir/table_mid_len_score.tab"
75 table_mid_join="$tmp_dir/table_mid_join.tab"
76 table_freq="$tmp_dir/table_freq.tab"
78 # sff_extract is a 3rd party tool from the MIRA package.
79 # http://sourceforge.net/projects/mira-assembler/files/
80 echo -n "Converting sff file $sff_file to FASTQ format ... "
81 sff_extract --fastq $sff_file --seq_file $fq_file --xml_file $xml_file > /dev/null 2>&1
84 # Using Biopieces -> www.biopieces.org
86 echo "" && echo "Running composition analysis on sequences ... "
87 read_fastq -i $fq_file |
89 analyze_vals -k SEQ -o $analysis_vals |
91 mean_vals -k 'GC%,HARD_MASK%,SOFT_MASK%' |
92 grab -e 'REC_TYPE eq MEAN' |
93 write_tab -ck 'GC%_MEAN,HARD_MASK%_MEAN,SOFT_MASK%_MEAN' -o $analysis_seqs -x
95 echo "" && echo "Plotting length distributions and scores before and after clipping ..."
96 read_fastq -i $fq_file |
98 bin_vals -k SEQ_LEN -b 50 |
99 plot_lendist -k SEQ_LEN_BIN -T "Length Distribution - unclipped" -X "50 nucleotide bins" -Y "Count" -o $plot_lendist_unclipped |
100 plot_scores -o $plot_scores_unclipped -X "Sequence length" -Y "Score" |
102 bin_vals -k SEQ_LEN -b 50 |
103 plot_lendist -k SEQ_LEN_BIN -T "Length Distribution - clipped" -X "50 nucleotide bins" -Y "Count" -o $plot_lendist_clipped |
104 plot_scores -o $plot_scores_clipped -X "Sequence length" -Y "Score" -x
106 echo "" && echo "Plotting mean score bins and counting mean scores greater than 20 ... "
107 read_fastq -i $fq_file |
110 bin_vals -k SCORES_MEAN -b 5 |
111 plot_histogram -s num -k SCORES_MEAN_BIN -T "Mean score bins" -X "Bins (size 5)" -Y "Count" -o $plot_mean_scores |
112 grab -e 'SCORES_MEAN >= 20' |
113 count_records -o $count_score_mean -x
115 echo "" && echo "Locating and counting MID tags ... "
116 read_fastq -i $fq_file |
120 write_tab -o $table_mid -c -k MID_NUM,MID_SEQ,MID_COUNT -x
122 echo "" && echo "Locating and counting MID tags for sequences longer than 250 ... "
123 read_fastq -i $fq_file |
126 grab -e 'SEQ_LEN >= 250' |
128 write_tab -o $table_mid_len -c -k MID_NUM,MID_SEQ,MID_COUNT -x
130 echo "" && echo "Locating and counting MID tags for sequences longer than 250 and mean score above 20 ... "
131 read_fastq -i $fq_file |
134 grab -e 'SEQ_LEN >= 250' |
136 grab -e 'SCORES_MEAN >= 20' |
138 write_tab -o $table_mid_len_score -c -k MID_NUM,MID_SEQ,MID_COUNT -x
140 echo "" && echo -n "Joining MID tables ... "
141 read_tab -i $table_mid |
142 rename_keys -k MID_NUM,A |
143 rename_keys -k MID_COUNT,TOTAL |
144 read_tab -i $table_mid_len |
145 rename_keys -k MID_NUM,B |
146 rename_keys -k MID_COUNT,L250 |
147 merge_records -k A,B |
148 read_tab -i $table_mid_len_score |
149 rename_keys -k MID_NUM,C |
150 rename_keys -k MID_COUNT,L250_S20 |
151 merge_records -k A,C |
152 rename_keys -k A,MID_NUM |
153 sort_records -k MID_NUMn |
154 write_tab -o $table_mid_join -c -k MID_NUM,MID_SEQ,TOTAL,L250,L250_S20 -x
157 echo "" && echo "Creating residue frequency table ... "
158 read_fastq -i $fq_file |
162 create_weight_matrix -p |
164 write_tab -o $table_freq -x
166 echo "" && echo -n "Generating report ... "
177 puts "File: `pwd`/$sff_file"
180 puts "Sequence analysis"
181 puts "-----------------"
184 puts "The below table contains some basic info:"
186 puts " COUNT is the number of sequences in the file."
187 puts " MIN is the minimum sequence length found."
188 puts " MAX is the maximum sequence length found."
189 puts " MEAN is the mean sequence length found."
190 puts " SUM is the total number of bases in the file."
195 puts "Sequence composition"
196 puts "--------------------"
199 puts "The below table contains composition analysis of the sequences:"
201 puts " GC%_MEAN is the mean GC content."
202 puts " HARD_MASK%_MEAN is the mean of hard masked sequence (i.e. % of N's)."
203 puts " SOFT_MASK%_MEAN is the mean of soft masked sequence (i.e. lowercase residues = clipped sequence)."
208 puts "Sequence length distribution"
209 puts "----------------------------"
212 puts "The length distribution of unclipped reads where the lengths are binned in buckets of size 50:"
214 pcat $plot_lendist_unclipped
216 puts "The length distribution of clipped reads where the lengths are binned in buckets of size 50:"
218 pcat $plot_lendist_clipped
221 puts "Quality score means"
222 puts "-------------------"
225 puts "The mean scores of the unclipped sequences:"
227 pcat $plot_scores_unclipped
229 puts "The mean scores of the clipped sequences:"
231 pcat $plot_scores_clipped
233 puts "Histogram of bins with mean quality scores:"
235 pcat $plot_mean_scores
237 puts "Number of sequences with a mean score >= 20:"
239 pcat $count_score_mean
242 puts "MID tag analysis"
243 puts "----------------"
246 puts "The below table contains the identified MID tags and the number of times they were found:"
248 puts " MID_NUM is the MID tag identifier."
249 puts " MID_SEQ is the sequence of the MID tag."
250 puts " TOTAL is the number of times this MID tag was found."
251 puts " L250 is the a subset count of TOTAL af sequences longer than 250 bases"
252 puts " L250_S20 is a subset count of L250 af sequences with a mean score above 20"
257 puts "Residue frequency analysis"
258 puts "--------------------------"
261 puts "The below table contains the residue frequency (in percent) of the first 50 bases:"
271 rm $plot_lendist_unclipped
272 rm $plot_lendist_clipped
273 rm $plot_scores_unclipped
274 rm $plot_scores_clipped
279 rm $table_mid_len_score
286 echo "Report located here: $out_file"