bp_scripts/QA_454_report.sh

   1 #!/bin/bash
   2
   3 # Copyright (C) 2010 Martin A. Hansen (mail@maasha.dk).
   4
   5 # This program is free software; you can redistribute it and/or
   6 # modify it under the terms of the GNU General Public License
   7 # as published by the Free Software Foundation; either version 2
   8 # of the License, or (at your option) any later version.
   9
  10 # This program is distributed in the hope that it will be useful,
  11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 # GNU General Public License for more details.
  14
  15 # You should have received a copy of the GNU General Public License
  16 # along with this program; if not, write to the Free Software
  17 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
  18
  19 # http://www.gnu.org/copyleft/gpl.html
  20
  21 sff_files=$@
  22
  23 if [ ! $1 ]; then
  24     echo
  25     echo "QA_454_report.sh generates a quality assurance report for"
  26     echo "each of a given number of Roche/FLX 454 .sff files."
  27     echo
  28     echo "Usage: `basename $0` <sff file(s)>"
  29     echo
  30     exit
  31 fi
  32
  33 function puts
  34 {
  35     msg=$1
  36
  37     echo $msg >> $out_file
  38 }
  39
  40 function pcat
  41 {
  42     file=$1
  43
  44     cat $file >> $out_file
  45 }
  46
  47 for sff_file in $sff_files; do
  48     base=`basename $sff_file .sff`
  49
  50     tmp_dir="$HOME/QA_454_report_$base"
  51
  52     if [ ! -d $tmp_dir ]; then
  53         mkdir $tmp_dir
  54     fi
  55
  56     fq_file="$tmp_dir/$base.fq"
  57     xml_file="$tmp_dir/$base.xml"
  58     out_file="$tmp_dir/QA_454_report_$base.txt"
  59
  60     if [ -f $out_file ]; then
  61         mv $out_file "$out_file.bak"
  62     fi
  63
  64     analysis_vals="$tmp_dir/analysis_vals.txt"
  65     analysis_seqs="$tmp_dir/analysis_seqs.txt"
  66     plot_lendist_unclipped="$tmp_dir/plot_lendist_unclipped.txt"
  67     plot_lendist_clipped="$tmp_dir/plot_lendist_clipped.txt"
  68     plot_scores_unclipped="$tmp_dir/plot_scores_unclipped.txt"
  69     plot_scores_clipped="$tmp_dir/plot_scores_clipped.txt"
  70     plot_mean_scores="$tmp_dir/plot_mean_scores.txt"
  71     count_score_mean="$tmp_dir/count_score_mean.txt"
  72     table_mid="$tmp_dir/table_mid.tab"
  73     table_mid_len="$tmp_dir/table_mid_len.tab"
  74     table_mid_len_score="$tmp_dir/table_mid_len_score.tab"
  75     table_mid_join="$tmp_dir/table_mid_join.tab"
  76     table_freq="$tmp_dir/table_freq.tab"
  77
  78     # sff_extract is a 3rd party tool from the MIRA package.
  79     # http://sourceforge.net/projects/mira-assembler/files/
  80     echo -n "Converting sff file $sff_file to FASTQ format ... "
  81     sff_extract --fastq $sff_file --seq_file $fq_file --xml_file $xml_file > /dev/null 2>&1
  82     echo "done."
  83
  84     # Using Biopieces -> www.biopieces.org
  85
  86     echo "" && echo "Running composition analysis on sequences ... "
  87     read_fastq -i $fq_file |
  88     progress_meter |
  89     analyze_vals -k SEQ -o $analysis_vals |
  90     analyze_seq |
  91     mean_vals -k 'GC%,HARD_MASK%,SOFT_MASK%' |
  92     grab -e 'REC_TYPE eq MEAN' |
  93     write_tab -ck 'GC%_MEAN,HARD_MASK%_MEAN,SOFT_MASK%_MEAN' -o $analysis_seqs -x
  94
  95     echo "" && echo "Plotting length distributions and scores before and after clipping ..."
  96     read_fastq -i $fq_file |
  97     progress_meter |
  98     bin_vals -k SEQ_LEN -b 50 |
  99     plot_lendist -k SEQ_LEN_BIN -T "Length Distribution - unclipped" -X "50 nucleotide bins" -Y "Count" -o $plot_lendist_unclipped |
 100     plot_scores -o $plot_scores_unclipped -X "Sequence length" -Y "Score" |
 101     clip_seq |
 102     bin_vals -k SEQ_LEN -b 50 |
 103     plot_lendist -k SEQ_LEN_BIN -T "Length Distribution - clipped" -X "50 nucleotide bins" -Y "Count" -o $plot_lendist_clipped |
 104     plot_scores -o $plot_scores_clipped -X "Sequence length" -Y "Score" -x
 105
 106     echo "" && echo "Plotting mean score bins and counting mean scores greater than 20 ... "
 107     read_fastq -i $fq_file |
 108     progress_meter |
 109     mean_scores |
 110     bin_vals -k SCORES_MEAN -b 5 |
 111     plot_histogram -s num -k SCORES_MEAN_BIN -T "Mean score bins" -X "Bins (size 5)" -Y "Count" -o $plot_mean_scores |
 112     grab -e 'SCORES_MEAN >= 20' |
 113     count_records -o $count_score_mean -x
 114
 115     echo "" && echo "Locating and counting MID tags ... "
 116     read_fastq -i $fq_file |
 117     progress_meter |
 118     find_mids |
 119     write_tab -o $table_mid -c -k MID_NUM,MID_SEQ,MID_COUNT -x
 120
 121     echo "" && echo "Locating and counting MID tags for sequences longer than 250 ... "
 122     read_fastq -i $fq_file |
 123     progress_meter |
 124     grab -e 'SEQ_LEN >= 250' |
 125     find_mids |
 126     write_tab -o $table_mid_len -c -k MID_NUM,MID_SEQ,MID_COUNT -x
 127
 128     echo "" && echo "Locating and counting MID tags for sequences longer than 250 and mean score above 20 ... "
 129     read_fastq -i $fq_file |
 130     progress_meter |
 131     grab -e 'SEQ_LEN >= 250' |
 132     mean_scores |
 133     grab -e 'SCORES_MEAN >= 20' |
 134     find_mids |
 135     write_tab -o $table_mid_len_score -c -k MID_NUM,MID_SEQ,MID_COUNT -x
 136
 137     echo "" && echo -n "Joining MID tables ... "
 138     read_tab -i $table_mid |
 139     rename_keys -k MID_NUM,A |
 140     rename_keys -k MID_COUNT,TOTAL |
 141     read_tab -i $table_mid_len |
 142     rename_keys -k MID_NUM,B |
 143     rename_keys -k MID_COUNT,L250 |
 144     merge_records -k A,B |
 145     read_tab -i $table_mid_len_score |
 146     rename_keys -k MID_NUM,C |
 147     rename_keys -k MID_COUNT,L250_S20 |
 148     merge_records -k A,C |
 149     rename_keys -k A,MID_NUM |
 150     sort_records -k MID_NUMn |
 151     write_tab -o $table_mid_join -c -k MID_NUM,MID_SEQ,TOTAL,L250,L250_S20 -x
 152     echo "done."
 153
 154     echo "" && echo "Creating residue frequency table ... "
 155     read_fastq -i $fq_file |
 156     progress_meter |
 157     extract_seq -l 50 |
 158     uppercase_seq |
 159     create_weight_matrix -p |
 160     flip_tab |
 161     write_tab -o $table_freq -x
 162
 163     echo "" && echo -n "Generating report ... "
 164     puts ""
 165     puts ""
 166     puts ""
 167     puts "QA 454 Report"
 168     puts "============="
 169     puts ""
 170     puts ""
 171     puts ""
 172     puts "Date: `date`"
 173     puts ""
 174     puts "File: `pwd`/$sff_file"
 175     puts ""
 176     puts ""
 177     puts "Sequence analysis"
 178     puts "-----------------"
 179     puts ""
 180     puts ""
 181     puts "The below table contains some basic info:"
 182     puts ""
 183     puts "  COUNT is the number of sequences in the file."
 184     puts "  MIN   is the minimum sequence length found."
 185     puts "  MAX   is the maximum sequence length found."
 186     puts "  MEAN  is the mean    sequence length found."
 187     puts "  SUM   is the total number of bases in the file."
 188     puts ""
 189     pcat $analysis_vals
 190     puts ""
 191     puts ""
 192     puts "Sequence composition"
 193     puts "--------------------"
 194     puts ""
 195     puts ""
 196     puts "The below table contains composition analysis of the sequences:"
 197     puts ""
 198     puts "  GC%_MEAN is the mean GC content."
 199     puts "  HARD_MASK%_MEAN is the mean of hard masked sequence (i.e. % of N's)."
 200     puts "  SOFT_MASK%_MEAN is the mean of soft masked sequence (i.e. lowercase residues = clipped sequence)."
 201     puts ""
 202     pcat $analysis_seqs
 203     puts ""
 204     puts ""
 205     puts "Sequence length distribution"
 206     puts "----------------------------"
 207     puts ""
 208     puts ""
 209     puts "The length distribution of unclipped reads where the lengths are binned in buckets of size 50:"
 210     puts ""
 211     pcat $plot_lendist_unclipped
 212     puts ""
 213     puts "The length distribution of clipped reads where the lengths are binned in buckets of size 50:"
 214     puts ""
 215     pcat $plot_lendist_clipped
 216     puts ""
 217     puts ""
 218     puts "Quality score means"
 219     puts "-------------------"
 220     puts ""
 221     puts ""
 222     puts "The mean scores of the unclipped sequences:"
 223     puts ""
 224     pcat $plot_scores_unclipped
 225     puts ""
 226     puts "The mean scores of the clipped sequences:"
 227     puts ""
 228     pcat $plot_scores_clipped
 229     puts ""
 230     puts "Histogram of bins with mean quality scores:"
 231     puts ""
 232     pcat $plot_mean_scores
 233     puts ""
 234     puts "Number of sequences with a mean score >= 20:"
 235     puts ""
 236     pcat $count_score_mean
 237     puts ""
 238     puts ""
 239     puts "MID tag analysis"
 240     puts "----------------"
 241     puts ""
 242     puts ""
 243     puts "The below table contains the identified MID tags and the number of times they were found:"
 244     puts ""
 245     puts "  MID_NUM is the MID tag identifier."
 246     puts "  MID_SEQ is the sequence of the MID tag."
 247     puts "  TOTAL is the number of times this MID tag was found."
 248     puts "  L250 is the a subset count of TOTAL af sequences longer than 250 bases"
 249     puts "  L250_S20 is a subset count of L250 af sequences with a mean score above 20"
 250     puts ""
 251     pcat $table_mid_join
 252     puts ""
 253     puts ""
 254     puts "Residue frequency analysis"
 255     puts "--------------------------"
 256     puts ""
 257     puts ""
 258     puts "The below table contains the residue frequency (in percent) of the first 50 bases:"
 259     puts ""
 260     pcat $table_freq
 261     puts ""
 262     puts "end."
 263
 264     rm $fq_file
 265     rm $xml_file
 266     rm $analysis_vals
 267     rm $analysis_seqs
 268     rm $plot_lendist_unclipped
 269     rm $plot_lendist_clipped
 270     rm $plot_scores_unclipped
 271     rm $plot_scores_clipped
 272     rm $plot_mean_scores
 273     rm $count_score_mean
 274     rm $table_mid
 275     rm $table_mid_len
 276     rm $table_mid_len_score
 277     rm $table_mid_join
 278     rm $table_freq
 279
 280     echo "done."
 281
 282     echo ""
 283     echo "Report located here: $out_file"
 284     echo ""
 285 done
 286
 287 echo "All done."