+++ /dev/null
-#!/bin/bash
-
-# Copyright (C) 2010 Martin A. Hansen (mail@maasha.dk).
-
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of the GNU General Public License
-# as published by the Free Software Foundation; either version 2
-# of the License, or (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-
-# http://www.gnu.org/copyleft/gpl.html
-
-fastq_files=$@
-
-if [ ! $1 ]; then
- echo
- echo "QA_Solexa_report.sh generates a quality assurance report for"
- echo "each of a given number of Solexa/Illumina FASTQ files."
- echo
- echo "Usage: `basename $0` <FASTQ file(s)>"
- echo
- exit
-fi
-
-function puts
-{
- msg=$1
-
- echo $msg >> $out_file
-}
-
-function pcat
-{
- file=$1
-
- cat $file >> $out_file
-}
-
-for fastq_file in $fastq_files; do
- base=`basename $fastq_file`
- date=`date | sed 's/[ :]/_/g' | sed 's/__/_/g'`
- tmp_dir="$HOME/QA_Solexa_report_$date"
- out_file="$tmp_dir/QA_Solexa_report_$date.txt"
- plot_scores="$tmp_dir/plot_scores.txt"
- plot_lendist="$tmp_dir/plot_lendist.txt"
- freq_table="$tmp_dir/freq_table.txt"
- read_count="$tmp_dir/read_count.txt"
- read_length="$tmp_dir/read_length.txt"
- top30_reads="$tmp_dir/top30_reads.txt"
-
- if [ ! -d $tmp_dir ]; then
- mkdir $tmp_dir
- fi
-
- read_fastq -n 1 -i $fastq_file |
- write_tab -k SEQ_LEN -o $read_length -x
-
- echo "" && echo "Plotting scores and length distributions ... "
- read_fastq -i $fastq_file |
- progress_meter -c 10000 |
- count_records -o $read_count |
- plot_scores -o $plot_scores -Y "Mean score" -X "Sequence length" |
- trim_seq |
- bin_vals -k SEQ_LEN |
- plot_lendist -k SEQ_LEN_BIN -o $plot_lendist -Y "Count" -X "Sequence Length (5nt bins)" -x
-
- echo "" && echo "Running composition analysis on sequences ... "
- read_fastq -i $fastq_file |
- progress_meter -c 10000 |
- create_weight_matrix -p |
- flip_tab |
- write_tab -o $freq_table -x
-
- echo "" && echo "Locating top 30 unique reads ... "
- read_fastq -i $fastq_file |
- progress_meter -c 10000 |
- uniq_seq -c |
- sort_records -rk SEQ_COUNTn |
- head_records -n 30 |
- write_tab -ck SEQ_COUNT,SEQ -o $top30_reads -x
-
- echo "" && echo -n "Generating report ... "
- puts ""
- puts ""
- puts ""
- puts "QA Solexa Report"
- puts "============="
- puts ""
- puts ""
- puts ""
- puts "Date: `date`"
- puts ""
- puts "File: `pwd`/$fastq_file"
- puts ""
- puts ""
- puts "Sequence analysis"
- puts "-----------------"
- puts ""
- puts ""
- puts "Read length:"
- puts ""
- pcat $read_length
- puts ""
- puts "Read count:"
- puts ""
- pcat $read_count
- puts ""
- puts ""
- puts "Quality score means"
- puts "-------------------"
- puts ""
- puts ""
- puts "The mean scores of the untrimmed sequences:"
- puts ""
- pcat $plot_scores
- puts ""
- puts ""
- puts "Sequence length distribution"
- puts "----------------------------"
- puts ""
- puts ""
- puts "The length distribution of trimmed reads where the lengths are binned in buckets of size 5:"
- puts ""
- pcat $plot_lendist
- puts ""
- puts ""
- puts "Residue frequency analysis"
- puts "--------------------------"
- puts ""
- puts ""
- puts "The below table contains the residue frequency (in percent) of all base positions:"
- puts ""
- pcat $freq_table
- puts ""
- puts ""
- puts "Top 30 reads"
- puts "--------------------------"
- puts ""
- puts ""
- pcat $top30_reads
- puts ""
- puts ""
- puts "end."
-
- rm $plot_scores
- rm $plot_lendist
- rm $freq_table
- rm $read_count
- rm $read_length
- rm $top30_reads
-
- echo "done."
-
- echo ""
- echo "Report located here: $out_file"
- echo ""
-done
-
-echo "All done."