From: martinahansen Date: Mon, 26 Nov 2012 08:35:58 +0000 (+0000) Subject: polish of QA_Illumina_report.rb X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=846f7af09551a46b046d25ed01429f7c88ef6982;hp=725eb685e891858a443645f0bddc4eee3ac86a66;p=biopieces.git polish of QA_Illumina_report.rb git-svn-id: http://biopieces.googlecode.com/svn/trunk@1995 74ccb610-7750-0410-82ae-013aeee3265d --- diff --git a/bp_scripts/QA_Illumina_report.rb b/bp_scripts/QA_Illumina_report.rb index 7b54e4d..d82c0cf 100755 --- a/bp_scripts/QA_Illumina_report.rb +++ b/bp_scripts/QA_Illumina_report.rb @@ -74,8 +74,8 @@ system( plot_distribution -k SEQ_LEN -T 'Sequence length distribution' -X 'Sequence length' -t png -o #{lendist_file} | plot_scores -c -t png -o #{scores_file} | plot_nucleotide_distribution -t png -o #{nucdist_file} | - bin_vals -k SEQ_LEN -b 50 | - plot_distribution -T '50 bases bin sequence length distribution' -X 'Sequence length' -k SEQ_LEN_BIN -t png -o #{lendist_bin_file} | + bin_vals -k SEQ_LEN -b 25 | + plot_distribution -T '25 bases bin sequence length distribution' -X 'Sequence length' -k SEQ_LEN_BIN -t png -o #{lendist_bin_file} | mean_scores | bin_vals -k SCORES_MEAN -b 5 | plot_distribution -k SCORES_MEAN_BIN -T '5 bin mean score distribution' -X 'Mean scores' -t png -o #{scores_bin_file} -x" diff --git a/bp_scripts/QA_Solexa_report.sh b/bp_scripts/QA_Solexa_report.sh deleted file mode 100755 index e76e1d9..0000000 --- a/bp_scripts/QA_Solexa_report.sh +++ /dev/null @@ -1,167 +0,0 @@ -#!/bin/bash - -# Copyright (C) 2010 Martin A. Hansen (mail@maasha.dk). - -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - -# http://www.gnu.org/copyleft/gpl.html - -fastq_files=$@ - -if [ ! $1 ]; then - echo - echo "QA_Solexa_report.sh generates a quality assurance report for" - echo "each of a given number of Solexa/Illumina FASTQ files." - echo - echo "Usage: `basename $0` " - echo - exit -fi - -function puts -{ - msg=$1 - - echo $msg >> $out_file -} - -function pcat -{ - file=$1 - - cat $file >> $out_file -} - -for fastq_file in $fastq_files; do - base=`basename $fastq_file` - date=`date | sed 's/[ :]/_/g' | sed 's/__/_/g'` - tmp_dir="$HOME/QA_Solexa_report_$date" - out_file="$tmp_dir/QA_Solexa_report_$date.txt" - plot_scores="$tmp_dir/plot_scores.txt" - plot_lendist="$tmp_dir/plot_lendist.txt" - freq_table="$tmp_dir/freq_table.txt" - read_count="$tmp_dir/read_count.txt" - read_length="$tmp_dir/read_length.txt" - top30_reads="$tmp_dir/top30_reads.txt" - - if [ ! -d $tmp_dir ]; then - mkdir $tmp_dir - fi - - read_fastq -n 1 -i $fastq_file | - write_tab -k SEQ_LEN -o $read_length -x - - echo "" && echo "Plotting scores and length distributions ... " - read_fastq -i $fastq_file | - progress_meter -c 10000 | - count_records -o $read_count | - plot_scores -o $plot_scores -Y "Mean score" -X "Sequence length" | - trim_seq | - bin_vals -k SEQ_LEN | - plot_lendist -k SEQ_LEN_BIN -o $plot_lendist -Y "Count" -X "Sequence Length (5nt bins)" -x - - echo "" && echo "Running composition analysis on sequences ... " - read_fastq -i $fastq_file | - progress_meter -c 10000 | - create_weight_matrix -p | - flip_tab | - write_tab -o $freq_table -x - - echo "" && echo "Locating top 30 unique reads ... " - read_fastq -i $fastq_file | - progress_meter -c 10000 | - uniq_seq -c | - sort_records -rk SEQ_COUNTn | - head_records -n 30 | - write_tab -ck SEQ_COUNT,SEQ -o $top30_reads -x - - echo "" && echo -n "Generating report ... " - puts "" - puts "" - puts "" - puts "QA Solexa Report" - puts "=============" - puts "" - puts "" - puts "" - puts "Date: `date`" - puts "" - puts "File: `pwd`/$fastq_file" - puts "" - puts "" - puts "Sequence analysis" - puts "-----------------" - puts "" - puts "" - puts "Read length:" - puts "" - pcat $read_length - puts "" - puts "Read count:" - puts "" - pcat $read_count - puts "" - puts "" - puts "Quality score means" - puts "-------------------" - puts "" - puts "" - puts "The mean scores of the untrimmed sequences:" - puts "" - pcat $plot_scores - puts "" - puts "" - puts "Sequence length distribution" - puts "----------------------------" - puts "" - puts "" - puts "The length distribution of trimmed reads where the lengths are binned in buckets of size 5:" - puts "" - pcat $plot_lendist - puts "" - puts "" - puts "Residue frequency analysis" - puts "--------------------------" - puts "" - puts "" - puts "The below table contains the residue frequency (in percent) of all base positions:" - puts "" - pcat $freq_table - puts "" - puts "" - puts "Top 30 reads" - puts "--------------------------" - puts "" - puts "" - pcat $top30_reads - puts "" - puts "" - puts "end." - - rm $plot_scores - rm $plot_lendist - rm $freq_table - rm $read_count - rm $read_length - rm $top30_reads - - echo "done." - - echo "" - echo "Report located here: $out_file" - echo "" -done - -echo "All done."