From d326bd18caa7e4d597691267025d6937854e4ea4 Mon Sep 17 00:00:00 2001 From: martinahansen Date: Thu, 30 Aug 2012 12:10:27 +0000 Subject: [PATCH] deleted old QA_454_report script git-svn-id: http://biopieces.googlecode.com/svn/trunk@1906 74ccb610-7750-0410-82ae-013aeee3265d --- bp_scripts/QA_454_report.sh | 290 ------------------------------------ 1 file changed, 290 deletions(-) delete mode 100755 bp_scripts/QA_454_report.sh diff --git a/bp_scripts/QA_454_report.sh b/bp_scripts/QA_454_report.sh deleted file mode 100755 index 99f89e8..0000000 --- a/bp_scripts/QA_454_report.sh +++ /dev/null @@ -1,290 +0,0 @@ -#!/bin/bash - -# Copyright (C) 2010 Martin A. Hansen (mail@maasha.dk). - -# This program is free software; you can redistribute it and/or -# modify it under the terms of the GNU General Public License -# as published by the Free Software Foundation; either version 2 -# of the License, or (at your option) any later version. - -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. - -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - -# http://www.gnu.org/copyleft/gpl.html - -sff_files=$@ - -if [ ! $1 ]; then - echo - echo "QA_454_report.sh generates a quality assurance report for" - echo "each of a given number of Roche/FLX 454 .sff files." - echo - echo "Usage: `basename $0` " - echo - exit -fi - -function puts -{ - msg=$1 - - echo $msg >> $out_file -} - -function pcat -{ - file=$1 - - cat $file >> $out_file -} - -for sff_file in $sff_files; do - base=`basename $sff_file .sff` - - tmp_dir="$HOME/QA_454_report_$base" - - if [ ! -d $tmp_dir ]; then - mkdir $tmp_dir - fi - - fq_file="$tmp_dir/$base.fq" - xml_file="$tmp_dir/$base.xml" - out_file="$tmp_dir/QA_454_report_$base.txt" - - if [ -f $out_file ]; then - mv $out_file "$out_file.bak" - fi - - analysis_vals="$tmp_dir/analysis_vals.txt" - analysis_seqs="$tmp_dir/analysis_seqs.txt" - plot_lendist_unclipped="$tmp_dir/plot_lendist_unclipped.txt" - plot_lendist_clipped="$tmp_dir/plot_lendist_clipped.txt" - plot_scores_unclipped="$tmp_dir/plot_scores_unclipped.txt" - plot_scores_clipped="$tmp_dir/plot_scores_clipped.txt" - plot_mean_scores="$tmp_dir/plot_mean_scores.txt" - count_score_mean="$tmp_dir/count_score_mean.txt" - table_mid="$tmp_dir/table_mid.tab" - table_mid_len="$tmp_dir/table_mid_len.tab" - table_mid_len_score="$tmp_dir/table_mid_len_score.tab" - table_mid_join="$tmp_dir/table_mid_join.tab" - table_freq="$tmp_dir/table_freq.tab" - - # sff_extract is a 3rd party tool from the MIRA package. - # http://sourceforge.net/projects/mira-assembler/files/ - echo -n "Converting sff file $sff_file to FASTQ format ... " - sff_extract --fastq $sff_file --seq_file $fq_file --xml_file $xml_file > /dev/null 2>&1 - echo "done." - - # Using Biopieces -> www.biopieces.org - - echo "" && echo "Running composition analysis on sequences ... " - read_fastq -i $fq_file | - progress_meter | - analyze_vals -k SEQ -o $analysis_vals | - analyze_seq | - mean_vals -k 'GC%,HARD_MASK%,SOFT_MASK%' | - grab -e 'REC_TYPE eq MEAN' | - write_tab -ck 'GC%_MEAN,HARD_MASK%_MEAN,SOFT_MASK%_MEAN' -o $analysis_seqs -x - - echo "" && echo "Plotting length distributions and scores before and after clipping ..." - read_fastq -i $fq_file | - progress_meter | - bin_vals -k SEQ_LEN -b 50 | - plot_lendist -k SEQ_LEN_BIN -T "Length Distribution - unclipped" -X "50 nucleotide bins" -Y "Count" -o $plot_lendist_unclipped | - plot_scores -o $plot_scores_unclipped -X "Sequence length" -Y "Score" | - clip_seq | - bin_vals -k SEQ_LEN -b 50 | - plot_lendist -k SEQ_LEN_BIN -T "Length Distribution - clipped" -X "50 nucleotide bins" -Y "Count" -o $plot_lendist_clipped | - plot_scores -o $plot_scores_clipped -X "Sequence length" -Y "Score" -x - - echo "" && echo "Plotting mean score bins and counting mean scores greater than 20 ... " - read_fastq -i $fq_file | - progress_meter | - mean_scores | - bin_vals -k SCORES_MEAN -b 5 | - plot_histogram -s num -k SCORES_MEAN_BIN -T "Mean score bins" -X "Bins (size 5)" -Y "Count" -o $plot_mean_scores | - grab -e 'SCORES_MEAN >= 20' | - count_records -o $count_score_mean -x - - echo "" && echo "Locating and counting MID tags ... " - read_fastq -i $fq_file | - progress_meter | - clip_seq | - find_mids | - write_tab -o $table_mid -c -k MID_NUM,MID_SEQ,MID_COUNT -x - - echo "" && echo "Locating and counting MID tags for sequences longer than 250 ... " - read_fastq -i $fq_file | - progress_meter | - clip_seq | - grab -e 'SEQ_LEN >= 250' | - find_mids | - write_tab -o $table_mid_len -c -k MID_NUM,MID_SEQ,MID_COUNT -x - - echo "" && echo "Locating and counting MID tags for sequences longer than 250 and mean score above 20 ... " - read_fastq -i $fq_file | - progress_meter | - clip_seq | - grab -e 'SEQ_LEN >= 250' | - mean_scores | - grab -e 'SCORES_MEAN >= 20' | - find_mids | - write_tab -o $table_mid_len_score -c -k MID_NUM,MID_SEQ,MID_COUNT -x - - echo "" && echo -n "Joining MID tables ... " - read_tab -i $table_mid | - rename_keys -k MID_NUM,A | - rename_keys -k MID_COUNT,TOTAL | - read_tab -i $table_mid_len | - rename_keys -k MID_NUM,B | - rename_keys -k MID_COUNT,L250 | - merge_records -k A,B | - read_tab -i $table_mid_len_score | - rename_keys -k MID_NUM,C | - rename_keys -k MID_COUNT,L250_S20 | - merge_records -k A,C | - rename_keys -k A,MID_NUM | - sort_records -k MID_NUMn | - write_tab -o $table_mid_join -c -k MID_NUM,MID_SEQ,TOTAL,L250,L250_S20 -x - echo "done." - - echo "" && echo "Creating residue frequency table ... " - read_fastq -i $fq_file | - progress_meter | - extract_seq -l 50 | - uppercase_seq | - create_weight_matrix -p | - flip_tab | - write_tab -o $table_freq -x - - echo "" && echo -n "Generating report ... " - puts "" - puts "" - puts "" - puts "QA 454 Report" - puts "=============" - puts "" - puts "" - puts "" - puts "Date: `date`" - puts "" - puts "File: `pwd`/$sff_file" - puts "" - puts "" - puts "Sequence analysis" - puts "-----------------" - puts "" - puts "" - puts "The below table contains some basic info:" - puts "" - puts " COUNT is the number of sequences in the file." - puts " MIN is the minimum sequence length found." - puts " MAX is the maximum sequence length found." - puts " MEAN is the mean sequence length found." - puts " SUM is the total number of bases in the file." - puts "" - pcat $analysis_vals - puts "" - puts "" - puts "Sequence composition" - puts "--------------------" - puts "" - puts "" - puts "The below table contains composition analysis of the sequences:" - puts "" - puts " GC%_MEAN is the mean GC content." - puts " HARD_MASK%_MEAN is the mean of hard masked sequence (i.e. % of N's)." - puts " SOFT_MASK%_MEAN is the mean of soft masked sequence (i.e. lowercase residues = clipped sequence)." - puts "" - pcat $analysis_seqs - puts "" - puts "" - puts "Sequence length distribution" - puts "----------------------------" - puts "" - puts "" - puts "The length distribution of unclipped reads where the lengths are binned in buckets of size 50:" - puts "" - pcat $plot_lendist_unclipped - puts "" - puts "The length distribution of clipped reads where the lengths are binned in buckets of size 50:" - puts "" - pcat $plot_lendist_clipped - puts "" - puts "" - puts "Quality score means" - puts "-------------------" - puts "" - puts "" - puts "The mean scores of the unclipped sequences:" - puts "" - pcat $plot_scores_unclipped - puts "" - puts "The mean scores of the clipped sequences:" - puts "" - pcat $plot_scores_clipped - puts "" - puts "Histogram of bins with mean quality scores:" - puts "" - pcat $plot_mean_scores - puts "" - puts "Number of sequences with a mean score >= 20:" - puts "" - pcat $count_score_mean - puts "" - puts "" - puts "MID tag analysis" - puts "----------------" - puts "" - puts "" - puts "The below table contains the identified MID tags and the number of times they were found:" - puts "" - puts " MID_NUM is the MID tag identifier." - puts " MID_SEQ is the sequence of the MID tag." - puts " TOTAL is the number of times this MID tag was found." - puts " L250 is the a subset count of TOTAL af sequences longer than 250 bases" - puts " L250_S20 is a subset count of L250 af sequences with a mean score above 20" - puts "" - pcat $table_mid_join - puts "" - puts "" - puts "Residue frequency analysis" - puts "--------------------------" - puts "" - puts "" - puts "The below table contains the residue frequency (in percent) of the first 50 bases:" - puts "" - pcat $table_freq - puts "" - puts "end." - - rm $fq_file - rm $xml_file - rm $analysis_vals - rm $analysis_seqs - rm $plot_lendist_unclipped - rm $plot_lendist_clipped - rm $plot_scores_unclipped - rm $plot_scores_clipped - rm $plot_mean_scores - rm $count_score_mean - rm $table_mid - rm $table_mid_len - rm $table_mid_len_score - rm $table_mid_join - rm $table_freq - - echo "done." - - echo "" - echo "Report located here: $out_file" - echo "" -done - -echo "All done." -- 2.39.2