deleted old QA_454_report script

author martinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>

Thu, 30 Aug 2012 12:10:27 +0000 (12:10 +0000)

committer martinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>

Thu, 30 Aug 2012 12:10:27 +0000 (12:10 +0000)
author martinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Thu, 30 Aug 2012 12:10:27 +0000 (12:10 +0000)
committer martinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
Thu, 30 Aug 2012 12:10:27 +0000 (12:10 +0000)
diff --git a/bp_scripts/QA_454_report.sh b/bp_scripts/QA_454_report.sh

deleted file mode 100755 (executable)

index 99f89e8..0000000
--- a/bp_scripts/QA_454_report.sh
+++ /dev/null
@@ -1,290 +0,0 @@
-#!/bin/bash
-
-# Copyright (C) 2010 Martin A. Hansen (mail@maasha.dk).
-
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of the GNU General Public License
-# as published by the Free Software Foundation; either version 2
-# of the License, or (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-
-# http://www.gnu.org/copyleft/gpl.html
-
-sff_files=$@
-
-if [ ! $1 ]; then
-    echo
-    echo "QA_454_report.sh generates a quality assurance report for"
-    echo "each of a given number of Roche/FLX 454 .sff files."
-    echo
-    echo "Usage: `basename $0` <sff file(s)>"
-    echo
-    exit
-fi
-
-function puts
-{
-    msg=$1
-
-    echo $msg >> $out_file
-}
-
-function pcat
-{
-    file=$1
-
-    cat $file >> $out_file
-}
-
-for sff_file in $sff_files; do
-    base=`basename $sff_file .sff`
-
-    tmp_dir="$HOME/QA_454_report_$base"
-
-    if [ ! -d $tmp_dir ]; then
-        mkdir $tmp_dir
-    fi
-
-    fq_file="$tmp_dir/$base.fq"
-    xml_file="$tmp_dir/$base.xml"
-    out_file="$tmp_dir/QA_454_report_$base.txt"
-
-    if [ -f $out_file ]; then
-        mv $out_file "$out_file.bak"
-    fi
-
-    analysis_vals="$tmp_dir/analysis_vals.txt"
-    analysis_seqs="$tmp_dir/analysis_seqs.txt"
-    plot_lendist_unclipped="$tmp_dir/plot_lendist_unclipped.txt"
-    plot_lendist_clipped="$tmp_dir/plot_lendist_clipped.txt"
-    plot_scores_unclipped="$tmp_dir/plot_scores_unclipped.txt"
-    plot_scores_clipped="$tmp_dir/plot_scores_clipped.txt"
-    plot_mean_scores="$tmp_dir/plot_mean_scores.txt"
-    count_score_mean="$tmp_dir/count_score_mean.txt"
-    table_mid="$tmp_dir/table_mid.tab"
-    table_mid_len="$tmp_dir/table_mid_len.tab"
-    table_mid_len_score="$tmp_dir/table_mid_len_score.tab"
-    table_mid_join="$tmp_dir/table_mid_join.tab"
-    table_freq="$tmp_dir/table_freq.tab"
-
-    # sff_extract is a 3rd party tool from the MIRA package.
-    # http://sourceforge.net/projects/mira-assembler/files/
-    echo -n "Converting sff file $sff_file to FASTQ format ... "
-    sff_extract --fastq $sff_file --seq_file $fq_file --xml_file $xml_file > /dev/null 2>&1
-    echo "done."
-
-    # Using Biopieces -> www.biopieces.org
-
-    echo "" && echo "Running composition analysis on sequences ... "
-    read_fastq -i $fq_file |
-    progress_meter |
-    analyze_vals -k SEQ -o $analysis_vals |
-    analyze_seq |
-    mean_vals -k 'GC%,HARD_MASK%,SOFT_MASK%' |
-    grab -e 'REC_TYPE eq MEAN' |
-    write_tab -ck 'GC%_MEAN,HARD_MASK%_MEAN,SOFT_MASK%_MEAN' -o $analysis_seqs -x
-
-    echo "" && echo "Plotting length distributions and scores before and after clipping ..."
-    read_fastq -i $fq_file |
-    progress_meter |
-    bin_vals -k SEQ_LEN -b 50 |
-    plot_lendist -k SEQ_LEN_BIN -T "Length Distribution - unclipped" -X "50 nucleotide bins" -Y "Count" -o $plot_lendist_unclipped |
-    plot_scores -o $plot_scores_unclipped -X "Sequence length" -Y "Score" |
-    clip_seq |
-    bin_vals -k SEQ_LEN -b 50 |
-    plot_lendist -k SEQ_LEN_BIN -T "Length Distribution - clipped" -X "50 nucleotide bins" -Y "Count" -o $plot_lendist_clipped |
-    plot_scores -o $plot_scores_clipped -X "Sequence length" -Y "Score" -x
-
-    echo "" && echo "Plotting mean score bins and counting mean scores greater than 20 ... "
-    read_fastq -i $fq_file |
-    progress_meter |
-    mean_scores |
-    bin_vals -k SCORES_MEAN -b 5 |
-    plot_histogram -s num -k SCORES_MEAN_BIN -T "Mean score bins" -X "Bins (size 5)" -Y "Count" -o $plot_mean_scores |
-    grab -e 'SCORES_MEAN >= 20' |
-    count_records -o $count_score_mean -x
-
-    echo "" && echo "Locating and counting MID tags ... "
-    read_fastq -i $fq_file |
-    progress_meter |
-    clip_seq |
-    find_mids |
-    write_tab -o $table_mid -c -k MID_NUM,MID_SEQ,MID_COUNT -x
-
-    echo "" && echo "Locating and counting MID tags for sequences longer than 250 ... "
-    read_fastq -i $fq_file |
-    progress_meter |
-    clip_seq |
-    grab -e 'SEQ_LEN >= 250' |
-    find_mids |
-    write_tab -o $table_mid_len -c -k MID_NUM,MID_SEQ,MID_COUNT -x
-
-    echo "" && echo "Locating and counting MID tags for sequences longer than 250 and mean score above 20 ... "
-    read_fastq -i $fq_file |
-    progress_meter |
-    clip_seq |
-    grab -e 'SEQ_LEN >= 250' |
-    mean_scores |
-    grab -e 'SCORES_MEAN >= 20' |
-    find_mids |
-    write_tab -o $table_mid_len_score -c -k MID_NUM,MID_SEQ,MID_COUNT -x
-
-    echo "" && echo -n "Joining MID tables ... "
-    read_tab -i $table_mid |
-    rename_keys -k MID_NUM,A |
-    rename_keys -k MID_COUNT,TOTAL |
-    read_tab -i $table_mid_len |
-    rename_keys -k MID_NUM,B |
-    rename_keys -k MID_COUNT,L250 |
-    merge_records -k A,B |
-    read_tab -i $table_mid_len_score |
-    rename_keys -k MID_NUM,C |
-    rename_keys -k MID_COUNT,L250_S20 |
-    merge_records -k A,C |
-    rename_keys -k A,MID_NUM |
-    sort_records -k MID_NUMn |
-    write_tab -o $table_mid_join -c -k MID_NUM,MID_SEQ,TOTAL,L250,L250_S20 -x
-    echo "done."
-
-    echo "" && echo "Creating residue frequency table ... "
-    read_fastq -i $fq_file |
-    progress_meter |
-    extract_seq -l 50 |
-    uppercase_seq |
-    create_weight_matrix -p |
-    flip_tab |
-    write_tab -o $table_freq -x
-
-    echo "" && echo -n "Generating report ... "
-    puts ""
-    puts ""
-    puts ""
-    puts "QA 454 Report"
-    puts "============="
-    puts ""
-    puts ""
-    puts ""
-    puts "Date: `date`"
-    puts ""
-    puts "File: `pwd`/$sff_file"
-    puts ""
-    puts ""
-    puts "Sequence analysis"
-    puts "-----------------"
-    puts ""
-    puts ""
-    puts "The below table contains some basic info:"
-    puts ""
-    puts "  COUNT is the number of sequences in the file."
-    puts "  MIN   is the minimum sequence length found."
-    puts "  MAX   is the maximum sequence length found."
-    puts "  MEAN  is the mean    sequence length found."
-    puts "  SUM   is the total number of bases in the file."
-    puts ""
-    pcat $analysis_vals
-    puts ""
-    puts ""
-    puts "Sequence composition"
-    puts "--------------------"
-    puts ""
-    puts ""
-    puts "The below table contains composition analysis of the sequences:"
-    puts ""
-    puts "  GC%_MEAN is the mean GC content."
-    puts "  HARD_MASK%_MEAN is the mean of hard masked sequence (i.e. % of N's)."
-    puts "  SOFT_MASK%_MEAN is the mean of soft masked sequence (i.e. lowercase residues = clipped sequence)."
-    puts ""
-    pcat $analysis_seqs
-    puts ""
-    puts ""
-    puts "Sequence length distribution"
-    puts "----------------------------"
-    puts ""
-    puts ""
-    puts "The length distribution of unclipped reads where the lengths are binned in buckets of size 50:"
-    puts ""
-    pcat $plot_lendist_unclipped
-    puts ""
-    puts "The length distribution of clipped reads where the lengths are binned in buckets of size 50:"
-    puts ""
-    pcat $plot_lendist_clipped
-    puts ""
-    puts ""
-    puts "Quality score means"
-    puts "-------------------"
-    puts ""
-    puts ""
-    puts "The mean scores of the unclipped sequences:"
-    puts ""
-    pcat $plot_scores_unclipped
-    puts ""
-    puts "The mean scores of the clipped sequences:"
-    puts ""
-    pcat $plot_scores_clipped
-    puts ""
-    puts "Histogram of bins with mean quality scores:"
-    puts ""
-    pcat $plot_mean_scores
-    puts ""
-    puts "Number of sequences with a mean score >= 20:"
-    puts ""
-    pcat $count_score_mean
-    puts ""
-    puts ""
-    puts "MID tag analysis"
-    puts "----------------"
-    puts ""
-    puts ""
-    puts "The below table contains the identified MID tags and the number of times they were found:"
-    puts ""
-    puts "  MID_NUM is the MID tag identifier."
-    puts "  MID_SEQ is the sequence of the MID tag."
-    puts "  TOTAL is the number of times this MID tag was found."
-    puts "  L250 is the a subset count of TOTAL af sequences longer than 250 bases"
-    puts "  L250_S20 is a subset count of L250 af sequences with a mean score above 20"
-    puts ""
-    pcat $table_mid_join
-    puts ""
-    puts ""
-    puts "Residue frequency analysis"
-    puts "--------------------------"
-    puts ""
-    puts ""
-    puts "The below table contains the residue frequency (in percent) of the first 50 bases:"
-    puts ""
-    pcat $table_freq
-    puts ""
-    puts "end."
-
-    rm $fq_file
-    rm $xml_file
-    rm $analysis_vals
-    rm $analysis_seqs
-    rm $plot_lendist_unclipped
-    rm $plot_lendist_clipped
-    rm $plot_scores_unclipped
-    rm $plot_scores_clipped
-    rm $plot_mean_scores
-    rm $count_score_mean
-    rm $table_mid
-    rm $table_mid_len
-    rm $table_mid_len_score
-    rm $table_mid_join
-    rm $table_freq
-
-    echo "done."
-
-    echo ""
-    echo "Report located here: $out_file"
-    echo ""
-done
-
-echo "All done."
author	martinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
	Thu, 30 Aug 2012 12:10:27 +0000 (12:10 +0000)
committer	martinahansen <martinahansen@74ccb610-7750-0410-82ae-013aeee3265d>
	Thu, 30 Aug 2012 12:10:27 +0000 (12:10 +0000)