Renamed --phred33-quals, --phred64-quals, and --solexa-quals to --bowtie-phred33...

[rsem.git] / rsem-calculate-expression
diff --git a/rsem-calculate-expression b/rsem-calculate-expression

index 9e6ef2e99e1d72e7db053401b764d62f6e7bd5f8..edb6fddf5458d3d80da38dc9996d586e902a8b9d 100755 (executable)
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -2,10 +2,12 @@
  
  use Getopt::Long;
  use Pod::Usage;
-use File::Basename;
-use Switch;
+use FindBin;
+use lib $FindBin::Bin;
  use strict;
  
+use rsem_perl_utils qw(runCommand collectResults showVersionInfo);
+
  #const
  my $BURNIN = 200;
  my $NCV = 1000;
@@ -59,6 +61,8 @@ my $keep_intermediate_files = 0;
  
  my $strand_specific = 0;
  
+my $version = 0;
+
  my $mTime = 0;
  my ($time_start, $time_end, $time_alignment, $time_rsem, $time_ci) = (0, 0, 0, 0, 0);
  
@@ -84,9 +88,9 @@ GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
            "bowtie-e=i" => \$E,
            "bowtie-m=i" => \$maxHits,
            "bowtie-chunkmbs=i" => \$chunkMbs,
-          "phred33-quals" => \$phred33,
-          "phred64-quals" => \$phred64, #solexa1.3-quals" => \$phred64,
-          "solexa-quals" => \$solexa,
+          "bowtie-phred33-quals" => \$phred33,
+          "bowtie-phred64-quals" => \$phred64, #solexa1.3-quals" => \$phred64,
+          "bowtie-solexa-quals" => \$solexa,
            "forward-prob=f" => \$probF,
            "fragment-length-min=i" => \$minL,
            "fragment-length-max=i" => \$maxL,
@@ -102,11 +106,14 @@ GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
            "calc-ci" => \$calcCI,
            "ci-memory=i" => \$NMB,
            "time" => \$mTime,
+          "version" => \$version,
            "q|quiet" => \$quiet,
            "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);
  
-pod2usage(-verbose => 2) if ($help == 1);
+my $dir = "$FindBin::Bin/";
  
+pod2usage(-verbose => 2) if ($help == 1);
+&showVersionInfo($dir) if ($version == 1);
  
  #check parameters and options
  
@@ -183,7 +190,6 @@ my ($mate_minL, $mate_maxL) = (1, $maxL);
  
  if ($bowtie_path ne "") { $bowtie_path .= "/"; }
  
-my ($fn, $dir, $suf) = fileparse($0);
  my $command = "";
  
  if (!$is_sam && !$is_bam) {
@@ -242,12 +248,11 @@ if ($quiet) { $command .= " -q"; }
  &runCommand($command);
  
  $command = $dir."rsem-build-read-index $gap"; 
-switch($read_type) {
-    case 0  { $command .= " 0 $quiet $imdName\_alignable.fa"; }
-    case 1  { $command .= " 1 $quiet $imdName\_alignable.fq"; }
-    case 2  { $command .= " 0 $quiet $imdName\_alignable_1.fa $imdName\_alignable_2.fa"; }
-    case 3  { $command .= " 1 $quiet $imdName\_alignable_1.fq $imdName\_alignable_2.fq"; }
-}
+if ($read_type == 0) { $command .= " 0 $quiet $imdName\_alignable.fa"; }
+elsif ($read_type == 1) { $command .= " 1 $quiet $imdName\_alignable.fq"; }
+elsif ($read_type == 2) { $command .= " 0 $quiet $imdName\_alignable_1.fa $imdName\_alignable_2.fa"; }
+elsif ($read_type == 3) { $command .= " 1 $quiet $imdName\_alignable_1.fq $imdName\_alignable_2.fq"; }
+else { print "Impossible! read_type is not in [1,2,3,4]!\n"; exit(-1); }
  &runCommand($command);
  
  my $doesOpen = open(OUTPUT, ">$imdName.mparams");
@@ -273,8 +278,8 @@ if ($quiet) { $command .= " -q"; }
  
  &runCommand($command);
  
-&collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
-&collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
+&collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+&collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
  
  if ($genBamF) {
      $command = $dir."sam/samtools sort $sampleName.transcript.bam $sampleName.transcript.sorted";
@@ -307,8 +312,8 @@ if ($calcCI || $var_opt) {
  if ($calcCI) {
      system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1");
      system("mv $sampleName.genes.results $imdName.genes.results.bak1");
-    &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
-    &collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
+    &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+    &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
  
      $command = $dir."rsem-calculate-credibility-intervals $refName $imdName $statName $CONFIDENCE $NCV $NSPC $NMB";
      $command .= " -p $nThreads";
@@ -317,8 +322,8 @@ if ($calcCI) {
  
      system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak2");
      system("mv $sampleName.genes.results $imdName.genes.results.bak2");
-    &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
-    &collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
+    &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+    &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
  }
  
  if ($mTime) { $time_end = time(); $time_ci = $time_end - $time_start; }
@@ -341,63 +346,6 @@ if ($mTime) {
      close(OUTPUT);
  }
  
-# command, {err_msg}
-sub runCommand {
-    print $_[0]."\n";
-    my $status = system($_[0]);
-    if ($status != 0) { 
-       my $errmsg;
-       if (scalar(@_) > 1) { $errmsg = $_[1]; }
-       else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the pipeline!"; }
-       print $errmsg."\n";
-       exit(-1);
-    }
-    print "\n";
-}
-
-# inpF, outF
-sub collectResults {
-    my $local_status;
-    my ($inpF, $outF);
-    my (@results, @ids) = ();
-    my $line;
-    my $cnt;
-
-    $inpF = $_[0];
-    $outF = $_[1];
-
-    $local_status = open(INPUT, $inpF);
-    if ($local_status == 0) { print "Fail to open file $inpF!\n"; exit(-1); }
-    
-    $cnt = 0;
-    @results = ();
-    
-    while ($line = <INPUT>) {
-       ++$cnt;
-       chomp($line);
-       my @local_arr = split(/\t/, $line);
-       if ($cnt == 4) { @ids = @local_arr; }
-       else { push(@results, \@local_arr); }
-    }
-    
-    push(@results, \@ids);
-    close(INPUT);
-
-    $local_status = open(OUTPUT, ">$outF");
-    if ($local_status == 0) { print "Fail to create file $outF!\n"; exit(-1); }
-
-    my $n = scalar(@results);
-    my $m = scalar(@{$results[0]});
-    for (my $i = 0; $i < $m; $i++) {
-       my @out_arr = ();
-       for (my $j = 0; $j < $n; $j++) { push(@out_arr, $results[$j][$i]); }
-       $" = "\t";
-       print OUTPUT "@out_arr\n"; 
-    }
-    close(OUTPUT);
-}
-
-
  __END__
  
  =head1 NAME
@@ -516,17 +464,17 @@ The path to the bowtie executables. (Default: the path to the bowtie executables
  
  (Bowtie parameter) memory allocated for best first alignment calculation (Default: 0 - use bowtie's default)
  
-=item B<--phred33-quals>
+=item B<--bowtie-phred33-quals>
  
-Input quality scores are encoded as Phred+33. (Default: on)
+(Bowtie parameter) Input quality scores are encoded as Phred+33. (Default: on)
  
-=item B<--phred64-quals>
+=item B<--bowtie-phred64-quals>
  
-Input quality scores are encoded as Phred+64 (default for GA Pipeline ver. >= 1.3). (Default: off)
+(Bowtie parameter) Input quality scores are encoded as Phred+64 (default for GA Pipeline ver. >= 1.3). (Default: off)
  
-=item B<--solexa-quals>
+=item B<--bowtie-solexa-quals>
  
-Input quality scores are solexa encoded (from GA Pipeline ver. < 1.3). (Default: off)
+(Bowtie parameter) Input quality scores are solexa encoded (from GA Pipeline ver. < 1.3). (Default: off)
  
  =item B<--forward-prob> <double>
  
@@ -608,33 +556,85 @@ With the '--calc-ci' option, 95% credibility intervals and posterior mean estima
  
  =over
  
-=item B<sample_name.genes.results> 
+=item B<sample_name.isoforms.results> 
  
-File containing gene level expression estimates. The format of each
-line in this file is:
+File containing isoform level expression estimates. The first line
+contains column names separated by the tab character. The format of
+each line in the rest of this file is:
  
-gene_id expected_counts tau_value [pmc_value tau_pme_value tau_ci_lower_bound tau_ci_upper_bound] transcript_id_list
+transcript_id gene_id length effective_length expected_count TPM FPKM IsoPct [pme_expected_count pme_TPM pme_FPKM IsoPct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound]
  
  Fields are separated by the tab character. Fields within "[]" are only
-presented if '--calc-ci' is set. pme stands for posterior mean
-estimation. pmc stands for posterior mean counts. ci_lower_bound(l)
-means the lower bound of the credibility intervals, ci_upper_bound(u)
-means the upper bound of the credibility intervals. So the credibility
-interval is [l, u]. 'transcript_id_list' is a space-separated list of
-transcript_ids belonging to the gene. If no gene information is
-provided, this file has the same content as
-'sample_name.isoforms.results'.
+presented if '--calc-ci' is set.
  
-=item B<sample_name.isoforms.results> 
+'transcript_id' is the transcript name of this transcript. 'gene_id'
+is the gene name of the gene which this transcript belongs to (denote
+this gene as its parent gene). If no gene information is provided,
+'gene_id' and 'transcript_id' are the same.
+
+'length' is this transcript's sequence length (poly(A) tail is not
+counted). 'effective_length' counts only the positions that can
+generate a valid fragment. If no poly(A) tail is added,
+'effective_length' is equal to transcript length - mean fragment
+length + 1. If one transcript's effective length is less than 1, this
+transcript's both effective length and abundance estimates are set to
+0.
+
+'expected_count' is the sum of the posterior probability of each read
+comes from this transcript over all reads. Because 1) each read
+aligning to this transcript has a probability of being generated from
+background noise; 2) RSEM may filter some alignable low quality reads,
+the sum of expected counts for all transcript are generally less than
+the total number of reads aligned.
+
+'TPM' stands for Transcripts Per Million. It is a relative measure of
+transcript abundance. The sum of all transcripts' TPM is 1
+million. 'FPKM' stands for Fragments Per Kilobase of transcript per
+Million mapped reads. It is another relative measure of transcript
+abundance. If we define l_bar be the mean transcript length in a
+sample, which can be calculated as
+
+l_bar = \sum_i TPM_i / 10^6 * effective_length_i (i goes through every transcript), 
+
+the following equation is hold:
  
-File containing isoform level expression values. The format of each
-line in this file is:
+FPKM_i = 10^3 / l_bar * TPM_i.
+
+We can see that the sum of FPKM is not a constant across samples.
+
+'IsoPct' stands for isoform percentage. It is the percentage of this
+transcript's abandunce over its parent gene's abandunce. If its parent
+gene has only one isoform or the gene information is not provided,
+this field will be set to 100.
+
+'pme_expected_count', 'pme_TPM', 'pme_FPKM' are posterior mean
+estimates calculated by RSEM's Gibbs sampler. 'IsoPct_from_pme_TPM' is
+the isoform percentage calculated from 'pme_TPM' values.
+
+'TPM_ci_lower_bound', 'TPM_ci_upper_bound', 'FPKM_ci_lower_bound' and
+'FPKM_ci_upper_bound' are lower(l) and upper(u) bounds of 95%
+credibility intervals for TPM and FPKM values. The bounds are
+inclusive (i.e. [l, u]).
+
+=item B<sample_name.genes.results>
+
+File containing gene level expression estimates. The first line
+contains column names separated by the tab character. The format of
+each line in the rest of this file is:
+
+gene_id transcript_id(s) length effective_length expected_count TPM FPKM [pme_expected_count pme_TPM pme_FPKM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound]
+
+Fields are separated by the tab character. Fields within "[]" are only
+presented if '--calc-ci' is set. 
  
-transcript_id expected_counts tau_value [pmc_value tau_pme_value tau_ci_lower_bound tau_ci_upper_bound] gene_id
+'transcript_id(s)' is a comma-separated list of transcript_ids
+belonging to this gene. If no gene information is provided, 'gene_id'
+and 'transcript_id(s)' are identical (the 'transcript_id').
  
-Fields are separated by the tab character. 'gene_id' is the gene_id of
-the gene which this transcript belongs to. If no gene information is
-provided, 'gene_id' and 'transcript_id' are the same.
+A gene's 'length' and 'effective_length' are
+defined as the weighted average of its transcripts' lengths and
+effective lengths (weighted by 'IsoPct'). A gene's abundance estimates
+are just the sum of its transcripts' abundance estimates.
  
  =item B<sample_name.transcript.bam, sample_name.transcript.sorted.bam and sample_name.transcript.sorted.bam.bai>