Renamed --phred33-quals, --phred64-quals, and --solexa-quals to --bowtie-phred33...

[rsem.git] / rsem-calculate-expression
diff --git a/rsem-calculate-expression b/rsem-calculate-expression

index c01c70ab2e7067f2a4b2f3683215fd87314645e1..edb6fddf5458d3d80da38dc9996d586e902a8b9d 100755 (executable)
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -2,10 +2,12 @@
  
  use Getopt::Long;
  use Pod::Usage;
-use File::Basename;
-use Switch;
+use FindBin;
+use lib $FindBin::Bin;
  use strict;
  
+use rsem_perl_utils qw(runCommand collectResults showVersionInfo);
+
  #const
  my $BURNIN = 200;
  my $NCV = 1000;
@@ -49,6 +51,7 @@ my $genBamF = 1;  # default is generating transcript bam file
  my $genGenomeBamF = 0;
  my $sampling = 0;
  my $calcCI = 0;
+my $var_opt = 0; # temporarily, only for internal use
  my $quiet = 0;
  my $help = 0;
  
@@ -58,10 +61,20 @@ my $keep_intermediate_files = 0;
  
  my $strand_specific = 0;
  
+my $version = 0;
+
  my $mTime = 0;
  my ($time_start, $time_end, $time_alignment, $time_rsem, $time_ci) = (0, 0, 0, 0, 0);
  
+my $mate1_list = "";
+my $mate2_list = "";
+my $inpF = "";
+
+my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName, $statName) = ();
+my $gap = 32;
+
  GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
+          "temporary-folder=s" => \$temp_dir,
            "no-qualities" => \$no_qual,
            "paired-end" => \$paired_end,
            "strand-specific" => \$strand_specific,
@@ -75,9 +88,9 @@ GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
            "bowtie-e=i" => \$E,
            "bowtie-m=i" => \$maxHits,
            "bowtie-chunkmbs=i" => \$chunkMbs,
-          "phred33-quals" => \$phred33,
-          "phred64-quals" => \$phred64, #solexa1.3-quals" => \$phred64,
-          "solexa-quals" => \$solexa,
+          "bowtie-phred33-quals" => \$phred33,
+          "bowtie-phred64-quals" => \$phred64, #solexa1.3-quals" => \$phred64,
+          "bowtie-solexa-quals" => \$solexa,
            "forward-prob=f" => \$probF,
            "fragment-length-min=i" => \$minL,
            "fragment-length-max=i" => \$maxL,
@@ -89,14 +102,18 @@ GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
            "no-bam-output" => sub { $genBamF = 0; },
            "output-genome-bam" => \$genGenomeBamF,
            "sampling-for-bam" => \$sampling,
+          "var" => \$var_opt,
            "calc-ci" => \$calcCI,
            "ci-memory=i" => \$NMB,
            "time" => \$mTime,
+          "version" => \$version,
            "q|quiet" => \$quiet,
            "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);
  
-pod2usage(-verbose => 2) if ($help == 1);
+my $dir = "$FindBin::Bin/";
  
+pod2usage(-verbose => 2) if ($help == 1);
+&showVersionInfo($dir) if ($version == 1);
  
  #check parameters and options
  
@@ -124,13 +141,6 @@ if ($L < 25) { print "Warning: the seed length set is less than 25! This is only
  
  if ($strand_specific) { $probF = 1.0; }
  
-my $mate1_list = "";
-my $mate2_list = "";
-my $inpF = "";
-
-my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName) = ();
-my $gap = 32;
-
  if ($paired_end) {
      if ($no_qual) { $read_type = 2; }
      else { $read_type = 3; }
@@ -165,13 +175,14 @@ my $pos = rindex($sampleName, '/');
  if ($pos < 0) { $sampleToken = $sampleName; }
  else { $sampleToken = substr($sampleName, $pos + 1); }
  
-$temp_dir = "$sampleName.temp";
+if ($temp_dir eq "") { $temp_dir = "$sampleName.temp"; }
  $stat_dir = "$sampleName.stat";
  
  if (!(-d $temp_dir) && !mkdir($temp_dir)) { print "Fail to create folder $temp_dir.\n"; exit(-1); }
  if (!(-d $stat_dir) && !mkdir($stat_dir)) { print "Fail to create folder $stat_dir.\n"; exit(-1); }
  
  $imdName = "$temp_dir/$sampleToken";
+$statName = "$stat_dir/$sampleToken";
  
  if (!$is_sam && !$is_bam && !$no_qual && ($phred33 + $phred64 + $solexa == 0)) { $phred33 = 1; }
  
@@ -179,7 +190,6 @@ my ($mate_minL, $mate_maxL) = (1, $maxL);
  
  if ($bowtie_path ne "") { $bowtie_path .= "/"; }
  
-my ($fn, $dir, $suf) = fileparse($0);
  my $command = "";
  
  if (!$is_sam && !$is_bam) {
@@ -209,7 +219,8 @@ if (!$is_sam && !$is_bam) {
         $command .= " -1 $mate1_list -2 $mate2_list";
      }
  
-    $command .= " | gzip > $sampleName.sam.gz";
+    # pipe to samtools to generate a BAM file
+    $command .= " | $dir\sam/samtools view -S -b -o $imdName.bam -";
  
      if ($mTime) { $time_start = time(); }
  
@@ -217,13 +228,13 @@ if (!$is_sam && !$is_bam) {
  
      if ($mTime) { $time_end = time(); $time_alignment = $time_end - $time_start; }
  
-    $inpF = "$sampleName.sam.gz";
-    $is_sam = 1; # output of bowtie is a sam file
+    $inpF = "$imdName.bam";
+    $is_bam = 1; # alignments are outputed as a BAM file
  }
  
  if ($mTime) { $time_start = time(); }
  
-$command = $dir."rsem-parse-alignments $refName $sampleName $sampleToken";
+$command = $dir."rsem-parse-alignments $refName $imdName $statName";
  
  my $samInpType;
  if ($is_sam) { $samInpType = "s"; } 
@@ -237,12 +248,11 @@ if ($quiet) { $command .= " -q"; }
  &runCommand($command);
  
  $command = $dir."rsem-build-read-index $gap"; 
-switch($read_type) {
-    case 0  { $command .= " 0 $quiet $imdName\_alignable.fa"; }
-    case 1  { $command .= " 1 $quiet $imdName\_alignable.fq"; }
-    case 2  { $command .= " 0 $quiet $imdName\_alignable_1.fa $imdName\_alignable_2.fa"; }
-    case 3  { $command .= " 1 $quiet $imdName\_alignable_1.fq $imdName\_alignable_2.fq"; }
-}
+if ($read_type == 0) { $command .= " 0 $quiet $imdName\_alignable.fa"; }
+elsif ($read_type == 1) { $command .= " 1 $quiet $imdName\_alignable.fq"; }
+elsif ($read_type == 2) { $command .= " 0 $quiet $imdName\_alignable_1.fa $imdName\_alignable_2.fa"; }
+elsif ($read_type == 3) { $command .= " 1 $quiet $imdName\_alignable_1.fq $imdName\_alignable_2.fq"; }
+else { print "Impossible! read_type is not in [1,2,3,4]!\n"; exit(-1); }
  &runCommand($command);
  
  my $doesOpen = open(OUTPUT, ">$imdName.mparams");
@@ -256,18 +266,21 @@ print OUTPUT "$mean $sd\n";
  print OUTPUT "$L\n";
  close(OUTPUT);  
  
-$command = $dir."rsem-run-em $refName $read_type $sampleName $sampleToken -p $nThreads";
+$command = $dir."rsem-run-em $refName $read_type $sampleName $imdName $statName -p $nThreads";
  if ($genBamF) { 
      $command .= " -b $samInpType $inpF";
      if ($fn_list ne "") { $command .= " 1 $fn_list"; }
      else { $command .= " 0"; }
      if ($sampling) { $command .= " --sampling"; }
  }
-if ($calcCI) { $command .= " --gibbs-out"; }
+if ($calcCI || $var_opt) { $command .= " --gibbs-out"; }
  if ($quiet) { $command .= " -q"; }
  
  &runCommand($command);
  
+&collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+&collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
+
  if ($genBamF) {
      $command = $dir."sam/samtools sort $sampleName.transcript.bam $sampleName.transcript.sorted";
      &runCommand($command);
@@ -284,33 +297,33 @@ if ($genBamF) {
      }
  }
  
-&collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
-&collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
-
  if ($mTime) { $time_end = time(); $time_rsem = $time_end - $time_start; }
  
  if ($mTime) { $time_start = time(); }
  
-if ($calcCI) {
-    $command = $dir."rsem-run-gibbs $refName $sampleName $sampleToken $BURNIN $NCV $SAMPLEGAP";
+if ($calcCI || $var_opt) {
+    $command = $dir."rsem-run-gibbs $refName $imdName $statName $BURNIN $NCV $SAMPLEGAP";
      $command .= " -p $nThreads";
+    if ($var_opt) { $command .= " --var"; }
      if ($quiet) { $command .= " -q"; }
      &runCommand($command);
+}
  
+if ($calcCI) {
      system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1");
      system("mv $sampleName.genes.results $imdName.genes.results.bak1");
-    &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
-    &collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
+    &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+    &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
  
-    $command = $dir."rsem-calculate-credibility-intervals $refName $sampleName $sampleToken $CONFIDENCE $NCV $NSPC $NMB";
+    $command = $dir."rsem-calculate-credibility-intervals $refName $imdName $statName $CONFIDENCE $NCV $NSPC $NMB";
      $command .= " -p $nThreads";
      if ($quiet) { $command .= " -q"; }
      &runCommand($command);
  
      system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak2");
      system("mv $sampleName.genes.results $imdName.genes.results.bak2");
-    &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
-    &collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
+    &collectResults("isoform", "$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+    &collectResults("gene", "$imdName.gene_res", "$sampleName.genes.results"); # gene level
  }
  
  if ($mTime) { $time_end = time(); $time_ci = $time_end - $time_start; }
@@ -333,63 +346,6 @@ if ($mTime) {
      close(OUTPUT);
  }
  
-# command, {err_msg}
-sub runCommand {
-    print $_[0]."\n";
-    my $status = system($_[0]);
-    if ($status != 0) { 
-       my $errmsg;
-       if (scalar(@_) > 1) { $errmsg = $_[1]; }
-       else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the pipeline!"; }
-       print $errmsg."\n";
-       exit(-1);
-    }
-    print "\n";
-}
-
-# inpF, outF
-sub collectResults {
-    my $local_status;
-    my ($inpF, $outF);
-    my (@results, @ids) = ();
-    my $line;
-    my $cnt;
-
-    $inpF = $_[0];
-    $outF = $_[1];
-
-    $local_status = open(INPUT, $inpF);
-    if ($local_status == 0) { print "Fail to open file $inpF!\n"; exit(-1); }
-    
-    $cnt = 0;
-    @results = ();
-    
-    while ($line = <INPUT>) {
-       ++$cnt;
-       chomp($line);
-       my @local_arr = split(/\t/, $line);
-       if ($cnt == 4) { @ids = @local_arr; }
-       else { push(@results, \@local_arr); }
-    }
-    
-    push(@results, \@ids);
-    close(INPUT);
-
-    $local_status = open(OUTPUT, ">$outF");
-    if ($local_status == 0) { print "Fail to create file $outF!\n"; exit(-1); }
-
-    my $n = scalar(@results);
-    my $m = scalar(@{$results[0]});
-    for (my $i = 0; $i < $m; $i++) {
-       my @out_arr = ();
-       for (my $j = 0; $j < $n; $j++) { push(@out_arr, $results[$j][$i]); }
-       $" = "\t";
-       print OUTPUT "@out_arr\n"; 
-    }
-    close(OUTPUT);
-}
-
-
  __END__
  
  =head1 NAME
@@ -474,7 +430,7 @@ Generate a BAM file, 'sample_name.genome.bam', with alignments mapped to genomic
  
  =item B<--sampling-for-bam>
  
-When RSEM generates a BAM file, instead of outputing all alignments a read has with their posterior probabilities, one alignment is sampled and outputed according to the posterior probabilities. If the sampling result is that the read comes from the "noise" transcript, nothing is outputed. (Default: off)
+When RSEM generates a BAM file, instead of outputing all alignments a read has with their posterior probabilities, one alignment is sampled according to the posterior probabilities. The sampling procedure includes the alignment to the "noise" transcript, which does not appear in the BAM file. Only the sampled alignment has a weight of 1. All other alignments have weight 0. If the "noise" transcript is sampled, all alignments appeared in the BAM file should have weight 0. (Default: off)
  
  =item B<--calc-ci>
  
@@ -508,17 +464,17 @@ The path to the bowtie executables. (Default: the path to the bowtie executables
  
  (Bowtie parameter) memory allocated for best first alignment calculation (Default: 0 - use bowtie's default)
  
-=item B<--phred33-quals>
+=item B<--bowtie-phred33-quals>
  
-Input quality scores are encoded as Phred+33. (Default: on)
+(Bowtie parameter) Input quality scores are encoded as Phred+33. (Default: on)
  
-=item B<--phred64-quals>
+=item B<--bowtie-phred64-quals>
  
-Input quality scores are encoded as Phred+64 (default for GA Pipeline ver. >= 1.3). (Default: off)
+(Bowtie parameter) Input quality scores are encoded as Phred+64 (default for GA Pipeline ver. >= 1.3). (Default: off)
  
-=item B<--solexa-quals>
+=item B<--bowtie-solexa-quals>
  
-Input quality scores are solexa encoded (from GA Pipeline ver. < 1.3). (Default: off)
+(Bowtie parameter) Input quality scores are solexa encoded (from GA Pipeline ver. < 1.3). (Default: off)
  
  =item B<--forward-prob> <double>
  
@@ -556,6 +512,10 @@ Maximum size (in memory, MB) of the auxiliary buffer used for computing credibil
  
  Keep temporary files generated by RSEM.  RSEM creates a temporary directory, 'sample_name.temp', into which it puts all intermediate output files. If this directory already exists, RSEM overwrites all files generated by previous RSEM runs inside of it. By default, after RSEM finishes, the temporary directory is deleted.  Set this option to prevent the deletion of this directory and the intermediate files inside of it. (Default: off)
  
+=item B<--temporary-folder> <string>
+
+Set where to put the temporary files generated by RSEM. If the folder specified does not exist, RSEM will try to create it. (Default: sample_name.temp)
+
  =item B<--time>
  
  Output time consumed by each step of RSEM to 'sample_name.time'. (Default: off)
@@ -596,33 +556,85 @@ With the '--calc-ci' option, 95% credibility intervals and posterior mean estima
  
  =over
  
-=item B<sample_name.genes.results> 
+=item B<sample_name.isoforms.results> 
  
-File containing gene level expression estimates. The format of each
-line in this file is:
+File containing isoform level expression estimates. The first line
+contains column names separated by the tab character. The format of
+each line in the rest of this file is:
  
-gene_id expected_counts tau_value [pmc_value tau_pme_value tau_ci_lower_bound tau_ci_upper_bound] transcript_id_list
+transcript_id gene_id length effective_length expected_count TPM FPKM IsoPct [pme_expected_count pme_TPM pme_FPKM IsoPct_from_pme_TPM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound]
  
  Fields are separated by the tab character. Fields within "[]" are only
-presented if '--calc-ci' is set. pme stands for posterior mean
-estimation. pmc stands for posterior mean counts. ci_lower_bound(l)
-means the lower bound of the credibility intervals, ci_upper_bound(u)
-means the upper bound of the credibility intervals. So the credibility
-interval is [l, u]. 'transcript_id_list' is a space-separated list of
-transcript_ids belonging to the gene. If no gene information is
-provided, this file has the same content as
-'sample_name.isoforms.results'.
+presented if '--calc-ci' is set.
  
-=item B<sample_name.isoforms.results> 
+'transcript_id' is the transcript name of this transcript. 'gene_id'
+is the gene name of the gene which this transcript belongs to (denote
+this gene as its parent gene). If no gene information is provided,
+'gene_id' and 'transcript_id' are the same.
+
+'length' is this transcript's sequence length (poly(A) tail is not
+counted). 'effective_length' counts only the positions that can
+generate a valid fragment. If no poly(A) tail is added,
+'effective_length' is equal to transcript length - mean fragment
+length + 1. If one transcript's effective length is less than 1, this
+transcript's both effective length and abundance estimates are set to
+0.
+
+'expected_count' is the sum of the posterior probability of each read
+comes from this transcript over all reads. Because 1) each read
+aligning to this transcript has a probability of being generated from
+background noise; 2) RSEM may filter some alignable low quality reads,
+the sum of expected counts for all transcript are generally less than
+the total number of reads aligned.
  
-File containing isoform level expression values. The format of each
-line in this file is:
+'TPM' stands for Transcripts Per Million. It is a relative measure of
+transcript abundance. The sum of all transcripts' TPM is 1
+million. 'FPKM' stands for Fragments Per Kilobase of transcript per
+Million mapped reads. It is another relative measure of transcript
+abundance. If we define l_bar be the mean transcript length in a
+sample, which can be calculated as
  
-transcript_id expected_counts tau_value [pmc_value tau_pme_value tau_ci_lower_bound tau_ci_upper_bound] gene_id
+l_bar = \sum_i TPM_i / 10^6 * effective_length_i (i goes through every transcript), 
  
-Fields are separated by the tab character. 'gene_id' is the gene_id of
-the gene which this transcript belongs to. If no gene information is
-provided, 'gene_id' and 'transcript_id' are the same.
+the following equation is hold:
+
+FPKM_i = 10^3 / l_bar * TPM_i.
+
+We can see that the sum of FPKM is not a constant across samples.
+
+'IsoPct' stands for isoform percentage. It is the percentage of this
+transcript's abandunce over its parent gene's abandunce. If its parent
+gene has only one isoform or the gene information is not provided,
+this field will be set to 100.
+
+'pme_expected_count', 'pme_TPM', 'pme_FPKM' are posterior mean
+estimates calculated by RSEM's Gibbs sampler. 'IsoPct_from_pme_TPM' is
+the isoform percentage calculated from 'pme_TPM' values.
+
+'TPM_ci_lower_bound', 'TPM_ci_upper_bound', 'FPKM_ci_lower_bound' and
+'FPKM_ci_upper_bound' are lower(l) and upper(u) bounds of 95%
+credibility intervals for TPM and FPKM values. The bounds are
+inclusive (i.e. [l, u]).
+
+=item B<sample_name.genes.results>
+
+File containing gene level expression estimates. The first line
+contains column names separated by the tab character. The format of
+each line in the rest of this file is:
+
+gene_id transcript_id(s) length effective_length expected_count TPM FPKM [pme_expected_count pme_TPM pme_FPKM TPM_ci_lower_bound TPM_ci_upper_bound FPKM_ci_lower_bound FPKM_ci_upper_bound]
+
+Fields are separated by the tab character. Fields within "[]" are only
+presented if '--calc-ci' is set. 
+
+'transcript_id(s)' is a comma-separated list of transcript_ids
+belonging to this gene. If no gene information is provided, 'gene_id'
+and 'transcript_id(s)' are identical (the 'transcript_id').
+
+A gene's 'length' and 'effective_length' are
+defined as the weighted average of its transcripts' lengths and
+effective lengths (weighted by 'IsoPct'). A gene's abundance estimates
+are just the sum of its transcripts' abundance estimates.
  
  =item B<sample_name.transcript.bam, sample_name.transcript.sorted.bam and sample_name.transcript.sorted.bam.bai>
  
@@ -634,7 +646,12 @@ is set to min(100, floor(-10 * log10(1.0 - w) + 0.5)), where w is the
  posterior probability of that alignment being the true mapping of a
  read.  In addition, RSEM pads a new tag ZW:f:value, where value is a
  single precision floating number representing the posterior
-probability.
+probability. Because this file contains all alignment lines produced
+by bowtie or user-specified aligners, it can also be used as a
+replacement of the aligner generated BAM/SAM file. For paired-end
+reads, if one mate has alignments but the other does not, this file
+marks the alignable mate as "unmappable" (flag bit 0x4) and appends an
+optional field "Z0:A:!".
  
  'sample_name.transcript.sorted.bam' and
  'sample_name.transcript.sorted.bam.bai' are the sorted BAM file and
@@ -659,12 +676,6 @@ indicating the strand of the transcript it aligns to.
  'sample_name.genome.sorted.bam' and 'sample_name.genome.sorted.bam.bai' are the
  sorted BAM file and indices generated by samtools (included in RSEM package).
  
-=item B<sample_name.sam.gz>
-
-Only generated when the input files are raw reads instead of SAM/BAM format files
-
-It is the gzipped SAM output produced by bowtie aligner.
-
  =item B<sample_name.time>
  
  Only generated when --time is specified.