rsem v1.1.16

[rsem.git] / rsem-calculate-expression
diff --git a/rsem-calculate-expression b/rsem-calculate-expression

index cbfef53fd7da453c07c29a5830691933aa8be1fc..2902239eff0513a186c2fc562d8541c99b578a62 100755 (executable)
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -8,7 +8,7 @@ use strict;
  
  #const
  my $BURNIN = 200;
-my $CHAINLEN = 1000;
+my $NCV = 1000;
  my $SAMPLEGAP = 1;
  my $CONFIDENCE = 0.95;
  my $NSPC = 50;
@@ -24,6 +24,7 @@ my $C = 2;
  my $E = 99999999;
  my $L = 25;
  my $maxHits = 200;
+my $chunkMbs = 0;      # 0 = use bowtie default
  my $phred33 = 0;
  my $phred64 = 0;
  my $solexa = 0;
@@ -73,6 +74,7 @@ GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
            "bowtie-n=i" => \$C,
            "bowtie-e=i" => \$E,
            "bowtie-m=i" => \$maxHits,
+          "bowtie-chunkmbs=i" => \$chunkMbs,
            "phred33-quals" => \$phred33,
            "phred64-quals" => \$phred64, #solexa1.3-quals" => \$phred64,
            "solexa-quals" => \$solexa,
@@ -189,15 +191,15 @@ if (!$is_sam && !$is_bam) {
      else { print "Oh, no!!!"; exit(2); }
      
      $command .= " -n $C -e $E -l $L";
-    
      if ($read_type == 2 || $read_type == 3) { $command .= " -I $minL -X $maxL"; }
+    if ($chunkMbs > 0) { $command .= " --chunkmbs $chunkMbs"; }
      
      if ($strand_specific || $probF == 1.0) { $command .= " --norc"; }
      elsif ($probF == 0.0) { $command .= " --nofw"; }
  
      $command .= " -p $nThreads -a -m $maxHits -S";
-    if ($quiet) { $command .= " --quiet"; }
-    
+    if ($quiet) { $command .= " --quiet"; }    
+
      $command .= " $refName";
      if ($read_type == 0 || $read_type == 1) {
         $command .= " $mate1_list"; 
@@ -206,7 +208,7 @@ if (!$is_sam && !$is_bam) {
         $command .= " -1 $mate1_list -2 $mate2_list";
      }
  
-    $command .= " | gzip > $imdName.sam.gz";
+    $command .= " | gzip > $sampleName.sam.gz";
  
      if ($mTime) { $time_start = time(); }
  
@@ -214,7 +216,7 @@ if (!$is_sam && !$is_bam) {
  
      if ($mTime) { $time_end = time(); $time_alignment = $time_end - $time_start; }
  
-    $inpF = "$imdName.sam.gz";
+    $inpF = "$sampleName.sam.gz";
      $is_sam = 1; # output of bowtie is a sam file
  }
  
@@ -289,8 +291,8 @@ if ($mTime) { $time_end = time(); $time_rsem = $time_end - $time_start; }
  if ($mTime) { $time_start = time(); }
  
  if ($calcCI) {
-    $command = $dir."rsem-run-gibbs $refName $sampleName $sampleToken $BURNIN $CHAINLEN $SAMPLEGAP";
-#    $command .= " -p $nThreads";
+    $command = $dir."rsem-run-gibbs $refName $sampleName $sampleToken $BURNIN $NCV $SAMPLEGAP";
+    $command .= " -p $nThreads";
      if ($quiet) { $command .= " -q"; }
      &runCommand($command);
  
@@ -299,7 +301,8 @@ if ($calcCI) {
      &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
      &collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
  
-    $command = $dir."rsem-calculate-credibility-intervals $refName $sampleName $sampleToken $CONFIDENCE $NSPC $NMB";
+    $command = $dir."rsem-calculate-credibility-intervals $refName $sampleName $sampleToken $CONFIDENCE $NCV $NSPC $NMB";
+    $command .= " -p $nThreads";
      if ($quiet) { $command .= " -q"; }
      &runCommand($command);
  
@@ -321,11 +324,11 @@ if ($mTime) { $time_end = time(); }
  
  if ($mTime) { 
      open(OUTPUT, ">$sampleName.time");
-    print OUTPUT "Alignment: $time_alignment s.\n";
-    print OUTPUT "RSEM: $time_rsem s.\n";
-    print OUTPUT "CI: $time_ci s.\n";
+    print OUTPUT "Aligning reads: $time_alignment s.\n";
+    print OUTPUT "Estimating expression levels: $time_rsem s.\n";
+    print OUTPUT "Calculating credibility intervals: $time_ci s.\n";
      my $time_del = $time_end - $time_start;
-    print OUTPUT "Delete: $time_del s.\n";
+#    print OUTPUT "Delete: $time_del s.\n";
      close(OUTPUT);
  }
  
@@ -467,7 +470,7 @@ Generate a BAM file, 'sample_name.genome.bam', with alignments mapped to genomic
  =item B<--sampling-for-bam>
  
  When RSEM generates a BAM file, instead of outputing all alignments a read has with their posterior probabilities, one alignment is sampled and outputed according to the posterior probabilities. If the sampling result is that the read comes from the "noise" transcript, nothing is outputed. (Default: off)
- 
+
  =item B<--calc-ci>
  
  Calculate 95% credibility intervals and posterior mean estimates.  (Default: off)
@@ -496,6 +499,10 @@ The path to the bowtie executables. (Default: the path to the bowtie executables
  
  (Bowtie parameter) suppress all alignments for a read if > <int> valid alignments exist. (Default: 200)
  
+=item B<--bowtie-chunkmbs> <int>
+
+(Bowtie parameter) memory allocated for best first alignment calculation (Default: 0 - use bowtie's default)
+
  =item B<--phred33-quals>
  
  Input quality scores are encoded as Phred+33. (Default: on)
@@ -544,6 +551,10 @@ Amount of memory (in MB) RSEM is allowed to use for computing credibility interv
  
  Keep temporary files generated by RSEM.  RSEM creates a temporary directory, 'sample_name.temp', into which it puts all intermediate output files. If this directory already exists, RSEM overwrites all files generated by previous RSEM runs inside of it. By default, after RSEM finishes, the temporary directory is deleted.  Set this option to prevent the deletion of this directory and the intermediate files inside of it. (Default: off)
  
+=item B<--time>
+
+Output time consumed by each step of RSEM to 'sample_name.time'. (Default: off)
+
  =item B<-q/--quiet>
  
  Suppress the output of logging information. (Default: off)
@@ -558,11 +569,13 @@ Show help information.
  
  In its default mode, this program aligns input reads against a reference transcriptome with Bowtie and calculates expression values using the alignments.  RSEM assumes the data are single-end reads with quality scores, unless the '--paired-end' or '--no-qualities' options are specified.  Users may use an alternative aligner by specifying one of the --sam and --bam options, and providing an alignment file in the specified format. However, users should make sure that they align against the indices generated by 'rsem-prepare-reference' and the alignment file satisfies the requirements mentioned in ARGUMENTS section. 
  
-One simple way to make the alignment file (e.g. input.sam) satisfying RSEM's requirements (assuming the aligner used put mates in a paired-end read adjacent) is to use the following command:
+One simple way to make the alignment file satisfying RSEM's requirements (assuming the aligner used put mates in a paired-end read adjacent) is to use 'convert-sam-for-rsem' script. This script only accept SAM format files as input. If a BAM format file is obtained, please use samtools to convert it to a SAM file first. For example, if '/ref/mouse_125' is the 'reference_name' and the SAM file is named 'input.sam', you can run the following command: 
  
-  sort -k 1,1 -s input.sam > input.sorted.sam
+  convert-sam-for-rsem /ref/mouse_125 input.sam -o input_for_rsem.sam  
  
-The SAM/BAM format RSEM uses is v1.4. However, it is compatible with old SAM/BAM format. However, RSEM cannot recognize 0x100 in the FLAG field. In addition, RSEM requires SEQ and QUAL not be '*'. 
+For details, please refer to 'convert-sam-for-rsem's documentation page.
+
+The SAM/BAM format RSEM uses is v1.4. However, it is compatible with old SAM/BAM format. However, RSEM cannot recognize 0x100 in the FLAG field. In addition, RSEM requires SEQ and QUAL are not '*'. 
  
  The user must run 'rsem-prepare-reference' with the appropriate reference before using this program.
  
@@ -572,7 +585,7 @@ Please note that some of the default values for the Bowtie parameters are not th
  
  The temporary directory and all intermediate files will be removed when RSEM finishes unless '--keep-intermediate-files' is specified.
  
-With the "--calc-ci" option, 95% credibility intervals and posterior mean estimates will be calculated in addition to maximum likelihood estimates.
+With the '--calc-ci' option, 95% credibility intervals and posterior mean estimates will be calculated in addition to maximum likelihood estimates.
  
  =head1 OUTPUT
  
@@ -639,6 +652,18 @@ indicating the strand of the transcript it aligns to.
  'sample_name.genome.sorted.bam' and 'sample_name.genome.sorted.bam.bai' are the
  sorted BAM file and indices generated by samtools (included in RSEM package).
  
+=item B<sample_name.sam.gz>
+
+Only generated when the input files are raw reads instead of SAM/BAM format files
+
+It is the gzipped SAM output produced by bowtie aligner.
+
+=item B<sample_name.time>
+
+Only generated when --time is specified.
+
+It contains time (in seconds) consumed by aligning reads, estimating expression levels and calculating credibility intervals.
+
  =item B<sample_name.stat>
  
  This is a folder instead of a file. All model related statistics are stored in this folder. Use 'rsem-plot-model' can generate plots using this folder.
@@ -647,7 +672,7 @@ This is a folder instead of a file. All model related statistics are stored in t
  
  =head1 EXAMPLES
  
-Assume the path to the bowtie executables is in the user's PATH environment variable. Reference files are under '/ref' with name 'mm9'. 
+Assume the path to the bowtie executables is in the user's PATH environment variable. Reference files are under '/ref' with name 'mouse_125'. 
  
  1) '/data/mmliver.fq', single-end reads with quality scores. Quality scores are encoded as for 'GA pipeline version >= 1.3'. We want to use 8 threads and generate a genome BAM file:
  
@@ -655,7 +680,7 @@ Assume the path to the bowtie executables is in the user's PATH environment vari
                             -p 8 \
                             --output-genome-bam \
                             /data/mmliver.fq \
-                           /ref/mm9 \
+                           /ref/mouse_125 \
                             mmliver_single_quals
  
  2) '/data/mmliver_1.fq' and '/data/mmliver_2.fq', paired-end reads with quality scores. Quality scores are in SANGER format. We want to use 8 threads and do not generate a genome BAM file:
@@ -664,7 +689,7 @@ Assume the path to the bowtie executables is in the user's PATH environment vari
                             --paired-end \
                             /data/mmliver_1.fq \
                             /data/mmliver_2.fq \
-                           /ref/mm9 \
+                           /ref/mouse_125 \
                             mmliver_paired_end_quals
  
  3) '/data/mmliver.fa', single-end reads without quality scores. We want to use 8 threads:
@@ -672,7 +697,7 @@ Assume the path to the bowtie executables is in the user's PATH environment vari
   rsem-calculate-expression -p 8 \
                             --no-qualities \
                             /data/mmliver.fa \
-                           /ref/mm9 \
+                           /ref/mouse_125 \
                             mmliver_single_without_quals
  
  4) Data are the same as 1). We want to take a fragment length distribution into consideration. We set the fragment length mean to 150 and the standard deviation to 35. In addition to a BAM file, we also want to generate credibility intervals.  We allow RSEM to use 1GB of memory for CI calculation:
@@ -686,7 +711,7 @@ Assume the path to the bowtie executables is in the user's PATH environment vari
                             --calc-ci \
                             --ci-memory 1024 \
                             /data/mmliver.fq \
-                           /ref/mm9 \
+                           /ref/mouse_125 \
                             mmliver_single_quals
  
  5) '/data/mmliver_paired_end_quals.bam', paired-end reads with quality scores.  We want to use 8 threads:
@@ -695,7 +720,7 @@ Assume the path to the bowtie executables is in the user's PATH environment vari
                             --bam \
                             -p 8 \
                             /data/mmliver_paired_end_quals.bam \
-                           /ref/mm9 \
+                           /ref/mouse_125 \
                             mmliver_paired_end_quals
  
  =cut