Fix a bug on convert-sam-for-rsem for the case that a paired-end read can align to...

[rsem.git] / rsem-calculate-expression
diff --git a/rsem-calculate-expression b/rsem-calculate-expression

index 6f9706e9e55e6c487c2aeed023fc4a52d31795c1..9e6ef2e99e1d72e7db053401b764d62f6e7bd5f8 100755 (executable)
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -8,7 +8,7 @@ use strict;
  
  #const
  my $BURNIN = 200;
  
  #const
  my $BURNIN = 200;
-my $CHAINLEN = 1000;
+my $NCV = 1000;
  my $SAMPLEGAP = 1;
  my $CONFIDENCE = 0.95;
  my $NSPC = 50;
  my $SAMPLEGAP = 1;
  my $CONFIDENCE = 0.95;
  my $NSPC = 50;
@@ -24,6 +24,7 @@ my $C = 2;
  my $E = 99999999;
  my $L = 25;
  my $maxHits = 200;
  my $E = 99999999;
  my $L = 25;
  my $maxHits = 200;
+my $chunkMbs = 0;      # 0 = use bowtie default
  my $phred33 = 0;
  my $phred64 = 0;
  my $solexa = 0;
  my $phred33 = 0;
  my $phred64 = 0;
  my $solexa = 0;
@@ -48,6 +49,7 @@ my $genBamF = 1;  # default is generating transcript bam file
  my $genGenomeBamF = 0;
  my $sampling = 0;
  my $calcCI = 0;
  my $genGenomeBamF = 0;
  my $sampling = 0;
  my $calcCI = 0;
+my $var_opt = 0; # temporarily, only for internal use
  my $quiet = 0;
  my $help = 0;
  
  my $quiet = 0;
  my $help = 0;
  
@@ -60,7 +62,15 @@ my $strand_specific = 0;
  my $mTime = 0;
  my ($time_start, $time_end, $time_alignment, $time_rsem, $time_ci) = (0, 0, 0, 0, 0);
  
  my $mTime = 0;
  my ($time_start, $time_end, $time_alignment, $time_rsem, $time_ci) = (0, 0, 0, 0, 0);
  
+my $mate1_list = "";
+my $mate2_list = "";
+my $inpF = "";
+
+my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName, $statName) = ();
+my $gap = 32;
+
  GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
  GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
+          "temporary-folder=s" => \$temp_dir,
            "no-qualities" => \$no_qual,
            "paired-end" => \$paired_end,
            "strand-specific" => \$strand_specific,
            "no-qualities" => \$no_qual,
            "paired-end" => \$paired_end,
            "strand-specific" => \$strand_specific,
@@ -73,6 +83,7 @@ GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
            "bowtie-n=i" => \$C,
            "bowtie-e=i" => \$E,
            "bowtie-m=i" => \$maxHits,
            "bowtie-n=i" => \$C,
            "bowtie-e=i" => \$E,
            "bowtie-m=i" => \$maxHits,
+          "bowtie-chunkmbs=i" => \$chunkMbs,
            "phred33-quals" => \$phred33,
            "phred64-quals" => \$phred64, #solexa1.3-quals" => \$phred64,
            "solexa-quals" => \$solexa,
            "phred33-quals" => \$phred33,
            "phred64-quals" => \$phred64, #solexa1.3-quals" => \$phred64,
            "solexa-quals" => \$solexa,
@@ -84,8 +95,10 @@ GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
            "estimate-rspd" => \$estRSPD,
            "num-rspd-bins=i" => \$B,
            "p|num-threads=i" => \$nThreads,
            "estimate-rspd" => \$estRSPD,
            "num-rspd-bins=i" => \$B,
            "p|num-threads=i" => \$nThreads,
+          "no-bam-output" => sub { $genBamF = 0; },
            "output-genome-bam" => \$genGenomeBamF,
            "sampling-for-bam" => \$sampling,
            "output-genome-bam" => \$genGenomeBamF,
            "sampling-for-bam" => \$sampling,
+          "var" => \$var_opt,
            "calc-ci" => \$calcCI,
            "ci-memory=i" => \$NMB,
            "time" => \$mTime,
            "calc-ci" => \$calcCI,
            "ci-memory=i" => \$NMB,
            "time" => \$mTime,
@@ -113,17 +126,13 @@ pod2usage(-msg => "Min fragment length should be at least 1!", -exitval => 2, -v
  pod2usage(-msg => "Min fragment length should be smaller or equal to max fragment length!", -exitval => 2, -verbose => 2) if ($minL > $maxL);
  pod2usage(-msg => "The memory allocated for calculating credibility intervals should be at least 1 MB!\n", -exitval => 2, -verbose => 2) if ($NMB < 1);
  pod2usage(-msg => "Number of threads should be at least 1!\n", -exitval => 2, -verbose => 2) if ($nThreads < 1);
  pod2usage(-msg => "Min fragment length should be smaller or equal to max fragment length!", -exitval => 2, -verbose => 2) if ($minL > $maxL);
  pod2usage(-msg => "The memory allocated for calculating credibility intervals should be at least 1 MB!\n", -exitval => 2, -verbose => 2) if ($NMB < 1);
  pod2usage(-msg => "Number of threads should be at least 1!\n", -exitval => 2, -verbose => 2) if ($nThreads < 1);
-pod2usage(-msg => "Seed length should be at least 25!\n", -exitval => 2, -verbose => 2) if ($L < 25);
-pod2usage(-msg => "--sampling-for-bam cannot be specified if --out-bam is not specified!\n", -exitval => 2, -verbose => 2) if ($sampling && !$genBamF);
-
-if ($strand_specific) { $probF = 1.0; }
+pod2usage(-msg => "Seed length should be at least 5!\n", -exitval => 2, -verbose => 2) if ($L < 5);
+pod2usage(-msg => "--sampling-for-bam cannot be specified if --no-bam-output is specified!\n", -exitval => 2, -verbose => 2) if ($sampling && !$genBamF);
+pod2usage(-msg => "--output-genome-bam cannot be specified if --no-bam-output is specified!\n", -exitval => 2, -verbose => 2) if ($genGenomeBamF && !$genBamF);
  
  
-my $mate1_list = "";
-my $mate2_list = "";
-my $inpF = "";
+if ($L < 25) { print "Warning: the seed length set is less than 25! This is only allowed if the references are not added poly(A) tails.\n"; }
  
  
-my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName) = ();
-my $gap = 32;
+if ($strand_specific) { $probF = 1.0; }
  
  if ($paired_end) {
      if ($no_qual) { $read_type = 2; }
  
  if ($paired_end) {
      if ($no_qual) { $read_type = 2; }
@@ -159,15 +168,16 @@ my $pos = rindex($sampleName, '/');
  if ($pos < 0) { $sampleToken = $sampleName; }
  else { $sampleToken = substr($sampleName, $pos + 1); }
  
  if ($pos < 0) { $sampleToken = $sampleName; }
  else { $sampleToken = substr($sampleName, $pos + 1); }
  
-$temp_dir = "$sampleName.temp";
+if ($temp_dir eq "") { $temp_dir = "$sampleName.temp"; }
  $stat_dir = "$sampleName.stat";
  
  if (!(-d $temp_dir) && !mkdir($temp_dir)) { print "Fail to create folder $temp_dir.\n"; exit(-1); }
  if (!(-d $stat_dir) && !mkdir($stat_dir)) { print "Fail to create folder $stat_dir.\n"; exit(-1); }
  
  $imdName = "$temp_dir/$sampleToken";
  $stat_dir = "$sampleName.stat";
  
  if (!(-d $temp_dir) && !mkdir($temp_dir)) { print "Fail to create folder $temp_dir.\n"; exit(-1); }
  if (!(-d $stat_dir) && !mkdir($stat_dir)) { print "Fail to create folder $stat_dir.\n"; exit(-1); }
  
  $imdName = "$temp_dir/$sampleToken";
+$statName = "$stat_dir/$sampleToken";
  
  
-if (!$is_sam && !$is_bam && $phred33 + $phred64 + $solexa == 0) { $phred33 = 1; }
+if (!$is_sam && !$is_bam && !$no_qual && ($phred33 + $phred64 + $solexa == 0)) { $phred33 = 1; }
  
  my ($mate_minL, $mate_maxL) = (1, $maxL);
  
  
  my ($mate_minL, $mate_maxL) = (1, $maxL);
  
@@ -178,24 +188,23 @@ my $command = "";
  
  if (!$is_sam && !$is_bam) {
      $command = $bowtie_path."bowtie";
  
  if (!$is_sam && !$is_bam) {
      $command = $bowtie_path."bowtie";
-    if ($read_type == 0 || $read_type == 2) { $command .= " -f"; }
+    if ($no_qual) { $command .= " -f"; }
      else { $command .= " -q"; }
      
      if ($phred33) { $command .= " --phred33-quals"; }
      elsif ($phred64) { $command .= " --phred64-quals"; }
      elsif ($solexa) { $command .= " --solexa-quals"; }
      else { $command .= " -q"; }
      
      if ($phred33) { $command .= " --phred33-quals"; }
      elsif ($phred64) { $command .= " --phred64-quals"; }
      elsif ($solexa) { $command .= " --solexa-quals"; }
-    else { print "Oh, no!!!"; exit(2); }
      
      $command .= " -n $C -e $E -l $L";
      
      $command .= " -n $C -e $E -l $L";
-    
      if ($read_type == 2 || $read_type == 3) { $command .= " -I $minL -X $maxL"; }
      if ($read_type == 2 || $read_type == 3) { $command .= " -I $minL -X $maxL"; }
+    if ($chunkMbs > 0) { $command .= " --chunkmbs $chunkMbs"; }
      
      if ($strand_specific || $probF == 1.0) { $command .= " --norc"; }
      elsif ($probF == 0.0) { $command .= " --nofw"; }
  
      $command .= " -p $nThreads -a -m $maxHits -S";
      
      if ($strand_specific || $probF == 1.0) { $command .= " --norc"; }
      elsif ($probF == 0.0) { $command .= " --nofw"; }
  
      $command .= " -p $nThreads -a -m $maxHits -S";
-    if ($quiet) { $command .= " --quiet"; }
-    
+    if ($quiet) { $command .= " --quiet"; }    
+
      $command .= " $refName";
      if ($read_type == 0 || $read_type == 1) {
         $command .= " $mate1_list"; 
      $command .= " $refName";
      if ($read_type == 0 || $read_type == 1) {
         $command .= " $mate1_list"; 
@@ -204,7 +213,8 @@ if (!$is_sam && !$is_bam) {
         $command .= " -1 $mate1_list -2 $mate2_list";
      }
  
         $command .= " -1 $mate1_list -2 $mate2_list";
      }
  
-    $command .= " | gzip > $imdName.sam.gz";
+    # pipe to samtools to generate a BAM file
+    $command .= " | $dir\sam/samtools view -S -b -o $imdName.bam -";
  
      if ($mTime) { $time_start = time(); }
  
  
      if ($mTime) { $time_start = time(); }
  
@@ -212,13 +222,13 @@ if (!$is_sam && !$is_bam) {
  
      if ($mTime) { $time_end = time(); $time_alignment = $time_end - $time_start; }
  
  
      if ($mTime) { $time_end = time(); $time_alignment = $time_end - $time_start; }
  
-    $inpF = "$imdName.sam.gz";
-    $is_sam = 1; # output of bowtie is a sam file
+    $inpF = "$imdName.bam";
+    $is_bam = 1; # alignments are outputed as a BAM file
  }
  
  if ($mTime) { $time_start = time(); }
  
  }
  
  if ($mTime) { $time_start = time(); }
  
-$command = $dir."rsem-parse-alignments $refName $sampleName $sampleToken";
+$command = $dir."rsem-parse-alignments $refName $imdName $statName";
  
  my $samInpType;
  if ($is_sam) { $samInpType = "s"; } 
  
  my $samInpType;
  if ($is_sam) { $samInpType = "s"; } 
@@ -251,18 +261,21 @@ print OUTPUT "$mean $sd\n";
  print OUTPUT "$L\n";
  close(OUTPUT);  
  
  print OUTPUT "$L\n";
  close(OUTPUT);  
  
-$command = $dir."rsem-run-em $refName $read_type $sampleName $sampleToken -p $nThreads";
+$command = $dir."rsem-run-em $refName $read_type $sampleName $imdName $statName -p $nThreads";
  if ($genBamF) { 
      $command .= " -b $samInpType $inpF";
      if ($fn_list ne "") { $command .= " 1 $fn_list"; }
      else { $command .= " 0"; }
      if ($sampling) { $command .= " --sampling"; }
  }
  if ($genBamF) { 
      $command .= " -b $samInpType $inpF";
      if ($fn_list ne "") { $command .= " 1 $fn_list"; }
      else { $command .= " 0"; }
      if ($sampling) { $command .= " --sampling"; }
  }
-if ($calcCI) { $command .= " --gibbs-out"; }
+if ($calcCI || $var_opt) { $command .= " --gibbs-out"; }
  if ($quiet) { $command .= " -q"; }
  
  &runCommand($command);
  
  if ($quiet) { $command .= " -q"; }
  
  &runCommand($command);
  
+&collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+&collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
+
  if ($genBamF) {
      $command = $dir."sam/samtools sort $sampleName.transcript.bam $sampleName.transcript.sorted";
      &runCommand($command);
  if ($genBamF) {
      $command = $dir."sam/samtools sort $sampleName.transcript.bam $sampleName.transcript.sorted";
      &runCommand($command);
@@ -279,25 +292,26 @@ if ($genBamF) {
      }
  }
  
      }
  }
  
-&collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
-&collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
-
  if ($mTime) { $time_end = time(); $time_rsem = $time_end - $time_start; }
  
  if ($mTime) { $time_start = time(); }
  
  if ($mTime) { $time_end = time(); $time_rsem = $time_end - $time_start; }
  
  if ($mTime) { $time_start = time(); }
  
-if ($calcCI) {
-    $command = $dir."rsem-run-gibbs $refName $sampleName $sampleToken $BURNIN $CHAINLEN $SAMPLEGAP";
-#    $command .= " -p $nThreads";
+if ($calcCI || $var_opt) {
+    $command = $dir."rsem-run-gibbs $refName $imdName $statName $BURNIN $NCV $SAMPLEGAP";
+    $command .= " -p $nThreads";
+    if ($var_opt) { $command .= " --var"; }
      if ($quiet) { $command .= " -q"; }
      &runCommand($command);
      if ($quiet) { $command .= " -q"; }
      &runCommand($command);
+}
  
  
+if ($calcCI) {
      system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1");
      system("mv $sampleName.genes.results $imdName.genes.results.bak1");
      &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
      &collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
  
      system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1");
      system("mv $sampleName.genes.results $imdName.genes.results.bak1");
      &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
      &collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
  
-    $command = $dir."rsem-calculate-credibility-intervals $refName $sampleName $sampleToken $CONFIDENCE $NSPC $NMB";
+    $command = $dir."rsem-calculate-credibility-intervals $refName $imdName $statName $CONFIDENCE $NCV $NSPC $NMB";
+    $command .= " -p $nThreads";
      if ($quiet) { $command .= " -q"; }
      &runCommand($command);
  
      if ($quiet) { $command .= " -q"; }
      &runCommand($command);
  
@@ -319,11 +333,11 @@ if ($mTime) { $time_end = time(); }
  
  if ($mTime) { 
      open(OUTPUT, ">$sampleName.time");
  
  if ($mTime) { 
      open(OUTPUT, ">$sampleName.time");
-    print OUTPUT "Alignment: $time_alignment s.\n";
-    print OUTPUT "RSEM: $time_rsem s.\n";
-    print OUTPUT "CI: $time_ci s.\n";
+    print OUTPUT "Aligning reads: $time_alignment s.\n";
+    print OUTPUT "Estimating expression levels: $time_rsem s.\n";
+    print OUTPUT "Calculating credibility intervals: $time_ci s.\n";
      my $time_del = $time_end - $time_start;
      my $time_del = $time_end - $time_start;
-    print OUTPUT "Delete: $time_del s.\n";
+#    print OUTPUT "Delete: $time_del s.\n";
      close(OUTPUT);
  }
  
      close(OUTPUT);
  }
  
@@ -334,7 +348,7 @@ sub runCommand {
      if ($status != 0) { 
         my $errmsg;
         if (scalar(@_) > 1) { $errmsg = $_[1]; }
      if ($status != 0) { 
         my $errmsg;
         if (scalar(@_) > 1) { $errmsg = $_[1]; }
-       else { $errmsg = "$command failed! Plase check if you provide correct parameters/options for the pipeline!"; }
+       else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the pipeline!"; }
         print $errmsg."\n";
         exit(-1);
      }
         print $errmsg."\n";
         exit(-1);
      }
@@ -345,7 +359,7 @@ sub runCommand {
  sub collectResults {
      my $local_status;
      my ($inpF, $outF);
  sub collectResults {
      my $local_status;
      my ($inpF, $outF);
-    my (@results, @comment) = ();
+    my (@results, @ids) = ();
      my $line;
      my $cnt;
  
      my $line;
      my $cnt;
  
@@ -362,11 +376,11 @@ sub collectResults {
         ++$cnt;
         chomp($line);
         my @local_arr = split(/\t/, $line);
         ++$cnt;
         chomp($line);
         my @local_arr = split(/\t/, $line);
-       if ($cnt == 4) { @comment = @local_arr; }
+       if ($cnt == 4) { @ids = @local_arr; }
         else { push(@results, \@local_arr); }
      }
      
         else { push(@results, \@local_arr); }
      }
      
-    push(@results, \@comment);
+    push(@results, \@ids);
      close(INPUT);
  
      $local_status = open(OUTPUT, ">$outF");
      close(INPUT);
  
      $local_status = open(OUTPUT, ">$outF");
@@ -458,21 +472,25 @@ RSEM reads header information from input by default. If this option is on, heade
  
  Number of threads to use. Both Bowtie and expression estimation will use this many threads. (Default: 1)
  
  
  Number of threads to use. Both Bowtie and expression estimation will use this many threads. (Default: 1)
  
+=item B<--no-bam-output>
+
+Do not output any BAM file. (Default: off)
+
  =item B<--output-genome-bam>
  
  Generate a BAM file, 'sample_name.genome.bam', with alignments mapped to genomic coordinates and annotated with their posterior probabilities. In addition, RSEM will call samtools (included in RSEM package) to sort and index the bam file. 'sample_name.genome.sorted.bam' and 'sample_name.genome.sorted.bam.bai' will be generated. (Default: off)
  
  =item B<--sampling-for-bam>
  
  =item B<--output-genome-bam>
  
  Generate a BAM file, 'sample_name.genome.bam', with alignments mapped to genomic coordinates and annotated with their posterior probabilities. In addition, RSEM will call samtools (included in RSEM package) to sort and index the bam file. 'sample_name.genome.sorted.bam' and 'sample_name.genome.sorted.bam.bai' will be generated. (Default: off)
  
  =item B<--sampling-for-bam>
  
-When RSEM generates a BAM file, instead of outputing all alignments a read has with their posterior probabilities, one alignment is sampled and outputed according to the posterior probabilities. If the sampling result is that the read comes from the "noise" transcript, nothing is outputed. (Default: off)
- 
+When RSEM generates a BAM file, instead of outputing all alignments a read has with their posterior probabilities, one alignment is sampled according to the posterior probabilities. The sampling procedure includes the alignment to the "noise" transcript, which does not appear in the BAM file. Only the sampled alignment has a weight of 1. All other alignments have weight 0. If the "noise" transcript is sampled, all alignments appeared in the BAM file should have weight 0. (Default: off)
+
  =item B<--calc-ci>
  
  Calculate 95% credibility intervals and posterior mean estimates.  (Default: off)
  
  =item B<--seed-length> <int>
  
  =item B<--calc-ci>
  
  Calculate 95% credibility intervals and posterior mean estimates.  (Default: off)
  
  =item B<--seed-length> <int>
  
-Seed length used by the read aligner.  Providing the correct value for this parameter is important for RSEM's accuracy if the data are single-end reads.  If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter. The minimum value is 25. Any read with its or at least one of its mates' (for paired-end reads) length less than 25 will be ignored. (Default: 25)
+Seed length used by the read aligner.  Providing the correct value is important for RSEM. If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter. Any read with its or at least one of its mates' (for paired-end reads) length less than this value will be ignored. If the references are not added poly(A) tails, the minimum allowed value is 5, otherwise, the minimum allowed value is 25. Note that this script will only check if the value >= 5 and give a warning message if the value < 25 but >= 5. (Default: 25)
  
  =item B<--tag> <string>
  
  
  =item B<--tag> <string>
  
@@ -494,6 +512,10 @@ The path to the bowtie executables. (Default: the path to the bowtie executables
  
  (Bowtie parameter) suppress all alignments for a read if > <int> valid alignments exist. (Default: 200)
  
  
  (Bowtie parameter) suppress all alignments for a read if > <int> valid alignments exist. (Default: 200)
  
+=item B<--bowtie-chunkmbs> <int>
+
+(Bowtie parameter) memory allocated for best first alignment calculation (Default: 0 - use bowtie's default)
+
  =item B<--phred33-quals>
  
  Input quality scores are encoded as Phred+33. (Default: on)
  =item B<--phred33-quals>
  
  Input quality scores are encoded as Phred+33. (Default: on)
@@ -536,12 +558,20 @@ Number of bins in the RSPD. Only relevant when '--estimate-rspd' is specified.
  
  =item B<--ci-memory> <int>
  
  
  =item B<--ci-memory> <int>
  
-Amount of memory (in MB) RSEM is allowed to use for computing credibility intervals. (Default: 1024)
+Maximum size (in memory, MB) of the auxiliary buffer used for computing credibility intervals (CI). Set it larger for a faster CI calculation. However, leaving 2 GB memory free for other usage is recommended. (Default: 1024)
  
  =item B<--keep-intermediate-files>
  
  Keep temporary files generated by RSEM.  RSEM creates a temporary directory, 'sample_name.temp', into which it puts all intermediate output files. If this directory already exists, RSEM overwrites all files generated by previous RSEM runs inside of it. By default, after RSEM finishes, the temporary directory is deleted.  Set this option to prevent the deletion of this directory and the intermediate files inside of it. (Default: off)
  
  
  =item B<--keep-intermediate-files>
  
  Keep temporary files generated by RSEM.  RSEM creates a temporary directory, 'sample_name.temp', into which it puts all intermediate output files. If this directory already exists, RSEM overwrites all files generated by previous RSEM runs inside of it. By default, after RSEM finishes, the temporary directory is deleted.  Set this option to prevent the deletion of this directory and the intermediate files inside of it. (Default: off)
  
+=item B<--temporary-folder> <string>
+
+Set where to put the temporary files generated by RSEM. If the folder specified does not exist, RSEM will try to create it. (Default: sample_name.temp)
+
+=item B<--time>
+
+Output time consumed by each step of RSEM to 'sample_name.time'. (Default: off)
+
  =item B<-q/--quiet>
  
  Suppress the output of logging information. (Default: off)
  =item B<-q/--quiet>
  
  Suppress the output of logging information. (Default: off)
@@ -556,11 +586,13 @@ Show help information.
  
  In its default mode, this program aligns input reads against a reference transcriptome with Bowtie and calculates expression values using the alignments.  RSEM assumes the data are single-end reads with quality scores, unless the '--paired-end' or '--no-qualities' options are specified.  Users may use an alternative aligner by specifying one of the --sam and --bam options, and providing an alignment file in the specified format. However, users should make sure that they align against the indices generated by 'rsem-prepare-reference' and the alignment file satisfies the requirements mentioned in ARGUMENTS section. 
  
  
  In its default mode, this program aligns input reads against a reference transcriptome with Bowtie and calculates expression values using the alignments.  RSEM assumes the data are single-end reads with quality scores, unless the '--paired-end' or '--no-qualities' options are specified.  Users may use an alternative aligner by specifying one of the --sam and --bam options, and providing an alignment file in the specified format. However, users should make sure that they align against the indices generated by 'rsem-prepare-reference' and the alignment file satisfies the requirements mentioned in ARGUMENTS section. 
  
-One simple way to make the alignment file (e.g. input.sam) satisfying RSEM's requirements (assuming the aligner used put mates in a paired-end read adjacent) is to use the following command:
+One simple way to make the alignment file satisfying RSEM's requirements (assuming the aligner used put mates in a paired-end read adjacent) is to use 'convert-sam-for-rsem' script. This script only accept SAM format files as input. If a BAM format file is obtained, please use samtools to convert it to a SAM file first. For example, if '/ref/mouse_125' is the 'reference_name' and the SAM file is named 'input.sam', you can run the following command: 
+
+  convert-sam-for-rsem /ref/mouse_125 input.sam -o input_for_rsem.sam  
  
  
-  sort -k 1,1 -s input.sam > input.sorted.sam
+For details, please refer to 'convert-sam-for-rsem's documentation page.
  
  
-The SAM/BAM format RSEM uses is v1.4. However, it is compatible with old SAM/BAM format. 
+The SAM/BAM format RSEM uses is v1.4. However, it is compatible with old SAM/BAM format. However, RSEM cannot recognize 0x100 in the FLAG field. In addition, RSEM requires SEQ and QUAL are not '*'. 
  
  The user must run 'rsem-prepare-reference' with the appropriate reference before using this program.
  
  
  The user must run 'rsem-prepare-reference' with the appropriate reference before using this program.
  
@@ -570,7 +602,7 @@ Please note that some of the default values for the Bowtie parameters are not th
  
  The temporary directory and all intermediate files will be removed when RSEM finishes unless '--keep-intermediate-files' is specified.
  
  
  The temporary directory and all intermediate files will be removed when RSEM finishes unless '--keep-intermediate-files' is specified.
  
-With the "--calc-ci" option, 95% credibility intervals and posterior mean estimates will be calculated in addition to maximum likelihood estimates.
+With the '--calc-ci' option, 95% credibility intervals and posterior mean estimates will be calculated in addition to maximum likelihood estimates.
  
  =head1 OUTPUT
  
  
  =head1 OUTPUT
  
@@ -589,30 +621,37 @@ estimation. pmc stands for posterior mean counts. ci_lower_bound(l)
  means the lower bound of the credibility intervals, ci_upper_bound(u)
  means the upper bound of the credibility intervals. So the credibility
  interval is [l, u]. 'transcript_id_list' is a space-separated list of
  means the lower bound of the credibility intervals, ci_upper_bound(u)
  means the upper bound of the credibility intervals. So the credibility
  interval is [l, u]. 'transcript_id_list' is a space-separated list of
-transcript_ids belonging to the gene.
+transcript_ids belonging to the gene. If no gene information is
+provided, this file has the same content as
+'sample_name.isoforms.results'.
  
  =item B<sample_name.isoforms.results> 
  
  File containing isoform level expression values. The format of each
  line in this file is:
  
  
  =item B<sample_name.isoforms.results> 
  
  File containing isoform level expression values. The format of each
  line in this file is:
  
-transcript_id expected_counts tau_value [pmc_value tau_pme_value tau_ci_lower_bound tau_ci_upper_bound] other_attributes
+transcript_id expected_counts tau_value [pmc_value tau_pme_value tau_ci_lower_bound tau_ci_upper_bound] gene_id
  
  
-Fields are separated by the tab character. 'other_attributes' are all
-other attributes after attribute 'transcript_id' field in the GTF
-file. If no other attributes are given or no GTF file is provided in
-'rsem-prepare-reference', there will be no tab after the
-tau_value field.
+Fields are separated by the tab character. 'gene_id' is the gene_id of
+the gene which this transcript belongs to. If no gene information is
+provided, 'gene_id' and 'transcript_id' are the same.
  
  =item B<sample_name.transcript.bam, sample_name.transcript.sorted.bam and sample_name.transcript.sorted.bam.bai>
  
  
  =item B<sample_name.transcript.bam, sample_name.transcript.sorted.bam and sample_name.transcript.sorted.bam.bai>
  
+Only generated when --no-bam-output is not specified.
+
  'sample_name.transcript.bam' is a BAM-formatted file of read
  alignments in transcript coordinates. The MAPQ field of each alignment
  is set to min(100, floor(-10 * log10(1.0 - w) + 0.5)), where w is the
  posterior probability of that alignment being the true mapping of a
  read.  In addition, RSEM pads a new tag ZW:f:value, where value is a
  single precision floating number representing the posterior
  'sample_name.transcript.bam' is a BAM-formatted file of read
  alignments in transcript coordinates. The MAPQ field of each alignment
  is set to min(100, floor(-10 * log10(1.0 - w) + 0.5)), where w is the
  posterior probability of that alignment being the true mapping of a
  read.  In addition, RSEM pads a new tag ZW:f:value, where value is a
  single precision floating number representing the posterior
-probability.
+probability. Because this file contains all alignment lines produced
+by bowtie or user-specified aligners, it can also be used as a
+replacement of the aligner generated BAM/SAM file. For paired-end
+reads, if one mate has alignments but the other does not, this file
+marks the alignable mate as "unmappable" (flag bit 0x4) and appends an
+optional field "Z0:A:!".
  
  'sample_name.transcript.sorted.bam' and
  'sample_name.transcript.sorted.bam.bai' are the sorted BAM file and
  
  'sample_name.transcript.sorted.bam' and
  'sample_name.transcript.sorted.bam.bai' are the sorted BAM file and
@@ -620,7 +659,7 @@ indices generated by samtools (included in RSEM package).
  
  =item B<sample_name.genome.bam, sample_name.genome.sorted.bam and sample_name.genome.sorted.bam.bai>
  
  
  =item B<sample_name.genome.bam, sample_name.genome.sorted.bam and sample_name.genome.sorted.bam.bai>
  
-Only generated when --output-genome-bam is specified.
+Only generated when --no-bam-output is not specified and --output-genome-bam is specified.
  
  'sample_name.genome.bam' is a BAM-formatted file of read alignments in
  genomic coordinates. Alignments of reads that have identical genomic
  
  'sample_name.genome.bam' is a BAM-formatted file of read alignments in
  genomic coordinates. Alignments of reads that have identical genomic
@@ -630,11 +669,19 @@ of each alignment is set to min(100, floor(-10 * log10(1.0 - w) +
  0.5)), where w is the posterior probability of that alignment being
  the true mapping of a read.  In addition, RSEM pads a new tag
  ZW:f:value, where value is a single precision floating number
  0.5)), where w is the posterior probability of that alignment being
  the true mapping of a read.  In addition, RSEM pads a new tag
  ZW:f:value, where value is a single precision floating number
-representing the posterior probability.
+representing the posterior probability. If an alignment is spliced, a
+XS:A:value tag is also added, where value is either '+' or '-'
+indicating the strand of the transcript it aligns to.
  
  'sample_name.genome.sorted.bam' and 'sample_name.genome.sorted.bam.bai' are the
  sorted BAM file and indices generated by samtools (included in RSEM package).
  
  
  'sample_name.genome.sorted.bam' and 'sample_name.genome.sorted.bam.bai' are the
  sorted BAM file and indices generated by samtools (included in RSEM package).
  
+=item B<sample_name.time>
+
+Only generated when --time is specified.
+
+It contains time (in seconds) consumed by aligning reads, estimating expression levels and calculating credibility intervals.
+
  =item B<sample_name.stat>
  
  This is a folder instead of a file. All model related statistics are stored in this folder. Use 'rsem-plot-model' can generate plots using this folder.
  =item B<sample_name.stat>
  
  This is a folder instead of a file. All model related statistics are stored in this folder. Use 'rsem-plot-model' can generate plots using this folder.
@@ -643,7 +690,7 @@ This is a folder instead of a file. All model related statistics are stored in t
  
  =head1 EXAMPLES
  
  
  =head1 EXAMPLES
  
-Assume the path to the bowtie executables is in the user's PATH environment variable. Reference files are under '/ref' with name 'mm9'. 
+Assume the path to the bowtie executables is in the user's PATH environment variable. Reference files are under '/ref' with name 'mouse_125'. 
  
  1) '/data/mmliver.fq', single-end reads with quality scores. Quality scores are encoded as for 'GA pipeline version >= 1.3'. We want to use 8 threads and generate a genome BAM file:
  
  
  1) '/data/mmliver.fq', single-end reads with quality scores. Quality scores are encoded as for 'GA pipeline version >= 1.3'. We want to use 8 threads and generate a genome BAM file:
  
@@ -651,7 +698,7 @@ Assume the path to the bowtie executables is in the user's PATH environment vari
                             -p 8 \
                             --output-genome-bam \
                             /data/mmliver.fq \
                             -p 8 \
                             --output-genome-bam \
                             /data/mmliver.fq \
-                           /ref/mm9 \
+                           /ref/mouse_125 \
                             mmliver_single_quals
  
  2) '/data/mmliver_1.fq' and '/data/mmliver_2.fq', paired-end reads with quality scores. Quality scores are in SANGER format. We want to use 8 threads and do not generate a genome BAM file:
                             mmliver_single_quals
  
  2) '/data/mmliver_1.fq' and '/data/mmliver_2.fq', paired-end reads with quality scores. Quality scores are in SANGER format. We want to use 8 threads and do not generate a genome BAM file:
@@ -660,7 +707,7 @@ Assume the path to the bowtie executables is in the user's PATH environment vari
                             --paired-end \
                             /data/mmliver_1.fq \
                             /data/mmliver_2.fq \
                             --paired-end \
                             /data/mmliver_1.fq \
                             /data/mmliver_2.fq \
-                           /ref/mm9 \
+                           /ref/mouse_125 \
                             mmliver_paired_end_quals
  
  3) '/data/mmliver.fa', single-end reads without quality scores. We want to use 8 threads:
                             mmliver_paired_end_quals
  
  3) '/data/mmliver.fa', single-end reads without quality scores. We want to use 8 threads:
@@ -668,7 +715,7 @@ Assume the path to the bowtie executables is in the user's PATH environment vari
   rsem-calculate-expression -p 8 \
                             --no-qualities \
                             /data/mmliver.fa \
   rsem-calculate-expression -p 8 \
                             --no-qualities \
                             /data/mmliver.fa \
-                           /ref/mm9 \
+                           /ref/mouse_125 \
                             mmliver_single_without_quals
  
  4) Data are the same as 1). We want to take a fragment length distribution into consideration. We set the fragment length mean to 150 and the standard deviation to 35. In addition to a BAM file, we also want to generate credibility intervals.  We allow RSEM to use 1GB of memory for CI calculation:
                             mmliver_single_without_quals
  
  4) Data are the same as 1). We want to take a fragment length distribution into consideration. We set the fragment length mean to 150 and the standard deviation to 35. In addition to a BAM file, we also want to generate credibility intervals.  We allow RSEM to use 1GB of memory for CI calculation:
@@ -682,7 +729,7 @@ Assume the path to the bowtie executables is in the user's PATH environment vari
                             --calc-ci \
                             --ci-memory 1024 \
                             /data/mmliver.fq \
                             --calc-ci \
                             --ci-memory 1024 \
                             /data/mmliver.fq \
-                           /ref/mm9 \
+                           /ref/mouse_125 \
                             mmliver_single_quals
  
  5) '/data/mmliver_paired_end_quals.bam', paired-end reads with quality scores.  We want to use 8 threads:
                             mmliver_single_quals
  
  5) '/data/mmliver_paired_end_quals.bam', paired-end reads with quality scores.  We want to use 8 threads:
@@ -691,8 +738,7 @@ Assume the path to the bowtie executables is in the user's PATH environment vari
                             --bam \
                             -p 8 \
                             /data/mmliver_paired_end_quals.bam \
                             --bam \
                             -p 8 \
                             /data/mmliver_paired_end_quals.bam \
-                           /ref/mm9 \
+                           /ref/mouse_125 \
                             mmliver_paired_end_quals
  
  =cut
                             mmliver_paired_end_quals
  
  =cut
-