Added a user-friendly error message for rsem-sam-validator

[rsem.git] / rsem-calculate-expression
diff --git a/rsem-calculate-expression b/rsem-calculate-expression

index 2902239eff0513a186c2fc562d8541c99b578a62..684e33c2c3eb713536e3f1db6a82d44acda0bbe2 100755 (executable)
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -49,6 +49,7 @@ my $genBamF = 1;  # default is generating transcript bam file
  my $genGenomeBamF = 0;
  my $sampling = 0;
  my $calcCI = 0;
  my $genGenomeBamF = 0;
  my $sampling = 0;
  my $calcCI = 0;
+my $var_opt = 0; # temporarily, only for internal use
  my $quiet = 0;
  my $help = 0;
  
  my $quiet = 0;
  my $help = 0;
  
@@ -61,7 +62,15 @@ my $strand_specific = 0;
  my $mTime = 0;
  my ($time_start, $time_end, $time_alignment, $time_rsem, $time_ci) = (0, 0, 0, 0, 0);
  
  my $mTime = 0;
  my ($time_start, $time_end, $time_alignment, $time_rsem, $time_ci) = (0, 0, 0, 0, 0);
  
+my $mate1_list = "";
+my $mate2_list = "";
+my $inpF = "";
+
+my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName, $statName) = ();
+my $gap = 32;
+
  GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
  GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
+          "temporary-folder=s" => \$temp_dir,
            "no-qualities" => \$no_qual,
            "paired-end" => \$paired_end,
            "strand-specific" => \$strand_specific,
            "no-qualities" => \$no_qual,
            "paired-end" => \$paired_end,
            "strand-specific" => \$strand_specific,
@@ -86,8 +95,10 @@ GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
            "estimate-rspd" => \$estRSPD,
            "num-rspd-bins=i" => \$B,
            "p|num-threads=i" => \$nThreads,
            "estimate-rspd" => \$estRSPD,
            "num-rspd-bins=i" => \$B,
            "p|num-threads=i" => \$nThreads,
+          "no-bam-output" => sub { $genBamF = 0; },
            "output-genome-bam" => \$genGenomeBamF,
            "sampling-for-bam" => \$sampling,
            "output-genome-bam" => \$genGenomeBamF,
            "sampling-for-bam" => \$sampling,
+          "var" => \$var_opt,
            "calc-ci" => \$calcCI,
            "ci-memory=i" => \$NMB,
            "time" => \$mTime,
            "calc-ci" => \$calcCI,
            "ci-memory=i" => \$NMB,
            "time" => \$mTime,
@@ -116,19 +127,13 @@ pod2usage(-msg => "Min fragment length should be smaller or equal to max fragmen
  pod2usage(-msg => "The memory allocated for calculating credibility intervals should be at least 1 MB!\n", -exitval => 2, -verbose => 2) if ($NMB < 1);
  pod2usage(-msg => "Number of threads should be at least 1!\n", -exitval => 2, -verbose => 2) if ($nThreads < 1);
  pod2usage(-msg => "Seed length should be at least 5!\n", -exitval => 2, -verbose => 2) if ($L < 5);
  pod2usage(-msg => "The memory allocated for calculating credibility intervals should be at least 1 MB!\n", -exitval => 2, -verbose => 2) if ($NMB < 1);
  pod2usage(-msg => "Number of threads should be at least 1!\n", -exitval => 2, -verbose => 2) if ($nThreads < 1);
  pod2usage(-msg => "Seed length should be at least 5!\n", -exitval => 2, -verbose => 2) if ($L < 5);
-pod2usage(-msg => "--sampling-for-bam cannot be specified if --out-bam is not specified!\n", -exitval => 2, -verbose => 2) if ($sampling && !$genBamF);
+pod2usage(-msg => "--sampling-for-bam cannot be specified if --no-bam-output is specified!\n", -exitval => 2, -verbose => 2) if ($sampling && !$genBamF);
+pod2usage(-msg => "--output-genome-bam cannot be specified if --no-bam-output is specified!\n", -exitval => 2, -verbose => 2) if ($genGenomeBamF && !$genBamF);
  
  if ($L < 25) { print "Warning: the seed length set is less than 25! This is only allowed if the references are not added poly(A) tails.\n"; }
  
  if ($strand_specific) { $probF = 1.0; }
  
  
  if ($L < 25) { print "Warning: the seed length set is less than 25! This is only allowed if the references are not added poly(A) tails.\n"; }
  
  if ($strand_specific) { $probF = 1.0; }
  
-my $mate1_list = "";
-my $mate2_list = "";
-my $inpF = "";
-
-my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName) = ();
-my $gap = 32;
-
  if ($paired_end) {
      if ($no_qual) { $read_type = 2; }
      else { $read_type = 3; }
  if ($paired_end) {
      if ($no_qual) { $read_type = 2; }
      else { $read_type = 3; }
@@ -163,15 +168,16 @@ my $pos = rindex($sampleName, '/');
  if ($pos < 0) { $sampleToken = $sampleName; }
  else { $sampleToken = substr($sampleName, $pos + 1); }
  
  if ($pos < 0) { $sampleToken = $sampleName; }
  else { $sampleToken = substr($sampleName, $pos + 1); }
  
-$temp_dir = "$sampleName.temp";
+if ($temp_dir eq "") { $temp_dir = "$sampleName.temp"; }
  $stat_dir = "$sampleName.stat";
  
  if (!(-d $temp_dir) && !mkdir($temp_dir)) { print "Fail to create folder $temp_dir.\n"; exit(-1); }
  if (!(-d $stat_dir) && !mkdir($stat_dir)) { print "Fail to create folder $stat_dir.\n"; exit(-1); }
  
  $imdName = "$temp_dir/$sampleToken";
  $stat_dir = "$sampleName.stat";
  
  if (!(-d $temp_dir) && !mkdir($temp_dir)) { print "Fail to create folder $temp_dir.\n"; exit(-1); }
  if (!(-d $stat_dir) && !mkdir($stat_dir)) { print "Fail to create folder $stat_dir.\n"; exit(-1); }
  
  $imdName = "$temp_dir/$sampleToken";
+$statName = "$stat_dir/$sampleToken";
  
  
-if (!$is_sam && !$is_bam && $phred33 + $phred64 + $solexa == 0) { $phred33 = 1; }
+if (!$is_sam && !$is_bam && !$no_qual && ($phred33 + $phred64 + $solexa == 0)) { $phred33 = 1; }
  
  my ($mate_minL, $mate_maxL) = (1, $maxL);
  
  
  my ($mate_minL, $mate_maxL) = (1, $maxL);
  
@@ -182,13 +188,12 @@ my $command = "";
  
  if (!$is_sam && !$is_bam) {
      $command = $bowtie_path."bowtie";
  
  if (!$is_sam && !$is_bam) {
      $command = $bowtie_path."bowtie";
-    if ($read_type == 0 || $read_type == 2) { $command .= " -f"; }
+    if ($no_qual) { $command .= " -f"; }
      else { $command .= " -q"; }
      
      if ($phred33) { $command .= " --phred33-quals"; }
      elsif ($phred64) { $command .= " --phred64-quals"; }
      elsif ($solexa) { $command .= " --solexa-quals"; }
      else { $command .= " -q"; }
      
      if ($phred33) { $command .= " --phred33-quals"; }
      elsif ($phred64) { $command .= " --phred64-quals"; }
      elsif ($solexa) { $command .= " --solexa-quals"; }
-    else { print "Oh, no!!!"; exit(2); }
      
      $command .= " -n $C -e $E -l $L";
      if ($read_type == 2 || $read_type == 3) { $command .= " -I $minL -X $maxL"; }
      
      $command .= " -n $C -e $E -l $L";
      if ($read_type == 2 || $read_type == 3) { $command .= " -I $minL -X $maxL"; }
@@ -208,7 +213,8 @@ if (!$is_sam && !$is_bam) {
         $command .= " -1 $mate1_list -2 $mate2_list";
      }
  
         $command .= " -1 $mate1_list -2 $mate2_list";
      }
  
-    $command .= " | gzip > $sampleName.sam.gz";
+    # pipe to samtools to generate a BAM file
+    $command .= " | $dir\sam/samtools view -S -b -o $imdName.bam -";
  
      if ($mTime) { $time_start = time(); }
  
  
      if ($mTime) { $time_start = time(); }
  
@@ -216,13 +222,13 @@ if (!$is_sam && !$is_bam) {
  
      if ($mTime) { $time_end = time(); $time_alignment = $time_end - $time_start; }
  
  
      if ($mTime) { $time_end = time(); $time_alignment = $time_end - $time_start; }
  
-    $inpF = "$sampleName.sam.gz";
-    $is_sam = 1; # output of bowtie is a sam file
+    $inpF = "$imdName.bam";
+    $is_bam = 1; # alignments are outputed as a BAM file
  }
  
  if ($mTime) { $time_start = time(); }
  
  }
  
  if ($mTime) { $time_start = time(); }
  
-$command = $dir."rsem-parse-alignments $refName $sampleName $sampleToken";
+$command = $dir."rsem-parse-alignments $refName $imdName $statName";
  
  my $samInpType;
  if ($is_sam) { $samInpType = "s"; } 
  
  my $samInpType;
  if ($is_sam) { $samInpType = "s"; } 
@@ -255,18 +261,21 @@ print OUTPUT "$mean $sd\n";
  print OUTPUT "$L\n";
  close(OUTPUT);  
  
  print OUTPUT "$L\n";
  close(OUTPUT);  
  
-$command = $dir."rsem-run-em $refName $read_type $sampleName $sampleToken -p $nThreads";
+$command = $dir."rsem-run-em $refName $read_type $sampleName $imdName $statName -p $nThreads";
  if ($genBamF) { 
      $command .= " -b $samInpType $inpF";
      if ($fn_list ne "") { $command .= " 1 $fn_list"; }
      else { $command .= " 0"; }
      if ($sampling) { $command .= " --sampling"; }
  }
  if ($genBamF) { 
      $command .= " -b $samInpType $inpF";
      if ($fn_list ne "") { $command .= " 1 $fn_list"; }
      else { $command .= " 0"; }
      if ($sampling) { $command .= " --sampling"; }
  }
-if ($calcCI) { $command .= " --gibbs-out"; }
+if ($calcCI || $var_opt) { $command .= " --gibbs-out"; }
  if ($quiet) { $command .= " -q"; }
  
  &runCommand($command);
  
  if ($quiet) { $command .= " -q"; }
  
  &runCommand($command);
  
+&collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+&collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
+
  if ($genBamF) {
      $command = $dir."sam/samtools sort $sampleName.transcript.bam $sampleName.transcript.sorted";
      &runCommand($command);
  if ($genBamF) {
      $command = $dir."sam/samtools sort $sampleName.transcript.bam $sampleName.transcript.sorted";
      &runCommand($command);
@@ -283,25 +292,25 @@ if ($genBamF) {
      }
  }
  
      }
  }
  
-&collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
-&collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
-
  if ($mTime) { $time_end = time(); $time_rsem = $time_end - $time_start; }
  
  if ($mTime) { $time_start = time(); }
  
  if ($mTime) { $time_end = time(); $time_rsem = $time_end - $time_start; }
  
  if ($mTime) { $time_start = time(); }
  
-if ($calcCI) {
-    $command = $dir."rsem-run-gibbs $refName $sampleName $sampleToken $BURNIN $NCV $SAMPLEGAP";
+if ($calcCI || $var_opt) {
+    $command = $dir."rsem-run-gibbs $refName $imdName $statName $BURNIN $NCV $SAMPLEGAP";
      $command .= " -p $nThreads";
      $command .= " -p $nThreads";
+    if ($var_opt) { $command .= " --var"; }
      if ($quiet) { $command .= " -q"; }
      &runCommand($command);
      if ($quiet) { $command .= " -q"; }
      &runCommand($command);
+}
  
  
+if ($calcCI) {
      system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1");
      system("mv $sampleName.genes.results $imdName.genes.results.bak1");
      &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
      &collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
  
      system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1");
      system("mv $sampleName.genes.results $imdName.genes.results.bak1");
      &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
      &collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
  
-    $command = $dir."rsem-calculate-credibility-intervals $refName $sampleName $sampleToken $CONFIDENCE $NCV $NSPC $NMB";
+    $command = $dir."rsem-calculate-credibility-intervals $refName $imdName $statName $CONFIDENCE $NCV $NSPC $NMB";
      $command .= " -p $nThreads";
      if ($quiet) { $command .= " -q"; }
      &runCommand($command);
      $command .= " -p $nThreads";
      if ($quiet) { $command .= " -q"; }
      &runCommand($command);
@@ -336,12 +345,12 @@ if ($mTime) {
  sub runCommand {
      print $_[0]."\n";
      my $status = system($_[0]);
  sub runCommand {
      print $_[0]."\n";
      my $status = system($_[0]);
-    if ($status != 0) { 
-       my $errmsg;
-       if (scalar(@_) > 1) { $errmsg = $_[1]; }
-       else { $errmsg = "\"$command\" failed! Plase check if you provide correct parameters/options for the pipeline!"; }
-       print $errmsg."\n";
-       exit(-1);
+    if ($status != 0) {
+        my $errmsg = "";
+        if (scalar(@_) > 1) { $errmsg .= $_[1]."\n"; }
+        $errmsg .= "\"$_[0]\" failed! Plase check if you provide correct parameters/options for the pipeline!\n";
+        print $errmsg;
+        exit(-1);
      }
      print "\n";
  }
      }
      print "\n";
  }
@@ -463,13 +472,17 @@ RSEM reads header information from input by default. If this option is on, heade
  
  Number of threads to use. Both Bowtie and expression estimation will use this many threads. (Default: 1)
  
  
  Number of threads to use. Both Bowtie and expression estimation will use this many threads. (Default: 1)
  
+=item B<--no-bam-output>
+
+Do not output any BAM file. (Default: off)
+
  =item B<--output-genome-bam>
  
  Generate a BAM file, 'sample_name.genome.bam', with alignments mapped to genomic coordinates and annotated with their posterior probabilities. In addition, RSEM will call samtools (included in RSEM package) to sort and index the bam file. 'sample_name.genome.sorted.bam' and 'sample_name.genome.sorted.bam.bai' will be generated. (Default: off)
  
  =item B<--sampling-for-bam>
  
  =item B<--output-genome-bam>
  
  Generate a BAM file, 'sample_name.genome.bam', with alignments mapped to genomic coordinates and annotated with their posterior probabilities. In addition, RSEM will call samtools (included in RSEM package) to sort and index the bam file. 'sample_name.genome.sorted.bam' and 'sample_name.genome.sorted.bam.bai' will be generated. (Default: off)
  
  =item B<--sampling-for-bam>
  
-When RSEM generates a BAM file, instead of outputing all alignments a read has with their posterior probabilities, one alignment is sampled and outputed according to the posterior probabilities. If the sampling result is that the read comes from the "noise" transcript, nothing is outputed. (Default: off)
+When RSEM generates a BAM file, instead of outputing all alignments a read has with their posterior probabilities, one alignment is sampled according to the posterior probabilities. The sampling procedure includes the alignment to the "noise" transcript, which does not appear in the BAM file. Only the sampled alignment has a weight of 1. All other alignments have weight 0. If the "noise" transcript is sampled, all alignments appeared in the BAM file should have weight 0. (Default: off)
  
  =item B<--calc-ci>
  
  
  =item B<--calc-ci>
  
@@ -545,12 +558,16 @@ Number of bins in the RSPD. Only relevant when '--estimate-rspd' is specified.
  
  =item B<--ci-memory> <int>
  
  
  =item B<--ci-memory> <int>
  
-Amount of memory (in MB) RSEM is allowed to use for computing credibility intervals. (Default: 1024)
+Maximum size (in memory, MB) of the auxiliary buffer used for computing credibility intervals (CI). Set it larger for a faster CI calculation. However, leaving 2 GB memory free for other usage is recommended. (Default: 1024)
  
  =item B<--keep-intermediate-files>
  
  Keep temporary files generated by RSEM.  RSEM creates a temporary directory, 'sample_name.temp', into which it puts all intermediate output files. If this directory already exists, RSEM overwrites all files generated by previous RSEM runs inside of it. By default, after RSEM finishes, the temporary directory is deleted.  Set this option to prevent the deletion of this directory and the intermediate files inside of it. (Default: off)
  
  
  =item B<--keep-intermediate-files>
  
  Keep temporary files generated by RSEM.  RSEM creates a temporary directory, 'sample_name.temp', into which it puts all intermediate output files. If this directory already exists, RSEM overwrites all files generated by previous RSEM runs inside of it. By default, after RSEM finishes, the temporary directory is deleted.  Set this option to prevent the deletion of this directory and the intermediate files inside of it. (Default: off)
  
+=item B<--temporary-folder> <string>
+
+Set where to put the temporary files generated by RSEM. If the folder specified does not exist, RSEM will try to create it. (Default: sample_name.temp)
+
  =item B<--time>
  
  Output time consumed by each step of RSEM to 'sample_name.time'. (Default: off)
  =item B<--time>
  
  Output time consumed by each step of RSEM to 'sample_name.time'. (Default: off)
@@ -621,13 +638,20 @@ provided, 'gene_id' and 'transcript_id' are the same.
  
  =item B<sample_name.transcript.bam, sample_name.transcript.sorted.bam and sample_name.transcript.sorted.bam.bai>
  
  
  =item B<sample_name.transcript.bam, sample_name.transcript.sorted.bam and sample_name.transcript.sorted.bam.bai>
  
+Only generated when --no-bam-output is not specified.
+
  'sample_name.transcript.bam' is a BAM-formatted file of read
  alignments in transcript coordinates. The MAPQ field of each alignment
  is set to min(100, floor(-10 * log10(1.0 - w) + 0.5)), where w is the
  posterior probability of that alignment being the true mapping of a
  read.  In addition, RSEM pads a new tag ZW:f:value, where value is a
  single precision floating number representing the posterior
  'sample_name.transcript.bam' is a BAM-formatted file of read
  alignments in transcript coordinates. The MAPQ field of each alignment
  is set to min(100, floor(-10 * log10(1.0 - w) + 0.5)), where w is the
  posterior probability of that alignment being the true mapping of a
  read.  In addition, RSEM pads a new tag ZW:f:value, where value is a
  single precision floating number representing the posterior
-probability.
+probability. Because this file contains all alignment lines produced
+by bowtie or user-specified aligners, it can also be used as a
+replacement of the aligner generated BAM/SAM file. For paired-end
+reads, if one mate has alignments but the other does not, this file
+marks the alignable mate as "unmappable" (flag bit 0x4) and appends an
+optional field "Z0:A:!".
  
  'sample_name.transcript.sorted.bam' and
  'sample_name.transcript.sorted.bam.bai' are the sorted BAM file and
  
  'sample_name.transcript.sorted.bam' and
  'sample_name.transcript.sorted.bam.bai' are the sorted BAM file and
@@ -635,7 +659,7 @@ indices generated by samtools (included in RSEM package).
  
  =item B<sample_name.genome.bam, sample_name.genome.sorted.bam and sample_name.genome.sorted.bam.bai>
  
  
  =item B<sample_name.genome.bam, sample_name.genome.sorted.bam and sample_name.genome.sorted.bam.bai>
  
-Only generated when --output-genome-bam is specified.
+Only generated when --no-bam-output is not specified and --output-genome-bam is specified.
  
  'sample_name.genome.bam' is a BAM-formatted file of read alignments in
  genomic coordinates. Alignments of reads that have identical genomic
  
  'sample_name.genome.bam' is a BAM-formatted file of read alignments in
  genomic coordinates. Alignments of reads that have identical genomic
@@ -652,12 +676,6 @@ indicating the strand of the transcript it aligns to.
  'sample_name.genome.sorted.bam' and 'sample_name.genome.sorted.bam.bai' are the
  sorted BAM file and indices generated by samtools (included in RSEM package).
  
  'sample_name.genome.sorted.bam' and 'sample_name.genome.sorted.bam.bai' are the
  sorted BAM file and indices generated by samtools (included in RSEM package).
  
-=item B<sample_name.sam.gz>
-
-Only generated when the input files are raw reads instead of SAM/BAM format files
-
-It is the gzipped SAM output produced by bowtie aligner.
-
  =item B<sample_name.time>
  
  Only generated when --time is specified.
  =item B<sample_name.time>
  
  Only generated when --time is specified.