Made the BAM output of RSEM have all information contained in the input alignments...

author Bo Li <bli@cs.wisc.edu>

Tue, 5 Jun 2012 21:25:13 +0000 (16:25 -0500)

committer Bo Li <bli@cs.wisc.edu>

Tue, 5 Jun 2012 21:25:13 +0000 (16:25 -0500)
author Bo Li <bli@cs.wisc.edu>
Tue, 5 Jun 2012 21:25:13 +0000 (16:25 -0500)
committer Bo Li <bli@cs.wisc.edu>
Tue, 5 Jun 2012 21:25:13 +0000 (16:25 -0500)
diff --git a/.gitignore b/.gitignore

index 0479fb5530a93270cb0a15b970263d34ed0cb10c..02f7995ec1454489d0eb45dffbc1fda69ede78fe 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@ rsem-synthesis-reference-transcripts
  rsem-tbam2gbam
  rsem-sam-validator
  rsem-scan-for-paired-end-reads
+rsem-for-ebseq-calculate-clustering-info
  sam/samtools
  .project
  .cproject
diff --git a/BamConverter.h b/BamConverter.h

index bd81127d5fcb4728ad4e43efd9a0d9f723e4f78f..13b8557f5bc727916ba31857cfdd39ac839d39d0 100644 (file)
--- a/BamConverter.h
+++ b/BamConverter.h
@@ -85,32 +85,51 @@ void BamConverter::process() {
                 ++cnt;
                 isPaired = (b->core.flag & 0x0001) > 0;
                 if (isPaired) {
-                       assert(samread(in, b2) >= 0 && (b2->core.flag & 0x0001) && b->core.tid == b2->core.tid);
-                       assert((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)); // for collapsing
-                       ++cnt;
+                       assert(samread(in, b2) >= 0 && (b2->core.flag & 0x0001));
+                       assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001));
+                       assert((b->core.flag & 0x0040) && (b2->core.flag & 0x0080) || (b->core.flag & 0x0080) && (b2->core.flag & 0x0040));
+                       ++cnt;
                 }
  
                 if (cnt % 1000000 == 0) { printf("."); fflush(stdout); }
  
                 // at least one segment is not properly mapped
-               if ((b->core.flag & 0x0004) || (isPaired && (b2->core.flag & 0x0004))) continue;
+               bool notgood = (b->core.flag & 0x0004) || (isPaired && (b2->core.flag & 0x0004));
+               
  
-               const Transcript& transcript = transcripts.getTranscriptViaEid(b->core.tid + 1);
+               if (isPaired && notgood) assert((b->core.flag & 0x0004) && (b2->core.flag & 0x0004));
  
-               convert(b, transcript);
-               if (isPaired) {
-                       convert(b2, transcript);
-                       b->core.mpos = b2->core.pos;
-                       b2->core.mpos = b->core.pos;
+
+               if (!notgood) {
+                 assert((b->core.tid == b2->core.tid) && (b->core.flag & 0x0040) && (b2->core.flag & 0x0080)); // for collapsing
+
+                 const Transcript& transcript = transcripts.getTranscriptViaEid(b->core.tid + 1);
+
+                 convert(b, transcript);
+                 if (isPaired) {
+                   convert(b2, transcript);
+                   b->core.mpos = b2->core.pos;
+                   b2->core.mpos = b->core.pos;
+                 }
+
+                 if (cqname != bam1_qname(b)) {
+                   writeCollapsedLines();
+                   cqname = bam1_qname(b);
+                   collapseMap.init(isPaired);
+                 }
+                 collapseMap.insert(b, b2, bam_aux2f(bam_aux_get(b, "ZW")));
                 }
+               else {
+                 assert(cqname != bam1_qname(b));
+
+                 writeCollapsedLines();
+                 cqname = bam1_qname(b);
+                 collapseMap.init(isPaired);
  
-               if (cqname != bam1_qname(b)) {
-                       writeCollapsedLines();
-                       cqname = bam1_qname(b);
-                       collapseMap.init(isPaired);
+                 samwrite(out, b);
+                 if (isPaired) samwrite(out, b2);
                 }
  
-               collapseMap.insert(b, b2, bam_aux2f(bam_aux_get(b, "ZW")));
         }
  
         writeCollapsedLines();
@@ -173,17 +192,14 @@ inline void BamConverter::writeCollapsedLines() {
                 while (collapseMap.next(tmp_b, tmp_b2, prb)) {
                         memcpy(bam_aux_get(tmp_b, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
                         tmp_b->core.qual = getMAPQ(prb);
-                       if (tmp_b->core.qual > 0) {
-                               samwrite(out, tmp_b);
-                               if (isPaired) {
-                                       memcpy(bam_aux_get(tmp_b2, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
-                                       tmp_b2->core.qual = tmp_b->core.qual;
-                                       samwrite(out, tmp_b2);
-                               }
+                       samwrite(out, tmp_b);
+                       if (isPaired) {
+                         memcpy(bam_aux_get(tmp_b2, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
+                         tmp_b2->core.qual = tmp_b->core.qual;
+                         samwrite(out, tmp_b2);
                         }
                         bam_destroy1(tmp_b);
                         if (isPaired) bam_destroy1(tmp_b2);
-
                 }
         }
  }
diff --git a/BamWriter.h b/BamWriter.h

index a73e3da4dc180668b2ffac4845172ed061bf11c0..782f5bdeb6705748a8919265e854b54d1f418133 100644 (file)
--- a/BamWriter.h
+++ b/BamWriter.h
@@ -85,16 +85,18 @@ void BamWriter::work(HitWrapper<SingleHit> wrapper) {
  
         while (samread(in, b) >= 0) {
                 ++cnt;
-               if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< "alignment lines are loaded!"<< std::endl; }
+               if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " alignment lines are loaded!"<< std::endl; }
  
-               if (b->core.flag & 0x0004) continue;
+               if (!(b->core.flag & 0x0004)) {
+                 hit = wrapper.getNextHit();
+                 assert(hit != NULL);
  
-               hit = wrapper.getNextHit();
-               assert(hit != NULL);
+                 assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
+                 convert(b, hit->getConPrb());
+               }
  
-               assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
-               convert(b, hit->getConPrb());
-               if (b->core.qual > 0) samwrite(out, b); // output only when MAPQ > 0
+               //if (b->core.qual > 0) samwrite(out, b); // output only when MAPQ > 0
+               samwrite(out, b);
         }
  
         assert(wrapper.getNextHit() == NULL);
@@ -115,34 +117,49 @@ void BamWriter::work(HitWrapper<PairedEndHit> wrapper) {
         while (samread(in, b) >= 0 && samread(in, b2) >= 0) {
                 cnt += 2;
                 if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< "alignment lines are loaded!"<< std::endl; }
-               //mate info is not complete, skip
-               if (!(((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) || ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)))) continue;
-               //unalignable reads, skip
-               if ((b->core.flag & 0x0004) || (b2->core.flag & 0x0004)) continue;
-
-               //swap if b is mate 2
-               if (b->core.flag & 0x0080) {
-                       assert(b2->core.flag & 0x0040);
-                       bam1_t *tmp = b;
-                       b = b2; b2 = tmp;
-               }
  
-               hit = wrapper.getNextHit();
-               assert(hit != NULL);
+               assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001));
+               assert((b->core.flag & 0x0040) && (b2->core.flag & 0x0080) || (b->core.flag & 0x0080) && (b2->core.flag & 0x0040));
+
+               //unalignable reads, skip               
+               bool notgood = (b->core.flag & 0x0004) || (b2->core.flag & 0x0004);
  
-               assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
-               assert(transcripts.getInternalSid(b2->core.tid + 1) == hit->getSid());
+               if (!notgood) {
+                 //swap if b is mate 2
+                 if (b->core.flag & 0x0080) {
+                   assert(b2->core.flag & 0x0040);
+                   bam1_t *tmp = b;
+                   b = b2; b2 = tmp;
+                 }
  
-               convert(b, hit->getConPrb());
-               convert(b2, hit->getConPrb());
+                 hit = wrapper.getNextHit();
+                 assert(hit != NULL);
  
-               b->core.mpos = b2->core.pos;
-               b2->core.mpos = b->core.pos;
+                 assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
+                 assert(transcripts.getInternalSid(b2->core.tid + 1) == hit->getSid());
  
-               if (b->core.qual > 0) {
-                       samwrite(out, b);
-                       samwrite(out, b2);
+                 convert(b, hit->getConPrb());
+                 convert(b2, hit->getConPrb());
+
+                 b->core.mpos = b2->core.pos;
+                 b2->core.mpos = b->core.pos;
+               }
+               else {
+                 // if only one mate can be aligned, mask it as unaligned and put an additional tag Z0:A:!
+                 char exclamation = '!';
+                 if (!(b->core.flag & 0x0004)) { 
+                   b->core.flag |= 0x0004;
+                   bam_aux_append(b, "Z0", 'A', bam_aux_type2size('A'), (uint8_t*)&exclamation);
+                 }
+                 if (!(b2->core.flag & 0x0004)) {
+                   b2->core.flag |= 0x0004;
+                   bam_aux_append(b2, "Z0", 'A', bam_aux_type2size('A'), (uint8_t*)&exclamation);
+                 }
                 }
+
+
+               samwrite(out, b);
+               samwrite(out, b2);
         }
  
         assert(wrapper.getNextHit() == NULL);
diff --git a/EM.cpp b/EM.cpp

index 2ce946fd6e97b3e7deb0b33b7ab42b143efe3046..0d3872836784bcf45c9def4ae8ea09db6c4a7ef7 100644 (file)
--- a/EM.cpp
+++ b/EM.cpp
@@ -652,8 +652,8 @@ int main(int argc, char* argv[]) {
         ifstream fin;
         bool quiet = false;
  
-       if (argc < 5) {
-               printf("Usage : rsem-run-em refName read_type sampleName sampleToken [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out] [--sampling]\n\n");
+       if (argc < 6) {
+               printf("Usage : rsem-run-em refName read_type sampleName imdName statName [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out] [--sampling]\n\n");
                 printf("  refName: reference name\n");
                 printf("  read_type: 0 single read without quality score; 1 single read with quality score; 2 paired-end read without quality score; 3 paired-end read with quality score.\n");
                 printf("  sampleName: sample's name, including the path\n");
@@ -672,8 +672,8 @@ int main(int argc, char* argv[]) {
         strcpy(refName, argv[1]);
         read_type = atoi(argv[2]);
         strcpy(outName, argv[3]);
-       sprintf(imdName, "%s.temp/%s", argv[3], argv[4]);
-       sprintf(statName, "%s.stat/%s", argv[3], argv[4]);
+       strcpy(imdName, argv[4]);
+       strcpy(statName, argv[5]);
  
         nThreads = 1;
  
@@ -682,7 +682,7 @@ int main(int argc, char* argv[]) {
         genGibbsOut = false;
         pt_fn_list = pt_chr_list = NULL;
  
-       for (int i = 5; i < argc; i++) {
+       for (int i = 6; i < argc; i++) {
                 if (!strcmp(argv[i], "-p")) { nThreads = atoi(argv[i + 1]); }
                 if (!strcmp(argv[i], "-b")) {
                         genBamF = true;
diff --git a/Gibbs.cpp b/Gibbs.cpp

index c1f4fe276b2100ca6e66a151303cb062e0ec0d0e..63d35a829c059e12ae79bd8f3246fcec5b50438b 100644 (file)
--- a/Gibbs.cpp
+++ b/Gibbs.cpp
@@ -404,15 +404,17 @@ void writeEstimatedParameters(char* modelF, char* imdName) {
  
  int main(int argc, char* argv[]) {
         if (argc < 7) {
-               printf("Usage: rsem-run-gibbs-multi reference_name sample_name sampleToken BURNIN NSAMPLES GAP [-p #Threads] [--var] [-q]\n");
+               printf("Usage: rsem-run-gibbs reference_name imdName statName BURNIN NSAMPLES GAP [-p #Threads] [--var] [-q]\n");
                 exit(-1);
         }
  
+       strcpy(imdName, argv[2]);
+       strcpy(statName, argv[3]);
+
         BURNIN = atoi(argv[4]);
         NSAMPLES = atoi(argv[5]);
         GAP = atoi(argv[6]);
-       sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
-       sprintf(statName, "%s.stat/%s", argv[2], argv[3]);
+
         load_data(argv[1], statName, imdName);
  
         nThreads = 1;
diff --git a/README.md b/README.md

index e85d25001d634b91efb2b9c696474865d5c3ee7d..ed5acc050230fd50a1803e31d24790a8919ce8fd 100644 (file)
--- a/README.md
+++ b/README.md
@@ -298,8 +298,8 @@ named 'EBSeq'.
  
  For more information about EBSeq (including the paper describing their
  method), please visit <a
-href="http://www.biostat.wisc.edu/~ningleng/EBSeq_Package">here</a>. You
-can also find a local version of vignette under
+href="http://www.biostat.wisc.edu/~ningleng/EBSeq_Package">EBSeq
+website</a>. You can also find a local version of vignette under
  'EBSeq/inst/doc/EBSeq_Vignette.pdf'.
  
  EBSeq requires gene-isoform relationship for its isoform DE
diff --git a/calcCI.cpp b/calcCI.cpp

index 89414f9b02f00ba641b8f987845aea8e977ef1b1..99ce14858825c989216dbd6ab309562218a8ae62 100644 (file)
--- a/calcCI.cpp
+++ b/calcCI.cpp
@@ -397,10 +397,13 @@ void calculate_credibility_intervals(char* imdName) {
  
  int main(int argc, char* argv[]) {
         if (argc < 8) {
-               printf("Usage: rsem-calculate-credibility-intervals reference_name sample_name sampleToken confidence nCV nSpC nMB [-p #Threads] [-q]\n");
+               printf("Usage: rsem-calculate-credibility-intervals reference_name imdName statName confidence nCV nSpC nMB [-p #Threads] [-q]\n");
                 exit(-1);
         }
  
+       strcpy(imdName, argv[2]);
+       strcpy(statName, argv[3]);
+
         confidence = atof(argv[4]);
         nCV = atoi(argv[5]);
         nSpC = atoi(argv[6]);
@@ -425,8 +428,6 @@ int main(int argc, char* argv[]) {
         cvlen = M + 1;
         assert(nSamples > 0 && cvlen > 1); // for Buffter.h: (bufsize_type)nSamples
  
-       sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
-       sprintf(statName, "%s.stat/%s", argv[2], argv[3]);
         sprintf(tmpF, "%s.tmp", imdName);
         sprintf(cvsF, "%s.countvectors", imdName);
  
diff --git a/parseIt.cpp b/parseIt.cpp

index c760c79f879775450e4677951d3ee25774a7ffcc..3373ef280979428230b74d77439d61bff3a96d47 100644 (file)
--- a/parseIt.cpp
+++ b/parseIt.cpp
@@ -33,7 +33,6 @@ HIT_INT_TYPE nHits; // # of hits
  READ_INT_TYPE nUnique, nMulti, nIsoMulti;
  char fn_list[STRLEN];
  char groupF[STRLEN], tiF[STRLEN];
-char imdName[STRLEN];
  char datF[STRLEN], cntF[STRLEN];
  
  GroupInfo gi;
@@ -167,7 +166,7 @@ int main(int argc, char* argv[]) {
         bool quiet = false;
  
         if (argc < 6) {
-               printf("Usage : rsem-parse-alignments refName sampleName sampleToken alignFType('s' for sam, 'b' for bam) alignF [-t Type] [-l fn_list] [-tag tagName] [-q]\n");
+               printf("Usage : rsem-parse-alignments refName imdName statName alignFType('s' for sam, 'b' for bam) alignF [-t Type] [-l fn_list] [-tag tagName] [-q]\n");
                 exit(-1);
         }
  
@@ -195,11 +194,10 @@ int main(int argc, char* argv[]) {
         sprintf(tiF, "%s.ti", argv[1]);
         transcripts.readFrom(tiF);
  
-       sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
-       sprintf(datF, "%s.dat", imdName);
-       sprintf(cntF, "%s.stat/%s.cnt", argv[2], argv[3]);
+       sprintf(datF, "%s.dat", argv[2]);
+       sprintf(cntF, "%s.cnt", argv[3]);
  
-       init(imdName, argv[4][0], argv[5]);
+       init(argv[2], argv[4][0], argv[5]);
  
         hit_out.open(datF);
  
diff --git a/rsem-calculate-expression b/rsem-calculate-expression

index 074d895ecf173cad4d5b0866c42b463655365747..457f1771e5314c2d29b46a3ab16a2a9e3622e49e 100755 (executable)
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -62,7 +62,15 @@ my $strand_specific = 0;
  my $mTime = 0;
  my ($time_start, $time_end, $time_alignment, $time_rsem, $time_ci) = (0, 0, 0, 0, 0);
  
+my $mate1_list = "";
+my $mate2_list = "";
+my $inpF = "";
+
+my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName, $statName) = ();
+my $gap = 32;
+
  GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
+          "temporary-folder=s" => \$temp_dir,
            "no-qualities" => \$no_qual,
            "paired-end" => \$paired_end,
            "strand-specific" => \$strand_specific,
@@ -126,13 +134,6 @@ if ($L < 25) { print "Warning: the seed length set is less than 25! This is only
  
  if ($strand_specific) { $probF = 1.0; }
  
-my $mate1_list = "";
-my $mate2_list = "";
-my $inpF = "";
-
-my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName) = ();
-my $gap = 32;
-
  if ($paired_end) {
      if ($no_qual) { $read_type = 2; }
      else { $read_type = 3; }
@@ -167,13 +168,14 @@ my $pos = rindex($sampleName, '/');
  if ($pos < 0) { $sampleToken = $sampleName; }
  else { $sampleToken = substr($sampleName, $pos + 1); }
  
-$temp_dir = "$sampleName.temp";
+if ($temp_dir eq "") { $temp_dir = "$sampleName.temp"; }
  $stat_dir = "$sampleName.stat";
  
  if (!(-d $temp_dir) && !mkdir($temp_dir)) { print "Fail to create folder $temp_dir.\n"; exit(-1); }
  if (!(-d $stat_dir) && !mkdir($stat_dir)) { print "Fail to create folder $stat_dir.\n"; exit(-1); }
  
  $imdName = "$temp_dir/$sampleToken";
+$statName = "$stat_dir/$sampleToken";
  
  if (!$is_sam && !$is_bam && !$no_qual && ($phred33 + $phred64 + $solexa == 0)) { $phred33 = 1; }
  
@@ -211,7 +213,8 @@ if (!$is_sam && !$is_bam) {
         $command .= " -1 $mate1_list -2 $mate2_list";
      }
  
-    $command .= " | gzip > $sampleName.sam.gz";
+    # pipe to samtools to generate a BAM file
+    $command .= " | $dir\sam/samtools view -S -b -o $imdName.bam -";
  
      if ($mTime) { $time_start = time(); }
  
@@ -219,13 +222,13 @@ if (!$is_sam && !$is_bam) {
  
      if ($mTime) { $time_end = time(); $time_alignment = $time_end - $time_start; }
  
-    $inpF = "$sampleName.sam.gz";
-    $is_sam = 1; # output of bowtie is a sam file
+    $inpF = "$imdName.bam";
+    $is_bam = 1; # alignments are outputed as a BAM file
  }
  
  if ($mTime) { $time_start = time(); }
  
-$command = $dir."rsem-parse-alignments $refName $sampleName $sampleToken";
+$command = $dir."rsem-parse-alignments $refName $imdName $statName";
  
  my $samInpType;
  if ($is_sam) { $samInpType = "s"; } 
@@ -258,7 +261,7 @@ print OUTPUT "$mean $sd\n";
  print OUTPUT "$L\n";
  close(OUTPUT);  
  
-$command = $dir."rsem-run-em $refName $read_type $sampleName $sampleToken -p $nThreads";
+$command = $dir."rsem-run-em $refName $read_type $sampleName $imdName $statName -p $nThreads";
  if ($genBamF) { 
      $command .= " -b $samInpType $inpF";
      if ($fn_list ne "") { $command .= " 1 $fn_list"; }
@@ -270,6 +273,9 @@ if ($quiet) { $command .= " -q"; }
  
  &runCommand($command);
  
+&collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+&collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
+
  if ($genBamF) {
      $command = $dir."sam/samtools sort $sampleName.transcript.bam $sampleName.transcript.sorted";
      &runCommand($command);
@@ -286,15 +292,12 @@ if ($genBamF) {
      }
  }
  
-&collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
-&collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
-
  if ($mTime) { $time_end = time(); $time_rsem = $time_end - $time_start; }
  
  if ($mTime) { $time_start = time(); }
  
  if ($calcCI || $var_opt) {
-    $command = $dir."rsem-run-gibbs $refName $sampleName $sampleToken $BURNIN $NCV $SAMPLEGAP";
+    $command = $dir."rsem-run-gibbs $refName $imdName $statName $BURNIN $NCV $SAMPLEGAP";
      $command .= " -p $nThreads";
      if ($var_opt) { $command .= " --var"; }
      if ($quiet) { $command .= " -q"; }
@@ -561,6 +564,10 @@ Maximum size (in memory, MB) of the auxiliary buffer used for computing credibil
  
  Keep temporary files generated by RSEM.  RSEM creates a temporary directory, 'sample_name.temp', into which it puts all intermediate output files. If this directory already exists, RSEM overwrites all files generated by previous RSEM runs inside of it. By default, after RSEM finishes, the temporary directory is deleted.  Set this option to prevent the deletion of this directory and the intermediate files inside of it. (Default: off)
  
+=item B<--temporary-folder> <string>
+
+Set where to put the temporary files generated by RSEM. If the folder specified does not exist, RSEM will try to create it. (Default: sample_name.temp)
+
  =item B<--time>
  
  Output time consumed by each step of RSEM to 'sample_name.time'. (Default: off)
author	Bo Li <bli@cs.wisc.edu>
	Tue, 5 Jun 2012 21:25:13 +0000 (16:25 -0500)
committer	Bo Li <bli@cs.wisc.edu>
	Tue, 5 Jun 2012 21:25:13 +0000 (16:25 -0500)
.gitignore		patch \| blob \| history
BamConverter.h		patch \| blob \| history
BamWriter.h		patch \| blob \| history
EM.cpp		patch \| blob \| history
Gibbs.cpp		patch \| blob \| history
README.md		patch \| blob \| history
calcCI.cpp		patch \| blob \| history
parseIt.cpp		patch \| blob \| history
rsem-calculate-expression		patch \| blob \| history