From 509ffe2d71cf6a6f5cca8c39909f8ee10b8db899 Mon Sep 17 00:00:00 2001
From: Bo Li <bli@cs.wisc.edu>
Date: Tue, 5 Jun 2012 16:25:13 -0500
Subject: [PATCH] Made the BAM output of RSEM have all information contained in
 the input alignments file

---
 .gitignore                |  1 +
 BamConverter.h            | 62 ++++++++++++++++++++------------
 BamWriter.h               | 75 ++++++++++++++++++++++++---------------
 EM.cpp                    | 10 +++---
 Gibbs.cpp                 |  8 +++--
 README.md                 |  4 +--
 calcCI.cpp                |  7 ++--
 parseIt.cpp               | 10 +++---
 rsem-calculate-expression | 41 ++++++++++++---------
 9 files changed, 130 insertions(+), 88 deletions(-)

diff --git a/.gitignore b/.gitignore
index 0479fb5..02f7995 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@ rsem-synthesis-reference-transcripts
 rsem-tbam2gbam
 rsem-sam-validator
 rsem-scan-for-paired-end-reads
+rsem-for-ebseq-calculate-clustering-info
 sam/samtools
 .project
 .cproject
diff --git a/BamConverter.h b/BamConverter.h
index bd81127..13b8557 100644
--- a/BamConverter.h
+++ b/BamConverter.h
@@ -85,32 +85,51 @@ void BamConverter::process() {
 		++cnt;
 		isPaired = (b->core.flag & 0x0001) > 0;
 		if (isPaired) {
-			assert(samread(in, b2) >= 0 && (b2->core.flag & 0x0001) && b->core.tid == b2->core.tid);
-			assert((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)); // for collapsing
-			++cnt;
+			assert(samread(in, b2) >= 0 && (b2->core.flag & 0x0001));
+			assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001));
+			assert((b->core.flag & 0x0040) && (b2->core.flag & 0x0080) || (b->core.flag & 0x0080) && (b2->core.flag & 0x0040));
+ 			++cnt;
 		}
 
 		if (cnt % 1000000 == 0) { printf("."); fflush(stdout); }
 
 		// at least one segment is not properly mapped
-		if ((b->core.flag & 0x0004) || (isPaired && (b2->core.flag & 0x0004))) continue;
+		bool notgood = (b->core.flag & 0x0004) || (isPaired && (b2->core.flag & 0x0004));
+		
 
-		const Transcript& transcript = transcripts.getTranscriptViaEid(b->core.tid + 1);
+		if (isPaired && notgood) assert((b->core.flag & 0x0004) && (b2->core.flag & 0x0004));
 
-		convert(b, transcript);
-		if (isPaired) {
-			convert(b2, transcript);
-			b->core.mpos = b2->core.pos;
-			b2->core.mpos = b->core.pos;
+
+		if (!notgood) {
+		  assert((b->core.tid == b2->core.tid) && (b->core.flag & 0x0040) && (b2->core.flag & 0x0080)); // for collapsing
+
+		  const Transcript& transcript = transcripts.getTranscriptViaEid(b->core.tid + 1);
+
+		  convert(b, transcript);
+		  if (isPaired) {
+		    convert(b2, transcript);
+		    b->core.mpos = b2->core.pos;
+		    b2->core.mpos = b->core.pos;
+		  }
+
+		  if (cqname != bam1_qname(b)) {
+		    writeCollapsedLines();
+		    cqname = bam1_qname(b);
+		    collapseMap.init(isPaired);
+		  }
+		  collapseMap.insert(b, b2, bam_aux2f(bam_aux_get(b, "ZW")));
 		}
+		else {
+		  assert(cqname != bam1_qname(b));
+
+		  writeCollapsedLines();
+		  cqname = bam1_qname(b);
+		  collapseMap.init(isPaired);
 
-		if (cqname != bam1_qname(b)) {
-			writeCollapsedLines();
-			cqname = bam1_qname(b);
-			collapseMap.init(isPaired);
+		  samwrite(out, b);
+		  if (isPaired) samwrite(out, b2);
 		}
 
-		collapseMap.insert(b, b2, bam_aux2f(bam_aux_get(b, "ZW")));
 	}
 
 	writeCollapsedLines();
@@ -173,17 +192,14 @@ inline void BamConverter::writeCollapsedLines() {
 		while (collapseMap.next(tmp_b, tmp_b2, prb)) {
 			memcpy(bam_aux_get(tmp_b, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
 			tmp_b->core.qual = getMAPQ(prb);
-			if (tmp_b->core.qual > 0) {
-				samwrite(out, tmp_b);
-				if (isPaired) {
-					memcpy(bam_aux_get(tmp_b2, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
-					tmp_b2->core.qual = tmp_b->core.qual;
-					samwrite(out, tmp_b2);
-				}
+			samwrite(out, tmp_b);
+			if (isPaired) {
+			  memcpy(bam_aux_get(tmp_b2, "ZW") + 1, (uint8_t*)&(prb), bam_aux_type2size('f'));
+			  tmp_b2->core.qual = tmp_b->core.qual;
+			  samwrite(out, tmp_b2);
 			}
 			bam_destroy1(tmp_b);
 			if (isPaired) bam_destroy1(tmp_b2);
-
 		}
 	}
 }
diff --git a/BamWriter.h b/BamWriter.h
index a73e3da..782f5bd 100644
--- a/BamWriter.h
+++ b/BamWriter.h
@@ -85,16 +85,18 @@ void BamWriter::work(HitWrapper<SingleHit> wrapper) {
 
 	while (samread(in, b) >= 0) {
 		++cnt;
-		if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< "alignment lines are loaded!"<< std::endl; }
+		if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " alignment lines are loaded!"<< std::endl; }
 
-		if (b->core.flag & 0x0004) continue;
+		if (!(b->core.flag & 0x0004)) {
+		  hit = wrapper.getNextHit();
+		  assert(hit != NULL);
 
-		hit = wrapper.getNextHit();
-		assert(hit != NULL);
+		  assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
+		  convert(b, hit->getConPrb());
+		}
 
-		assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
-		convert(b, hit->getConPrb());
-		if (b->core.qual > 0) samwrite(out, b); // output only when MAPQ > 0
+		//if (b->core.qual > 0) samwrite(out, b); // output only when MAPQ > 0
+		samwrite(out, b);
 	}
 
 	assert(wrapper.getNextHit() == NULL);
@@ -115,34 +117,49 @@ void BamWriter::work(HitWrapper<PairedEndHit> wrapper) {
 	while (samread(in, b) >= 0 && samread(in, b2) >= 0) {
 		cnt += 2;
 		if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< "alignment lines are loaded!"<< std::endl; }
-		//mate info is not complete, skip
-		if (!(((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) || ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)))) continue;
-		//unalignable reads, skip
-		if ((b->core.flag & 0x0004) || (b2->core.flag & 0x0004)) continue;
-
-		//swap if b is mate 2
-		if (b->core.flag & 0x0080) {
-			assert(b2->core.flag & 0x0040);
-			bam1_t *tmp = b;
-			b = b2; b2 = tmp;
-		}
 
-		hit = wrapper.getNextHit();
-		assert(hit != NULL);
+		assert((b->core.flag & 0x0001) && (b2->core.flag & 0x0001));
+		assert((b->core.flag & 0x0040) && (b2->core.flag & 0x0080) || (b->core.flag & 0x0080) && (b2->core.flag & 0x0040));
+
+		//unalignable reads, skip		
+		bool notgood = (b->core.flag & 0x0004) || (b2->core.flag & 0x0004);
 
-		assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
-		assert(transcripts.getInternalSid(b2->core.tid + 1) == hit->getSid());
+		if (!notgood) {
+		  //swap if b is mate 2
+		  if (b->core.flag & 0x0080) {
+		    assert(b2->core.flag & 0x0040);
+		    bam1_t *tmp = b;
+		    b = b2; b2 = tmp;
+		  }
 
-		convert(b, hit->getConPrb());
-		convert(b2, hit->getConPrb());
+		  hit = wrapper.getNextHit();
+		  assert(hit != NULL);
 
-		b->core.mpos = b2->core.pos;
-		b2->core.mpos = b->core.pos;
+		  assert(transcripts.getInternalSid(b->core.tid + 1) == hit->getSid());
+		  assert(transcripts.getInternalSid(b2->core.tid + 1) == hit->getSid());
 
-		if (b->core.qual > 0) {
-			samwrite(out, b);
-			samwrite(out, b2);
+		  convert(b, hit->getConPrb());
+		  convert(b2, hit->getConPrb());
+
+		  b->core.mpos = b2->core.pos;
+		  b2->core.mpos = b->core.pos;
+		}
+		else {
+		  // if only one mate can be aligned, mask it as unaligned and put an additional tag Z0:A:!
+		  char exclamation = '!';
+		  if (!(b->core.flag & 0x0004)) { 
+		    b->core.flag |= 0x0004;
+		    bam_aux_append(b, "Z0", 'A', bam_aux_type2size('A'), (uint8_t*)&exclamation);
+		  }
+		  if (!(b2->core.flag & 0x0004)) {
+		    b2->core.flag |= 0x0004;
+		    bam_aux_append(b2, "Z0", 'A', bam_aux_type2size('A'), (uint8_t*)&exclamation);
+		  }
 		}
+
+
+		samwrite(out, b);
+		samwrite(out, b2);
 	}
 
 	assert(wrapper.getNextHit() == NULL);
diff --git a/EM.cpp b/EM.cpp
index 2ce946f..0d38728 100644
--- a/EM.cpp
+++ b/EM.cpp
@@ -652,8 +652,8 @@ int main(int argc, char* argv[]) {
 	ifstream fin;
 	bool quiet = false;
 
-	if (argc < 5) {
-		printf("Usage : rsem-run-em refName read_type sampleName sampleToken [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out] [--sampling]\n\n");
+	if (argc < 6) {
+		printf("Usage : rsem-run-em refName read_type sampleName imdName statName [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out] [--sampling]\n\n");
 		printf("  refName: reference name\n");
 		printf("  read_type: 0 single read without quality score; 1 single read with quality score; 2 paired-end read without quality score; 3 paired-end read with quality score.\n");
 		printf("  sampleName: sample's name, including the path\n");
@@ -672,8 +672,8 @@ int main(int argc, char* argv[]) {
 	strcpy(refName, argv[1]);
 	read_type = atoi(argv[2]);
 	strcpy(outName, argv[3]);
-	sprintf(imdName, "%s.temp/%s", argv[3], argv[4]);
-	sprintf(statName, "%s.stat/%s", argv[3], argv[4]);
+	strcpy(imdName, argv[4]);
+	strcpy(statName, argv[5]);
 
 	nThreads = 1;
 
@@ -682,7 +682,7 @@ int main(int argc, char* argv[]) {
 	genGibbsOut = false;
 	pt_fn_list = pt_chr_list = NULL;
 
-	for (int i = 5; i < argc; i++) {
+	for (int i = 6; i < argc; i++) {
 		if (!strcmp(argv[i], "-p")) { nThreads = atoi(argv[i + 1]); }
 		if (!strcmp(argv[i], "-b")) {
 			genBamF = true;
diff --git a/Gibbs.cpp b/Gibbs.cpp
index c1f4fe2..63d35a8 100644
--- a/Gibbs.cpp
+++ b/Gibbs.cpp
@@ -404,15 +404,17 @@ void writeEstimatedParameters(char* modelF, char* imdName) {
 
 int main(int argc, char* argv[]) {
 	if (argc < 7) {
-		printf("Usage: rsem-run-gibbs-multi reference_name sample_name sampleToken BURNIN NSAMPLES GAP [-p #Threads] [--var] [-q]\n");
+		printf("Usage: rsem-run-gibbs reference_name imdName statName BURNIN NSAMPLES GAP [-p #Threads] [--var] [-q]\n");
 		exit(-1);
 	}
 
+	strcpy(imdName, argv[2]);
+	strcpy(statName, argv[3]);
+
 	BURNIN = atoi(argv[4]);
 	NSAMPLES = atoi(argv[5]);
 	GAP = atoi(argv[6]);
-	sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
-	sprintf(statName, "%s.stat/%s", argv[2], argv[3]);
+
 	load_data(argv[1], statName, imdName);
 
 	nThreads = 1;
diff --git a/README.md b/README.md
index e85d250..ed5acc0 100644
--- a/README.md
+++ b/README.md
@@ -298,8 +298,8 @@ named 'EBSeq'.
 
 For more information about EBSeq (including the paper describing their
 method), please visit <a
-href="http://www.biostat.wisc.edu/~ningleng/EBSeq_Package">here</a>. You
-can also find a local version of vignette under
+href="http://www.biostat.wisc.edu/~ningleng/EBSeq_Package">EBSeq
+website</a>. You can also find a local version of vignette under
 'EBSeq/inst/doc/EBSeq_Vignette.pdf'.
 
 EBSeq requires gene-isoform relationship for its isoform DE
diff --git a/calcCI.cpp b/calcCI.cpp
index 89414f9..99ce148 100644
--- a/calcCI.cpp
+++ b/calcCI.cpp
@@ -397,10 +397,13 @@ void calculate_credibility_intervals(char* imdName) {
 
 int main(int argc, char* argv[]) {
 	if (argc < 8) {
-		printf("Usage: rsem-calculate-credibility-intervals reference_name sample_name sampleToken confidence nCV nSpC nMB [-p #Threads] [-q]\n");
+		printf("Usage: rsem-calculate-credibility-intervals reference_name imdName statName confidence nCV nSpC nMB [-p #Threads] [-q]\n");
 		exit(-1);
 	}
 
+	strcpy(imdName, argv[2]);
+	strcpy(statName, argv[3]);
+
 	confidence = atof(argv[4]);
 	nCV = atoi(argv[5]);
 	nSpC = atoi(argv[6]);
@@ -425,8 +428,6 @@ int main(int argc, char* argv[]) {
 	cvlen = M + 1;
 	assert(nSamples > 0 && cvlen > 1); // for Buffter.h: (bufsize_type)nSamples
 
-	sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
-	sprintf(statName, "%s.stat/%s", argv[2], argv[3]);
 	sprintf(tmpF, "%s.tmp", imdName);
 	sprintf(cvsF, "%s.countvectors", imdName);
 
diff --git a/parseIt.cpp b/parseIt.cpp
index c760c79..3373ef2 100644
--- a/parseIt.cpp
+++ b/parseIt.cpp
@@ -33,7 +33,6 @@ HIT_INT_TYPE nHits; // # of hits
 READ_INT_TYPE nUnique, nMulti, nIsoMulti;
 char fn_list[STRLEN];
 char groupF[STRLEN], tiF[STRLEN];
-char imdName[STRLEN];
 char datF[STRLEN], cntF[STRLEN];
 
 GroupInfo gi;
@@ -167,7 +166,7 @@ int main(int argc, char* argv[]) {
 	bool quiet = false;
 
 	if (argc < 6) {
-		printf("Usage : rsem-parse-alignments refName sampleName sampleToken alignFType('s' for sam, 'b' for bam) alignF [-t Type] [-l fn_list] [-tag tagName] [-q]\n");
+		printf("Usage : rsem-parse-alignments refName imdName statName alignFType('s' for sam, 'b' for bam) alignF [-t Type] [-l fn_list] [-tag tagName] [-q]\n");
 		exit(-1);
 	}
 
@@ -195,11 +194,10 @@ int main(int argc, char* argv[]) {
 	sprintf(tiF, "%s.ti", argv[1]);
 	transcripts.readFrom(tiF);
 
-	sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
-	sprintf(datF, "%s.dat", imdName);
-	sprintf(cntF, "%s.stat/%s.cnt", argv[2], argv[3]);
+	sprintf(datF, "%s.dat", argv[2]);
+	sprintf(cntF, "%s.cnt", argv[3]);
 
-	init(imdName, argv[4][0], argv[5]);
+	init(argv[2], argv[4][0], argv[5]);
 
 	hit_out.open(datF);
 
diff --git a/rsem-calculate-expression b/rsem-calculate-expression
index 074d895..457f177 100755
--- a/rsem-calculate-expression
+++ b/rsem-calculate-expression
@@ -62,7 +62,15 @@ my $strand_specific = 0;
 my $mTime = 0;
 my ($time_start, $time_end, $time_alignment, $time_rsem, $time_ci) = (0, 0, 0, 0, 0);
 
+my $mate1_list = "";
+my $mate2_list = "";
+my $inpF = "";
+
+my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName, $statName) = ();
+my $gap = 32;
+
 GetOptions("keep-intermediate-files" => \$keep_intermediate_files,
+	   "temporary-folder=s" => \$temp_dir,
 	   "no-qualities" => \$no_qual,
 	   "paired-end" => \$paired_end,
 	   "strand-specific" => \$strand_specific,
@@ -126,13 +134,6 @@ if ($L < 25) { print "Warning: the seed length set is less than 25! This is only
 
 if ($strand_specific) { $probF = 1.0; }
 
-my $mate1_list = "";
-my $mate2_list = "";
-my $inpF = "";
-
-my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName) = ();
-my $gap = 32;
-
 if ($paired_end) {
     if ($no_qual) { $read_type = 2; }
     else { $read_type = 3; }
@@ -167,13 +168,14 @@ my $pos = rindex($sampleName, '/');
 if ($pos < 0) { $sampleToken = $sampleName; }
 else { $sampleToken = substr($sampleName, $pos + 1); }
 
-$temp_dir = "$sampleName.temp";
+if ($temp_dir eq "") { $temp_dir = "$sampleName.temp"; }
 $stat_dir = "$sampleName.stat";
 
 if (!(-d $temp_dir) && !mkdir($temp_dir)) { print "Fail to create folder $temp_dir.\n"; exit(-1); }
 if (!(-d $stat_dir) && !mkdir($stat_dir)) { print "Fail to create folder $stat_dir.\n"; exit(-1); }
 
 $imdName = "$temp_dir/$sampleToken";
+$statName = "$stat_dir/$sampleToken";
 
 if (!$is_sam && !$is_bam && !$no_qual && ($phred33 + $phred64 + $solexa == 0)) { $phred33 = 1; }
 
@@ -211,7 +213,8 @@ if (!$is_sam && !$is_bam) {
 	$command .= " -1 $mate1_list -2 $mate2_list";
     }
 
-    $command .= " | gzip > $sampleName.sam.gz";
+    # pipe to samtools to generate a BAM file
+    $command .= " | $dir\sam/samtools view -S -b -o $imdName.bam -";
 
     if ($mTime) { $time_start = time(); }
 
@@ -219,13 +222,13 @@ if (!$is_sam && !$is_bam) {
 
     if ($mTime) { $time_end = time(); $time_alignment = $time_end - $time_start; }
 
-    $inpF = "$sampleName.sam.gz";
-    $is_sam = 1; # output of bowtie is a sam file
+    $inpF = "$imdName.bam";
+    $is_bam = 1; # alignments are outputed as a BAM file
 }
 
 if ($mTime) { $time_start = time(); }
 
-$command = $dir."rsem-parse-alignments $refName $sampleName $sampleToken";
+$command = $dir."rsem-parse-alignments $refName $imdName $statName";
 
 my $samInpType;
 if ($is_sam) { $samInpType = "s"; } 
@@ -258,7 +261,7 @@ print OUTPUT "$mean $sd\n";
 print OUTPUT "$L\n";
 close(OUTPUT);  
 
-$command = $dir."rsem-run-em $refName $read_type $sampleName $sampleToken -p $nThreads";
+$command = $dir."rsem-run-em $refName $read_type $sampleName $imdName $statName -p $nThreads";
 if ($genBamF) { 
     $command .= " -b $samInpType $inpF";
     if ($fn_list ne "") { $command .= " 1 $fn_list"; }
@@ -270,6 +273,9 @@ if ($quiet) { $command .= " -q"; }
 
 &runCommand($command);
 
+&collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+&collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
+
 if ($genBamF) {
     $command = $dir."sam/samtools sort $sampleName.transcript.bam $sampleName.transcript.sorted";
     &runCommand($command);
@@ -286,15 +292,12 @@ if ($genBamF) {
     }
 }
 
-&collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
-&collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
-
 if ($mTime) { $time_end = time(); $time_rsem = $time_end - $time_start; }
 
 if ($mTime) { $time_start = time(); }
 
 if ($calcCI || $var_opt) {
-    $command = $dir."rsem-run-gibbs $refName $sampleName $sampleToken $BURNIN $NCV $SAMPLEGAP";
+    $command = $dir."rsem-run-gibbs $refName $imdName $statName $BURNIN $NCV $SAMPLEGAP";
     $command .= " -p $nThreads";
     if ($var_opt) { $command .= " --var"; }
     if ($quiet) { $command .= " -q"; }
@@ -561,6 +564,10 @@ Maximum size (in memory, MB) of the auxiliary buffer used for computing credibil
 
 Keep temporary files generated by RSEM.  RSEM creates a temporary directory, 'sample_name.temp', into which it puts all intermediate output files. If this directory already exists, RSEM overwrites all files generated by previous RSEM runs inside of it. By default, after RSEM finishes, the temporary directory is deleted.  Set this option to prevent the deletion of this directory and the intermediate files inside of it. (Default: off)
 
+=item B<--temporary-folder> <string>
+
+Set where to put the temporary files generated by RSEM. If the folder specified does not exist, RSEM will try to create it. (Default: sample_name.temp)
+
 =item B<--time>
 
 Output time consumed by each step of RSEM to 'sample_name.time'. (Default: off)
-- 
2.39.2