From: Bo Li <bli@cs.wisc.edu>
Date: Sun, 11 Mar 2012 04:35:10 +0000 (-0600)
Subject: Add a validator for input SAM/BAM file
X-Git-Url: https://git.donarmstrong.com/?a=commitdiff_plain;h=fb2aa1ca9d00710943155ef3abbbdd87df116e4a;p=rsem.git

Add a validator for input SAM/BAM file
---

diff --git a/.gitignore b/.gitignore
index 5c72f2e..0f2be17 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 *.o
 *.a
+*~
 rsem-bam2readdepth
 rsem-bam2wig
 rsem-build-read-index
@@ -13,4 +14,7 @@ rsem-run-gibbs
 rsem-simulate-reads
 rsem-synthesis-reference-transcripts
 rsem-tbam2gbam
+rsem-sam-validator
 sam/samtools
+.project
+.cproject
diff --git a/BamWriter.h b/BamWriter.h
index e014f8d..12ab087 100644
--- a/BamWriter.h
+++ b/BamWriter.h
@@ -113,6 +113,9 @@ void BamWriter::work(HitWrapper<PairedEndHit> wrapper) {
 		cnt += 2;
 		if (verbose && cnt % 1000000 == 0) { printf("%d alignment lines are loaded!\n", cnt); }
 
+		//mate info is not complete, skip
+		if (!(((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) || ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)))) continue;
+		//unalignable reads, skip
 		if ((b->core.flag & 0x0004) || (b2->core.flag & 0x0004)) continue;
 
 		//swap if b is mate 2
diff --git a/EM.cpp b/EM.cpp
index dfdd0e4..f884979 100644
--- a/EM.cpp
+++ b/EM.cpp
@@ -503,7 +503,7 @@ void EM() {
 
 		if (verbose) printf("ROUND = %d, SUM = %.15g, bChange = %f, totNum = %d\n", ROUND, sum, bChange, totNum);
 	} while (ROUND < MIN_ROUND || (totNum > 0 && ROUND < MAX_ROUND));
-	  //while (ROUND < MAX_ROUND);
+//	} while (ROUND < 1);
 
 	if (totNum > 0) fprintf(stderr, "Warning: RSEM reaches %d iterations before meeting the convergence criteria.\n", MAX_ROUND);
 
diff --git a/SamParser.h b/SamParser.h
index 9d62eff..513536f 100644
--- a/SamParser.h
+++ b/SamParser.h
@@ -178,13 +178,17 @@ int SamParser::parseNext(PairedEndRead& read, PairedEndHit& hit) {
 
 	bam1_t *mp1 = NULL, *mp2 = NULL;
 
-	if ((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) {
+	// If lose mate info, discard. is it necessary?
+	if (!((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) && !((b->core.flag & 0x0080) && (b2->core.flag & 0x0040))) return 4;
+	// If only one mate is mapped, discard
+	if (((b->core.flag & 0x0004) && !(b2->core.flag & 0x0004)) || (!(b->core.flag & 0x0004) && (b2->core.flag & 0x0004))) return 4;
+
+	if (b->core.flag & 0x0040) {
 		mp1 = b; mp2 = b2;
 	}
-	else if ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)) {
+	else  {
 		mp1 = b2; mp2 = b;
 	}
-	else return 4; // If lose mate info, discard. is it necessary?
 
 	int readType = getReadType(mp1, mp2);
 	std::string name = getName(mp1);
@@ -226,13 +230,17 @@ int SamParser::parseNext(PairedEndReadQ& read, PairedEndHit& hit) {
 
 	bam1_t *mp1 = NULL, *mp2 = NULL;
 
-	if ((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) {
+	// If lose mate info, discard. is it necessary?
+	if (!((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) && !((b->core.flag & 0x0080) && (b2->core.flag & 0x0040))) return 4;
+	// If only one mate is mapped, discard
+	if (((b->core.flag & 0x0004) && !(b2->core.flag & 0x0004)) || (!(b->core.flag & 0x0004) && (b2->core.flag & 0x0004))) return 4;
+
+	if (b->core.flag & 0x0040) {
 		mp1 = b; mp2 = b2;
 	}
-	else if ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)) {
+	else  {
 		mp1 = b2; mp2 = b;
 	}
-	else return 4;
 
 	int readType = getReadType(mp1, mp2);
 	std::string name = getName(mp1);
diff --git a/makefile b/makefile
index a6d863c..44d7369 100644
--- a/makefile
+++ b/makefile
@@ -1,7 +1,7 @@
 CC = g++
 CFLAGS = -Wall -c -I.
 COFLAGS = -Wall -O3 -ffast-math -c -I.
-PROGRAMS = rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts rsem-preref rsem-parse-alignments rsem-build-read-index rsem-run-em rsem-tbam2gbam rsem-run-gibbs rsem-calculate-credibility-intervals rsem-simulate-reads rsem-bam2wig rsem-get-unique rsem-bam2readdepth
+PROGRAMS = rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts rsem-preref rsem-parse-alignments rsem-build-read-index rsem-run-em rsem-tbam2gbam rsem-run-gibbs rsem-calculate-credibility-intervals rsem-simulate-reads rsem-bam2wig rsem-get-unique rsem-bam2readdepth rsem-sam-validator
 
 
 all : $(PROGRAMS)
@@ -65,13 +65,13 @@ simul.h : boost/random.hpp
 
 ReadReader.h : SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h ReadIndex.h
 
-SingleModel.h : utils.h Orientation.h LenDist.h RSPD.h Profile.h NoiseProfile.h ModelParams.h RefSeq.h Refs.h SingleRead.h SingleHit.h ReadReader.h simul.h
+SingleModel.h : utils.h my_assert.h Orientation.h LenDist.h RSPD.h Profile.h NoiseProfile.h ModelParams.h RefSeq.h Refs.h SingleRead.h SingleHit.h ReadReader.h simul.h
 
-SingleQModel.h : utils.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h Refs.h SingleReadQ.h SingleHit.h ReadReader.h simul.h
+SingleQModel.h : utils.h my_assert.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h Refs.h SingleReadQ.h SingleHit.h ReadReader.h simul.h
 
-PairedEndModel.h : utils.h Orientation.h LenDist.h RSPD.h Profile.h NoiseProfile.h ModelParams.h RefSeq.h Refs.h SingleRead.h PairedEndRead.h PairedEndHit.h ReadReader.h simul.h 
+PairedEndModel.h : utils.h my_assert.h Orientation.h LenDist.h RSPD.h Profile.h NoiseProfile.h ModelParams.h RefSeq.h Refs.h SingleRead.h PairedEndRead.h PairedEndHit.h ReadReader.h simul.h 
 
-PairedEndQModel.h : utils.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h Refs.h SingleReadQ.h PairedEndReadQ.h PairedEndHit.h ReadReader.h simul.h
+PairedEndQModel.h : utils.h my_assert.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h Refs.h SingleReadQ.h PairedEndReadQ.h PairedEndHit.h ReadReader.h simul.h
 
 HitWrapper.h : HitContainer.h
 
@@ -130,6 +130,9 @@ calcCI.o : utils.h my_assert.h boost/random.hpp sampling.h Model.h SingleModel.h
 rsem-get-unique : sam/bam.h sam/sam.h getUnique.cpp sam/libbam.a
 	$(CC) -O3 -Wall getUnique.cpp sam/libbam.a -lz -o $@
 
+rsem-sam-validator : sam/bam.h sam/sam.h my_assert.h samValidator.cpp sam/libbam.a
+	$(CC) -O3 -Wall samValidator.cpp sam/libbam.a -lz -o $@
+	
 clean:
 	rm -f *.o *~ $(PROGRAMS)
 	cd sam ; ${MAKE} clean
diff --git a/samValidator.cpp b/samValidator.cpp
new file mode 100644
index 0000000..06e91d4
--- /dev/null
+++ b/samValidator.cpp
@@ -0,0 +1,106 @@
+#include<cstdio>
+#include<cstring>
+#include<cstdlib>
+#include<cassert>
+#include<string>
+#include<set>
+
+#include <stdint.h>
+#include "sam/bam.h"
+#include "sam/sam.h"
+
+#include "my_assert.h"
+
+using namespace std;
+
+samfile_t *in;
+bam1_t *b, *b2;
+
+set<string> used;
+
+bool isValid;
+
+int main(int argc, char* argv[]) {
+	if (argc != 2) {
+		printf("Usage: rsem-sam-validator <input.sam/input.bam>\n");
+		exit(-1);
+	}
+
+	string input_file(argv[1]);
+	size_t pos = input_file.find_last_of('.');
+	general_assert(pos != string::npos, "Input file does not have a suffix!");
+	++pos;
+
+	string suffix = input_file.substr(pos);
+	size_t len = suffix.length();
+	for (size_t i = 0; i < len; i++) suffix[i] = tolower(suffix[i]);
+
+	general_assert(suffix == "sam" || suffix == "bam", "Cannot recognize input file's file type! The file suffix is neither sam or bam.");
+
+	in = (suffix == "sam" ? samopen(argv[1], "r", NULL) : samopen(argv[1], "rb", NULL));
+	general_assert(in != 0, "Cannot open input file!");
+
+	used.clear();
+	b = bam_init1(); b2 = bam_init1();
+
+	isValid = true;
+
+	long cnt = 0;
+	string cqname(""), qname;
+	uint64_t creadlen = 0, readlen;
+	bool cispaired = false, ispaired;
+
+	printf(".");
+	do {
+		if (samread(in, b) < 0) break;
+		assert(b->core.l_qseq > 0);
+
+		qname.assign(bam1_qname(b));
+
+		// if this is a paired-end read
+		ispaired = b->core.flag & 0x0001;
+		if (ispaired) {
+			isValid = (samread(in, b2) >= 0) && (qname == bam1_qname(b2)) && (b2->core.flag & 0x0001);
+			if (!isValid) { printf("\nOnly find one mate for paired-end read %s!\n", qname.c_str()); continue; }
+			assert(b2->core.l_qseq > 0);
+			isValid = !(((b->core.flag & 0x0040) && (b2->core.flag & 0x0040)) || ((b->core.flag & 0x0080) && (b2->core.flag & 0x0080)));
+			if (!isValid) { printf("\nThe two mates of paired-end read %s are not adjacent!\n", qname.c_str()); continue; }
+			// both mates are mapped
+			if (!(b->core.flag & 0x0004) && !(b2->core.flag & 0x0004)) {
+				isValid = (b->core.tid == b2->core.tid) && (b->core.pos == b2->core.mpos) && (b2->core.pos == b->core.mpos);
+				if (!isValid) { printf("\nOne of paired-end read %s's alignment does not have two mates adjacent to each other!\n", qname.c_str()); continue; }
+			}
+
+			readlen = ((b->core.flag & 0x0040) ? (uint64_t(b->core.l_qseq) << 32) + b2->core.l_qseq : (uint64_t(b2->core.l_qseq) << 32) + b->core.l_qseq);
+		}
+		else readlen = b->core.l_qseq;
+
+		if (cqname != qname) {
+			isValid = used.find(qname) == used.end();
+			if (!isValid) { printf("\nThe alignments of read %s are not grouped together!", qname.c_str()); continue; }
+			used.insert(cqname);
+			cqname = qname;
+			creadlen = readlen;
+			cispaired = ispaired;
+		}
+		else {
+			assert(cqname != "");
+			isValid = (creadlen == readlen);
+			if (!isValid) { printf("\nRead %s have different read/mate lengths!\n", qname.c_str()); continue; }
+			isValid = (cispaired == ispaired);
+			if (!isValid) { printf("\nRead %s is detected as both single-end read and paired-end read!\n", qname.c_str()); continue; }
+		}
+
+		++cnt;
+		if (cnt % 1000000 == 0) printf(".");
+
+	} while(isValid);
+
+	bam_destroy1(b); bam_destroy1(b2);
+	samclose(in);
+
+	if (isValid) printf("\nThe input file is valid!\n");
+	else printf("The input file is not valid!\n");
+
+	return 0;
+}