From fb2aa1ca9d00710943155ef3abbbdd87df116e4a Mon Sep 17 00:00:00 2001 From: Bo Li Date: Sat, 10 Mar 2012 22:35:10 -0600 Subject: [PATCH] Add a validator for input SAM/BAM file --- .gitignore | 4 ++ BamWriter.h | 3 ++ EM.cpp | 2 +- SamParser.h | 20 ++++++--- makefile | 13 +++--- samValidator.cpp | 106 +++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 136 insertions(+), 12 deletions(-) create mode 100644 samValidator.cpp diff --git a/.gitignore b/.gitignore index 5c72f2e..0f2be17 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *.o *.a +*~ rsem-bam2readdepth rsem-bam2wig rsem-build-read-index @@ -13,4 +14,7 @@ rsem-run-gibbs rsem-simulate-reads rsem-synthesis-reference-transcripts rsem-tbam2gbam +rsem-sam-validator sam/samtools +.project +.cproject diff --git a/BamWriter.h b/BamWriter.h index e014f8d..12ab087 100644 --- a/BamWriter.h +++ b/BamWriter.h @@ -113,6 +113,9 @@ void BamWriter::work(HitWrapper wrapper) { cnt += 2; if (verbose && cnt % 1000000 == 0) { printf("%d alignment lines are loaded!\n", cnt); } + //mate info is not complete, skip + if (!(((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) || ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)))) continue; + //unalignable reads, skip if ((b->core.flag & 0x0004) || (b2->core.flag & 0x0004)) continue; //swap if b is mate 2 diff --git a/EM.cpp b/EM.cpp index dfdd0e4..f884979 100644 --- a/EM.cpp +++ b/EM.cpp @@ -503,7 +503,7 @@ void EM() { if (verbose) printf("ROUND = %d, SUM = %.15g, bChange = %f, totNum = %d\n", ROUND, sum, bChange, totNum); } while (ROUND < MIN_ROUND || (totNum > 0 && ROUND < MAX_ROUND)); - //while (ROUND < MAX_ROUND); +// } while (ROUND < 1); if (totNum > 0) fprintf(stderr, "Warning: RSEM reaches %d iterations before meeting the convergence criteria.\n", MAX_ROUND); diff --git a/SamParser.h b/SamParser.h index 9d62eff..513536f 100644 --- a/SamParser.h +++ b/SamParser.h @@ -178,13 +178,17 @@ int SamParser::parseNext(PairedEndRead& read, PairedEndHit& hit) { bam1_t *mp1 = NULL, *mp2 = NULL; - if ((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) { + // If lose mate info, discard. is it necessary? + if (!((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) && !((b->core.flag & 0x0080) && (b2->core.flag & 0x0040))) return 4; + // If only one mate is mapped, discard + if (((b->core.flag & 0x0004) && !(b2->core.flag & 0x0004)) || (!(b->core.flag & 0x0004) && (b2->core.flag & 0x0004))) return 4; + + if (b->core.flag & 0x0040) { mp1 = b; mp2 = b2; } - else if ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)) { + else { mp1 = b2; mp2 = b; } - else return 4; // If lose mate info, discard. is it necessary? int readType = getReadType(mp1, mp2); std::string name = getName(mp1); @@ -226,13 +230,17 @@ int SamParser::parseNext(PairedEndReadQ& read, PairedEndHit& hit) { bam1_t *mp1 = NULL, *mp2 = NULL; - if ((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) { + // If lose mate info, discard. is it necessary? + if (!((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) && !((b->core.flag & 0x0080) && (b2->core.flag & 0x0040))) return 4; + // If only one mate is mapped, discard + if (((b->core.flag & 0x0004) && !(b2->core.flag & 0x0004)) || (!(b->core.flag & 0x0004) && (b2->core.flag & 0x0004))) return 4; + + if (b->core.flag & 0x0040) { mp1 = b; mp2 = b2; } - else if ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)) { + else { mp1 = b2; mp2 = b; } - else return 4; int readType = getReadType(mp1, mp2); std::string name = getName(mp1); diff --git a/makefile b/makefile index a6d863c..44d7369 100644 --- a/makefile +++ b/makefile @@ -1,7 +1,7 @@ CC = g++ CFLAGS = -Wall -c -I. COFLAGS = -Wall -O3 -ffast-math -c -I. -PROGRAMS = rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts rsem-preref rsem-parse-alignments rsem-build-read-index rsem-run-em rsem-tbam2gbam rsem-run-gibbs rsem-calculate-credibility-intervals rsem-simulate-reads rsem-bam2wig rsem-get-unique rsem-bam2readdepth +PROGRAMS = rsem-extract-reference-transcripts rsem-synthesis-reference-transcripts rsem-preref rsem-parse-alignments rsem-build-read-index rsem-run-em rsem-tbam2gbam rsem-run-gibbs rsem-calculate-credibility-intervals rsem-simulate-reads rsem-bam2wig rsem-get-unique rsem-bam2readdepth rsem-sam-validator all : $(PROGRAMS) @@ -65,13 +65,13 @@ simul.h : boost/random.hpp ReadReader.h : SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h ReadIndex.h -SingleModel.h : utils.h Orientation.h LenDist.h RSPD.h Profile.h NoiseProfile.h ModelParams.h RefSeq.h Refs.h SingleRead.h SingleHit.h ReadReader.h simul.h +SingleModel.h : utils.h my_assert.h Orientation.h LenDist.h RSPD.h Profile.h NoiseProfile.h ModelParams.h RefSeq.h Refs.h SingleRead.h SingleHit.h ReadReader.h simul.h -SingleQModel.h : utils.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h Refs.h SingleReadQ.h SingleHit.h ReadReader.h simul.h +SingleQModel.h : utils.h my_assert.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h Refs.h SingleReadQ.h SingleHit.h ReadReader.h simul.h -PairedEndModel.h : utils.h Orientation.h LenDist.h RSPD.h Profile.h NoiseProfile.h ModelParams.h RefSeq.h Refs.h SingleRead.h PairedEndRead.h PairedEndHit.h ReadReader.h simul.h +PairedEndModel.h : utils.h my_assert.h Orientation.h LenDist.h RSPD.h Profile.h NoiseProfile.h ModelParams.h RefSeq.h Refs.h SingleRead.h PairedEndRead.h PairedEndHit.h ReadReader.h simul.h -PairedEndQModel.h : utils.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h Refs.h SingleReadQ.h PairedEndReadQ.h PairedEndHit.h ReadReader.h simul.h +PairedEndQModel.h : utils.h my_assert.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h ModelParams.h RefSeq.h Refs.h SingleReadQ.h PairedEndReadQ.h PairedEndHit.h ReadReader.h simul.h HitWrapper.h : HitContainer.h @@ -130,6 +130,9 @@ calcCI.o : utils.h my_assert.h boost/random.hpp sampling.h Model.h SingleModel.h rsem-get-unique : sam/bam.h sam/sam.h getUnique.cpp sam/libbam.a $(CC) -O3 -Wall getUnique.cpp sam/libbam.a -lz -o $@ +rsem-sam-validator : sam/bam.h sam/sam.h my_assert.h samValidator.cpp sam/libbam.a + $(CC) -O3 -Wall samValidator.cpp sam/libbam.a -lz -o $@ + clean: rm -f *.o *~ $(PROGRAMS) cd sam ; ${MAKE} clean diff --git a/samValidator.cpp b/samValidator.cpp new file mode 100644 index 0000000..06e91d4 --- /dev/null +++ b/samValidator.cpp @@ -0,0 +1,106 @@ +#include +#include +#include +#include +#include +#include + +#include +#include "sam/bam.h" +#include "sam/sam.h" + +#include "my_assert.h" + +using namespace std; + +samfile_t *in; +bam1_t *b, *b2; + +set used; + +bool isValid; + +int main(int argc, char* argv[]) { + if (argc != 2) { + printf("Usage: rsem-sam-validator \n"); + exit(-1); + } + + string input_file(argv[1]); + size_t pos = input_file.find_last_of('.'); + general_assert(pos != string::npos, "Input file does not have a suffix!"); + ++pos; + + string suffix = input_file.substr(pos); + size_t len = suffix.length(); + for (size_t i = 0; i < len; i++) suffix[i] = tolower(suffix[i]); + + general_assert(suffix == "sam" || suffix == "bam", "Cannot recognize input file's file type! The file suffix is neither sam or bam."); + + in = (suffix == "sam" ? samopen(argv[1], "r", NULL) : samopen(argv[1], "rb", NULL)); + general_assert(in != 0, "Cannot open input file!"); + + used.clear(); + b = bam_init1(); b2 = bam_init1(); + + isValid = true; + + long cnt = 0; + string cqname(""), qname; + uint64_t creadlen = 0, readlen; + bool cispaired = false, ispaired; + + printf("."); + do { + if (samread(in, b) < 0) break; + assert(b->core.l_qseq > 0); + + qname.assign(bam1_qname(b)); + + // if this is a paired-end read + ispaired = b->core.flag & 0x0001; + if (ispaired) { + isValid = (samread(in, b2) >= 0) && (qname == bam1_qname(b2)) && (b2->core.flag & 0x0001); + if (!isValid) { printf("\nOnly find one mate for paired-end read %s!\n", qname.c_str()); continue; } + assert(b2->core.l_qseq > 0); + isValid = !(((b->core.flag & 0x0040) && (b2->core.flag & 0x0040)) || ((b->core.flag & 0x0080) && (b2->core.flag & 0x0080))); + if (!isValid) { printf("\nThe two mates of paired-end read %s are not adjacent!\n", qname.c_str()); continue; } + // both mates are mapped + if (!(b->core.flag & 0x0004) && !(b2->core.flag & 0x0004)) { + isValid = (b->core.tid == b2->core.tid) && (b->core.pos == b2->core.mpos) && (b2->core.pos == b->core.mpos); + if (!isValid) { printf("\nOne of paired-end read %s's alignment does not have two mates adjacent to each other!\n", qname.c_str()); continue; } + } + + readlen = ((b->core.flag & 0x0040) ? (uint64_t(b->core.l_qseq) << 32) + b2->core.l_qseq : (uint64_t(b2->core.l_qseq) << 32) + b->core.l_qseq); + } + else readlen = b->core.l_qseq; + + if (cqname != qname) { + isValid = used.find(qname) == used.end(); + if (!isValid) { printf("\nThe alignments of read %s are not grouped together!", qname.c_str()); continue; } + used.insert(cqname); + cqname = qname; + creadlen = readlen; + cispaired = ispaired; + } + else { + assert(cqname != ""); + isValid = (creadlen == readlen); + if (!isValid) { printf("\nRead %s have different read/mate lengths!\n", qname.c_str()); continue; } + isValid = (cispaired == ispaired); + if (!isValid) { printf("\nRead %s is detected as both single-end read and paired-end read!\n", qname.c_str()); continue; } + } + + ++cnt; + if (cnt % 1000000 == 0) printf("."); + + } while(isValid); + + bam_destroy1(b); bam_destroy1(b2); + samclose(in); + + if (isValid) printf("\nThe input file is valid!\n"); + else printf("The input file is not valid!\n"); + + return 0; +} -- 2.39.2