From 58f823f5be6dfbe00896fc44ac3aac5e881e9c5c Mon Sep 17 00:00:00 2001 From: Bo Li Date: Sun, 17 Apr 2011 09:57:59 -0500 Subject: [PATCH] lots of changes --- EM.cpp | 26 ++--- Gibbs.cpp | 32 +++--- SamParser.h | 38 ++++++- calcCI.cpp | 32 +++--- makefile | 21 ++-- mersenne.cpp | 215 -------------------------------------- parseIt.cpp | 55 ++++++++-- randomc.h | 193 ---------------------------------- rsem-calculate-expression | 67 ++++++------ rsem-plot-model | 17 ++- simul.h | 9 +- simul_mersenne.h | 20 ---- simulation.cpp | 4 +- 13 files changed, 191 insertions(+), 538 deletions(-) delete mode 100644 mersenne.cpp delete mode 100644 randomc.h delete mode 100644 simul_mersenne.h diff --git a/EM.cpp b/EM.cpp index aa7ebdb..ff25768 100644 --- a/EM.cpp +++ b/EM.cpp @@ -61,7 +61,8 @@ bool genBamF; // If user wants to generate bam file, true; otherwise, false. bool updateModel, calcExpectedWeights; bool genGibbsOut; // generate file for Gibbs sampler -char refName[STRLEN], imdName[STRLEN], outName[STRLEN]; +char refName[STRLEN], outName[STRLEN]; +char imdName[STRLEN], statName[STRLEN]; char refF[STRLEN], groupF[STRLEN], cntF[STRLEN], tiF[STRLEN]; char mparamsF[STRLEN], bmparamsF[STRLEN]; char modelF[STRLEN], thetaF[STRLEN]; @@ -305,7 +306,7 @@ void writeResults(ModelType& model, double* counts) { char outF[STRLEN]; FILE *fo; - sprintf(modelF, "%s.model", outName); + sprintf(modelF, "%s.model", statName); model.write(modelF); //calculate tau values @@ -547,15 +548,9 @@ void EM() { } } fclose(fo); - - char scoreF[STRLEN]; - sprintf(scoreF, "%s.ns", imdName); - fo = fopen(scoreF, "w"); - fprintf(fo, "%.15g\n", model.getLogP()); - fclose(fo); } - sprintf(thetaF, "%s.theta", outName); + sprintf(thetaF, "%s.theta", statName); fo = fopen(thetaF, "w"); fprintf(fo, "%d\n", M + 1); @@ -635,11 +630,11 @@ int main(int argc, char* argv[]) { bool quiet = false; if (argc < 5) { - printf("Usage : rsem-run-em refName read_type imdName outName [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out]\n\n"); + printf("Usage : rsem-run-em refName read_type sampleName sampleToken [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out]\n\n"); printf(" refName: reference name\n"); printf(" read_type: 0 single read without quality score; 1 single read with quality score; 2 paired-end read without quality score; 3 paired-end read with quality score.\n"); - printf(" imdName: name for all upstream/downstream user-unseen files. (different files have different suffices)\n"); - printf(" outName: name for all output files. (different files have different suffices)\n"); + printf(" samplePath: sample path.\n"); + printf(" sampleName: sample name.\n"); printf(" -p: number of threads which user wants to use. (default: 1)\n"); printf(" -b: produce bam format output file. (default: off)\n"); printf(" -q: set it quiet\n"); @@ -652,8 +647,9 @@ int main(int argc, char* argv[]) { strcpy(refName, argv[1]); read_type = atoi(argv[2]); - strcpy(imdName, argv[3]); - strcpy(outName, argv[4]); + strcpy(outName, argv[3]); + sprintf(imdName, "%s.temp/%s", argv[3], argv[4]); + sprintf(statName, "%s.stat/%s", argv[3], argv[4]); nThreads = 1; @@ -691,7 +687,7 @@ int main(int argc, char* argv[]) { sprintf(tiF, "%s.ti", refName); transcripts.readFrom(tiF); - sprintf(cntF, "%s.cnt", imdName); + sprintf(cntF, "%s.cnt", statName); fin.open(cntF); if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", cntF); exit(-1); } fin>>N0>>N1>>N2>>N_tot; diff --git a/Gibbs.cpp b/Gibbs.cpp index 7727f37..5bdfb24 100644 --- a/Gibbs.cpp +++ b/Gibbs.cpp @@ -7,7 +7,8 @@ #include #include -#include "randomc.h" +#include "boost/random.hpp" + #include "utils.h" #include "Model.h" @@ -35,6 +36,7 @@ int model_type; int m, M, N0, N1, nHits; double totc; int BURNIN, CHAINLEN, GAP; +char imdName[STRLEN], statName[STRLEN]; char thetaF[STRLEN], ofgF[STRLEN], groupF[STRLEN], refF[STRLEN], modelF[STRLEN]; char cvsF[STRLEN]; @@ -50,9 +52,11 @@ vector counts; bool quiet; vector arr; -CRandomMersenne rg(time(NULL)); -void load_data(char* reference_name, char* sample_name, char* imdName) { +boost::mt19937 rng(time(NULL)); +boost::uniform_01 rg(rng); + +void load_data(char* reference_name, char* statName, char* imdName) { ifstream fin; string line; int tmpVal; @@ -68,7 +72,7 @@ void load_data(char* reference_name, char* sample_name, char* imdName) { m = gi.getm(); //load thetaF - sprintf(thetaF, "%s.theta",sample_name); + sprintf(thetaF, "%s.theta",statName); fin.open(thetaF); if (!fin.is_open()) { fprintf(stderr, "Cannot open %s!\n", thetaF); @@ -123,7 +127,7 @@ void load_data(char* reference_name, char* sample_name, char* imdName) { // If by chance arr[len - 1] == 0.0, one possibility is to sample uniformly from 0...len-1 int sample(vector& arr, int len) { int l, r, mid; - double prb = rg.Random() * arr[len - 1]; + double prb = rg() * arr[len - 1]; l = 0; r = len - 1; while (l <= r) { @@ -334,14 +338,16 @@ void writeEstimatedParameters(char* modelF, char* imdName) { int main(int argc, char* argv[]) { if (argc < 7) { - printf("Usage: rsem-run-gibbs reference_name sample_name imdName BURNIN CHAINLEN GAP [-q]\n"); + printf("Usage: rsem-run-gibbs reference_name sample_name sampleToken BURNIN CHAINLEN GAP [-q]\n"); exit(-1); } BURNIN = atoi(argv[4]); CHAINLEN = atoi(argv[5]); GAP = atoi(argv[6]); - load_data(argv[1], argv[2], argv[3]); + sprintf(imdName, "%s.temp/%s", argv[2], argv[3]); + sprintf(statName, "%s.stat/%s", argv[2], argv[3]); + load_data(argv[1], statName, imdName); quiet = false; if (argc > 7 && !strcmp(argv[7], "-q")) { @@ -350,19 +356,19 @@ int main(int argc, char* argv[]) { verbose = !quiet; init(); - Gibbs(argv[3]); + Gibbs(imdName); - sprintf(modelF, "%s.model", argv[2]); + sprintf(modelF, "%s.model", statName); FILE *fi = fopen(modelF, "r"); if (fi == NULL) { fprintf(stderr, "Cannot open %s!\n", modelF); exit(-1); } fscanf(fi, "%d", &model_type); fclose(fi); switch(model_type) { - case 0 : writeEstimatedParameters(modelF, argv[3]); break; - case 1 : writeEstimatedParameters(modelF, argv[3]); break; - case 2 : writeEstimatedParameters(modelF, argv[3]); break; - case 3 : writeEstimatedParameters(modelF, argv[3]); break; + case 0 : writeEstimatedParameters(modelF, imdName); break; + case 1 : writeEstimatedParameters(modelF, imdName); break; + case 2 : writeEstimatedParameters(modelF, imdName); break; + case 3 : writeEstimatedParameters(modelF, imdName); break; } return 0; diff --git a/SamParser.h b/SamParser.h index a36af3a..335984e 100644 --- a/SamParser.h +++ b/SamParser.h @@ -1,4 +1,4 @@ -/* ReadType here means if the read is unalignable, alignable or aligned too much. It is NOT single read or paired-end read */ +/* ReadType here means if the read is unalignable, alignable or aligned too much. It is NOT siheaderngle read or paired-end read */ #ifndef SAMPARSER_H_ #define SAMPARSER_H_ @@ -10,7 +10,12 @@ #include "sam/bam.h" #include "sam/sam.h" + #include "utils.h" + +#include "Transcript.h" +#include "Transcripts.h" + #include "SingleRead.h" #include "SingleReadQ.h" #include "PairedEndRead.h" @@ -20,7 +25,7 @@ class SamParser { public: - SamParser(char, const char*, const char* = 0); + SamParser(char, const char*, Transcripts&, const char* = 0); ~SamParser(); /** @@ -46,6 +51,7 @@ private: bam_header_t *header; bam1_t *b, *b2; + //tag used by aligner static char rtTag[STRLEN]; @@ -64,12 +70,16 @@ private: //0 ~ N0 1 ~ N1 2 ~ N2 int getReadType(const bam1_t*); int getReadType(const bam1_t*, const bam1_t*); // for paired-end reads + + bool check(bam1_t *b) { + return (b->core.n_cigar == 1) && ((*bam1_cigar(b) & BAM_CIGAR_MASK) == BAM_CMATCH) && (b->core.l_qseq == (int32_t)(*bam1_cigar(b) >> BAM_CIGAR_SHIFT)); + } }; char SamParser::rtTag[STRLEN] = ""; // default : no tag, thus no Type 2 reads // aux, if not 0, points to the file name of fn_list -SamParser::SamParser(char inpType, const char* inpF, const char* aux) { +SamParser::SamParser(char inpType, const char* inpF, Transcripts& transcripts, const char* aux) { switch(inpType) { case 'b': sam_in = samopen(inpF, "rb", aux); break; case 's': sam_in = samopen(inpF, "r", aux); break; @@ -80,6 +90,20 @@ SamParser::SamParser(char inpType, const char* inpF, const char* aux) { header = sam_in->header; if (header == 0) { fprintf(stderr, "Fail to parse sam header!\n"); exit(-1); } + // Check if the reference used for aligner is the transcript set RSEM generated + if (transcripts.getM() != header->n_targets) { + fprintf(stderr, "Number of transcripts does not match! Please align reads against the transcript set and use RSEM generated reference for your aligner!\n"); + exit(-1); + } + for (int i = 0; i < header->n_targets; i++) { + const Transcript& transcript = transcripts.getTranscriptAt(i + 1); + // If update int to long, chance the (int) conversion + if (transcript.getTranscriptID().compare(header->target_name[i]) != 0 || transcript.getLength() != (int)header->target_len[i]) { + fprintf(stderr, "Transcript information does not match! Please align reads against the transcript set and use RSEM generated reference for your aligner!\n"); + exit(-1); + } + } + b = bam_init1(); b2 = bam_init1(); } @@ -110,6 +134,8 @@ int SamParser::parseNext(SingleRead& read, SingleHit& hit) { else val = 5; if (readType == 1) { + if (!check(b)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); } + if (getDir(b) > 0) { hit = SingleHit(b->core.tid + 1, b->core.pos); } @@ -139,6 +165,8 @@ int SamParser::parseNext(SingleReadQ& read, SingleHit& hit) { else val = 5; if (readType == 1) { + if (!check(b)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); } + if (getDir(b) > 0) { hit = SingleHit(b->core.tid + 1, b->core.pos); } @@ -184,6 +212,8 @@ int SamParser::parseNext(PairedEndRead& read, PairedEndHit& hit) { else val = 5; if (readType == 1) { + if (!check(mp1) || !check(mp2)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); } + if (mp1->core.tid != mp2->core.tid) { fprintf(stderr, "The two reads do not come from the same pair!"); exit(-1); @@ -233,6 +263,8 @@ int SamParser::parseNext(PairedEndReadQ& read, PairedEndHit& hit) { else val = 5; if (readType == 1) { + if (!check(mp1) || !check(mp2)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); } + if (mp1->core.tid != mp2->core.tid) { fprintf(stderr, "The two reads do not come from the same pair!"); exit(-1); diff --git a/calcCI.cpp b/calcCI.cpp index 321cd4a..c9ccce5 100644 --- a/calcCI.cpp +++ b/calcCI.cpp @@ -59,6 +59,7 @@ generator_type **rgs; int M, m; Refs refs; GroupInfo gi; +char imdName[STRLEN], statName[STRLEN]; char modelF[STRLEN], groupF[STRLEN], refF[STRLEN]; vector eel; //expected effective lengths @@ -327,17 +328,17 @@ void generateResults(char* imdName) { if (verbose) { printf("All credibility intervals are calculated!\n"); } - sprintf(outF, "%s.tau_denoms", imdName); - fo = fopen(outF, "w"); - fprintf(fo, "%d\n", nSamples); - for (int i = 0; i < nSamples; i++) fprintf(fo, "%.15g ", tau_denoms[i]); - fprintf(fo, "\n"); - fclose(fo); + sprintf(outF, "%s.tau_denoms", imdName); + fo = fopen(outF, "w"); + fprintf(fo, "%d\n", nSamples); + for (int i = 0; i < nSamples; i++) fprintf(fo, "%.15g ", tau_denoms[i]); + fprintf(fo, "\n"); + fclose(fo); } int main(int argc, char* argv[]) { if (argc < 7) { - printf("Usage: rsem-calculate-credibility-intervals reference_name sample_name imdName confidence nSpC nMB[-q]\n"); + printf("Usage: rsem-calculate-credibility-intervals reference_name sample_name sampleToken confidence nSpC nMB[-q]\n"); exit(-1); } @@ -351,7 +352,10 @@ int main(int argc, char* argv[]) { } verbose = !quiet; - sprintf(modelF, "%s.model", argv[2]); + sprintf(imdName, "%s.temp/%s", argv[2], argv[3]); + sprintf(statName, "%s.stat/%s", argv[2], argv[3]); + + sprintf(modelF, "%s.model", statName); FILE *fi = fopen(modelF, "r"); if (fi == NULL) { fprintf(stderr, "Cannot open %s!\n", modelF); exit(-1); } fscanf(fi, "%d", &model_type); @@ -364,8 +368,8 @@ int main(int argc, char* argv[]) { gi.load(groupF); m = gi.getm(); - sprintf(tmpF, "%s.tmp", argv[3]); - sprintf(cvsF, "%s.countvectors", argv[3]); + sprintf(tmpF, "%s.tmp", imdName); + sprintf(cvsF, "%s.countvectors", imdName); switch(model_type) { case 0 : sampling(); break; @@ -374,15 +378,15 @@ int main(int argc, char* argv[]) { case 3 : sampling(); break; } - generateResults(argv[3]); + generateResults(imdName); delete[] tau_denoms; - sprintf(command, "rm -f %s", tmpF); + sprintf(command, "rm -f %s", tmpF); int status = system(command); if (status != 0) { - fprintf(stderr, "Cannot delete %s!\n", tmpF); - exit(-1); + fprintf(stderr, "Cannot delete %s!\n", tmpF); + exit(-1); } return 0; diff --git a/makefile b/makefile index e789a1e..df8d2a1 100644 --- a/makefile +++ b/makefile @@ -48,13 +48,13 @@ PairedEndHit.h : SingleHit.h HitContainer.h : GroupInfo.h -SamParser.h : sam/sam.h sam/bam.h utils.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h +SamParser.h : sam/sam.h sam/bam.h utils.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Transcript.h Transcripts.h rsem-parse-alignments : parseIt.o sam/libbam.a $(CC) -o rsem-parse-alignments parseIt.o sam/libbam.a -lz -parseIt.o : utils.h GroupInfo.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h HitContainer.h SamParser.h parseIt.cpp +parseIt.o : utils.h GroupInfo.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h HitContainer.h SamParser.h Transcript.h Transcripts.h sam/sam.h sam/bam.h parseIt.cpp $(CC) $(COFLAGS) parseIt.cpp @@ -85,22 +85,17 @@ EM.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ. rsem-bam2wig : sam/bam.h sam/sam.h sam/libbam.a bam2wig.cpp $(CC) -O3 -Wall bam2wig.cpp sam/libbam.a -lz -o rsem-bam2wig -mersenne.o : mersenne.cpp randomc.h - $(CC) $(COFLAGS) mersenne.cpp +rsem-simulate-reads : simulation.o + $(CC) -o rsem-simulate-reads simulation.o -simul_mersenne.h : simul.h randomc.h - -rsem-simulate-reads : simulation.o mersenne.o - $(CC) -o rsem-simulate-reads simulation.o mersenne.o - -simulation.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h RefSeq.h GroupInfo.h Transcript.h Transcripts.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h Profile.h NoiseProfile.h simul.h simul_mersenne.h randomc.h simulation.cpp +simulation.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h RefSeq.h GroupInfo.h Transcript.h Transcripts.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h Profile.h NoiseProfile.h simul.h boost/random.hpp simulation.cpp $(CC) $(COFLAGS) simulation.cpp -rsem-run-gibbs : Gibbs.o mersenne.o - $(CC) -o rsem-run-gibbs Gibbs.o mersenne.o +rsem-run-gibbs : Gibbs.o + $(CC) -o rsem-run-gibbs Gibbs.o #some header files are omitted -Gibbs.o : randomc.h utils.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h Gibbs.cpp +Gibbs.o : utils.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h boost/random.hpp Gibbs.cpp $(CC) $(COFLAGS) Gibbs.cpp rsem-calculate-credibility-intervals : calcCI.o diff --git a/mersenne.cpp b/mersenne.cpp deleted file mode 100644 index d9be809..0000000 --- a/mersenne.cpp +++ /dev/null @@ -1,215 +0,0 @@ -/************************** MERSENNE.CPP ******************** AgF 2007-08-01 * -* Random Number generator 'Mersenne Twister' * -* * -* This random number generator is described in the article by * -* M. Matsumoto & T. Nishimura, in: * -* ACM Transactions on Modeling and Computer Simulation, * -* vol. 8, no. 1, 1998, pp. 3-30. * -* Details on the initialization scheme can be found at * -* http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html * -* * -* Experts consider this an excellent random number generator. * -* * -* © 2001 - 2007 A. Fog. Published under the GNU General Public License * -* (www.gnu.org/copyleft/gpl.html) with the further restriction that it * -* cannot be used for gambling applications. * -*****************************************************************************/ - -#include "randomc.h" - -void CRandomMersenne::Init0(uint32 seed) { - // Detect computer architecture - union {double f; uint32 i[2];} convert; - convert.f = 1.0; - if (convert.i[1] == 0x3FF00000) Architecture = LITTLE_ENDIAN1; - else if (convert.i[0] == 0x3FF00000) Architecture = BIG_ENDIAN1; - else Architecture = NONIEEE; - - // Seed generator - mt[0]= seed; - for (mti=1; mti < MERS_N; mti++) { - mt[mti] = (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti); - } -} - -void CRandomMersenne::RandomInit(uint32 seed) { - // Initialize and seed - Init0(seed); - - // Randomize some more - for (int i = 0; i < 37; i++) BRandom(); -} - - -void CRandomMersenne::RandomInitByArray(uint32 seeds[], int length) { - // Seed by more than 32 bits - int i, j, k; - - // Initialize - Init0(19650218); - - if (length <= 0) return; - - // Randomize mt[] using whole seeds[] array - i = 1; j = 0; - k = (MERS_N > length ? MERS_N : length); - for (; k; k--) { - mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL)) + seeds[j] + j; - i++; j++; - if (i >= MERS_N) {mt[0] = mt[MERS_N-1]; i=1;} - if (j >= length) j=0;} - for (k = MERS_N-1; k; k--) { - mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL)) - i; - if (++i >= MERS_N) {mt[0] = mt[MERS_N-1]; i=1;}} - mt[0] = 0x80000000UL; // MSB is 1; assuring non-zero initial array - - // Randomize some more - mti = 0; - for (int i = 0; i <= MERS_N; i++) BRandom(); -} - - -uint32 CRandomMersenne::BRandom() { - // Generate 32 random bits - uint32 y; - - if (mti >= MERS_N) { - // Generate MERS_N words at one time - const uint32 LOWER_MASK = (1LU << MERS_R) - 1; // Lower MERS_R bits - const uint32 UPPER_MASK = 0xFFFFFFFF << MERS_R; // Upper (32 - MERS_R) bits - static const uint32 mag01[2] = {0, MERS_A}; - - int kk; - for (kk=0; kk < MERS_N-MERS_M; kk++) { - y = (mt[kk] & UPPER_MASK) | (mt[kk+1] & LOWER_MASK); - mt[kk] = mt[kk+MERS_M] ^ (y >> 1) ^ mag01[y & 1];} - - for (; kk < MERS_N-1; kk++) { - y = (mt[kk] & UPPER_MASK) | (mt[kk+1] & LOWER_MASK); - mt[kk] = mt[kk+(MERS_M-MERS_N)] ^ (y >> 1) ^ mag01[y & 1];} - - y = (mt[MERS_N-1] & UPPER_MASK) | (mt[0] & LOWER_MASK); - mt[MERS_N-1] = mt[MERS_M-1] ^ (y >> 1) ^ mag01[y & 1]; - mti = 0; - } - - y = mt[mti++]; - -#if 1 - // Tempering (May be omitted): - y ^= y >> MERS_U; - y ^= (y << MERS_S) & MERS_B; - y ^= (y << MERS_T) & MERS_C; - y ^= y >> MERS_L; -#endif - - return y; -} - - -double CRandomMersenne::Random() { - // Output random float number in the interval 0 <= x < 1 - union {double f; uint32 i[2];} convert; - uint32 r = BRandom(); // Get 32 random bits - // The fastest way to convert random bits to floating point is as follows: - // Set the binary exponent of a floating point number to 1+bias and set - // the mantissa to random bits. This will give a random number in the - // interval [1,2). Then subtract 1.0 to get a random number in the interval - // [0,1). This procedure requires that we know how floating point numbers - // are stored. The storing method is tested in function RandomInit and saved - // in the variable Architecture. - - // This shortcut allows the compiler to optimize away the following switch - // statement for the most common architectures: -#if defined(_M_IX86) || defined(_M_X64) || defined(__LITTLE_ENDIAN__) - Architecture = LITTLE_ENDIAN1; -#elif defined(__BIG_ENDIAN__) - Architecture = BIG_ENDIAN1; -#endif - - switch (Architecture) { - case LITTLE_ENDIAN1: - convert.i[0] = r << 20; - convert.i[1] = (r >> 12) | 0x3FF00000; - return convert.f - 1.0; - case BIG_ENDIAN1: - convert.i[1] = r << 20; - convert.i[0] = (r >> 12) | 0x3FF00000; - return convert.f - 1.0; - case NONIEEE: default: ; - } - // This somewhat slower method works for all architectures, including - // non-IEEE floating point representation: - return (double)r * (1./((double)(uint32)(-1L)+1.)); -} - - -int CRandomMersenne::IRandom(int min, int max) { - // Output random integer in the interval min <= x <= max - // Relative error on frequencies < 2^-32 - if (max <= min) { - if (max == min) return min; else return 0x80000000; - } - // Multiply interval with random and truncate - int r = int((max - min + 1) * Random()) + min; - if (r > max) r = max; - return r; -} - - -int CRandomMersenne::IRandomX(int min, int max) { - // Output random integer in the interval min <= x <= max - // Each output value has exactly the same probability. - // This is obtained by rejecting certain bit values so that the number - // of possible bit values is divisible by the interval length - if (max <= min) { - if (max == min) return min; else return 0x80000000; - } -#ifdef INT64_DEFINED - // 64 bit integers available. Use multiply and shift method - uint32 interval; // Length of interval - uint64 longran; // Random bits * interval - uint32 iran; // Longran / 2^32 - uint32 remainder; // Longran % 2^32 - - interval = uint32(max - min + 1); - if (interval != LastInterval) { - // Interval length has changed. Must calculate rejection limit - // Reject when remainder = 2^32 / interval * interval - // RLimit will be 0 if interval is a power of 2. No rejection then - RLimit = uint32(((uint64)1 << 32) / interval) * interval - 1; - LastInterval = interval; - } - do { // Rejection loop - longran = (uint64)BRandom() * interval; - iran = (uint32)(longran >> 32); - remainder = (uint32)longran; - } while (remainder > RLimit); - // Convert back to signed and return result - return (int32)iran + min; - -#else - // 64 bit integers not available. Use modulo method - uint32 interval; // Length of interval - uint32 bran; // Random bits - uint32 iran; // bran / interval - uint32 remainder; // bran % interval - - interval = uint32(max - min + 1); - if (interval != LastInterval) { - // Interval length has changed. Must calculate rejection limit - // Reject when iran = 2^32 / interval - // We can't make 2^32 so we use 2^32-1 and correct afterwards - RLimit = (uint32)0xFFFFFFFF / interval; - if ((uint32)0xFFFFFFFF % interval == interval - 1) RLimit++; - } - do { // Rejection loop - bran = BRandom(); - iran = bran / interval; - remainder = bran % interval; - } while (iran >= RLimit); - // Convert back to signed and return result - return (int32)remainder + min; - -#endif -} diff --git a/parseIt.cpp b/parseIt.cpp index c266715..55dd954 100644 --- a/parseIt.cpp +++ b/parseIt.cpp @@ -8,11 +8,15 @@ #include #include #include +#include #include "utils.h" #include "GroupInfo.h" +#include "Transcript.h" +#include "Transcripts.h" + #include "SingleRead.h" #include "SingleReadQ.h" #include "PairedEndRead.h" @@ -30,10 +34,12 @@ int N[3]; // note, N = N0 + N1 + N2 , but may not be equal to the total number o int nHits; // # of hits int nUnique, nMulti, nIsoMulti; char fn_list[STRLEN]; -char groupF[STRLEN]; +char groupF[STRLEN], tiF[STRLEN]; +char imdName[STRLEN]; char datF[STRLEN], cntF[STRLEN]; GroupInfo gi; +Transcripts transcripts; SamParser *parser; ofstream hit_out; @@ -42,14 +48,14 @@ int n_os; // number of ostreams ostream *cat[3][2]; // cat : category 1-dim 0 N0 1 N1 2 N2; 2-dim 0 mate1 1 mate2 char readOutFs[3][2][STRLEN]; -void init(const char* imdName, char alignFType, const char* alignF) { +map counter; +map::iterator iter; - sprintf(datF, "%s.dat", imdName); - sprintf(cntF, "%s.cnt", imdName); +void init(const char* imdName, char alignFType, const char* alignF) { char* aux = 0; if (strcmp(fn_list, "")) aux = fn_list; - parser = new SamParser(alignFType, alignF, aux); + parser = new SamParser(alignFType, alignF, transcripts, aux); memset(cat, 0, sizeof(cat)); memset(readOutFs, 0, sizeof(readOutFs)); @@ -64,6 +70,8 @@ void init(const char* imdName, char alignFType, const char* alignF) { for (int j = 0; j < n_os; j++) cat[i][j] = new ofstream(readOutFs[i][j]); } + + counter.clear(); } //Do not allow duplicate for unalignable reads and supressed reads in SAM input @@ -93,6 +101,14 @@ void parseIt(SamParser *parser) { nMulti += hits.calcNumGeneMultiReads(gi); nIsoMulti += hits.calcNumIsoformMultiReads(); hits.write(hit_out); + + iter = counter.find(hits.getNHits()); + if (iter != counter.end()) { + iter->second++; + } + else { + counter[hits.getNHits()] = 1; + } } hits.clear(); @@ -116,6 +132,14 @@ void parseIt(SamParser *parser) { nMulti += hits.calcNumGeneMultiReads(gi); nIsoMulti += hits.calcNumIsoformMultiReads(); hits.write(hit_out); + + iter = counter.find(hits.getNHits()); + if (iter != counter.end()) { + iter->second++; + } + else { + counter[hits.getNHits()] = 1; + } } nUnique = N[1] - nMulti; @@ -138,15 +162,15 @@ void release() { int main(int argc, char* argv[]) { bool quiet = false; - if (argc < 5) { - printf("Usage : rsem-parse-alignments refName imdName alignFType('s' for sam, 'b' for bam) alignF [-t Type] [-l fn_list] [-tag tagName] [-q]\n"); + if (argc < 6) { + printf("Usage : rsem-parse-alignments refName sampleName sampleToken alignFType('s' for sam, 'b' for bam) alignF [-t Type] [-l fn_list] [-tag tagName] [-q]\n"); exit(-1); } strcpy(fn_list, ""); read_type = 0; - if (argc > 5) { - for (int i = 5; i < argc; i++) { + if (argc > 6) { + for (int i = 6; i < argc; i++) { if (!strcmp(argv[i], "-t")) { read_type = atoi(argv[i + 1]); } @@ -162,10 +186,16 @@ int main(int argc, char* argv[]) { verbose = !quiet; - init(argv[2], argv[3][0], argv[4]); - sprintf(groupF, "%s.grp", argv[1]); gi.load(groupF); + sprintf(tiF, "%s.ti", argv[1]); + transcripts.readFrom(tiF); + + sprintf(imdName, "%s.temp/%s", argv[2], argv[3]); + sprintf(datF, "%s.dat", imdName); + sprintf(cntF, "%s.stat/%s.cnt", argv[2], argv[3]); + + init(imdName, argv[4][0], argv[5]); hit_out.open(datF); @@ -190,6 +220,9 @@ int main(int argc, char* argv[]) { fout<first<<'\t'<second< and B - -Output files used by RSEM internally for tasks like simulation, -compute credibility intervals etc. - =item B Only generated when --out-bam is specified. @@ -604,6 +600,9 @@ representing the posterior probability. 'sample_name.sorted.bam' and 'sample_name.sorted.bam.bai' are the sorted BAM file and indices generated by samtools (included in RSEM package). +=item B + +This is a folder instead of a file. All model related statistics are stored in this folder. Use 'rsem-plot-model' can generate plots using this folder. =back diff --git a/rsem-plot-model b/rsem-plot-model index e9225ed..f7b84fc 100755 --- a/rsem-plot-model +++ b/rsem-plot-model @@ -2,13 +2,20 @@ argv <- commandArgs(TRUE) if (length(argv) != 2) { - cat("Usage: rsem-plot-model modelF outF\n") + cat("Usage: rsem-plot-model sample_name outF\n") q(status = 1) } -con <- file(argv[1], open = "r") +strvec <- strsplit(argv[1], split = "/")[[1]] +token <- strvec[length(strvec)] + +modelF <- paste(argv[1], ".stat/", token, ".model") +cntF <- paste(argv[1], ".stat/", token, ".cnt") + pdf(argv[2]) +con <- file(modelF, open = "r") + # model type and forward probability model_type <- as.numeric(readLines(con, n = 4)[1]) @@ -108,5 +115,9 @@ if (model_type == 1 || model_type == 3) { legend("topleft", c("A", "C", "G", "T"), lty = 1:4, pch = 0:3, col = 1:4) } -dev.off() close(con) + +pair <- read.table(file = cntF, skip = 3, sep = "\t") +plot(pair[,1], pair[,2], xlab = "Number of Alignments", ylab = "Number of Reads", main = "Among alignable reads, distribution of # of alignments") + +dev.off() diff --git a/simul.h b/simul.h index 2719b3c..ba105db 100644 --- a/simul.h +++ b/simul.h @@ -1,13 +1,17 @@ #ifndef SIMUL_H_ #define SIMUL_H_ +#include #include +#include "boost/random.hpp" class simul { public: - virtual ~simul() {} + simul() : rg(boost::mt19937(time(NULL))) { + } + // interval : [,) // random number should be in [0, arr[len - 1]) // If by chance arr[len - 1] == 0.0, one possibility is to sample uniformly from 0 ... len - 1 @@ -29,9 +33,10 @@ public: return l; } - virtual double random() { return 0.0; }; + double random() { return rg(); }; private: + boost::uniform_01 rg; }; #endif /* SIMUL_H_ */ diff --git a/simul_mersenne.h b/simul_mersenne.h deleted file mode 100644 index 1a476d1..0000000 --- a/simul_mersenne.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef SIMUL_MERSENNE_H_ -#define SIMUL_MERSENNE_H_ - -#include - -#include "randomc.h" -#include "simul.h" - -class simul_mersenne : public simul { -public: - simul_mersenne() : rg(time(NULL)) { } - - virtual ~simul_mersenne() {} - virtual double random() { return rg.Random(); } - -private: - CRandomMersenne rg; -}; - -#endif /* SIMUL_MERSENNE_H_ */ diff --git a/simulation.cpp b/simulation.cpp index ecc0a11..2511f33 100644 --- a/simulation.cpp +++ b/simulation.cpp @@ -27,7 +27,7 @@ #include "Transcript.h" #include "Transcripts.h" -#include "simul_mersenne.h" +#include "simul.h" using namespace std; @@ -48,7 +48,7 @@ char outReadF[2][STRLEN]; char refF[STRLEN], groupF[STRLEN], tiF[STRLEN]; char geneResF[STRLEN], isoResF[STRLEN]; -simul_mersenne sampler; +simul sampler; void genOutReadStreams(int type, char *outFN) { switch(type) { -- 2.39.2