bool updateModel, calcExpectedWeights;
bool genGibbsOut; // generate file for Gibbs sampler
-char refName[STRLEN], imdName[STRLEN], outName[STRLEN];
+char refName[STRLEN], outName[STRLEN];
+char imdName[STRLEN], statName[STRLEN];
char refF[STRLEN], groupF[STRLEN], cntF[STRLEN], tiF[STRLEN];
char mparamsF[STRLEN], bmparamsF[STRLEN];
char modelF[STRLEN], thetaF[STRLEN];
char outF[STRLEN];
FILE *fo;
- sprintf(modelF, "%s.model", outName);
+ sprintf(modelF, "%s.model", statName);
model.write(modelF);
//calculate tau values
}
}
fclose(fo);
-
- char scoreF[STRLEN];
- sprintf(scoreF, "%s.ns", imdName);
- fo = fopen(scoreF, "w");
- fprintf(fo, "%.15g\n", model.getLogP());
- fclose(fo);
}
- sprintf(thetaF, "%s.theta", outName);
+ sprintf(thetaF, "%s.theta", statName);
fo = fopen(thetaF, "w");
fprintf(fo, "%d\n", M + 1);
bool quiet = false;
if (argc < 5) {
- printf("Usage : rsem-run-em refName read_type imdName outName [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out]\n\n");
+ printf("Usage : rsem-run-em refName read_type sampleName sampleToken [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out]\n\n");
printf(" refName: reference name\n");
printf(" read_type: 0 single read without quality score; 1 single read with quality score; 2 paired-end read without quality score; 3 paired-end read with quality score.\n");
- printf(" imdName: name for all upstream/downstream user-unseen files. (different files have different suffices)\n");
- printf(" outName: name for all output files. (different files have different suffices)\n");
+ printf(" samplePath: sample path.\n");
+ printf(" sampleName: sample name.\n");
printf(" -p: number of threads which user wants to use. (default: 1)\n");
printf(" -b: produce bam format output file. (default: off)\n");
printf(" -q: set it quiet\n");
strcpy(refName, argv[1]);
read_type = atoi(argv[2]);
- strcpy(imdName, argv[3]);
- strcpy(outName, argv[4]);
+ strcpy(outName, argv[3]);
+ sprintf(imdName, "%s.temp/%s", argv[3], argv[4]);
+ sprintf(statName, "%s.stat/%s", argv[3], argv[4]);
nThreads = 1;
sprintf(tiF, "%s.ti", refName);
transcripts.readFrom(tiF);
- sprintf(cntF, "%s.cnt", imdName);
+ sprintf(cntF, "%s.cnt", statName);
fin.open(cntF);
if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", cntF); exit(-1); }
fin>>N0>>N1>>N2>>N_tot;
#include<sstream>
#include<vector>
-#include "randomc.h"
+#include "boost/random.hpp"
+
#include "utils.h"
#include "Model.h"
int m, M, N0, N1, nHits;
double totc;
int BURNIN, CHAINLEN, GAP;
+char imdName[STRLEN], statName[STRLEN];
char thetaF[STRLEN], ofgF[STRLEN], groupF[STRLEN], refF[STRLEN], modelF[STRLEN];
char cvsF[STRLEN];
bool quiet;
vector<double> arr;
-CRandomMersenne rg(time(NULL));
-void load_data(char* reference_name, char* sample_name, char* imdName) {
+boost::mt19937 rng(time(NULL));
+boost::uniform_01<boost::mt19937> rg(rng);
+
+void load_data(char* reference_name, char* statName, char* imdName) {
ifstream fin;
string line;
int tmpVal;
m = gi.getm();
//load thetaF
- sprintf(thetaF, "%s.theta",sample_name);
+ sprintf(thetaF, "%s.theta",statName);
fin.open(thetaF);
if (!fin.is_open()) {
fprintf(stderr, "Cannot open %s!\n", thetaF);
// If by chance arr[len - 1] == 0.0, one possibility is to sample uniformly from 0...len-1
int sample(vector<double>& arr, int len) {
int l, r, mid;
- double prb = rg.Random() * arr[len - 1];
+ double prb = rg() * arr[len - 1];
l = 0; r = len - 1;
while (l <= r) {
int main(int argc, char* argv[]) {
if (argc < 7) {
- printf("Usage: rsem-run-gibbs reference_name sample_name imdName BURNIN CHAINLEN GAP [-q]\n");
+ printf("Usage: rsem-run-gibbs reference_name sample_name sampleToken BURNIN CHAINLEN GAP [-q]\n");
exit(-1);
}
BURNIN = atoi(argv[4]);
CHAINLEN = atoi(argv[5]);
GAP = atoi(argv[6]);
- load_data(argv[1], argv[2], argv[3]);
+ sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
+ sprintf(statName, "%s.stat/%s", argv[2], argv[3]);
+ load_data(argv[1], statName, imdName);
quiet = false;
if (argc > 7 && !strcmp(argv[7], "-q")) {
verbose = !quiet;
init();
- Gibbs(argv[3]);
+ Gibbs(imdName);
- sprintf(modelF, "%s.model", argv[2]);
+ sprintf(modelF, "%s.model", statName);
FILE *fi = fopen(modelF, "r");
if (fi == NULL) { fprintf(stderr, "Cannot open %s!\n", modelF); exit(-1); }
fscanf(fi, "%d", &model_type);
fclose(fi);
switch(model_type) {
- case 0 : writeEstimatedParameters<SingleModel>(modelF, argv[3]); break;
- case 1 : writeEstimatedParameters<SingleQModel>(modelF, argv[3]); break;
- case 2 : writeEstimatedParameters<PairedEndModel>(modelF, argv[3]); break;
- case 3 : writeEstimatedParameters<PairedEndQModel>(modelF, argv[3]); break;
+ case 0 : writeEstimatedParameters<SingleModel>(modelF, imdName); break;
+ case 1 : writeEstimatedParameters<SingleQModel>(modelF, imdName); break;
+ case 2 : writeEstimatedParameters<PairedEndModel>(modelF, imdName); break;
+ case 3 : writeEstimatedParameters<PairedEndQModel>(modelF, imdName); break;
}
return 0;
-/* ReadType here means if the read is unalignable, alignable or aligned too much. It is NOT single read or paired-end read */
+/* ReadType here means if the read is unalignable, alignable or aligned too much. It is NOT siheaderngle read or paired-end read */
#ifndef SAMPARSER_H_
#define SAMPARSER_H_
#include "sam/bam.h"
#include "sam/sam.h"
+
#include "utils.h"
+
+#include "Transcript.h"
+#include "Transcripts.h"
+
#include "SingleRead.h"
#include "SingleReadQ.h"
#include "PairedEndRead.h"
class SamParser {
public:
- SamParser(char, const char*, const char* = 0);
+ SamParser(char, const char*, Transcripts&, const char* = 0);
~SamParser();
/**
bam_header_t *header;
bam1_t *b, *b2;
+
//tag used by aligner
static char rtTag[STRLEN];
//0 ~ N0 1 ~ N1 2 ~ N2
int getReadType(const bam1_t*);
int getReadType(const bam1_t*, const bam1_t*); // for paired-end reads
+
+ bool check(bam1_t *b) {
+ return (b->core.n_cigar == 1) && ((*bam1_cigar(b) & BAM_CIGAR_MASK) == BAM_CMATCH) && (b->core.l_qseq == (int32_t)(*bam1_cigar(b) >> BAM_CIGAR_SHIFT));
+ }
};
char SamParser::rtTag[STRLEN] = ""; // default : no tag, thus no Type 2 reads
// aux, if not 0, points to the file name of fn_list
-SamParser::SamParser(char inpType, const char* inpF, const char* aux) {
+SamParser::SamParser(char inpType, const char* inpF, Transcripts& transcripts, const char* aux) {
switch(inpType) {
case 'b': sam_in = samopen(inpF, "rb", aux); break;
case 's': sam_in = samopen(inpF, "r", aux); break;
header = sam_in->header;
if (header == 0) { fprintf(stderr, "Fail to parse sam header!\n"); exit(-1); }
+ // Check if the reference used for aligner is the transcript set RSEM generated
+ if (transcripts.getM() != header->n_targets) {
+ fprintf(stderr, "Number of transcripts does not match! Please align reads against the transcript set and use RSEM generated reference for your aligner!\n");
+ exit(-1);
+ }
+ for (int i = 0; i < header->n_targets; i++) {
+ const Transcript& transcript = transcripts.getTranscriptAt(i + 1);
+ // If update int to long, chance the (int) conversion
+ if (transcript.getTranscriptID().compare(header->target_name[i]) != 0 || transcript.getLength() != (int)header->target_len[i]) {
+ fprintf(stderr, "Transcript information does not match! Please align reads against the transcript set and use RSEM generated reference for your aligner!\n");
+ exit(-1);
+ }
+ }
+
b = bam_init1();
b2 = bam_init1();
}
else val = 5;
if (readType == 1) {
+ if (!check(b)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); }
+
if (getDir(b) > 0) {
hit = SingleHit(b->core.tid + 1, b->core.pos);
}
else val = 5;
if (readType == 1) {
+ if (!check(b)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); }
+
if (getDir(b) > 0) {
hit = SingleHit(b->core.tid + 1, b->core.pos);
}
else val = 5;
if (readType == 1) {
+ if (!check(mp1) || !check(mp2)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); }
+
if (mp1->core.tid != mp2->core.tid) {
fprintf(stderr, "The two reads do not come from the same pair!");
exit(-1);
else val = 5;
if (readType == 1) {
+ if (!check(mp1) || !check(mp2)) { fprintf(stderr, "RSEM does not support gapped alignments, sorry!\n"); exit(-1); }
+
if (mp1->core.tid != mp2->core.tid) {
fprintf(stderr, "The two reads do not come from the same pair!");
exit(-1);
int M, m;
Refs refs;
GroupInfo gi;
+char imdName[STRLEN], statName[STRLEN];
char modelF[STRLEN], groupF[STRLEN], refF[STRLEN];
vector<double> eel; //expected effective lengths
if (verbose) { printf("All credibility intervals are calculated!\n"); }
- sprintf(outF, "%s.tau_denoms", imdName);
- fo = fopen(outF, "w");
- fprintf(fo, "%d\n", nSamples);
- for (int i = 0; i < nSamples; i++) fprintf(fo, "%.15g ", tau_denoms[i]);
- fprintf(fo, "\n");
- fclose(fo);
+ sprintf(outF, "%s.tau_denoms", imdName);
+ fo = fopen(outF, "w");
+ fprintf(fo, "%d\n", nSamples);
+ for (int i = 0; i < nSamples; i++) fprintf(fo, "%.15g ", tau_denoms[i]);
+ fprintf(fo, "\n");
+ fclose(fo);
}
int main(int argc, char* argv[]) {
if (argc < 7) {
- printf("Usage: rsem-calculate-credibility-intervals reference_name sample_name imdName confidence nSpC nMB[-q]\n");
+ printf("Usage: rsem-calculate-credibility-intervals reference_name sample_name sampleToken confidence nSpC nMB[-q]\n");
exit(-1);
}
}
verbose = !quiet;
- sprintf(modelF, "%s.model", argv[2]);
+ sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
+ sprintf(statName, "%s.stat/%s", argv[2], argv[3]);
+
+ sprintf(modelF, "%s.model", statName);
FILE *fi = fopen(modelF, "r");
if (fi == NULL) { fprintf(stderr, "Cannot open %s!\n", modelF); exit(-1); }
fscanf(fi, "%d", &model_type);
gi.load(groupF);
m = gi.getm();
- sprintf(tmpF, "%s.tmp", argv[3]);
- sprintf(cvsF, "%s.countvectors", argv[3]);
+ sprintf(tmpF, "%s.tmp", imdName);
+ sprintf(cvsF, "%s.countvectors", imdName);
switch(model_type) {
case 0 : sampling<SingleModel>(); break;
case 3 : sampling<PairedEndQModel>(); break;
}
- generateResults(argv[3]);
+ generateResults(imdName);
delete[] tau_denoms;
- sprintf(command, "rm -f %s", tmpF);
+ sprintf(command, "rm -f %s", tmpF);
int status = system(command);
if (status != 0) {
- fprintf(stderr, "Cannot delete %s!\n", tmpF);
- exit(-1);
+ fprintf(stderr, "Cannot delete %s!\n", tmpF);
+ exit(-1);
}
return 0;
HitContainer.h : GroupInfo.h
-SamParser.h : sam/sam.h sam/bam.h utils.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h
+SamParser.h : sam/sam.h sam/bam.h utils.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h Transcript.h Transcripts.h
rsem-parse-alignments : parseIt.o sam/libbam.a
$(CC) -o rsem-parse-alignments parseIt.o sam/libbam.a -lz
-parseIt.o : utils.h GroupInfo.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h HitContainer.h SamParser.h parseIt.cpp
+parseIt.o : utils.h GroupInfo.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h SingleHit.h PairedEndHit.h HitContainer.h SamParser.h Transcript.h Transcripts.h sam/sam.h sam/bam.h parseIt.cpp
$(CC) $(COFLAGS) parseIt.cpp
rsem-bam2wig : sam/bam.h sam/sam.h sam/libbam.a bam2wig.cpp
$(CC) -O3 -Wall bam2wig.cpp sam/libbam.a -lz -o rsem-bam2wig
-mersenne.o : mersenne.cpp randomc.h
- $(CC) $(COFLAGS) mersenne.cpp
+rsem-simulate-reads : simulation.o
+ $(CC) -o rsem-simulate-reads simulation.o
-simul_mersenne.h : simul.h randomc.h
-
-rsem-simulate-reads : simulation.o mersenne.o
- $(CC) -o rsem-simulate-reads simulation.o mersenne.o
-
-simulation.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h RefSeq.h GroupInfo.h Transcript.h Transcripts.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h Profile.h NoiseProfile.h simul.h simul_mersenne.h randomc.h simulation.cpp
+simulation.o : utils.h Read.h SingleRead.h SingleReadQ.h PairedEndRead.h PairedEndReadQ.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h Refs.h RefSeq.h GroupInfo.h Transcript.h Transcripts.h Orientation.h LenDist.h RSPD.h QualDist.h QProfile.h NoiseQProfile.h Profile.h NoiseProfile.h simul.h boost/random.hpp simulation.cpp
$(CC) $(COFLAGS) simulation.cpp
-rsem-run-gibbs : Gibbs.o mersenne.o
- $(CC) -o rsem-run-gibbs Gibbs.o mersenne.o
+rsem-run-gibbs : Gibbs.o
+ $(CC) -o rsem-run-gibbs Gibbs.o
#some header files are omitted
-Gibbs.o : randomc.h utils.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h Gibbs.cpp
+Gibbs.o : utils.h Model.h SingleModel.h SingleQModel.h PairedEndModel.h PairedEndQModel.h RefSeq.h RefSeqPolicy.h PolyARules.h Refs.h GroupInfo.h boost/random.hpp Gibbs.cpp
$(CC) $(COFLAGS) Gibbs.cpp
rsem-calculate-credibility-intervals : calcCI.o
+++ /dev/null
-/************************** MERSENNE.CPP ******************** AgF 2007-08-01 *\r
-* Random Number generator 'Mersenne Twister' *\r
-* *\r
-* This random number generator is described in the article by *\r
-* M. Matsumoto & T. Nishimura, in: *\r
-* ACM Transactions on Modeling and Computer Simulation, *\r
-* vol. 8, no. 1, 1998, pp. 3-30. *\r
-* Details on the initialization scheme can be found at *\r
-* http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html *\r
-* *\r
-* Experts consider this an excellent random number generator. *\r
-* *\r
-* © 2001 - 2007 A. Fog. Published under the GNU General Public License *\r
-* (www.gnu.org/copyleft/gpl.html) with the further restriction that it *\r
-* cannot be used for gambling applications. *\r
-*****************************************************************************/\r
-\r
-#include "randomc.h"\r
-\r
-void CRandomMersenne::Init0(uint32 seed) {\r
- // Detect computer architecture\r
- union {double f; uint32 i[2];} convert;\r
- convert.f = 1.0;\r
- if (convert.i[1] == 0x3FF00000) Architecture = LITTLE_ENDIAN1;\r
- else if (convert.i[0] == 0x3FF00000) Architecture = BIG_ENDIAN1;\r
- else Architecture = NONIEEE;\r
-\r
- // Seed generator\r
- mt[0]= seed;\r
- for (mti=1; mti < MERS_N; mti++) {\r
- mt[mti] = (1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);\r
- }\r
-}\r
-\r
-void CRandomMersenne::RandomInit(uint32 seed) {\r
- // Initialize and seed\r
- Init0(seed);\r
-\r
- // Randomize some more\r
- for (int i = 0; i < 37; i++) BRandom();\r
-}\r
-\r
-\r
-void CRandomMersenne::RandomInitByArray(uint32 seeds[], int length) {\r
- // Seed by more than 32 bits\r
- int i, j, k;\r
-\r
- // Initialize\r
- Init0(19650218);\r
-\r
- if (length <= 0) return;\r
-\r
- // Randomize mt[] using whole seeds[] array\r
- i = 1; j = 0;\r
- k = (MERS_N > length ? MERS_N : length);\r
- for (; k; k--) {\r
- mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL)) + seeds[j] + j;\r
- i++; j++;\r
- if (i >= MERS_N) {mt[0] = mt[MERS_N-1]; i=1;}\r
- if (j >= length) j=0;}\r
- for (k = MERS_N-1; k; k--) {\r
- mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL)) - i;\r
- if (++i >= MERS_N) {mt[0] = mt[MERS_N-1]; i=1;}}\r
- mt[0] = 0x80000000UL; // MSB is 1; assuring non-zero initial array\r
-\r
- // Randomize some more\r
- mti = 0;\r
- for (int i = 0; i <= MERS_N; i++) BRandom();\r
-}\r
-\r
-\r
-uint32 CRandomMersenne::BRandom() {\r
- // Generate 32 random bits\r
- uint32 y;\r
-\r
- if (mti >= MERS_N) {\r
- // Generate MERS_N words at one time\r
- const uint32 LOWER_MASK = (1LU << MERS_R) - 1; // Lower MERS_R bits\r
- const uint32 UPPER_MASK = 0xFFFFFFFF << MERS_R; // Upper (32 - MERS_R) bits\r
- static const uint32 mag01[2] = {0, MERS_A};\r
-\r
- int kk;\r
- for (kk=0; kk < MERS_N-MERS_M; kk++) { \r
- y = (mt[kk] & UPPER_MASK) | (mt[kk+1] & LOWER_MASK);\r
- mt[kk] = mt[kk+MERS_M] ^ (y >> 1) ^ mag01[y & 1];}\r
-\r
- for (; kk < MERS_N-1; kk++) { \r
- y = (mt[kk] & UPPER_MASK) | (mt[kk+1] & LOWER_MASK);\r
- mt[kk] = mt[kk+(MERS_M-MERS_N)] ^ (y >> 1) ^ mag01[y & 1];} \r
-\r
- y = (mt[MERS_N-1] & UPPER_MASK) | (mt[0] & LOWER_MASK);\r
- mt[MERS_N-1] = mt[MERS_M-1] ^ (y >> 1) ^ mag01[y & 1];\r
- mti = 0;\r
- }\r
-\r
- y = mt[mti++];\r
-\r
-#if 1\r
- // Tempering (May be omitted):\r
- y ^= y >> MERS_U;\r
- y ^= (y << MERS_S) & MERS_B;\r
- y ^= (y << MERS_T) & MERS_C;\r
- y ^= y >> MERS_L;\r
-#endif\r
-\r
- return y;\r
-}\r
-\r
-\r
-double CRandomMersenne::Random() {\r
- // Output random float number in the interval 0 <= x < 1\r
- union {double f; uint32 i[2];} convert;\r
- uint32 r = BRandom(); // Get 32 random bits\r
- // The fastest way to convert random bits to floating point is as follows:\r
- // Set the binary exponent of a floating point number to 1+bias and set\r
- // the mantissa to random bits. This will give a random number in the \r
- // interval [1,2). Then subtract 1.0 to get a random number in the interval\r
- // [0,1). This procedure requires that we know how floating point numbers\r
- // are stored. The storing method is tested in function RandomInit and saved \r
- // in the variable Architecture.\r
-\r
- // This shortcut allows the compiler to optimize away the following switch\r
- // statement for the most common architectures:\r
-#if defined(_M_IX86) || defined(_M_X64) || defined(__LITTLE_ENDIAN__)\r
- Architecture = LITTLE_ENDIAN1;\r
-#elif defined(__BIG_ENDIAN__)\r
- Architecture = BIG_ENDIAN1;\r
-#endif\r
-\r
- switch (Architecture) {\r
- case LITTLE_ENDIAN1:\r
- convert.i[0] = r << 20;\r
- convert.i[1] = (r >> 12) | 0x3FF00000;\r
- return convert.f - 1.0;\r
- case BIG_ENDIAN1:\r
- convert.i[1] = r << 20;\r
- convert.i[0] = (r >> 12) | 0x3FF00000;\r
- return convert.f - 1.0;\r
- case NONIEEE: default: ;\r
- } \r
- // This somewhat slower method works for all architectures, including \r
- // non-IEEE floating point representation:\r
- return (double)r * (1./((double)(uint32)(-1L)+1.));\r
-}\r
-\r
-\r
-int CRandomMersenne::IRandom(int min, int max) {\r
- // Output random integer in the interval min <= x <= max\r
- // Relative error on frequencies < 2^-32\r
- if (max <= min) {\r
- if (max == min) return min; else return 0x80000000;\r
- }\r
- // Multiply interval with random and truncate\r
- int r = int((max - min + 1) * Random()) + min; \r
- if (r > max) r = max;\r
- return r;\r
-}\r
-\r
-\r
-int CRandomMersenne::IRandomX(int min, int max) {\r
- // Output random integer in the interval min <= x <= max\r
- // Each output value has exactly the same probability.\r
- // This is obtained by rejecting certain bit values so that the number\r
- // of possible bit values is divisible by the interval length\r
- if (max <= min) {\r
- if (max == min) return min; else return 0x80000000;\r
- }\r
-#ifdef INT64_DEFINED\r
- // 64 bit integers available. Use multiply and shift method\r
- uint32 interval; // Length of interval\r
- uint64 longran; // Random bits * interval\r
- uint32 iran; // Longran / 2^32\r
- uint32 remainder; // Longran % 2^32\r
-\r
- interval = uint32(max - min + 1);\r
- if (interval != LastInterval) {\r
- // Interval length has changed. Must calculate rejection limit\r
- // Reject when remainder = 2^32 / interval * interval\r
- // RLimit will be 0 if interval is a power of 2. No rejection then\r
- RLimit = uint32(((uint64)1 << 32) / interval) * interval - 1;\r
- LastInterval = interval;\r
- }\r
- do { // Rejection loop\r
- longran = (uint64)BRandom() * interval;\r
- iran = (uint32)(longran >> 32);\r
- remainder = (uint32)longran;\r
- } while (remainder > RLimit);\r
- // Convert back to signed and return result\r
- return (int32)iran + min;\r
-\r
-#else\r
- // 64 bit integers not available. Use modulo method\r
- uint32 interval; // Length of interval\r
- uint32 bran; // Random bits\r
- uint32 iran; // bran / interval\r
- uint32 remainder; // bran % interval\r
-\r
- interval = uint32(max - min + 1);\r
- if (interval != LastInterval) {\r
- // Interval length has changed. Must calculate rejection limit\r
- // Reject when iran = 2^32 / interval\r
- // We can't make 2^32 so we use 2^32-1 and correct afterwards\r
- RLimit = (uint32)0xFFFFFFFF / interval;\r
- if ((uint32)0xFFFFFFFF % interval == interval - 1) RLimit++;\r
- }\r
- do { // Rejection loop\r
- bran = BRandom();\r
- iran = bran / interval;\r
- remainder = bran % interval;\r
- } while (iran >= RLimit);\r
- // Convert back to signed and return result\r
- return (int32)remainder + min;\r
-\r
-#endif\r
-}\r
#include<iostream>
#include<fstream>
#include<string>
+#include<map>
#include "utils.h"
#include "GroupInfo.h"
+#include "Transcript.h"
+#include "Transcripts.h"
+
#include "SingleRead.h"
#include "SingleReadQ.h"
#include "PairedEndRead.h"
int nHits; // # of hits
int nUnique, nMulti, nIsoMulti;
char fn_list[STRLEN];
-char groupF[STRLEN];
+char groupF[STRLEN], tiF[STRLEN];
+char imdName[STRLEN];
char datF[STRLEN], cntF[STRLEN];
GroupInfo gi;
+Transcripts transcripts;
SamParser *parser;
ofstream hit_out;
ostream *cat[3][2]; // cat : category 1-dim 0 N0 1 N1 2 N2; 2-dim 0 mate1 1 mate2
char readOutFs[3][2][STRLEN];
-void init(const char* imdName, char alignFType, const char* alignF) {
+map<int, int> counter;
+map<int, int>::iterator iter;
- sprintf(datF, "%s.dat", imdName);
- sprintf(cntF, "%s.cnt", imdName);
+void init(const char* imdName, char alignFType, const char* alignF) {
char* aux = 0;
if (strcmp(fn_list, "")) aux = fn_list;
- parser = new SamParser(alignFType, alignF, aux);
+ parser = new SamParser(alignFType, alignF, transcripts, aux);
memset(cat, 0, sizeof(cat));
memset(readOutFs, 0, sizeof(readOutFs));
for (int j = 0; j < n_os; j++)
cat[i][j] = new ofstream(readOutFs[i][j]);
}
+
+ counter.clear();
}
//Do not allow duplicate for unalignable reads and supressed reads in SAM input
nMulti += hits.calcNumGeneMultiReads(gi);
nIsoMulti += hits.calcNumIsoformMultiReads();
hits.write(hit_out);
+
+ iter = counter.find(hits.getNHits());
+ if (iter != counter.end()) {
+ iter->second++;
+ }
+ else {
+ counter[hits.getNHits()] = 1;
+ }
}
hits.clear();
nMulti += hits.calcNumGeneMultiReads(gi);
nIsoMulti += hits.calcNumIsoformMultiReads();
hits.write(hit_out);
+
+ iter = counter.find(hits.getNHits());
+ if (iter != counter.end()) {
+ iter->second++;
+ }
+ else {
+ counter[hits.getNHits()] = 1;
+ }
}
nUnique = N[1] - nMulti;
int main(int argc, char* argv[]) {
bool quiet = false;
- if (argc < 5) {
- printf("Usage : rsem-parse-alignments refName imdName alignFType('s' for sam, 'b' for bam) alignF [-t Type] [-l fn_list] [-tag tagName] [-q]\n");
+ if (argc < 6) {
+ printf("Usage : rsem-parse-alignments refName sampleName sampleToken alignFType('s' for sam, 'b' for bam) alignF [-t Type] [-l fn_list] [-tag tagName] [-q]\n");
exit(-1);
}
strcpy(fn_list, "");
read_type = 0;
- if (argc > 5) {
- for (int i = 5; i < argc; i++) {
+ if (argc > 6) {
+ for (int i = 6; i < argc; i++) {
if (!strcmp(argv[i], "-t")) {
read_type = atoi(argv[i + 1]);
}
verbose = !quiet;
- init(argv[2], argv[3][0], argv[4]);
-
sprintf(groupF, "%s.grp", argv[1]);
gi.load(groupF);
+ sprintf(tiF, "%s.ti", argv[1]);
+ transcripts.readFrom(tiF);
+
+ sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
+ sprintf(datF, "%s.dat", imdName);
+ sprintf(cntF, "%s.stat/%s.cnt", argv[2], argv[3]);
+
+ init(imdName, argv[4][0], argv[5]);
hit_out.open(datF);
fout<<N[0]<<" "<<N[1]<<" "<<N[2]<<" "<<(N[0] + N[1] + N[2])<<endl;
fout<<nUnique<<" "<<nMulti<<" "<<nIsoMulti<<endl;
fout<<nHits<<" "<<read_type<<endl;
+ for (iter = counter.begin(); iter != counter.end(); iter++) {
+ fout<<iter->first<<'\t'<<iter->second<<endl;
+ }
fout.close();
release();
+++ /dev/null
-/*************************** RANDOMC.H ***************** 2007-09-22 Agner Fog *\r
-*\r
-* This file contains class declarations and other definitions for the C++ \r
-* library of uniform random number generators.\r
-*\r
-* Overview of classes:\r
-* ====================\r
-*\r
-* class CRandomMersenne:\r
-* Random number generator of type Mersenne twister.\r
-* Source file mersenne.cpp\r
-*\r
-* class CRandomMother:\r
-* Random number generator of type Mother-of-All (Multiply with carry).\r
-* Source file mother.cpp\r
-*\r
-*\r
-* Member functions (methods):\r
-* ===========================\r
-*\r
-* All these classes have identical member functions:\r
-*\r
-* Constructor(uint32 seed):\r
-* The seed can be any integer. Usually the time is used as seed.\r
-* Executing a program twice with the same seed will give the same sequence of\r
-* random numbers. A different seed will give a different sequence.\r
-*\r
-* void RandomInit(uint32 seed);\r
-* Re-initializes the random number generator with a new seed.\r
-*\r
-* void RandomInitByArray(uint32 seeds[], int length);\r
-* In CRandomMersenne only: Use this function if you want to initialize with\r
-* a seed with more than 32 bits. All bits in the seeds[] array will influence\r
-* the sequence of random numbers generated. length is the number of entries\r
-* in the seeds[] array.\r
-*\r
-* double Random();\r
-* Gives a floating point random number in the interval 0 <= x < 1.\r
-* The resolution is 32 bits in CRandomMother and CRandomMersenne.\r
-*\r
-* int IRandom(int min, int max);\r
-* Gives an integer random number in the interval min <= x <= max.\r
-* (max-min < MAXINT).\r
-* The precision is 2^-32 (defined as the difference in frequency between \r
-* possible output values). The frequencies are exact if max-min+1 is a\r
-* power of 2.\r
-*\r
-* int IRandomX(int min, int max);\r
-* Same as IRandom, but exact. In CRandomMersenne only.\r
-* The frequencies of all output values are exactly the same for an \r
-* infinitely long sequence. (Only relevant for extremely long sequences).\r
-*\r
-* uint32 BRandom();\r
-* Gives 32 random bits. \r
-*\r
-*\r
-* Example:\r
-* ========\r
-* The file EX-RAN.CPP contains an example of how to generate random numbers.\r
-*\r
-*\r
-* Optimized version:\r
-* ==================\r
-* Faster versions of these random number generators are provided as function\r
-* libraries in asmlib.zip. These function libraries are coded in assembly\r
-* language and support only x86 platforms, including 32-bit and 64-bit\r
-* Windows, Linux, BSD, Mac OS-X (Intel based). Use asmlibran.h from asmlib.zip\r
-*\r
-*\r
-* Non-uniform random number generators:\r
-* =====================================\r
-* Random number generators with various non-uniform distributions are available\r
-* in stocc.zip (www.agner.org/random).\r
-*\r
-*\r
-* Further documentation:\r
-* ======================\r
-* The file randomc.htm contains further documentation on these random number\r
-* generators.\r
-*\r
-*\r
-* Copyright:\r
-============\r
-* © 1997 - 2007 Agner Fog. All software in this library is published under the\r
-* GNU General Public License with the further restriction that it cannot be \r
-* used for gambling applications. See licence.htm\r
-*******************************************************************************/\r
-\r
-#ifndef RANDOMC_H\r
-#define RANDOMC_H\r
-\r
-\r
-// Define 32 bit signed and unsigned integers.\r
-// Change these definitions, if necessary, to match a particular platform\r
-#if defined(_WIN16) || defined(__MSDOS__) || defined(_MSDOS) \r
- // 16 bit systems use long int for 32 bit integer\r
- typedef long int int32; // 32 bit signed integer\r
- typedef unsigned long int uint32; // 32 bit unsigned integer\r
-#else\r
- // Most other systems use int for 32 bit integer\r
- typedef int int32; // 32 bit signed integer\r
- typedef unsigned int uint32; // 32 bit unsigned integer\r
-#endif\r
-\r
-// Define 64 bit signed and unsigned integers, if possible\r
-#if (defined(__WINDOWS__) || defined(_WIN32)) && (defined(_MSC_VER) || defined(__INTEL_COMPILER))\r
- // Microsoft and other compilers under Windows use __int64\r
- typedef __int64 int64; // 64 bit signed integer\r
- typedef unsigned __int64 uint64; // 64 bit unsigned integer\r
- #define INT64_DEFINED // Remember that int64 is defined\r
-#elif defined(__unix__) && (defined(_M_IX86) || defined(_M_X64))\r
- // Gnu and other compilers under Linux etc. use long long\r
- typedef long long int64; // 64 bit signed integer\r
- typedef unsigned long long uint64; // 64 bit unsigned integer\r
- #define INT64_DEFINED // Remember that int64 is defined\r
-#else\r
- // 64 bit integers not defined\r
- // You may include definitions for other platforms here\r
-#endif\r
-\r
-\r
-/***********************************************************************\r
-System-specific user interface functions\r
-***********************************************************************/\r
-\r
-void EndOfProgram(void); // System-specific exit code (userintf.cpp)\r
-\r
-void FatalError(char * ErrorText); // System-specific error reporting (userintf.cpp)\r
-\r
-\r
-/***********************************************************************\r
-Define random number generator classes\r
-***********************************************************************/\r
-\r
-class CRandomMersenne { // Encapsulate random number generator\r
-#if 0\r
- // Define constants for type MT11213A:\r
-#define MERS_N 351\r
-#define MERS_M 175\r
-#define MERS_R 19\r
-#define MERS_U 11\r
-#define MERS_S 7\r
-#define MERS_T 15\r
-#define MERS_L 17\r
-#define MERS_A 0xE4BD75F5\r
-#define MERS_B 0x655E5280\r
-#define MERS_C 0xFFD58000\r
-#else \r
- // or constants for type MT19937:\r
-#define MERS_N 624\r
-#define MERS_M 397\r
-#define MERS_R 31\r
-#define MERS_U 11\r
-#define MERS_S 7\r
-#define MERS_T 15\r
-#define MERS_L 18\r
-#define MERS_A 0x9908B0DF\r
-#define MERS_B 0x9D2C5680\r
-#define MERS_C 0xEFC60000\r
-#endif\r
-public:\r
- CRandomMersenne(uint32 seed) { // Constructor\r
- RandomInit(seed); LastInterval = 0;}\r
- void RandomInit(uint32 seed); // Re-seed\r
- void RandomInitByArray(uint32 seeds[], int length); // Seed by more than 32 bits\r
- int IRandom (int min, int max); // Output random integer\r
- int IRandomX(int min, int max); // Output random integer, exact\r
- double Random(); // Output random float\r
- uint32 BRandom(); // Output random bits\r
-private:\r
- void Init0(uint32 seed); // Basic initialization procedure\r
- uint32 mt[MERS_N]; // State vector\r
- int mti; // Index into mt\r
- uint32 LastInterval; // Last interval length for IRandomX\r
- uint32 RLimit; // Rejection limit used by IRandomX\r
- enum TArch {LITTLE_ENDIAN1, BIG_ENDIAN1, NONIEEE}; // Definition of architecture\r
- TArch Architecture; // Conversion to float depends on architecture\r
-}; \r
-\r
-\r
-class CRandomMother { // Encapsulate random number generator\r
-public:\r
- void RandomInit(uint32 seed); // Initialization\r
- int IRandom(int min, int max); // Get integer random number in desired interval\r
- double Random(); // Get floating point random number\r
- uint32 BRandom(); // Output random bits\r
- CRandomMother(uint32 seed) { // Constructor\r
- RandomInit(seed);}\r
-protected:\r
- uint32 x[5]; // History buffer\r
-};\r
-\r
-#endif\r
my $mate2_list = "";
my $inpF = "";
-my ($refName, $taskName, $tmp_dir, $imdName) = ();
+my ($refName, $sampleName, $sampleToken, $temp_dir, $stat_dir, $imdName) = ();
my $gap = 32;
if ($paired_end) {
if ($is_sam || $is_bam) { $inpF = $ARGV[0]; }
else {$mate1_list = $ARGV[0]; }
$refName = $ARGV[1];
- $taskName = $ARGV[2];
+ $sampleName = $ARGV[2];
}
else {
$mate1_list = $ARGV[0];
$mate2_list = $ARGV[1];
$refName = $ARGV[2];
- $taskName = $ARGV[3];
+ $sampleName = $ARGV[3];
}
-$tmp_dir = $taskName.".temp";
-my $pos = rindex($taskName, '/');
-if ($pos < 0) {
- $imdName = "$tmp_dir/$taskName";
-}
-else {
- $imdName = $tmp_dir."/".substr($taskName, $pos + 1);
-}
+my $pos = rindex($sampleName, '/');
+if ($pos < 0) { $sampleToken = $sampleName; }
+else { $sampleToken = substr($sampleName, $pos + 1); }
+
+$temp_dir = "$sampleName.temp";
+$stat_dir = "$sampleName.stat";
+
+if (!(-d $temp_dir) && !mkdir($temp_dir)) { print "Fail to create folder $temp_dir.\n"; exit(-1); }
+if (!(-d $stat_dir) && !mkdir($stat_dir)) { print "Fail to create folder $stat_dir.\n"; exit(-1); }
+
+$imdName = "$temp_dir/$sampleToken";
if (!$is_sam && !$is_bam && $phred33 + $phred64 + $solexa == 0) { $phred33 = 1; }
my ($mate_minL, $mate_maxL) = (1, $maxL);
-if (!(-d $tmp_dir) && !mkdir($tmp_dir)) { print "Fail to create the directory.\n"; exit(-1); }
-
if ($bowtie_path ne "") { $bowtie_path .= "/"; }
my ($fn, $dir, $suf) = fileparse($0);
$is_sam = 1; # output of bowtie is a sam file
}
-$command = $dir."rsem-parse-alignments $refName $imdName";
+$command = $dir."rsem-parse-alignments $refName $sampleName $sampleToken";
my $samInpType;
if ($is_sam) { $samInpType = "s"; }
print OUTPUT "$L\n";
close(OUTPUT);
-$command = $dir."rsem-run-em $refName $read_type $imdName $taskName -p $nThreads";
+$command = $dir."rsem-run-em $refName $read_type $sampleName $sampleToken -p $nThreads";
if ($genBamF) {
$command .= " -b $samInpType $inpF";
if ($fn_list ne "") { $command .= " 1 $fn_list"; }
print "\n";
if ($genBamF) {
- $command = $dir."sam/samtools sort $taskName.bam $taskName.sorted";
+ $command = $dir."sam/samtools sort $sampleName.bam $sampleName.sorted";
print "$command\n";
$status = system($command);
if ($status != 0) {
exit(-1);
}
print "\n";
- $command = $dir."sam/samtools index $taskName.sorted.bam";
+ $command = $dir."sam/samtools index $sampleName.sorted.bam";
print "$command\n";
$status = system($command);
if ($status != 0) {
print "\n";
}
-&collectResults("$imdName.iso_res", "$taskName.isoforms.results"); # isoform level
-&collectResults("$imdName.gene_res", "$taskName.genes.results"); # gene level
+&collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+&collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
if ($calcCI) {
- $command = $dir."rsem-run-gibbs $refName $taskName $imdName $BURNIN $CHAINLEN $SAMPLEGAP";
+ $command = $dir."rsem-run-gibbs $refName $sampleName $sampleToken $BURNIN $CHAINLEN $SAMPLEGAP";
if ($quiet) { $command .= " -q"; }
print "$command\n";
$status = system($command);
}
print "\n";
- system("mv $taskName.isoforms.results $imdName.isoforms.results.bak1");
- system("mv $taskName.genes.results $imdName.genes.results.bak1");
- &collectResults("$imdName.iso_res", "$taskName.isoforms.results"); # isoform level
- &collectResults("$imdName.gene_res", "$taskName.genes.results"); # gene level
+ system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak1");
+ system("mv $sampleName.genes.results $imdName.genes.results.bak1");
+ &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+ &collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
- $command = $dir."rsem-calculate-credibility-intervals $refName $taskName $imdName $CONFIDENCE $NSPC $NMB";
+ $command = $dir."rsem-calculate-credibility-intervals $refName $sampleName $sampleToken $CONFIDENCE $NSPC $NMB";
if ($quiet) { $command .= " -q"; }
print "$command\n";
$status = system($command);
}
print "\n";
- system("mv $taskName.isoforms.results $imdName.isoforms.results.bak2");
- system("mv $taskName.genes.results $imdName.genes.results.bak2");
- &collectResults("$imdName.iso_res", "$taskName.isoforms.results"); # isoform level
- &collectResults("$imdName.gene_res", "$taskName.genes.results"); # gene level
+ system("mv $sampleName.isoforms.results $imdName.isoforms.results.bak2");
+ system("mv $sampleName.genes.results $imdName.genes.results.bak2");
+ &collectResults("$imdName.iso_res", "$sampleName.isoforms.results"); # isoform level
+ &collectResults("$imdName.gene_res", "$sampleName.genes.results"); # gene level
}
if (!$keep_intermediate_files) {
'rsem-prepare-reference', there will be no tab after the
tau_value field.
-=item B<sample_name.model> and B<sample_name.theta>
-
-Output files used by RSEM internally for tasks like simulation,
-compute credibility intervals etc.
-
=item B<sample_name.bam, sample_name.sorted.bam and sample_name.sorted.bam.bai>
Only generated when --out-bam is specified.
'sample_name.sorted.bam' and 'sample_name.sorted.bam.bai' are the
sorted BAM file and indices generated by samtools (included in RSEM package).
+=item B<sample_name.stat>
+
+This is a folder instead of a file. All model related statistics are stored in this folder. Use 'rsem-plot-model' can generate plots using this folder.
=back
argv <- commandArgs(TRUE)
if (length(argv) != 2) {
- cat("Usage: rsem-plot-model modelF outF\n")
+ cat("Usage: rsem-plot-model sample_name outF\n")
q(status = 1)
}
-con <- file(argv[1], open = "r")
+strvec <- strsplit(argv[1], split = "/")[[1]]
+token <- strvec[length(strvec)]
+
+modelF <- paste(argv[1], ".stat/", token, ".model")
+cntF <- paste(argv[1], ".stat/", token, ".cnt")
+
pdf(argv[2])
+con <- file(modelF, open = "r")
+
# model type and forward probability
model_type <- as.numeric(readLines(con, n = 4)[1])
legend("topleft", c("A", "C", "G", "T"), lty = 1:4, pch = 0:3, col = 1:4)
}
-dev.off()
close(con)
+
+pair <- read.table(file = cntF, skip = 3, sep = "\t")
+plot(pair[,1], pair[,2], xlab = "Number of Alignments", ylab = "Number of Reads", main = "Among alignable reads, distribution of # of alignments")
+
+dev.off()
#ifndef SIMUL_H_
#define SIMUL_H_
+#include<ctime>
#include<cassert>
+#include "boost/random.hpp"
class simul {
public:
- virtual ~simul() {}
+ simul() : rg(boost::mt19937(time(NULL))) {
+ }
+
// interval : [,)
// random number should be in [0, arr[len - 1])
// If by chance arr[len - 1] == 0.0, one possibility is to sample uniformly from 0 ... len - 1
return l;
}
- virtual double random() { return 0.0; };
+ double random() { return rg(); };
private:
+ boost::uniform_01<boost::mt19937> rg;
};
#endif /* SIMUL_H_ */
+++ /dev/null
-#ifndef SIMUL_MERSENNE_H_
-#define SIMUL_MERSENNE_H_
-
-#include<ctime>
-
-#include "randomc.h"
-#include "simul.h"
-
-class simul_mersenne : public simul {
-public:
- simul_mersenne() : rg(time(NULL)) { }
-
- virtual ~simul_mersenne() {}
- virtual double random() { return rg.Random(); }
-
-private:
- CRandomMersenne rg;
-};
-
-#endif /* SIMUL_MERSENNE_H_ */
#include "Transcript.h"
#include "Transcripts.h"
-#include "simul_mersenne.h"
+#include "simul.h"
using namespace std;
char refF[STRLEN], groupF[STRLEN], tiF[STRLEN];
char geneResF[STRLEN], isoResF[STRLEN];
-simul_mersenne sampler;
+simul sampler;
void genOutReadStreams(int type, char *outFN) {
switch(type) {