12 #include "my_assert.h"
16 #include "SingleModel.h"
17 #include "SingleQModel.h"
18 #include "PairedEndModel.h"
19 #include "PairedEndQModel.h"
22 #include "GroupInfo.h"
23 #include "WriteResults.h"
38 int start_gene_id, end_gene_id;
42 float lb, ub; // the interval is [lb, ub]
44 CIType() { lb = ub = 0.0; }
51 int nCV, nSpC, nSamples; // nCV: number of count vectors; nSpC: number of theta vectors sampled per count vector; nSamples: nCV * nSpC
56 char cvsF[STRLEN], tmpF[STRLEN], command[STRLEN];
59 CIType *iso_tpm = NULL, *iso_fpkm = NULL;
60 CIType *gene_tpm, *gene_fpkm;
65 char refName[STRLEN], imdName[STRLEN], statName[STRLEN];
66 char modelF[STRLEN], groupF[STRLEN], refF[STRLEN];
72 vector<double> eel; //expected effective lengths
86 CIParams *ciParamsArray;
88 void* sample_theta_from_c(void* arg) {
93 gamma_generator **rgs;
95 Params *params = (Params*)arg;
96 FILE *fi = params->fi;
97 const double *mw = params->mw;
99 cvec = new int[M + 1];
100 theta = new double[M + 1];
101 gammas = new gamma_dist*[M + 1];
102 rgs = new gamma_generator*[M + 1];
103 tpm = new float[M + 1];
104 float l_bar; // the mean transcript length over the sample
107 while (fscanf(fi, "%d", &cvec[0]) == 1) {
108 for (int j = 1; j <= M; j++) assert(fscanf(fi, "%d", &cvec[j]) == 1);
113 for (int j = 0; j <= M; j++) {
114 gammas[j] = NULL; rgs[j] = NULL;
116 gammas[j] = new gamma_dist(cvec[j]);
117 rgs[j] = new gamma_generator(*(params->engine), *gammas[j]);
121 for (int i = 0; i < nSpC; i++) {
123 for (int j = 0; j <= M; j++) {
124 theta[j] = ((j == 0 || (cvec[j] > 0 && eel[j] >= EPSILON && mw[j] >= EPSILON)) ? (*rgs[j])() / mw[j] : 0.0);
127 assert(sum >= EPSILON);
128 for (int j = 0; j <= M; j++) theta[j] /= sum;
132 for (int j = 1; j <= M; j++)
133 if (eel[j] >= EPSILON) {
134 tpm[j] = theta[j] / eel[j];
137 else assert(theta[j] < EPSILON);
138 assert(sum >= EPSILON);
139 l_bar = 0.0; // store mean effective length of the sample
140 for (int j = 1; j <= M; j++) { tpm[j] /= sum; l_bar += tpm[j] * eel[j]; tpm[j] *= 1e6; }
141 buffer->write(l_bar, tpm + 1); // ommit the first element in tpm
144 for (int j = 0; j <= M; j++) {
145 if (gammas[j] != NULL) delete gammas[j];
146 if (rgs[j] != NULL) delete rgs[j];
149 if (verbose && cnt % 100 == 0) { printf("Thread %d, %d count vectors are processed!\n", params->no, cnt); }
161 template<class ModelType>
162 void sample_theta_vectors_from_count_vectors() {
165 calcExpectedEffectiveLengths<ModelType>(M, refs, model, eel);
167 int num_threads = min(nThreads, nCV);
169 buffer = new Buffer(nMB, nSamples, M, l_bars, tmpF);
171 paramsArray = new Params[num_threads];
172 threads = new pthread_t[num_threads];
175 hasSeed ? engineFactory::init(seed) : engineFactory::init();
176 for (int i = 0; i < num_threads; i++) {
177 paramsArray[i].no = i;
178 sprintf(inpF, "%s%d", cvsF, i);
179 paramsArray[i].fi = fopen(inpF, "r");
180 paramsArray[i].engine = engineFactory::new_engine();
181 paramsArray[i].mw = model.getMW();
183 engineFactory::finish();
185 /* set thread attribute to be joinable */
186 pthread_attr_init(&attr);
187 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
189 for (int i = 0; i < num_threads; i++) {
190 rc = pthread_create(&threads[i], &attr, &sample_theta_from_c, (void*)(¶msArray[i]));
191 pthread_assert(rc, "pthread_create", "Cannot create thread " + itos(i) + " (numbered from 0) in sample_theta_vectors_from_count_vectors!");
193 for (int i = 0; i < num_threads; i++) {
194 rc = pthread_join(threads[i], NULL);
195 pthread_assert(rc, "pthread_join", "Cannot join thread " + itos(i) + " (numbered from 0) in sample_theta_vectors_from_count_vectors!");
198 /* destroy attribute */
199 pthread_attr_destroy(&attr);
202 for (int i = 0; i < num_threads; i++) {
203 fclose(paramsArray[i].fi);
204 delete paramsArray[i].engine;
206 delete[] paramsArray;
208 delete buffer; // Must delete here, force the content left in the buffer be written into the disk
210 if (verbose) { printf("Sampling is finished!\n"); }
213 void calcCI(int nSamples, float *samples, float &lb, float &ub) {
214 int p, q; // p pointer for lb, q pointer for ub;
216 int threshold = nSamples - (int(confidence * nSamples - 1e-8) + 1);
219 sort(samples, samples + nSamples);
221 p = 0; q = nSamples - 1;
225 while (newq > 0 && samples[newq - 1] == samples[newq]) newq--;
227 } while (newq >= 0 && nSamples - (newq + 1) <= threshold);
229 nOutside = nSamples - (q + 1);
231 lb = -1e30; ub = 1e30;
233 if (samples[q] - samples[p] < ub - lb) {
239 while (newp < nSamples - 1 && samples[newp] == samples[newp + 1]) newp++;
241 if (newp <= threshold) {
242 nOutside += newp - p;
244 while (nOutside > threshold && q < nSamples - 1) {
246 while (newq < nSamples - 1 && samples[newq] == samples[newq + 1]) newq++;
247 nOutside -= newq - q;
250 assert(nOutside <= threshold);
253 } while (p <= threshold);
256 void* calcCI_batch(void* arg) {
257 float *tsamples, *fsamples;
258 float *itsamples = NULL, *ifsamples = NULL, *gtsamples, *gfsamples;
260 CIParams *ciParams = (CIParams*)arg;
261 int curtid, curaid, tid;
263 tsamples = new float[nSamples];
264 fsamples = new float[nSamples];
266 itsamples = new float[nSamples];
267 ifsamples = new float[nSamples];
269 gtsamples = new float[nSamples];
270 gfsamples = new float[nSamples];
272 fin.open(tmpF, ios::binary);
273 // minus 1 here for that theta0 is not written!
274 streampos pos = streampos(gi.spAt(ciParams->start_gene_id) - 1) * nSamples * FLOATSIZE;
275 fin.seekg(pos, ios::beg);
279 curtid = curaid = -1;
280 memset(itsamples, 0, FLOATSIZE * nSamples);
281 memset(ifsamples, 0, FLOATSIZE * nSamples);
283 for (int i = ciParams->start_gene_id; i < ciParams->end_gene_id; i++) {
284 int b = gi.spAt(i), e = gi.spAt(i + 1);
285 memset(gtsamples, 0, FLOATSIZE * nSamples);
286 memset(gfsamples, 0, FLOATSIZE * nSamples);
287 for (int j = b; j < e; j++) {
292 if (j - curaid > 1) {
293 calcCI(nSamples, itsamples, iso_tpm[curtid].lb, iso_tpm[curtid].ub);
294 calcCI(nSamples, ifsamples, iso_fpkm[curtid].lb, iso_fpkm[curtid].ub);
297 iso_tpm[curtid] = tpm[curaid];
298 iso_fpkm[curtid] = fpkm[curaid];
306 for (int k = 0; k < nSamples; k++) {
307 fin.read((char*)(&tsamples[k]), FLOATSIZE);
308 fsamples[k] = 1e3 / l_bars[k] * tsamples[k];
310 itsamples[k] += tsamples[k];
311 ifsamples[k] += fsamples[k];
313 gtsamples[k] += tsamples[k];
314 gfsamples[k] += fsamples[k];
316 calcCI(nSamples, tsamples, tpm[j].lb, tpm[j].ub);
317 calcCI(nSamples, fsamples, fpkm[j].lb, fpkm[j].ub);
321 calcCI(nSamples, gtsamples, gene_tpm[i].lb, gene_tpm[i].ub);
322 calcCI(nSamples, gfsamples, gene_fpkm[i].lb, gene_fpkm[i].ub);
325 gene_tpm[i] = tpm[b];
326 gene_fpkm[i] = fpkm[b];
330 if (verbose && cnt % 1000 == 0) { printf("In thread %d, %d genes are processed for CI calculation!\n", ciParams->no, cnt); }
334 if (alleleS && (curtid >= 0)) {
335 if (gi.spAt(ciParams->end_gene_id) - curaid > 1) {
336 calcCI(nSamples, itsamples, iso_tpm[curtid].lb, iso_tpm[curtid].ub);
337 calcCI(nSamples, ifsamples, iso_fpkm[curtid].lb, iso_fpkm[curtid].ub);
340 iso_tpm[curtid] = tpm[curaid];
341 iso_fpkm[curtid] = fpkm[curaid];
357 void calculate_credibility_intervals(char* imdName) {
360 int num_threads = nThreads;
362 tpm = new CIType[M + 1];
363 fpkm = new CIType[M + 1];
365 iso_tpm = new CIType[m_trans];
366 iso_fpkm = new CIType[m_trans];
368 gene_tpm = new CIType[m];
369 gene_fpkm = new CIType[m];
372 int quotient = M / num_threads;
373 if (quotient < 1) { num_threads = M; quotient = 1; }
375 int num_isoforms = 0;
377 // A just so so strategy for paralleling
378 ciParamsArray = new CIParams[num_threads];
379 for (int i = 0; i < num_threads; i++) {
380 ciParamsArray[i].no = i;
381 ciParamsArray[i].start_gene_id = cur_gene_id;
384 while ((m - cur_gene_id > num_threads - i - 1) && (i == num_threads - 1 || num_isoforms < quotient)) {
385 num_isoforms += gi.spAt(cur_gene_id + 1) - gi.spAt(cur_gene_id);
389 ciParamsArray[i].end_gene_id = cur_gene_id;
392 threads = new pthread_t[num_threads];
394 /* set thread attribute to be joinable */
395 pthread_attr_init(&attr);
396 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
399 for (int i = 0; i < num_threads; i++) {
400 rc = pthread_create(&threads[i], &attr, &calcCI_batch, (void*)(&ciParamsArray[i]));
401 pthread_assert(rc, "pthread_create", "Cannot create thread " + itos(i) + " (numbered from 0) in calculate_credibility_intervals!");
403 for (int i = 0; i < num_threads; i++) {
404 rc = pthread_join(threads[i], NULL);
405 pthread_assert(rc, "pthread_join", "Cannot join thread " + itos(i) + " (numbered from 0) in calculate_credibility_intervals!");
408 // releasing resources
410 /* destroy attribute */
411 pthread_attr_destroy(&attr);
414 delete[] ciParamsArray;
416 alleleS ? sprintf(outF, "%s.allele_res", imdName) : sprintf(outF, "%s.iso_res", imdName);
417 fo = fopen(outF, "a");
418 for (int i = 1; i <= M; i++)
419 fprintf(fo, "%.6g%c", tpm[i].lb, (i < M ? '\t' : '\n'));
420 for (int i = 1; i <= M; i++)
421 fprintf(fo, "%.6g%c", tpm[i].ub, (i < M ? '\t' : '\n'));
422 for (int i = 1; i <= M; i++)
423 fprintf(fo, "%.6g%c", fpkm[i].lb, (i < M ? '\t' : '\n'));
424 for (int i = 1; i <= M; i++)
425 fprintf(fo, "%.6g%c", fpkm[i].ub, (i < M ? '\t' : '\n'));
429 //isoform level results
430 sprintf(outF, "%s.iso_res", imdName);
431 fo = fopen(outF, "a");
432 for (int i = 0; i < m_trans; i++)
433 fprintf(fo, "%.6g%c", iso_tpm[i].lb, (i < m_trans - 1 ? '\t' : '\n'));
434 for (int i = 0; i < m_trans; i++)
435 fprintf(fo, "%.6g%c", iso_tpm[i].ub, (i < m_trans - 1 ? '\t' : '\n'));
436 for (int i = 0; i < m_trans; i++)
437 fprintf(fo, "%.6g%c", iso_fpkm[i].lb, (i < m_trans - 1 ? '\t' : '\n'));
438 for (int i = 0; i < m_trans; i++)
439 fprintf(fo, "%.6g%c", iso_fpkm[i].ub, (i < m_trans - 1 ? '\t' : '\n'));
444 sprintf(outF, "%s.gene_res", imdName);
445 fo = fopen(outF, "a");
446 for (int i = 0; i < m; i++)
447 fprintf(fo, "%.6g%c", gene_tpm[i].lb, (i < m - 1 ? '\t' : '\n'));
448 for (int i = 0; i < m; i++)
449 fprintf(fo, "%.6g%c", gene_tpm[i].ub, (i < m - 1 ? '\t' : '\n'));
450 for (int i = 0; i < m; i++)
451 fprintf(fo, "%.6g%c", gene_fpkm[i].lb, (i < m - 1 ? '\t' : '\n'));
452 for (int i = 0; i < m; i++)
453 fprintf(fo, "%.6g%c", gene_fpkm[i].ub, (i < m - 1 ? '\t' : '\n'));
465 if (verbose) { printf("All credibility intervals are calculated!\n"); }
468 int main(int argc, char* argv[]) {
470 printf("Usage: rsem-calculate-credibility-intervals reference_name imdName statName confidence nCV nSpC nMB [-p #Threads] [--seed seed] [-q]\n");
474 strcpy(refName, argv[1]);
475 strcpy(imdName, argv[2]);
476 strcpy(statName, argv[3]);
478 confidence = atof(argv[4]);
480 nSpC = atoi(argv[6]);
486 for (int i = 8; i < argc; i++) {
487 if (!strcmp(argv[i], "-p")) nThreads = atoi(argv[i + 1]);
488 if (!strcmp(argv[i], "--seed")) {
490 int len = strlen(argv[i + 1]);
492 for (int k = 0; k < len; k++) seed = seed * 10 + (argv[i + 1][k] - '0');
494 if (!strcmp(argv[i], "-q")) quiet = true;
498 sprintf(refF, "%s.seq", refName);
499 refs.loadRefs(refF, 1);
502 sprintf(groupF, "%s.grp", refName);
507 alleleS = isAlleleSpecific(refName, NULL, &ta);
508 if (alleleS) m_trans = ta.getm();
510 nSamples = nCV * nSpC;
511 assert(nSamples > 0 && M > 0); // for Buffter.h: (bufsize_type)nSamples
512 l_bars = new float[nSamples];
514 sprintf(tmpF, "%s.tmp", imdName);
515 sprintf(cvsF, "%s.countvectors", imdName);
517 sprintf(modelF, "%s.model", statName);
518 FILE *fi = fopen(modelF, "r");
519 general_assert(fi != NULL, "Cannot open " + cstrtos(modelF) + "!");
520 assert(fscanf(fi, "%d", &model_type) == 1);
525 case 0 : sample_theta_vectors_from_count_vectors<SingleModel>(); break;
526 case 1 : sample_theta_vectors_from_count_vectors<SingleQModel>(); break;
527 case 2 : sample_theta_vectors_from_count_vectors<PairedEndModel>(); break;
528 case 3 : sample_theta_vectors_from_count_vectors<PairedEndQModel>(); break;
532 calculate_credibility_intervals(imdName);