12 #include "my_assert.h"
16 #include "SingleModel.h"
17 #include "SingleQModel.h"
18 #include "PairedEndModel.h"
19 #include "PairedEndQModel.h"
22 #include "GroupInfo.h"
36 float lb, ub; // the interval is [lb, ub]
38 CIType() { lb = ub = 0.0; }
43 int start_gene_id, end_gene_id;
50 int nCV, nSpC, nSamples; // nCV: number of count vectors; nSpC: number of theta vectors sampled per count vector; nSamples: nCV * nSpC
55 char cvsF[STRLEN], tmpF[STRLEN], command[STRLEN];
57 CIType *iso_tpm, *gene_tpm, *iso_fpkm, *gene_fpkm;
62 char imdName[STRLEN], statName[STRLEN];
63 char modelF[STRLEN], groupF[STRLEN], refF[STRLEN];
65 vector<double> eel; //expected effective lengths
76 CIParams *ciParamsArray;
78 template<class ModelType>
79 void calcExpectedEffectiveLengths(ModelType& model) {
81 double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = \sigma_{j=1}^{i}pdf[i]*(lb+i)
83 model.getGLD().copyTo(pdf, cdf, lb, ub, span);
84 clen = new double[span + 1];
86 for (int i = 1; i <= span; i++) {
87 clen[i] = clen[i - 1] + pdf[i] * (lb + i);
90 eel.assign(M + 1, 0.0);
91 for (int i = 1; i <= M; i++) {
92 int totLen = refs.getRef(i).getTotLen();
93 int fullLen = refs.getRef(i).getFullLen();
94 int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0);
95 int pos2 = max(min(totLen, ub) - lb, 0);
97 if (pos2 == 0) { eel[i] = 0.0; continue; }
99 eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
101 if (eel[i] < MINEEL) { eel[i] = 0.0; }
109 void* sample_theta_from_c(void* arg) {
114 gamma_generator **rgs;
116 Params *params = (Params*)arg;
117 FILE *fi = params->fi;
118 const double *mw = params->mw;
120 cvec = new int[M + 1];
121 theta = new double[M + 1];
122 gammas = new gamma_dist*[M + 1];
123 rgs = new gamma_generator*[M + 1];
124 tpm = new float[M + 1];
125 float l_bar; // the mean transcript length over the sample
128 while (fscanf(fi, "%d", &cvec[0]) == 1) {
129 for (int j = 1; j <= M; j++) assert(fscanf(fi, "%d", &cvec[j]) == 1);
133 for (int j = 0; j <= M; j++) {
134 gammas[j] = new gamma_dist(cvec[j]);
135 rgs[j] = new gamma_generator(*(params->engine), *gammas[j]);
138 for (int i = 0; i < nSpC; i++) {
140 for (int j = 0; j <= M; j++) {
141 theta[j] = ((j == 0 || (eel[j] >= EPSILON && mw[j] >= EPSILON)) ? (*rgs[j])() / mw[j] : 0.0);
144 assert(sum >= EPSILON);
145 for (int j = 0; j <= M; j++) theta[j] /= sum;
149 for (int j = 1; j <= M; j++)
150 if (eel[j] >= EPSILON) {
151 tpm[j] = theta[j] / eel[j];
154 else assert(theta[j] < EPSILON);
155 assert(sum >= EPSILON);
156 l_bar = 0.0; // store mean effective length of the sample
157 for (int j = 1; j <= M; j++) { tpm[j] /= sum; l_bar += tpm[j] * eel[j]; tpm[j] *= 1e6; }
158 buffer->write(l_bar, tpm + 1); // ommit the first element in tpm
161 for (int j = 0; j <= M; j++) {
166 if (verbose && cnt % 100 == 0) { printf("Thread %d, %d count vectors are processed!\n", params->no, cnt); }
178 template<class ModelType>
179 void sample_theta_vectors_from_count_vectors() {
182 calcExpectedEffectiveLengths<ModelType>(model);
184 int num_threads = min(nThreads, nCV);
186 buffer = new Buffer(nMB, nSamples, M, l_bars, tmpF);
188 paramsArray = new Params[num_threads];
189 threads = new pthread_t[num_threads];
192 for (int i = 0; i < num_threads; i++) {
193 paramsArray[i].no = i;
194 sprintf(inpF, "%s%d", cvsF, i);
195 paramsArray[i].fi = fopen(inpF, "r");
196 paramsArray[i].engine = engineFactory::new_engine();
197 paramsArray[i].mw = model.getMW();
200 /* set thread attribute to be joinable */
201 pthread_attr_init(&attr);
202 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
204 for (int i = 0; i < num_threads; i++) {
205 rc = pthread_create(&threads[i], &attr, &sample_theta_from_c, (void*)(¶msArray[i]));
206 pthread_assert(rc, "pthread_create", "Cannot create thread " + itos(i) + " (numbered from 0) in sample_theta_vectors_from_count_vectors!");
208 for (int i = 0; i < num_threads; i++) {
209 rc = pthread_join(threads[i], NULL);
210 pthread_assert(rc, "pthread_join", "Cannot join thread " + itos(i) + " (numbered from 0) in sample_theta_vectors_from_count_vectors!");
213 /* destroy attribute */
214 pthread_attr_destroy(&attr);
217 for (int i = 0; i < num_threads; i++) {
218 fclose(paramsArray[i].fi);
219 delete paramsArray[i].engine;
221 delete[] paramsArray;
223 delete buffer; // Must delete here, force the content left in the buffer be written into the disk
225 if (verbose) { printf("Sampling is finished!\n"); }
228 void calcCI(int nSamples, float *samples, float &lb, float &ub) {
229 int p, q; // p pointer for lb, q pointer for ub;
231 int threshold = nSamples - (int(confidence * nSamples - 1e-8) + 1);
234 sort(samples, samples + nSamples);
236 p = 0; q = nSamples - 1;
240 while (newq > 0 && samples[newq - 1] == samples[newq]) newq--;
242 } while (newq >= 0 && nSamples - (newq + 1) <= threshold);
244 nOutside = nSamples - (q + 1);
246 lb = -1e30; ub = 1e30;
248 if (samples[q] - samples[p] < ub - lb) {
254 while (newp < nSamples - 1 && samples[newp] == samples[newp + 1]) newp++;
256 if (newp <= threshold) {
257 nOutside += newp - p;
259 while (nOutside > threshold && q < nSamples - 1) {
261 while (newq < nSamples - 1 && samples[newq] == samples[newq + 1]) newq++;
262 nOutside -= newq - q;
265 assert(nOutside <= threshold);
268 } while (p <= threshold);
271 void* calcCI_batch(void* arg) {
272 float *itsamples, *gtsamples, *ifsamples, *gfsamples;
274 CIParams *ciParams = (CIParams*)arg;
276 itsamples = new float[nSamples];
277 gtsamples = new float[nSamples];
278 ifsamples = new float[nSamples];
279 gfsamples = new float[nSamples];
281 fin.open(tmpF, ios::binary);
282 // minus 1 here for that theta0 is not written!
283 streampos pos = streampos(gi.spAt(ciParams->start_gene_id) - 1) * nSamples * FLOATSIZE;
284 fin.seekg(pos, ios::beg);
287 for (int i = ciParams->start_gene_id; i < ciParams->end_gene_id; i++) {
288 int b = gi.spAt(i), e = gi.spAt(i + 1);
289 memset(gtsamples, 0, FLOATSIZE * nSamples);
290 memset(gfsamples, 0, FLOATSIZE * nSamples);
291 for (int j = b; j < e; j++) {
292 for (int k = 0; k < nSamples; k++) {
293 fin.read((char*)(&itsamples[k]), FLOATSIZE);
294 gtsamples[k] += itsamples[k];
295 ifsamples[k] = 1e3 / l_bars[k] * itsamples[k];
296 gfsamples[k] += ifsamples[k];
298 calcCI(nSamples, itsamples, iso_tpm[j].lb, iso_tpm[j].ub);
299 calcCI(nSamples, ifsamples, iso_fpkm[j].lb, iso_fpkm[j].ub);
303 calcCI(nSamples, gtsamples, gene_tpm[i].lb, gene_tpm[i].ub);
304 calcCI(nSamples, gfsamples, gene_fpkm[i].lb, gene_fpkm[i].ub);
307 gene_tpm[i].lb = iso_tpm[b].lb; gene_tpm[i].ub = iso_tpm[b].ub;
308 gene_fpkm[i].lb = iso_fpkm[b].lb; gene_fpkm[i].ub = iso_fpkm[b].ub;
312 if (verbose && cnt % 1000 == 0) { printf("In thread %d, %d genes are processed for CI calculation!\n", ciParams->no, cnt); }
323 void calculate_credibility_intervals(char* imdName) {
326 int num_threads = nThreads;
328 iso_tpm = new CIType[M + 1];
329 gene_tpm = new CIType[m];
330 iso_fpkm = new CIType[M + 1];
331 gene_fpkm = new CIType[m];
334 int quotient = M / num_threads;
335 if (quotient < 1) { num_threads = M; quotient = 1; }
337 int num_isoforms = 0;
339 // A just so so strategy for paralleling
340 ciParamsArray = new CIParams[num_threads];
341 for (int i = 0; i < num_threads; i++) {
342 ciParamsArray[i].no = i;
343 ciParamsArray[i].start_gene_id = cur_gene_id;
346 while ((m - cur_gene_id > num_threads - i - 1) && (i == num_threads - 1 || num_isoforms < quotient)) {
347 num_isoforms += gi.spAt(cur_gene_id + 1) - gi.spAt(cur_gene_id);
351 ciParamsArray[i].end_gene_id = cur_gene_id;
354 threads = new pthread_t[num_threads];
356 /* set thread attribute to be joinable */
357 pthread_attr_init(&attr);
358 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
361 for (int i = 0; i < num_threads; i++) {
362 rc = pthread_create(&threads[i], &attr, &calcCI_batch, (void*)(&ciParamsArray[i]));
363 pthread_assert(rc, "pthread_create", "Cannot create thread " + itos(i) + " (numbered from 0) in calculate_credibility_intervals!");
365 for (int i = 0; i < num_threads; i++) {
366 rc = pthread_join(threads[i], NULL);
367 pthread_assert(rc, "pthread_join", "Cannot join thread " + itos(i) + " (numbered from 0) in calculate_credibility_intervals!");
370 // releasing resources
372 /* destroy attribute */
373 pthread_attr_destroy(&attr);
376 delete[] ciParamsArray;
378 //isoform level results
379 sprintf(outF, "%s.iso_res", imdName);
380 fo = fopen(outF, "a");
381 for (int i = 1; i <= M; i++)
382 fprintf(fo, "%.6g%c", iso_tpm[i].lb, (i < M ? '\t' : '\n'));
383 for (int i = 1; i <= M; i++)
384 fprintf(fo, "%.6g%c", iso_tpm[i].ub, (i < M ? '\t' : '\n'));
385 for (int i = 1; i <= M; i++)
386 fprintf(fo, "%.6g%c", iso_fpkm[i].lb, (i < M ? '\t' : '\n'));
387 for (int i = 1; i <= M; i++)
388 fprintf(fo, "%.6g%c", iso_fpkm[i].ub, (i < M ? '\t' : '\n'));
392 sprintf(outF, "%s.gene_res", imdName);
393 fo = fopen(outF, "a");
394 for (int i = 0; i < m; i++)
395 fprintf(fo, "%.6g%c", gene_tpm[i].lb, (i < m - 1 ? '\t' : '\n'));
396 for (int i = 0; i < m; i++)
397 fprintf(fo, "%.6g%c", gene_tpm[i].ub, (i < m - 1 ? '\t' : '\n'));
398 for (int i = 0; i < m; i++)
399 fprintf(fo, "%.6g%c", gene_fpkm[i].lb, (i < m - 1 ? '\t' : '\n'));
400 for (int i = 0; i < m; i++)
401 fprintf(fo, "%.6g%c", gene_fpkm[i].ub, (i < m - 1 ? '\t' : '\n'));
409 if (verbose) { printf("All credibility intervals are calculated!\n"); }
412 int main(int argc, char* argv[]) {
414 printf("Usage: rsem-calculate-credibility-intervals reference_name imdName statName confidence nCV nSpC nMB [-p #Threads] [-q]\n");
418 strcpy(imdName, argv[2]);
419 strcpy(statName, argv[3]);
421 confidence = atof(argv[4]);
423 nSpC = atoi(argv[6]);
428 for (int i = 8; i < argc; i++) {
429 if (!strcmp(argv[i], "-p")) nThreads = atoi(argv[i + 1]);
430 if (!strcmp(argv[i], "-q")) quiet = true;
434 sprintf(refF, "%s.seq", argv[1]);
435 refs.loadRefs(refF, 1);
437 sprintf(groupF, "%s.grp", argv[1]);
441 nSamples = nCV * nSpC;
442 assert(nSamples > 0 && M > 0); // for Buffter.h: (bufsize_type)nSamples
443 l_bars = new float[nSamples];
445 sprintf(tmpF, "%s.tmp", imdName);
446 sprintf(cvsF, "%s.countvectors", imdName);
448 sprintf(modelF, "%s.model", statName);
449 FILE *fi = fopen(modelF, "r");
450 general_assert(fi != NULL, "Cannot open " + cstrtos(modelF) + "!");
451 assert(fscanf(fi, "%d", &model_type) == 1);
456 case 0 : sample_theta_vectors_from_count_vectors<SingleModel>(); break;
457 case 1 : sample_theta_vectors_from_count_vectors<SingleQModel>(); break;
458 case 2 : sample_theta_vectors_from_count_vectors<PairedEndModel>(); break;
459 case 3 : sample_theta_vectors_from_count_vectors<PairedEndQModel>(); break;
463 calculate_credibility_intervals(imdName);
467 sprintf(command, "rm -f %s", tmpF);
468 int status = system(command);
470 fprintf(stderr, "Cannot delete %s!\n", tmpF);