11 #include "my_assert.h"
15 #include "SingleModel.h"
16 #include "SingleQModel.h"
17 #include "PairedEndModel.h"
18 #include "PairedEndQModel.h"
21 #include "GroupInfo.h"
29 double *pme_c, *pve_c; //posterior mean and variance vectors on counts
30 double *pme_tpm, *pme_fpkm;
38 Item(int sid, double conprb) {
40 this->conprb = conprb;
51 int BURNIN, NSAMPLES, GAP;
52 char imdName[STRLEN], statName[STRLEN];
53 char thetaF[STRLEN], ofgF[STRLEN], groupF[STRLEN], refF[STRLEN], modelF[STRLEN];
59 vector<HIT_INT_TYPE> s;
65 vector<double> pme_c, pve_c; //global posterior mean and variance vectors on counts
66 vector<double> pme_tpm, pme_fpkm;
76 void load_data(char* reference_name, char* statName, char* imdName) {
82 sprintf(refF, "%s.seq", reference_name);
83 refs.loadRefs(refF, 1);
87 sprintf(groupF, "%s.grp", reference_name);
92 sprintf(ofgF, "%s.ofg", imdName);
94 general_assert(fin.is_open(), "Cannot open " + cstrtos(ofgF) + "!");
96 general_assert(tmpVal == M, "M in " + cstrtos(ofgF) + " is not consistent with " + cstrtos(refF) + "!");
99 s.clear(); hits.clear();
101 while (getline(fin, line)) {
102 istringstream strin(line);
106 while (strin>>sid>>conprb) {
107 hits.push_back(Item(sid, conprb));
109 s.push_back(hits.size());
116 totc = N0 + N1 + (M + 1);
118 if (verbose) { printf("Loading Data is finished!\n"); }
121 template<class ModelType>
122 void calcExpectedEffectiveLengths(ModelType& model) {
124 double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i)
126 model.getGLD().copyTo(pdf, cdf, lb, ub, span);
127 clen = new double[span + 1];
129 for (int i = 1; i <= span; i++) {
130 clen[i] = clen[i - 1] + pdf[i] * (lb + i);
133 eel.assign(M + 1, 0.0);
134 for (int i = 1; i <= M; i++) {
135 int totLen = refs.getRef(i).getTotLen();
136 int fullLen = refs.getRef(i).getFullLen();
137 int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0);
138 int pos2 = max(min(totLen, ub) - lb, 0);
140 if (pos2 == 0) { eel[i] = 0.0; continue; }
142 eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
144 if (eel[i] < MINEEL) { eel[i] = 0.0; }
152 template<class ModelType>
153 void init_model_related(char* modelF) {
157 calcExpectedEffectiveLengths<ModelType>(model);
158 memcpy(mw, model.getMW(), sizeof(double) * (M + 1)); // otherwise, after exiting this procedure, mw becomes undefined
166 quotient = NSAMPLES / nThreads;
167 left = NSAMPLES % nThreads;
169 sprintf(cvsF, "%s.countvectors", imdName);
170 paramsArray = new Params[nThreads];
171 threads = new pthread_t[nThreads];
173 for (int i = 0; i < nThreads; i++) {
174 paramsArray[i].no = i;
176 paramsArray[i].nsamples = quotient;
177 if (i < left) paramsArray[i].nsamples++;
179 sprintf(outF, "%s%d", cvsF, i);
180 paramsArray[i].fo = fopen(outF, "w");
182 paramsArray[i].engine = engineFactory::new_engine();
183 paramsArray[i].pme_c = new double[M + 1];
184 memset(paramsArray[i].pme_c, 0, sizeof(double) * (M + 1));
185 paramsArray[i].pve_c = new double[M + 1];
186 memset(paramsArray[i].pve_c, 0, sizeof(double) * (M + 1));
187 paramsArray[i].pme_tpm = new double[M + 1];
188 memset(paramsArray[i].pme_tpm, 0, sizeof(double) * (M + 1));
189 paramsArray[i].pme_fpkm = new double[M + 1];
190 memset(paramsArray[i].pme_fpkm, 0, sizeof(double) * (M + 1));
193 /* set thread attribute to be joinable */
194 pthread_attr_init(&attr);
195 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
197 if (verbose) { printf("Initialization finished!\n"); }
200 //sample theta from Dir(1)
201 void sampleTheta(engine_type& engine, vector<double>& theta) {
203 gamma_generator gmg(engine, gm);
206 theta.assign(M + 1, 0);
208 for (int i = 0; i <= M; i++) {
212 assert(denom > EPSILON);
213 for (int i = 0; i <= M; i++) theta[i] /= denom;
216 void writeCountVector(FILE* fo, vector<int>& counts) {
217 for (int i = 0; i < M; i++) {
218 fprintf(fo, "%d ", counts[i]);
220 fprintf(fo, "%d\n", counts[M]);
223 void polishTheta(vector<double>& theta, const vector<double>& eel, const double* mw) {
226 /* The reason that for noise gene, mw value is 1 is :
227 * currently, all masked positions are for poly(A) sites, which in theory should be filtered out.
228 * So the theta0 does not containing reads from any masked position
231 for (int i = 0; i <= M; i++) {
232 // i == 0, mw[i] == 1
233 if (i > 0 && (mw[i] < EPSILON || eel[i] < EPSILON)) {
237 theta[i] = theta[i] / mw[i];
240 // currently is OK, since no transcript should be masked totally, only the poly(A) tail related part will be masked
241 general_assert(sum >= EPSILON, "No effective length is no less than" + ftos(MINEEL, 6) + " !");
242 for (int i = 0; i <= M; i++) theta[i] /= sum;
245 void calcExpressionValues(const vector<double>& theta, const vector<double>& eel, vector<double>& tpm, vector<double>& fpkm) {
249 //calculate fraction of count over all mappabile reads
251 frac.assign(M + 1, 0.0);
252 for (int i = 1; i <= M; i++)
253 if (eel[i] >= EPSILON) {
257 general_assert(denom >= EPSILON, "No alignable reads?!");
258 for (int i = 1; i <= M; i++) frac[i] /= denom;
261 fpkm.assign(M + 1, 0.0);
262 for (int i = 1; i <= M; i++)
263 if (eel[i] >= EPSILON) fpkm[i] = frac[i] * 1e9 / eel[i];
266 tpm.assign(M + 1, 0.0);
268 for (int i = 1; i <= M; i++) denom += fpkm[i];
269 for (int i = 1; i <= M; i++) tpm[i] = fpkm[i] / denom * 1e6;
272 void* Gibbs(void* arg) {
274 HIT_INT_TYPE len, fr, to;
275 Params *params = (Params*)arg;
277 vector<double> theta, tpm, fpkm;
278 vector<int> z, counts;
281 uniform01 rg(*params->engine);
283 // generate initial state
284 sampleTheta(*params->engine, theta);
288 counts.assign(M + 1, 1); // 1 pseudo count
291 for (READ_INT_TYPE i = 0; i < N1; i++) {
292 fr = s[i]; to = s[i + 1];
295 for (HIT_INT_TYPE j = fr; j < to; j++) {
296 arr[j - fr] = theta[hits[j].sid] * hits[j].conprb;
297 if (j > fr) arr[j - fr] += arr[j - fr - 1]; // cumulative
299 z[i] = hits[fr + sample(rg, arr, len)].sid;
304 CHAINLEN = 1 + (params->nsamples - 1) * GAP;
305 for (int ROUND = 1; ROUND <= BURNIN + CHAINLEN; ROUND++) {
307 for (READ_INT_TYPE i = 0; i < N1; i++) {
309 fr = s[i]; to = s[i + 1]; len = to - fr;
311 for (HIT_INT_TYPE j = fr; j < to; j++) {
312 arr[j - fr] = counts[hits[j].sid] * hits[j].conprb;
313 if (j > fr) arr[j - fr] += arr[j - fr - 1]; //cumulative
315 z[i] = hits[fr + sample(rg, arr, len)].sid;
319 if (ROUND > BURNIN) {
320 if ((ROUND - BURNIN - 1) % GAP == 0) {
321 writeCountVector(params->fo, counts);
322 for (int i = 0; i <= M; i++) theta[i] = counts[i] / totc;
323 polishTheta(theta, eel, mw);
324 calcExpressionValues(theta, eel, tpm, fpkm);
325 for (int i = 0; i <= M; i++) {
326 params->pme_c[i] += counts[i] - 1;
327 params->pve_c[i] += (counts[i] - 1) * (counts[i] - 1);
328 params->pme_tpm[i] += tpm[i];
329 params->pme_fpkm[i] += fpkm[i];
334 if (verbose && ROUND % 100 == 0) { printf("Thread %d, ROUND %d is finished!\n", params->no, ROUND); }
341 // char inpF[STRLEN], command[STRLEN];
344 /* destroy attribute */
345 pthread_attr_destroy(&attr);
348 pme_c.assign(M + 1, 0);
349 pve_c.assign(M + 1, 0);
350 pme_tpm.assign(M + 1, 0);
351 pme_fpkm.assign(M + 1, 0);
352 for (int i = 0; i < nThreads; i++) {
353 fclose(paramsArray[i].fo);
354 delete paramsArray[i].engine;
355 for (int j = 0; j <= M; j++) {
356 pme_c[j] += paramsArray[i].pme_c[j];
357 pve_c[j] += paramsArray[i].pve_c[j];
358 pme_tpm[j] += paramsArray[i].pme_tpm[j];
359 pme_fpkm[j] += paramsArray[i].pme_fpkm[j];
361 delete[] paramsArray[i].pme_c;
362 delete[] paramsArray[i].pve_c;
363 delete[] paramsArray[i].pme_tpm;
364 delete[] paramsArray[i].pme_fpkm;
366 delete[] paramsArray;
369 for (int i = 0; i <= M; i++) {
370 pme_c[i] /= NSAMPLES;
371 pve_c[i] = (pve_c[i] - NSAMPLES * pme_c[i] * pme_c[i]) / (NSAMPLES - 1);
372 pme_tpm[i] /= NSAMPLES;
373 pme_fpkm[i] /= NSAMPLES;
377 void writeResults(char* imdName) {
381 vector<double> isopct;
382 vector<double> gene_counts, gene_tpm, gene_fpkm;
384 //calculate IsoPct, etc.
385 isopct.assign(M + 1, 0.0);
386 gene_counts.assign(m, 0.0); gene_tpm.assign(m, 0.0); gene_fpkm.assign(m, 0.0);
388 for (int i = 0; i < m; i++) {
389 int b = gi.spAt(i), e = gi.spAt(i + 1);
390 for (int j = b; j < e; j++) {
391 gene_counts[i] += pme_c[j];
392 gene_tpm[i] += pme_tpm[j];
393 gene_fpkm[i] += pme_fpkm[j];
395 if (gene_tpm[i] < EPSILON) continue;
396 for (int j = b; j < e; j++)
397 isopct[j] = pme_tpm[j] / gene_tpm[i];
400 //isoform level results
401 sprintf(outF, "%s.iso_res", imdName);
402 fo = fopen(outF, "a");
403 general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!");
405 for (int i = 1; i <= M; i++)
406 fprintf(fo, "%.2f%c", pme_c[i], (i < M ? '\t' : '\n'));
407 for (int i = 1; i <= M; i++)
408 fprintf(fo, "%.2f%c", pme_tpm[i], (i < M ? '\t' : '\n'));
409 for (int i = 1; i <= M; i++)
410 fprintf(fo, "%.2f%c", pme_fpkm[i], (i < M ? '\t' : '\n'));
411 for (int i = 1; i <= M; i++)
412 fprintf(fo, "%.2f%c", isopct[i] * 1e2, (i < M ? '\t' : '\n'));
416 sprintf(outF, "%s.gene_res", imdName);
417 fo = fopen(outF, "a");
418 general_assert(fo != NULL, "Cannot open " + cstrtos(outF) + "!");
420 for (int i = 0; i < m; i++)
421 fprintf(fo, "%.2f%c", gene_counts[i], (i < m - 1 ? '\t' : '\n'));
422 for (int i = 0; i < m; i++)
423 fprintf(fo, "%.2f%c", gene_tpm[i], (i < m - 1 ? '\t' : '\n'));
424 for (int i = 0; i < m; i++)
425 fprintf(fo, "%.2f%c", gene_fpkm[i], (i < m - 1 ? '\t' : '\n'));
428 if (verbose) { printf("Gibbs based expression values are written!\n"); }
431 int main(int argc, char* argv[]) {
433 printf("Usage: rsem-run-gibbs reference_name imdName statName BURNIN NSAMPLES GAP [-p #Threads] [--var] [-q]\n");
437 strcpy(imdName, argv[2]);
438 strcpy(statName, argv[3]);
440 BURNIN = atoi(argv[4]);
441 NSAMPLES = atoi(argv[5]);
448 for (int i = 7; i < argc; i++) {
449 if (!strcmp(argv[i], "-p")) nThreads = atoi(argv[i + 1]);
450 if (!strcmp(argv[i], "--var")) var_opt = true;
451 if (!strcmp(argv[i], "-q")) quiet = true;
455 assert(NSAMPLES > 1); // Otherwise, we cannot calculate posterior variance
457 if (nThreads > NSAMPLES) {
459 printf("Warning: Number of samples is less than number of threads! Change the number of threads to %d!\n", nThreads);
462 load_data(argv[1], statName, imdName);
464 sprintf(modelF, "%s.model", statName);
465 FILE *fi = fopen(modelF, "r");
466 general_assert(fi != NULL, "Cannot open " + cstrtos(modelF) + "!");
467 assert(fscanf(fi, "%d", &model_type) == 1);
470 mw = new double[M + 1]; // make an extra copy
473 case 0 : init_model_related<SingleModel>(modelF); break;
474 case 1 : init_model_related<SingleQModel>(modelF); break;
475 case 2 : init_model_related<PairedEndModel>(modelF); break;
476 case 3 : init_model_related<PairedEndQModel>(modelF); break;
479 if (verbose) printf("Gibbs started!\n");
482 for (int i = 0; i < nThreads; i++) {
483 rc = pthread_create(&threads[i], &attr, Gibbs, (void*)(¶msArray[i]));
484 pthread_assert(rc, "pthread_create", "Cannot create thread " + itos(i) + " (numbered from 0)!");
486 for (int i = 0; i < nThreads; i++) {
487 rc = pthread_join(threads[i], NULL);
488 pthread_assert(rc, "pthread_join", "Cannot join thread " + itos(i) + " (numbered from 0)!");
492 if (verbose) printf("Gibbs finished!\n");
494 writeResults(imdName);
499 sprintf(varF, "%s.var", statName);
500 FILE *fo = fopen(varF, "w");
501 general_assert(fo != NULL, "Cannot open " + cstrtos(varF) + "!");
502 for (int i = 0; i < m; i++) {
503 int b = gi.spAt(i), e = gi.spAt(i + 1), number_of_isoforms = e - b;
504 for (int j = b; j < e; j++) {
505 fprintf(fo, "%s\t%d\t%.15g\t%.15g\n", refs.getRef(j).getName().c_str(), number_of_isoforms, pme_c[j], pve_c[j]);
511 delete mw; // delete the copy