12 #include "boost/random.hpp"
17 #include "SingleModel.h"
18 #include "SingleQModel.h"
19 #include "PairedEndModel.h"
20 #include "PairedEndQModel.h"
23 #include "GroupInfo.h"
27 typedef unsigned int seedType;
28 typedef boost::mt19937 engine_type;
29 typedef boost::gamma_distribution<> distribution_type;
30 typedef boost::variate_generator<engine_type&, distribution_type> generator_type;
31 typedef boost::uniform_01<engine_type> uniform_generator;
37 double *pme_c, *pme_theta;
44 Item(int sid, double conprb) {
46 this->conprb = conprb;
53 int m, M, N0, N1, nHits;
55 int BURNIN, NSAMPLES, GAP;
56 char imdName[STRLEN], statName[STRLEN];
57 char thetaF[STRLEN], ofgF[STRLEN], groupF[STRLEN], refF[STRLEN], modelF[STRLEN];
63 vector<double> theta, pme_theta, pme_c, eel;
71 engine_type engine(time(NULL));
78 void load_data(char* reference_name, char* statName, char* imdName) {
84 sprintf(refF, "%s.seq", reference_name);
85 refs.loadRefs(refF, 1);
89 sprintf(groupF, "%s.grp", reference_name);
94 sprintf(thetaF, "%s.theta",statName);
97 fprintf(stderr, "Cannot open %s!\n", thetaF);
101 if (tmpVal != M + 1) {
102 fprintf(stderr, "Number of transcripts is not consistent in %s and %s!\n", refF, thetaF);
105 theta.clear(); theta.resize(M + 1);
106 for (int i = 0; i <= M; i++) fin>>theta[i];
110 sprintf(ofgF, "%s.ofg", imdName);
112 if (!fin.is_open()) {
113 fprintf(stderr, "Cannot open %s!\n", ofgF);
118 fprintf(stderr, "M in %s is not consistent with %s!\n", ofgF, refF);
123 s.clear(); hits.clear();
125 while (getline(fin, line)) {
126 istringstream strin(line);
130 while (strin>>sid>>conprb) {
131 hits.push_back(Item(sid, conprb));
133 s.push_back(hits.size());
140 totc = N0 + N1 + (M + 1);
142 if (verbose) { printf("Loading Data is finished!\n"); }
149 set<seedType> seedSet;
150 set<seedType>::iterator iter;
153 quotient = NSAMPLES / nThreads;
154 left = NSAMPLES % nThreads;
156 sprintf(cvsF, "%s.countvectors", imdName);
158 params = new Params[nThreads];
159 threads = new pthread_t[nThreads];
160 for (int i = 0; i < nThreads; i++) {
163 params[i].nsamples = quotient;
164 if (i < left) params[i].nsamples++;
167 params[i].fo = fopen(cvsF, "w");
168 fprintf(params[i].fo, "%d %d\n", NSAMPLES, M + 1);
171 sprintf(outF, "%s%d", cvsF, i);
172 params[i].fo = fopen(outF, "w");
177 iter = seedSet.find(seed);
178 } while (iter != seedSet.end());
179 params[i].rng = new engine_type(seed);
180 seedSet.insert(seed);
182 params[i].pme_c = new double[M + 1];
183 memset(params[i].pme_c, 0, sizeof(double) * (M + 1));
184 params[i].pme_theta = new double[M + 1];
185 memset(params[i].pme_theta, 0, sizeof(double) * (M + 1));
188 /* set thread attribute to be joinable */
189 pthread_attr_init(&attr);
190 pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
194 void sampleTheta(engine_type& engine, vector<double>& theta) {
195 distribution_type gm(1);
196 generator_type gmg(engine, gm);
202 for (int i = 0; i <= M; i++) {
206 assert(denom > EPSILON);
207 for (int i = 0; i <= M; i++) theta[i] /= denom;
210 // arr should be cumulative!
212 // random number should be in [0, arr[len - 1])
213 // If by chance arr[len - 1] == 0.0, one possibility is to sample uniformly from 0...len-1
214 int sample(uniform_generator& rg, vector<double>& arr, int len) {
216 double prb = rg() * arr[len - 1];
221 if (arr[mid] <= prb) l = mid + 1;
225 if (l >= len) { printf("%d %lf %lf\n", len, arr[len - 1], prb); }
231 void writeForSee(FILE* fo, vector<int>& counts) {
232 for (int i = 1; i < M; i++) fprintf(fo, "%d ", counts[i]);
233 fprintf(fo, "%d\n", counts[M]);
236 void writeCountVector(FILE* fo, vector<int>& counts) {
237 for (int i = 0; i < M; i++) {
238 fprintf(fo, "%d ", counts[i]);
240 fprintf(fo, "%d\n", counts[M]);
243 void* Gibbs(void* arg) {
246 Params *params = (Params*)arg;
248 vector<double> theta;
249 vector<int> z, counts;
252 uniform_generator rg(*params->rng);
254 sampleTheta(*params->rng, theta);
257 // generate initial state
264 counts.resize(M + 1, 1); // 1 pseudo count
267 for (int i = 0; i < N1; i++) {
268 fr = s[i]; to = s[i + 1];
271 for (int j = fr; j < to; j++) {
272 arr[j - fr] = theta[hits[j].sid] * hits[j].conprb;
273 if (j > fr) arr[j - fr] += arr[j - fr - 1]; // cumulative
275 z[i] = hits[fr + sample(rg, arr, len)].sid;
281 sprintf(seeF, "%s.see", statName);
282 FILE *fsee = fopen(seeF, "w");
284 CHAINLEN = 1 + (params->nsamples - 1) * GAP;
285 for (int ROUND = 1; ROUND <= BURNIN + CHAINLEN ; ROUND++) {
287 for (int i = 0; i < N1; i++) {
289 fr = s[i]; to = s[i + 1]; len = to - fr;
291 for (int j = fr; j < to; j++) {
292 arr[j - fr] = counts[hits[j].sid] * hits[j].conprb;
293 if (j > fr) arr[j - fr] += arr[j - fr - 1]; //cumulative
295 z[i] = hits[fr + sample(rg, arr, len)].sid;
299 writeForSee(fsee, counts);
301 if (ROUND > BURNIN) {
302 if ((ROUND - BURNIN - 1) % GAP == 0) writeCountVector(params->fo, counts);
303 for (int i = 0; i <= M; i++) {
304 params->pme_c[i] += counts[i] - 1;
305 params->pme_theta[i] += counts[i] / totc;
309 if (verbose & ROUND % 10 == 0) { printf("Thread %d, ROUND %d is finished!\n", params->no, ROUND); }
317 char inpF[STRLEN], command[STRLEN];
320 /* destroy attribute */
321 pthread_attr_destroy(&attr);
324 pme_c.clear(); pme_c.resize(M + 1, 0.0);
325 pme_theta.clear(); pme_theta.resize(M + 1, 0.0);
326 for (int i = 0; i < nThreads; i++) {
327 fclose(params[i].fo);
328 delete params[i].rng;
329 for (int j = 0; j <= M; j++) {
330 pme_c[j] += params[i].pme_c[j];
331 pme_theta[j] += params[i].pme_theta[j];
333 delete[] params[i].pme_c;
334 delete[] params[i].pme_theta;
338 for (int i = 0; i <= M; i++) {
339 pme_c[i] /= NSAMPLES;
340 pme_theta[i] /= NSAMPLES;
344 FILE *fo = fopen(cvsF, "a");
345 for (int i = 1; i < nThreads; i++) {
346 sprintf(inpF, "%s%d", cvsF, i);
348 while (getline(fin, line)) {
349 fprintf(fo, "%s\n", line.c_str());
352 sprintf(command, "rm -f %s", inpF);
353 int status = system(command);
354 if (status != 0) { fprintf(stderr, "Fail to delete file %s!\n", inpF); exit(-1); }
359 template<class ModelType>
360 void calcExpectedEffectiveLengths(ModelType& model) {
362 double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i)
364 model.getGLD().copyTo(pdf, cdf, lb, ub, span);
365 clen = new double[span + 1];
367 for (int i = 1; i <= span; i++) {
368 clen[i] = clen[i - 1] + pdf[i] * (lb + i);
372 eel.resize(M + 1, 0.0);
373 for (int i = 1; i <= M; i++) {
374 int totLen = refs.getRef(i).getTotLen();
375 int fullLen = refs.getRef(i).getFullLen();
376 int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0);
377 int pos2 = max(min(totLen, ub) - lb, 0);
379 if (pos2 == 0) { eel[i] = 0.0; continue; }
381 eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
383 if (eel[i] < MINEEL) { eel[i] = 0.0; }
391 template<class ModelType>
392 void writeEstimatedParameters(char* modelF, char* imdName) {
400 calcExpectedEffectiveLengths<ModelType>(model);
402 denom = pme_theta[0];
403 for (int i = 1; i <= M; i++)
404 if (eel[i] < EPSILON) pme_theta[i] = 0.0;
405 else denom += pme_theta[i];
406 if (denom <= 0) { fprintf(stderr, "No Expected Effective Length is no less than %.6g?!\n", MINEEL); exit(-1); }
407 for (int i = 0; i <= M; i++) pme_theta[i] /= denom;
410 double *mw = model.getMW();
411 for (int i = 0; i <= M; i++) {
412 pme_theta[i] = (mw[i] < EPSILON ? 0.0 : pme_theta[i] / mw[i]);
413 denom += pme_theta[i];
415 assert(denom >= EPSILON);
416 for (int i = 0; i <= M; i++) pme_theta[i] /= denom;
418 //calculate tau values
419 double *tau = new double[M + 1];
420 memset(tau, 0, sizeof(double) * (M + 1));
423 for (int i = 1; i <= M; i++)
424 if (eel[i] > EPSILON) {
425 tau[i] = pme_theta[i] / eel[i];
428 if (denom <= 0) { fprintf(stderr, "No alignable reads?!\n"); exit(-1); }
430 for (int i = 1; i <= M; i++) {
434 //isoform level results
435 sprintf(outF, "%s.iso_res", imdName);
436 fo = fopen(outF, "a");
437 if (fo == NULL) { fprintf(stderr, "Cannot open %s!\n", outF); exit(-1); }
438 for (int i = 1; i <= M; i++)
439 fprintf(fo, "%.2f%c", pme_c[i], (i < M ? '\t' : '\n'));
440 for (int i = 1; i <= M; i++)
441 fprintf(fo, "%.15g%c", tau[i], (i < M ? '\t' : '\n'));
445 sprintf(outF, "%s.gene_res", imdName);
446 fo = fopen(outF, "a");
447 if (fo == NULL) { fprintf(stderr, "Cannot open %s!\n", outF); exit(-1); }
448 for (int i = 0; i < m; i++) {
449 double sumC = 0.0; // sum of pme counts
450 int b = gi.spAt(i), e = gi.spAt(i + 1);
451 for (int j = b; j < e; j++) {
454 fprintf(fo, "%.15g%c", sumC, (i < m - 1 ? '\t' : '\n'));
456 for (int i = 0; i < m; i++) {
457 double sumT = 0.0; // sum of tau values
458 int b = gi.spAt(i), e = gi.spAt(i + 1);
459 for (int j = b; j < e; j++) {
462 fprintf(fo, "%.15g%c", sumT, (i < m - 1 ? '\t' : '\n'));
468 if (verbose) { printf("Gibbs based expression values are written!\n"); }
472 int main(int argc, char* argv[]) {
474 printf("Usage: rsem-run-gibbs reference_name sample_name sampleToken BURNIN NSAMPLES GAP [-p #Threads] [-q]\n");
478 BURNIN = atoi(argv[4]);
479 NSAMPLES = atoi(argv[5]);
481 sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
482 sprintf(statName, "%s.stat/%s", argv[2], argv[3]);
483 load_data(argv[1], statName, imdName);
487 for (int i = 7; i < argc; i++) {
488 if (!strcmp(argv[i], "-p")) nThreads = atoi(argv[i + 1]);
489 if (!strcmp(argv[i], "-q")) quiet = true;
493 if (nThreads > NSAMPLES) {
495 printf("Warning: Number of samples is less than number of threads! Change the number of threads to %d!\n", nThreads);
498 if (verbose) printf("Gibbs started!\n");
500 for (int i = 0; i < nThreads; i++) {
501 rc = pthread_create(&threads[i], &attr, Gibbs, (void*)(¶ms[i]));
502 if (rc != 0) { fprintf(stderr, "Cannot create thread %d! (numbered from 0)\n", i); }
504 for (int i = 0; i < nThreads; i++) {
505 rc = pthread_join(threads[i], &status);
506 if (rc != 0) { fprintf(stderr, "Cannot join thread %d! (numbered from 0)\n", i); }
509 if (verbose) printf("Gibbs finished!\n");
511 sprintf(modelF, "%s.model", statName);
512 FILE *fi = fopen(modelF, "r");
513 if (fi == NULL) { fprintf(stderr, "Cannot open %s!\n", modelF); exit(-1); }
514 fscanf(fi, "%d", &model_type);
518 case 0 : writeEstimatedParameters<SingleModel>(modelF, imdName); break;
519 case 1 : writeEstimatedParameters<SingleQModel>(modelF, imdName); break;
520 case 2 : writeEstimatedParameters<PairedEndModel>(modelF, imdName); break;
521 case 3 : writeEstimatedParameters<PairedEndQModel>(modelF, imdName); break;