10 #include "boost/random.hpp"
15 #include "SingleModel.h"
16 #include "SingleQModel.h"
17 #include "PairedEndModel.h"
18 #include "PairedEndQModel.h"
21 #include "GroupInfo.h"
29 Item(int sid, double conprb) {
31 this->conprb = conprb;
36 int m, M, N0, N1, nHits;
38 int BURNIN, CHAINLEN, GAP;
39 char imdName[STRLEN], statName[STRLEN];
40 char thetaF[STRLEN], ofgF[STRLEN], groupF[STRLEN], refF[STRLEN], modelF[STRLEN];
46 vector<double> theta, pme_theta, pme_c, eel;
56 boost::mt19937 rng(time(NULL));
57 boost::uniform_01<boost::mt19937> rg(rng);
59 void load_data(char* reference_name, char* statName, char* imdName) {
65 sprintf(refF, "%s.seq", reference_name);
66 refs.loadRefs(refF, 1);
70 sprintf(groupF, "%s.grp", reference_name);
75 sprintf(thetaF, "%s.theta",statName);
78 fprintf(stderr, "Cannot open %s!\n", thetaF);
82 if (tmpVal != M + 1) {
83 fprintf(stderr, "Number of transcripts is not consistent in %s and %s!\n", refF, thetaF);
86 theta.clear(); theta.resize(M + 1);
87 for (int i = 0; i <= M; i++) fin>>theta[i];
91 sprintf(ofgF, "%s.ofg", imdName);
94 fprintf(stderr, "Cannot open %s!\n", ofgF);
99 fprintf(stderr, "M in %s is not consistent with %s!\n", ofgF, refF);
104 s.clear(); hits.clear();
106 while (getline(fin, line)) {
107 istringstream strin(line);
111 while (strin>>sid>>conprb) {
112 hits.push_back(Item(sid, conprb));
114 s.push_back(hits.size());
121 if (verbose) { printf("Loading Data is finished!\n"); }
124 // arr should be cumulative!
126 // random number should be in [0, arr[len - 1])
127 // If by chance arr[len - 1] == 0.0, one possibility is to sample uniformly from 0...len-1
128 int sample(vector<double>& arr, int len) {
130 double prb = rg() * arr[len - 1];
135 if (arr[mid] <= prb) l = mid + 1;
139 if (l >= len) { printf("%d %lf %lf\n", len, arr[len - 1], prb); }
153 counts.resize(M + 1, 1); // 1 pseudo count
156 for (int i = 0; i < N1; i++) {
157 fr = s[i]; to = s[i + 1];
160 for (int j = fr; j < to; j++) {
161 arr[j - fr] = theta[hits[j].sid] * hits[j].conprb;
162 if (j > fr) arr[j - fr] += arr[j - fr - 1]; // cumulative
164 z[i] = hits[fr + sample(arr, len)].sid;
168 totc = N0 + N1 + (M + 1);
170 if (verbose) { printf("Initialization is finished!\n"); }
173 void writeCountVector(FILE* fo) {
174 for (int i = 0; i < M; i++) {
175 fprintf(fo, "%d ", counts[i]);
177 fprintf(fo, "%d\n", counts[M]);
180 void Gibbs(char* imdName) {
184 sprintf(cvsF, "%s.countvectors", imdName);
185 fo = fopen(cvsF, "w");
186 assert(CHAINLEN % GAP == 0);
187 fprintf(fo, "%d %d\n", CHAINLEN / GAP, M + 1);
188 //fprintf(fo, "%d %d\n", CHAINLEN, M + 1);
190 pme_c.clear(); pme_c.resize(M + 1, 0.0);
191 pme_theta.clear(); pme_theta.resize(M + 1, 0.0);
192 for (int ROUND = 1; ROUND <= BURNIN + CHAINLEN; ROUND++) {
194 for (int i = 0; i < N1; i++) {
196 fr = s[i]; to = s[i + 1]; len = to - fr;
198 for (int j = fr; j < to; j++) {
199 arr[j - fr] = counts[hits[j].sid] * hits[j].conprb;
200 if (j > fr) arr[j - fr] += arr[j - fr - 1]; //cumulative
202 z[i] = hits[fr + sample(arr, len)].sid;
206 if (ROUND > BURNIN) {
207 if ((ROUND - BURNIN - 1) % GAP == 0) writeCountVector(fo);
208 for (int i = 0; i <= M; i++) {
209 pme_c[i] += counts[i] - 1;
210 pme_theta[i] += counts[i] / totc;
214 if (verbose) { printf("ROUND %d is finished!\n", ROUND); }
218 for (int i = 0; i <= M; i++) {
219 pme_c[i] /= CHAINLEN;
220 pme_theta[i] /= CHAINLEN;
223 if (verbose) { printf("Gibbs is finished!\n"); }
226 template<class ModelType>
227 void calcExpectedEffectiveLengths(ModelType& model) {
229 double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i)
231 model.getGLD().copyTo(pdf, cdf, lb, ub, span);
232 clen = new double[span + 1];
234 for (int i = 1; i <= span; i++) {
235 clen[i] = clen[i - 1] + pdf[i] * (lb + i);
239 eel.resize(M + 1, 0.0);
240 for (int i = 1; i <= M; i++) {
241 int totLen = refs.getRef(i).getTotLen();
242 int fullLen = refs.getRef(i).getFullLen();
243 int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0);
244 int pos2 = max(min(totLen, ub) - lb, 0);
246 if (pos2 == 0) { eel[i] = 0.0; continue; }
248 eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
250 if (eel[i] < MINEEL) { eel[i] = 0.0; }
258 template<class ModelType>
259 void writeEstimatedParameters(char* modelF, char* imdName) {
267 calcExpectedEffectiveLengths<ModelType>(model);
269 denom = pme_theta[0];
270 for (int i = 1; i <= M; i++)
271 if (eel[i] < EPSILON) pme_theta[i] = 0.0;
272 else denom += pme_theta[i];
273 if (denom <= 0) { fprintf(stderr, "No Expected Effective Length is no less than %.6g?!\n", MINEEL); exit(-1); }
274 for (int i = 0; i <= M; i++) pme_theta[i] /= denom;
277 double *mw = model.getMW();
278 for (int i = 0; i <= M; i++) {
279 pme_theta[i] = (mw[i] < EPSILON ? 0.0 : pme_theta[i] / mw[i]);
280 denom += pme_theta[i];
282 assert(denom >= EPSILON);
283 for (int i = 0; i <= M; i++) pme_theta[i] /= denom;
285 //calculate tau values
286 double *tau = new double[M + 1];
287 memset(tau, 0, sizeof(double) * (M + 1));
290 for (int i = 1; i <= M; i++)
291 if (eel[i] > EPSILON) {
292 tau[i] = pme_theta[i] / eel[i];
295 if (denom <= 0) { fprintf(stderr, "No alignable reads?!\n"); exit(-1); }
297 for (int i = 1; i <= M; i++) {
301 //isoform level results
302 sprintf(outF, "%s.iso_res", imdName);
303 fo = fopen(outF, "a");
304 if (fo == NULL) { fprintf(stderr, "Cannot open %s!\n", outF); exit(-1); }
305 for (int i = 1; i <= M; i++)
306 fprintf(fo, "%.2f%c", pme_c[i], (i < M ? '\t' : '\n'));
307 for (int i = 1; i <= M; i++)
308 fprintf(fo, "%.15g%c", tau[i], (i < M ? '\t' : '\n'));
312 sprintf(outF, "%s.gene_res", imdName);
313 fo = fopen(outF, "a");
314 if (fo == NULL) { fprintf(stderr, "Cannot open %s!\n", outF); exit(-1); }
315 for (int i = 0; i < m; i++) {
316 double sumC = 0.0; // sum of pme counts
317 int b = gi.spAt(i), e = gi.spAt(i + 1);
318 for (int j = b; j < e; j++) {
321 fprintf(fo, "%.15g%c", sumC, (i < m - 1 ? '\t' : '\n'));
323 for (int i = 0; i < m; i++) {
324 double sumT = 0.0; // sum of tau values
325 int b = gi.spAt(i), e = gi.spAt(i + 1);
326 for (int j = b; j < e; j++) {
329 fprintf(fo, "%.15g%c", sumT, (i < m - 1 ? '\t' : '\n'));
335 if (verbose) { printf("Gibbs based expression values are written!\n"); }
339 int main(int argc, char* argv[]) {
341 printf("Usage: rsem-run-gibbs reference_name sample_name sampleToken BURNIN CHAINLEN GAP [-q]\n");
345 BURNIN = atoi(argv[4]);
346 CHAINLEN = atoi(argv[5]);
348 sprintf(imdName, "%s.temp/%s", argv[2], argv[3]);
349 sprintf(statName, "%s.stat/%s", argv[2], argv[3]);
350 load_data(argv[1], statName, imdName);
353 if (argc > 7 && !strcmp(argv[7], "-q")) {
361 sprintf(modelF, "%s.model", statName);
362 FILE *fi = fopen(modelF, "r");
363 if (fi == NULL) { fprintf(stderr, "Cannot open %s!\n", modelF); exit(-1); }
364 fscanf(fi, "%d", &model_type);
368 case 0 : writeEstimatedParameters<SingleModel>(modelF, imdName); break;
369 case 1 : writeEstimatedParameters<SingleQModel>(modelF, imdName); break;
370 case 2 : writeEstimatedParameters<PairedEndModel>(modelF, imdName); break;
371 case 3 : writeEstimatedParameters<PairedEndQModel>(modelF, imdName); break;