9 #include "boost/random.hpp"
14 #include "SingleModel.h"
15 #include "SingleQModel.h"
16 #include "PairedEndModel.h"
17 #include "PairedEndQModel.h"
20 #include "GroupInfo.h"
24 typedef unsigned long bufsize_type;
25 typedef boost::mt19937 engine_type;
26 typedef boost::gamma_distribution<> distribution_type;
27 typedef boost::variate_generator<engine_type&, distribution_type> generator_type;
29 const int FLOATSIZE = sizeof(float);
32 float lb, ub; // the interval is [lb, ub]
34 CIType() { lb = ub = 0.0; }
42 int nC, cvlen, nSpC, nSamples; // nSpC : number of sample theta vectors per count vector
43 int fr, to; // each flush, sample fr .. to - 1
48 char cvsF[STRLEN], tmpF[STRLEN], command[STRLEN];
53 CIType *iso_nrf, *gene_nrf, *iso_tau, *gene_tau;
55 engine_type engine(time(NULL));
56 distribution_type **gammas;
62 char modelF[STRLEN], groupF[STRLEN], refF[STRLEN];
64 vector<double> eel; //expected effective lengths
65 double *tau_denoms; // denominators for tau values
67 template<class ModelType>
68 void calcExpectedEffectiveLengths(ModelType& model) {
70 double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = sigma_{j=1}^{i}pdf[i]*(lb+i)
72 model.getGLD().copyTo(pdf, cdf, lb, ub, span);
73 clen = new double[span + 1];
75 for (int i = 1; i <= span; i++) {
76 clen[i] = clen[i - 1] + pdf[i] * (lb + i);
80 eel.resize(M + 1, 0.0);
81 for (int i = 1; i <= M; i++) {
82 int totLen = refs.getRef(i).getTotLen();
83 int fullLen = refs.getRef(i).getFullLen();
84 int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0);
85 int pos2 = max(min(totLen, ub) - lb, 0);
87 if (pos2 == 0) { eel[i] = 0.0; continue; }
89 eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1]));
91 if (eel[i] < MINEEL) { eel[i] = 0.0; }
99 void flushToTempFile() {
100 int gap1 = fr * FLOATSIZE;
101 int gap2 = (nSamples - to) * FLOATSIZE;
104 ftmpOut.seekp(0, ios_base::beg);
105 for (int i = 0; i < cvlen; i++) {
107 ftmpOut.seekp(gap1, ios_base::cur);
108 for (int j = fr; j < to; j++) {
109 ftmpOut.write((char*)p, FLOATSIZE);
112 ftmpOut.seekp(gap2, ios_base::cur);
116 template<class ModelType>
123 calcExpectedEffectiveLengths<ModelType>(model);
125 ftmpOut.open(tmpF, ios::binary);
128 assert(cvlen = M + 1);
130 nSamples = nC * nSpC;
134 size = bufsize_type(nMB) * 1024 * 1024 / FLOATSIZE / cvlen;
135 if (size > (bufsize_type)nSamples) size = nSamples;
137 buffer = new float[size];
142 cvec = new int[cvlen];
143 theta = new double[cvlen];
144 gammas = new distribution_type*[cvlen];
145 rgs = new generator_type*[cvlen];
147 tau_denoms = new double[nSamples];
148 memset(tau_denoms, 0, sizeof(double) * nSamples);
150 double *mw = model.getMW();
151 for (int i = 0; i < nC; i++) {
152 for (int j = 0; j < cvlen; j++) {
156 for (int j = 0; j < cvlen; j++) {
157 gammas[j] = new distribution_type(cvec[j]); // need change back before publishing
158 rgs[j] = new generator_type(engine, *gammas[j]);
161 for (int j = 0; j < nSpC; j++) {
163 for (int k = 0; k < cvlen; k++) {
164 theta[k] = (k == 0 || eel[k] > EPSILON ? (*rgs[k])() : 0.0);
168 for (int k = 0; k < cvlen; k++) theta[k] /= sum;
171 for (int k = 0; k < cvlen; k++) {
172 theta[k] = (mw[k] < EPSILON ? 0.0 : theta[k] / mw[k]);
175 assert(sum >= EPSILON);
176 for (int k = 0; k < cvlen; k++) theta[k] /= sum;
178 *p = (float)theta[0]; ++p;
179 assert(1.0 - theta[0] > 0.0);
180 for (int k = 1; k < cvlen; k++) {
181 if (eel[k] > EPSILON) {
182 theta[k] /= (1.0 - theta[0]);
183 tau_denoms[to] += theta[k] / eel[k];
186 if (theta[k] != 0.0) { fprintf(stderr, "K=%d Theta_K=%lf\n", k, theta[k]); exit(-1); }
189 *p = (float)theta[k];
197 if (verbose) { printf("%d vectors are sampled!\n", to); }
201 for (int j = 0; j < cvlen; j++) {
207 if (fr != to) { flushToTempFile(); }
219 if (verbose) { printf("Sampling is finished!\n"); }
222 void calcCI(int nSamples, float *samples, float &lb, float &ub) {
223 int p, q; // p pointer for lb, q pointer for ub;
225 int threshold = nSamples - (int(confidence * nSamples - 1e-8) + 1);
228 sort(samples, samples + nSamples);
230 p = 0; q = nSamples - 1;
234 while (newq > 0 && samples[newq - 1] == samples[newq]) newq--;
236 } while (newq >= 0 && nSamples - (newq + 1) <= threshold);
238 nOutside = nSamples - (q + 1);
240 lb = -1e30; ub = 1e30;
242 if (samples[q] - samples[p] < ub - lb) {
248 while (newp < nSamples - 1 && samples[newp] == samples[newp + 1]) newp++;
250 if (newp <= threshold) {
251 nOutside += newp - p;
253 while (nOutside > threshold && q < nSamples - 1) {
255 while (newq < nSamples - 1 && samples[newq] == samples[newq + 1]) newq++;
256 nOutside -= newq - q;
259 assert(nOutside <= threshold);
262 } while (p <= threshold);
265 void generateResults(char* imdName) {
266 float *izsamples, *gzsamples, *itsamples, *gtsamples;
271 iso_nrf = new CIType[M + 1];
272 iso_tau = new CIType[M + 1];
273 gene_nrf = new CIType[m];
274 gene_tau = new CIType[m];
276 izsamples = new float[nSamples];
277 itsamples = new float[nSamples];
278 gzsamples = new float[nSamples];
279 gtsamples = new float[nSamples];
281 fin.open(tmpF, ios::binary);
283 for (int k = 0; k < nSamples; k++) fin.read((char*)(&izsamples[k]), FLOATSIZE);
284 calcCI(nSamples, izsamples, iso_nrf[0].lb, iso_nrf[0].ub);
286 for (int i = 0; i < m; i++) {
287 int b = gi.spAt(i), e = gi.spAt(i + 1);
288 memset(gzsamples, 0, FLOATSIZE * nSamples);
289 memset(gtsamples, 0, FLOATSIZE * nSamples);
290 for (int j = b; j < e; j++) {
291 for (int k = 0; k < nSamples; k++) {
292 fin.read((char*)(&izsamples[k]), FLOATSIZE);
293 if (eel[j] > EPSILON && tau_denoms[k] > EPSILON) { itsamples[k] = izsamples[k] / eel[j] / tau_denoms[k]; }
295 if (izsamples[k] != 0.0) { fprintf(stderr, "K=%d, IZSAMPLES_K=%lf\n", k, izsamples[k]); exit(-1); }
298 gzsamples[k] += izsamples[k];
299 gtsamples[k] += itsamples[k];
301 calcCI(nSamples, izsamples, iso_nrf[j].lb, iso_nrf[j].ub);
302 calcCI(nSamples, itsamples, iso_tau[j].lb, iso_tau[j].ub);
304 calcCI(nSamples, gzsamples, gene_nrf[i].lb, gene_nrf[i].ub);
305 calcCI(nSamples, gtsamples, gene_tau[i].lb, gene_tau[i].ub);
307 if (verbose && (i + 1) % 1000 == 0) { printf("%d genes are done!\n", i + 1); }
312 //isoform level results
313 sprintf(outF, "%s.iso_res", imdName);
314 fo = fopen(outF, "a");
315 for (int i = 1; i <= M; i++)
316 fprintf(fo, "%.6g%c", iso_nrf[i].lb, (i < M ? '\t' : '\n'));
317 for (int i = 1; i <= M; i++)
318 fprintf(fo, "%.6g%c", iso_nrf[i].ub, (i < M ? '\t' : '\n'));
319 for (int i = 1; i <= M; i++)
320 fprintf(fo, "%.6g%c", iso_tau[i].lb, (i < M ? '\t' : '\n'));
321 for (int i = 1; i <= M; i++)
322 fprintf(fo, "%.6g%c", iso_tau[i].ub, (i < M ? '\t' : '\n'));
326 sprintf(outF, "%s.gene_res", imdName);
327 fo = fopen(outF, "a");
328 for (int i = 0; i < m; i++)
329 fprintf(fo, "%.6g%c", gene_nrf[i].lb, (i < m - 1 ? '\t' : '\n'));
330 for (int i = 0; i < m; i++)
331 fprintf(fo, "%.6g%c", gene_nrf[i].ub, (i < m - 1 ? '\t' : '\n'));
332 for (int i = 0; i < m; i++)
333 fprintf(fo, "%.6g%c", gene_tau[i].lb, (i < m - 1 ? '\t' : '\n'));
334 for (int i = 0; i < m; i++)
335 fprintf(fo, "%.6g%c", gene_tau[i].ub, (i < m - 1 ? '\t' : '\n'));
338 printf("CI of noise isoform is [%.6g, %.6g]\n", iso_nrf[0].lb, iso_nrf[0].ub);
350 if (verbose) { printf("All credibility intervals are calculated!\n"); }
352 sprintf(outF, "%s.tau_denoms", imdName);
353 fo = fopen(outF, "w");
354 fprintf(fo, "%d\n", nSamples);
355 for (int i = 0; i < nSamples; i++) fprintf(fo, "%.15g ", tau_denoms[i]);
361 int main(int argc, char* argv[]) {
363 printf("Usage: rsem-calculate-credibility-intervals reference_name sample_name imdName confidence nSpC nMB[-q]\n");
367 confidence = atof(argv[4]);
368 nSpC = atoi(argv[5]);
372 if (argc > 7 && !strcmp(argv[7], "-q")) {
377 sprintf(modelF, "%s.model", argv[2]);
378 FILE *fi = fopen(modelF, "r");
379 if (fi == NULL) { fprintf(stderr, "Cannot open %s!\n", modelF); exit(-1); }
380 fscanf(fi, "%d", &model_type);
383 sprintf(refF, "%s.seq", argv[1]);
384 refs.loadRefs(refF, 1);
386 sprintf(groupF, "%s.grp", argv[1]);
390 sprintf(tmpF, "%s.tmp", argv[3]);
391 sprintf(cvsF, "%s.countvectors", argv[3]);
394 case 0 : sampling<SingleModel>(); break;
395 case 1 : sampling<SingleQModel>(); break;
396 case 2 : sampling<PairedEndModel>(); break;
397 case 3 : sampling<PairedEndQModel>(); break;
400 generateResults(argv[3]);
404 sprintf(command, "rm -f %s", tmpF);
405 int status = system(command);
407 fprintf(stderr, "Cannot delete %s!\n", tmpF);