X-Git-Url: https://git.donarmstrong.com/?p=rsem.git;a=blobdiff_plain;f=calcCI.cpp;h=86b3937ed8009fda5a40cfefdcc7825a0de0fc80;hp=89414f9b02f00ba641b8f987845aea8e977ef1b1;hb=7cd7abcad92a44bbd5d8d1b2bb3a871cd479c0bf;hpb=757263aa0ff5367f826468db30d983282e51e7f0 diff --git a/calcCI.cpp b/calcCI.cpp index 89414f9..86b3937 100644 --- a/calcCI.cpp +++ b/calcCI.cpp @@ -20,15 +20,22 @@ #include "Refs.h" #include "GroupInfo.h" +#include "WriteResults.h" #include "Buffer.h" + using namespace std; struct Params { int no; FILE *fi; engine_type *engine; - double *mw; + const double *mw; +}; + +struct CIParams { + int no; + int start_gene_id, end_gene_id; }; struct CIType { @@ -37,29 +44,31 @@ struct CIType { CIType() { lb = ub = 0.0; } }; -struct CIParams { - int no; - int start_gene_id, end_gene_id; -}; - int model_type; int nMB; double confidence; int nCV, nSpC, nSamples; // nCV: number of count vectors; nSpC: number of theta vectors sampled per count vector; nSamples: nCV * nSpC int nThreads; -int cvlen; + +float *l_bars; char cvsF[STRLEN], tmpF[STRLEN], command[STRLEN]; -CIType *iso_tau, *gene_tau; +CIType *tpm, *fpkm; +CIType *iso_tpm = NULL, *iso_fpkm = NULL; +CIType *gene_tpm, *gene_fpkm; int M, m; Refs refs; GroupInfo gi; -char imdName[STRLEN], statName[STRLEN]; +char refName[STRLEN], imdName[STRLEN], statName[STRLEN]; char modelF[STRLEN], groupF[STRLEN], refF[STRLEN]; +bool alleleS; +int m_trans; +GroupInfo ta; + vector eel; //expected effective lengths Buffer *buffer; @@ -69,106 +78,66 @@ bool quiet; Params *paramsArray; pthread_t *threads; pthread_attr_t attr; -void *status; int rc; -CIParams *ciParamsArray; +bool hasSeed; +seedType seed; -template -void calcExpectedEffectiveLengths(ModelType& model) { - int lb, ub, span; - double *pdf = NULL, *cdf = NULL, *clen = NULL; // clen[i] = \sigma_{j=1}^{i}pdf[i]*(lb+i) - - model.getGLD().copyTo(pdf, cdf, lb, ub, span); - clen = new double[span + 1]; - clen[0] = 0.0; - for (int i = 1; i <= span; i++) { - clen[i] = clen[i - 1] + pdf[i] * (lb + i); - } - - eel.assign(M + 1, 0.0); - for (int i = 1; i <= M; i++) { - int totLen = refs.getRef(i).getTotLen(); - int fullLen = refs.getRef(i).getFullLen(); - int pos1 = max(min(totLen - fullLen + 1, ub) - lb, 0); - int pos2 = max(min(totLen, ub) - lb, 0); - - if (pos2 == 0) { eel[i] = 0.0; continue; } - - eel[i] = fullLen * cdf[pos1] + ((cdf[pos2] - cdf[pos1]) * (totLen + 1) - (clen[pos2] - clen[pos1])); - assert(eel[i] >= 0); - if (eel[i] < MINEEL) { eel[i] = 0.0; } - } - - delete[] pdf; - delete[] cdf; - delete[] clen; -} +CIParams *ciParamsArray; void* sample_theta_from_c(void* arg) { - int *cvec; double *theta; + float *tpm; gamma_dist **gammas; gamma_generator **rgs; Params *params = (Params*)arg; FILE *fi = params->fi; - double *mw = params->mw; - - cvec = new int[cvlen]; - theta = new double[cvlen]; - gammas = new gamma_dist*[cvlen]; - rgs = new gamma_generator*[cvlen]; + const double *mw = params->mw; - float **vecs = new float*[nSpC]; - for (int i = 0; i < nSpC; i++) vecs[i] = new float[cvlen]; + cvec = new int[M + 1]; + theta = new double[M + 1]; + gammas = new gamma_dist*[M + 1]; + rgs = new gamma_generator*[M + 1]; + tpm = new float[M + 1]; + float l_bar; // the mean transcript length over the sample int cnt = 0; while (fscanf(fi, "%d", &cvec[0]) == 1) { - for (int j = 1; j < cvlen; j++) assert(fscanf(fi, "%d", &cvec[j]) == 1); + for (int j = 1; j <= M; j++) assert(fscanf(fi, "%d", &cvec[j]) == 1); ++cnt; - for (int j = 0; j < cvlen; j++) { + for (int j = 0; j <= M; j++) { gammas[j] = new gamma_dist(cvec[j]); rgs[j] = new gamma_generator(*(params->engine), *gammas[j]); } for (int i = 0; i < nSpC; i++) { double sum = 0.0; - for (int j = 0; j < cvlen; j++) { - theta[j] = ((j == 0 || eel[j] >= EPSILON) ? (*rgs[j])() : 0.0); + for (int j = 0; j <= M; j++) { + theta[j] = ((j == 0 || (eel[j] >= EPSILON && mw[j] >= EPSILON)) ? (*rgs[j])() / mw[j] : 0.0); sum += theta[j]; } assert(sum >= EPSILON); - for (int j = 0; j < cvlen; j++) theta[j] /= sum; + for (int j = 0; j <= M; j++) theta[j] /= sum; sum = 0.0; - for (int j = 0; j < cvlen; j++) { - theta[j] = (mw[j] < EPSILON ? 0.0 : theta[j] / mw[j]); - sum += theta[j]; - } - assert(sum >= EPSILON); - for (int j = 0; j < cvlen; j++) theta[j] /= sum; - - - sum = 0.0; - vecs[i][0] = theta[0]; - for (int j = 1; j < cvlen; j++) + tpm[0] = 0.0; + for (int j = 1; j <= M; j++) if (eel[j] >= EPSILON) { - vecs[i][j] = theta[j] / eel[j]; - sum += vecs[i][j]; + tpm[j] = theta[j] / eel[j]; + sum += tpm[j]; } else assert(theta[j] < EPSILON); - assert(sum >= EPSILON); - for (int j = 1; j < cvlen; j++) vecs[i][j] /= sum; + l_bar = 0.0; // store mean effective length of the sample + for (int j = 1; j <= M; j++) { tpm[j] /= sum; l_bar += tpm[j] * eel[j]; tpm[j] *= 1e6; } + buffer->write(l_bar, tpm + 1); // ommit the first element in tpm } - buffer->write(nSpC, vecs); - - for (int j = 0; j < cvlen; j++) { + for (int j = 0; j <= M; j++) { delete gammas[j]; delete rgs[j]; } @@ -180,9 +149,7 @@ void* sample_theta_from_c(void* arg) { delete[] theta; delete[] gammas; delete[] rgs; - - for (int i = 0; i < nSpC; i++) delete[] vecs[i]; - delete[] vecs; + delete[] tpm; return NULL; } @@ -191,17 +158,17 @@ template void sample_theta_vectors_from_count_vectors() { ModelType model; model.read(modelF); - calcExpectedEffectiveLengths(model); - + calcExpectedEffectiveLengths(M, refs, model, eel); int num_threads = min(nThreads, nCV); - buffer = new Buffer(nMB, nSamples, cvlen, tmpF); + buffer = new Buffer(nMB, nSamples, M, l_bars, tmpF); paramsArray = new Params[num_threads]; threads = new pthread_t[num_threads]; char inpF[STRLEN]; + hasSeed ? engineFactory::init(seed) : engineFactory::init(); for (int i = 0; i < num_threads; i++) { paramsArray[i].no = i; sprintf(inpF, "%s%d", cvsF, i); @@ -209,6 +176,7 @@ void sample_theta_vectors_from_count_vectors() { paramsArray[i].engine = engineFactory::new_engine(); paramsArray[i].mw = model.getMW(); } + engineFactory::finish(); /* set thread attribute to be joinable */ pthread_attr_init(&attr); @@ -219,7 +187,7 @@ void sample_theta_vectors_from_count_vectors() { pthread_assert(rc, "pthread_create", "Cannot create thread " + itos(i) + " (numbered from 0) in sample_theta_vectors_from_count_vectors!"); } for (int i = 0; i < num_threads; i++) { - rc = pthread_join(threads[i], &status); + rc = pthread_join(threads[i], NULL); pthread_assert(rc, "pthread_join", "Cannot join thread " + itos(i) + " (numbered from 0) in sample_theta_vectors_from_count_vectors!"); } @@ -282,38 +250,102 @@ void calcCI(int nSamples, float *samples, float &lb, float &ub) { } void* calcCI_batch(void* arg) { - float *itsamples, *gtsamples; + float *tsamples, *fsamples; + float *itsamples = NULL, *ifsamples = NULL, *gtsamples, *gfsamples; ifstream fin; CIParams *ciParams = (CIParams*)arg; + int curtid, curaid, tid; - itsamples = new float[nSamples]; + tsamples = new float[nSamples]; + fsamples = new float[nSamples]; + if (alleleS) { + itsamples = new float[nSamples]; + ifsamples = new float[nSamples]; + } gtsamples = new float[nSamples]; + gfsamples = new float[nSamples]; fin.open(tmpF, ios::binary); - streampos pos = streampos(gi.spAt(ciParams->start_gene_id)) * nSamples * FLOATSIZE; + // minus 1 here for that theta0 is not written! + streampos pos = streampos(gi.spAt(ciParams->start_gene_id) - 1) * nSamples * FLOATSIZE; fin.seekg(pos, ios::beg); int cnt = 0; + if (alleleS) { + curtid = curaid = -1; + memset(itsamples, 0, FLOATSIZE * nSamples); + memset(ifsamples, 0, FLOATSIZE * nSamples); + } for (int i = ciParams->start_gene_id; i < ciParams->end_gene_id; i++) { int b = gi.spAt(i), e = gi.spAt(i + 1); memset(gtsamples, 0, FLOATSIZE * nSamples); + memset(gfsamples, 0, FLOATSIZE * nSamples); for (int j = b; j < e; j++) { + if (alleleS) { + tid = ta.gidAt(j); + if (curtid != tid) { + if (curtid >= 0) { + if (j - curaid > 1) { + calcCI(nSamples, itsamples, iso_tpm[curtid].lb, iso_tpm[curtid].ub); + calcCI(nSamples, ifsamples, iso_fpkm[curtid].lb, iso_fpkm[curtid].ub); + } + else { + iso_tpm[curtid] = tpm[curaid]; + iso_fpkm[curtid] = fpkm[curaid]; + } + } + curtid = tid; + curaid = j; + } + } + for (int k = 0; k < nSamples; k++) { - fin.read((char*)(&itsamples[k]), FLOATSIZE); - gtsamples[k] += itsamples[k]; + fin.read((char*)(&tsamples[k]), FLOATSIZE); + fsamples[k] = 1e3 / l_bars[k] * tsamples[k]; + if (alleleS) { + itsamples[k] += tsamples[k]; + ifsamples[k] += fsamples[k]; + } + gtsamples[k] += tsamples[k]; + gfsamples[k] += fsamples[k]; } - calcCI(nSamples, itsamples, iso_tau[j].lb, iso_tau[j].ub); + calcCI(nSamples, tsamples, tpm[j].lb, tpm[j].ub); + calcCI(nSamples, fsamples, fpkm[j].lb, fpkm[j].ub); + } + + if (e - b > 1) { + calcCI(nSamples, gtsamples, gene_tpm[i].lb, gene_tpm[i].ub); + calcCI(nSamples, gfsamples, gene_fpkm[i].lb, gene_fpkm[i].ub); + } + else { + gene_tpm[i] = tpm[b]; + gene_fpkm[i] = fpkm[b]; } - calcCI(nSamples, gtsamples, gene_tau[i].lb, gene_tau[i].ub); ++cnt; if (verbose && cnt % 1000 == 0) { printf("In thread %d, %d genes are processed for CI calculation!\n", ciParams->no, cnt); } } - fin.close(); - delete[] itsamples; + if (alleleS && (curtid >= 0)) { + if (gi.spAt(ciParams->end_gene_id) - curaid > 1) { + calcCI(nSamples, itsamples, iso_tpm[curtid].lb, iso_tpm[curtid].ub); + calcCI(nSamples, ifsamples, iso_fpkm[curtid].lb, iso_fpkm[curtid].ub); + } + else { + iso_tpm[curtid] = tpm[curaid]; + iso_fpkm[curtid] = fpkm[curaid]; + } + } + + delete[] tsamples; + delete[] fsamples; + if (alleleS) { + delete[] itsamples; + delete[] ifsamples; + } delete[] gtsamples; + delete[] gfsamples; return NULL; } @@ -323,8 +355,14 @@ void calculate_credibility_intervals(char* imdName) { char outF[STRLEN]; int num_threads = nThreads; - iso_tau = new CIType[M + 1]; - gene_tau = new CIType[m]; + tpm = new CIType[M + 1]; + fpkm = new CIType[M + 1]; + if (alleleS) { + iso_tpm = new CIType[m_trans]; + iso_fpkm = new CIType[m_trans]; + } + gene_tpm = new CIType[m]; + gene_fpkm = new CIType[m]; assert(M > 0); int quotient = M / num_threads; @@ -359,7 +397,7 @@ void calculate_credibility_intervals(char* imdName) { pthread_assert(rc, "pthread_create", "Cannot create thread " + itos(i) + " (numbered from 0) in calculate_credibility_intervals!"); } for (int i = 0; i < num_threads; i++) { - rc = pthread_join(threads[i], &status); + rc = pthread_join(threads[i], NULL); pthread_assert(rc, "pthread_join", "Cannot join thread " + itos(i) + " (numbered from 0) in calculate_credibility_intervals!"); } @@ -371,36 +409,68 @@ void calculate_credibility_intervals(char* imdName) { delete[] ciParamsArray; - //isoform level results - sprintf(outF, "%s.iso_res", imdName); + alleleS ? sprintf(outF, "%s.allele_res", imdName) : sprintf(outF, "%s.iso_res", imdName); fo = fopen(outF, "a"); for (int i = 1; i <= M; i++) - fprintf(fo, "%.6g%c", iso_tau[i].lb, (i < M ? '\t' : '\n')); + fprintf(fo, "%.6g%c", tpm[i].lb, (i < M ? '\t' : '\n')); + for (int i = 1; i <= M; i++) + fprintf(fo, "%.6g%c", tpm[i].ub, (i < M ? '\t' : '\n')); for (int i = 1; i <= M; i++) - fprintf(fo, "%.6g%c", iso_tau[i].ub, (i < M ? '\t' : '\n')); + fprintf(fo, "%.6g%c", fpkm[i].lb, (i < M ? '\t' : '\n')); + for (int i = 1; i <= M; i++) + fprintf(fo, "%.6g%c", fpkm[i].ub, (i < M ? '\t' : '\n')); fclose(fo); + if (alleleS) { + //isoform level results + sprintf(outF, "%s.iso_res", imdName); + fo = fopen(outF, "a"); + for (int i = 0; i < m_trans; i++) + fprintf(fo, "%.6g%c", iso_tpm[i].lb, (i < m_trans - 1 ? '\t' : '\n')); + for (int i = 0; i < m_trans; i++) + fprintf(fo, "%.6g%c", iso_tpm[i].ub, (i < m_trans - 1 ? '\t' : '\n')); + for (int i = 0; i < m_trans; i++) + fprintf(fo, "%.6g%c", iso_fpkm[i].lb, (i < m_trans - 1 ? '\t' : '\n')); + for (int i = 0; i < m_trans; i++) + fprintf(fo, "%.6g%c", iso_fpkm[i].ub, (i < m_trans - 1 ? '\t' : '\n')); + fclose(fo); + } + //gene level results sprintf(outF, "%s.gene_res", imdName); fo = fopen(outF, "a"); for (int i = 0; i < m; i++) - fprintf(fo, "%.6g%c", gene_tau[i].lb, (i < m - 1 ? '\t' : '\n')); + fprintf(fo, "%.6g%c", gene_tpm[i].lb, (i < m - 1 ? '\t' : '\n')); + for (int i = 0; i < m; i++) + fprintf(fo, "%.6g%c", gene_tpm[i].ub, (i < m - 1 ? '\t' : '\n')); + for (int i = 0; i < m; i++) + fprintf(fo, "%.6g%c", gene_fpkm[i].lb, (i < m - 1 ? '\t' : '\n')); for (int i = 0; i < m; i++) - fprintf(fo, "%.6g%c", gene_tau[i].ub, (i < m - 1 ? '\t' : '\n')); + fprintf(fo, "%.6g%c", gene_fpkm[i].ub, (i < m - 1 ? '\t' : '\n')); fclose(fo); - delete[] iso_tau; - delete[] gene_tau; + delete[] tpm; + delete[] fpkm; + if (alleleS) { + delete[] iso_tpm; + delete[] iso_fpkm; + } + delete[] gene_tpm; + delete[] gene_fpkm; if (verbose) { printf("All credibility intervals are calculated!\n"); } } int main(int argc, char* argv[]) { if (argc < 8) { - printf("Usage: rsem-calculate-credibility-intervals reference_name sample_name sampleToken confidence nCV nSpC nMB [-p #Threads] [-q]\n"); + printf("Usage: rsem-calculate-credibility-intervals reference_name imdName statName confidence nCV nSpC nMB [-p #Threads] [--seed seed] [-q]\n"); exit(-1); } + strcpy(refName, argv[1]); + strcpy(imdName, argv[2]); + strcpy(statName, argv[3]); + confidence = atof(argv[4]); nCV = atoi(argv[5]); nSpC = atoi(argv[6]); @@ -408,25 +478,35 @@ int main(int argc, char* argv[]) { nThreads = 1; quiet = false; + hasSeed = false; for (int i = 8; i < argc; i++) { if (!strcmp(argv[i], "-p")) nThreads = atoi(argv[i + 1]); + if (!strcmp(argv[i], "--seed")) { + hasSeed = true; + int len = strlen(argv[i + 1]); + seed = 0; + for (int k = 0; k < len; k++) seed = seed * 10 + (argv[i + 1][k] - '0'); + } if (!strcmp(argv[i], "-q")) quiet = true; } verbose = !quiet; - sprintf(refF, "%s.seq", argv[1]); + sprintf(refF, "%s.seq", refName); refs.loadRefs(refF, 1); M = refs.getM(); - sprintf(groupF, "%s.grp", argv[1]); + + sprintf(groupF, "%s.grp", refName); gi.load(groupF); m = gi.getm(); + // allele-specific + alleleS = isAlleleSpecific(refName, NULL, &ta); + if (alleleS) m_trans = ta.getm(); + nSamples = nCV * nSpC; - cvlen = M + 1; - assert(nSamples > 0 && cvlen > 1); // for Buffter.h: (bufsize_type)nSamples + assert(nSamples > 0 && M > 0); // for Buffter.h: (bufsize_type)nSamples + l_bars = new float[nSamples]; - sprintf(imdName, "%s.temp/%s", argv[2], argv[3]); - sprintf(statName, "%s.stat/%s", argv[2], argv[3]); sprintf(tmpF, "%s.tmp", imdName); sprintf(cvsF, "%s.countvectors", imdName); @@ -447,14 +527,7 @@ int main(int argc, char* argv[]) { // Phase II calculate_credibility_intervals(imdName); - /* - sprintf(command, "rm -f %s", tmpF); - int status = system(command); - if (status != 0) { - fprintf(stderr, "Cannot delete %s!\n", tmpF); - exit(-1); - } - */ + delete l_bars; return 0; }