X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=EM.cpp;h=322a3063ff6596c9b1cce15d037065c8c2c1af85;hb=791b978145b0d22865ee350db9d58072bb99d816;hp=00cb8527afe966771b61408debe86a74ad0294a1;hpb=a95154919f950f86de9104b2b9dcf1f0c7e83387;p=rsem.git diff --git a/EM.cpp b/EM.cpp index 00cb852..322a306 100644 --- a/EM.cpp +++ b/EM.cpp @@ -61,9 +61,11 @@ bool genBamF; // If user wants to generate bam file, true; otherwise, false. bool updateModel, calcExpectedWeights; bool genGibbsOut; // generate file for Gibbs sampler -char refName[STRLEN], imdName[STRLEN], outName[STRLEN]; +char refName[STRLEN], outName[STRLEN]; +char imdName[STRLEN], statName[STRLEN]; char refF[STRLEN], groupF[STRLEN], cntF[STRLEN], tiF[STRLEN]; char mparamsF[STRLEN], bmparamsF[STRLEN]; +char modelF[STRLEN], thetaF[STRLEN]; char inpSamType; char *pt_fn_list, *pt_chr_list; @@ -127,7 +129,7 @@ void init(ReadReader **&readers, HitContainer **&hitvs, doubl if (!readers[i]->locate(curnr)) { fprintf(stderr, "Read indices files do not match!\n"); exit(-1); } //assert(readers[i]->locate(curnr)); - while (nrLeft > ntLeft && hitvs[i]->getNHits() < nhT) { + while (nrLeft > ntLeft && (i == nThreads - 1 || hitvs[i]->getNHits() < nhT)) { if (!hitvs[i]->read(fin)) { fprintf(stderr, "Cannot read alignments from .dat file!\n"); exit(-1); } //assert(hitvs[i]->read(fin)); --nrLeft; @@ -301,35 +303,19 @@ void calcExpectedEffectiveLengths(ModelType& model) { template void writeResults(ModelType& model, double* counts) { double denom; - char modelF[STRLEN], thetaF[STRLEN]; char outF[STRLEN]; FILE *fo; - sprintf(modelF, "%s.model", outName); + sprintf(modelF, "%s.model", statName); model.write(modelF); - sprintf(thetaF, "%s.theta", outName); - fo = fopen(thetaF, "w"); - fprintf(fo, "%d\n", M + 1); - for (int i = 0; i < M; i++) fprintf(fo, "%.15g ", theta[i]); - fprintf(fo, "%.15g\n", theta[M]); - fclose(fo); - - - //calculate normalized read fraction - double *nrf = new double[M + 1]; - memset(nrf, 0, sizeof(double) * (M + 1)); - denom = 1.0 - theta[0]; - if (denom <= 0) { fprintf(stderr, "No alignable reads?!\n"); exit(-1); } - for (int i = 1; i <= M; i++) nrf[i] = theta[i] / denom; - //calculate tau values double *tau = new double[M + 1]; memset(tau, 0, sizeof(double) * (M + 1)); denom = 0.0; for (int i = 1; i <= M; i++) - if (eel[i] > EPSILON) { + if (eel[i] >= EPSILON) { tau[i] = theta[i] / eel[i]; denom += tau[i]; } @@ -348,8 +334,6 @@ void writeResults(ModelType& model, double* counts) { } for (int i = 1; i <= M; i++) fprintf(fo, "%.2f%c", counts[i], (i < M ? '\t' : '\n')); - for (int i = 1; i <= M; i++) - fprintf(fo, "%.15g%c", nrf[i], (i < M ? '\t' : '\n')); for (int i = 1; i <= M; i++) fprintf(fo, "%.15g%c", tau[i], (i < M ? '\t' : '\n')); for (int i = 1; i <= M; i++) { @@ -371,12 +355,6 @@ void writeResults(ModelType& model, double* counts) { for (int j = b; j < e; j++) sumC += counts[j]; fprintf(fo, "%.2f%c", sumC, (i < m - 1 ? '\t' : '\n')); } - for (int i = 0; i < m; i++) { - double sumN = 0.0; // sum of normalized read fraction - int b = gi.spAt(i), e = gi.spAt(i + 1); - for (int j = b; j < e; j++) sumN += nrf[j]; - fprintf(fo, "%.15g%c", sumN, (i < m - 1 ? '\t' : '\n')); - } for (int i = 0; i < m; i++) { double sumT = 0.0; // sum of tau values int b = gi.spAt(i), e = gi.spAt(i + 1); @@ -391,7 +369,6 @@ void writeResults(ModelType& model, double* counts) { } fclose(fo); - delete[] nrf; delete[] tau; if (verbose) { printf("Expression Results are written!\n"); } @@ -417,14 +394,18 @@ void release(ReadReader **readers, HitContainer **hitvs, doub delete[] mhps; } +int tmp_n; + inline bool doesUpdateModel(int ROUND) { - //return false; // never update, for debugging only - return ROUND <= 20 || ROUND % 100 == 0; + // return ROUND <= 20 || ROUND % 100 == 0; + return ROUND <= 10; } //Including initialize, algorithm and results saving template void EM() { + FILE *fo; + int ROUND; double sum; @@ -443,6 +424,7 @@ void EM() { void *status; int rc; + //initialize boolean variables updateModel = calcExpectedWeights = false; @@ -472,7 +454,6 @@ void EM() { pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - ROUND = 0; do { ++ROUND; @@ -527,22 +508,11 @@ void EM() { } if (verbose) printf("ROUND = %d, SUM = %.15g, bChange = %f, totNum = %d\n", ROUND, sum, bChange, totNum); - } while (ROUND < MIN_ROUND || totNum > 0 && ROUND < MAX_ROUND); + } while (ROUND < MIN_ROUND || (totNum > 0 && ROUND < MAX_ROUND)); //while (ROUND < MAX_ROUND); if (totNum > 0) fprintf(stderr, "Warning: RSEM reaches %d iterations before meeting the convergence criteria.\n", MAX_ROUND); - //calculate expected effective lengths for each isoform - calcExpectedEffectiveLengths(model); - - //correct theta vector - sum = theta[0]; - for (int i = 1; i <= M; i++) - if (eel[i] < EPSILON) { theta[i] = 0.0; } - else sum += theta[i]; - if (sum < EPSILON) { fprintf(stderr, "No Expected Effective Length is no less than %.6g?!\n", MINEEL); exit(-1); } - for (int i = 0; i <= M; i++) theta[i] /= sum; - //generate output file used by Gibbs sampler if (genGibbsOut) { if (model.getNeedCalcConPrb()) { @@ -558,7 +528,7 @@ void EM() { model.setNeedCalcConPrb(false); sprintf(out_for_gibbs_F, "%s.ofg", imdName); - FILE *fo = fopen(out_for_gibbs_F, "w"); + fo = fopen(out_for_gibbs_F, "w"); fprintf(fo, "%d %d\n", M, N0); for (int i = 0; i < nThreads; i++) { int numN = hitvs[i]->getN(); @@ -567,7 +537,7 @@ void EM() { int to = hitvs[i]->getSAt(j + 1); int totNum = 0; - if (ncpvs[i][j] > 0.0) { ++totNum; fprintf(fo, "%d %.15g ", 0, ncpvs[i][j]); } + if (ncpvs[i][j] >= EPSILON) { ++totNum; fprintf(fo, "%d %.15g ", 0, ncpvs[i][j]); } for (int k = fr; k < to; k++) { HitType &hit = hitvs[i]->getHitAt(k); if (hit.getConPrb() >= EPSILON) { @@ -582,8 +552,28 @@ void EM() { fclose(fo); } + sprintf(thetaF, "%s.theta", statName); + fo = fopen(thetaF, "w"); + fprintf(fo, "%d\n", M + 1); + + // output theta' + for (int i = 0; i < M; i++) fprintf(fo, "%.15g ", theta[i]); + fprintf(fo, "%.15g\n", theta[M]); + + //calculate expected effective lengths for each isoform + calcExpectedEffectiveLengths(model); + + //correct theta vector + sum = theta[0]; + for (int i = 1; i <= M; i++) + if (eel[i] < EPSILON) { theta[i] = 0.0; } + else sum += theta[i]; + if (sum < EPSILON) { fprintf(stderr, "No Expected Effective Length is no less than %.6g?!\n", MINEEL); exit(-1); } + for (int i = 0; i <= M; i++) theta[i] /= sum; + //calculate expected weights and counts using learned parameters updateModel = false; calcExpectedWeights = true; + for (int i = 0; i <= M; i++) probv[i] = theta[i]; for (int i = 0; i < nThreads; i++) { rc = pthread_create(&threads[i], &attr, E_STEP, (void*)(&fparams[i])); if (rc != 0) { fprintf(stderr, "Cannot create thread %d when calculate expected weights! (numbered from 0)\n", i); exit(-1); } @@ -605,8 +595,7 @@ void EM() { /* destroy attribute */ pthread_attr_destroy(&attr); - - //for all + //convert theta' to theta double *mw = model.getMW(); sum = 0.0; for (int i = 0; i <= M; i++) { @@ -616,6 +605,12 @@ void EM() { assert(sum >= EPSILON); for (int i = 0; i <= M; i++) theta[i] /= sum; + // output theta + for (int i = 0; i < M; i++) fprintf(fo, "%.15g ", theta[i]); + fprintf(fo, "%.15g\n", theta[M]); + + fclose(fo); + writeResults(model, countvs[0]); if (genBamF) { @@ -638,11 +633,11 @@ int main(int argc, char* argv[]) { bool quiet = false; if (argc < 5) { - printf("Usage : rsem-run-em refName read_type imdName outName [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out]\n\n"); + printf("Usage : rsem-run-em refName read_type sampleName sampleToken [-p #Threads] [-b samInpType samInpF has_fn_list_? [fn_list]] [-q] [--gibbs-out]\n\n"); printf(" refName: reference name\n"); printf(" read_type: 0 single read without quality score; 1 single read with quality score; 2 paired-end read without quality score; 3 paired-end read with quality score.\n"); - printf(" imdName: name for all upstream/downstream user-unseen files. (different files have different suffices)\n"); - printf(" outName: name for all output files. (different files have different suffices)\n"); + printf(" sampleName: sample's name, including the path\n"); + printf(" sampleToken: sampleName excludes the path\n"); printf(" -p: number of threads which user wants to use. (default: 1)\n"); printf(" -b: produce bam format output file. (default: off)\n"); printf(" -q: set it quiet\n"); @@ -655,8 +650,9 @@ int main(int argc, char* argv[]) { strcpy(refName, argv[1]); read_type = atoi(argv[2]); - strcpy(imdName, argv[3]); - strcpy(outName, argv[4]); + strcpy(outName, argv[3]); + sprintf(imdName, "%s.temp/%s", argv[3], argv[4]); + sprintf(statName, "%s.stat/%s", argv[3], argv[4]); nThreads = 1; @@ -694,7 +690,7 @@ int main(int argc, char* argv[]) { sprintf(tiF, "%s.ti", refName); transcripts.readFrom(tiF); - sprintf(cntF, "%s.cnt", imdName); + sprintf(cntF, "%s.cnt", statName); fin.open(cntF); if (!fin.is_open()) { fprintf(stderr, "Cannot open %s! It may not exist.\n", cntF); exit(-1); } fin>>N0>>N1>>N2>>N_tot;