From 9eef8b58056b7cdaad1b4bdb2b2904d9fc0ff430 Mon Sep 17 00:00:00 2001 From: Bo Li Date: Wed, 21 Mar 2012 10:44:21 -0500 Subject: [PATCH] Allowed > 2^31 hits --- BamConverter.h | 2 +- BamWriter.h | 14 ++++--- EM.cpp | 88 +++++++++++++++++++++------------------ Gibbs.cpp | 35 +++++----------- HitContainer.h | 45 ++++++++++---------- HitWrapper.h | 5 ++- ModelParams.h | 2 +- PairedEndModel.h | 15 +++---- PairedEndQModel.h | 15 +++---- ReadIndex.h | 4 +- ReadReader.h | 8 ++-- SingleHit.h | 2 +- SingleModel.h | 17 ++++---- SingleQModel.h | 15 +++---- buildReadIndex.cpp | 6 +-- extractRef.cpp | 16 +++---- getUnique.cpp | 8 ++-- parseIt.cpp | 16 +++---- samValidator.cpp | 3 +- scanForPairedEndReads.cpp | 3 +- simulation.cpp | 10 ++--- utils.h | 4 ++ wiggle.cpp | 6 ++- 23 files changed, 174 insertions(+), 165 deletions(-) diff --git a/BamConverter.h b/BamConverter.h index af984ac..bd81127 100644 --- a/BamConverter.h +++ b/BamConverter.h @@ -76,7 +76,7 @@ void BamConverter::process() { std::string cqname; bool isPaired = false; - int cnt = 0; + HIT_INT_TYPE cnt = 0; cqname = ""; b = bam_init1(); b2 = bam_init1(); diff --git a/BamWriter.h b/BamWriter.h index 12ab087..a73e3da 100644 --- a/BamWriter.h +++ b/BamWriter.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include "sam/bam.h" @@ -14,6 +15,8 @@ #include "sam_rsem_aux.h" #include "sam_rsem_cvt.h" +#include "utils.h" + #include "SingleHit.h" #include "PairedEndHit.h" @@ -76,13 +79,13 @@ void BamWriter::work(HitWrapper wrapper) { bam1_t *b; SingleHit *hit; - int cnt = 0; + HIT_INT_TYPE cnt = 0; b = bam_init1(); while (samread(in, b) >= 0) { ++cnt; - if (verbose && cnt % 1000000 == 0) { printf("%d alignment lines are loaded!\n", cnt); } + if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< "alignment lines are loaded!"<< std::endl; } if (b->core.flag & 0x0004) continue; @@ -97,7 +100,7 @@ void BamWriter::work(HitWrapper wrapper) { assert(wrapper.getNextHit() == NULL); bam_destroy1(b); - if (verbose) { printf("Bam output file is generated!\n"); } + if (verbose) { std::cout<< "Bam output file is generated!"<< std::endl; } } void BamWriter::work(HitWrapper wrapper) { @@ -111,8 +114,7 @@ void BamWriter::work(HitWrapper wrapper) { while (samread(in, b) >= 0 && samread(in, b2) >= 0) { cnt += 2; - if (verbose && cnt % 1000000 == 0) { printf("%d alignment lines are loaded!\n", cnt); } - + if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< "alignment lines are loaded!"<< std::endl; } //mate info is not complete, skip if (!(((b->core.flag & 0x0040) && (b2->core.flag & 0x0080)) || ((b->core.flag & 0x0080) && (b2->core.flag & 0x0040)))) continue; //unalignable reads, skip @@ -148,7 +150,7 @@ void BamWriter::work(HitWrapper wrapper) { bam_destroy1(b); bam_destroy1(b2); - if (verbose) { printf("Bam output file is generated!\n"); } + if (verbose) { std::cout<< "Bam output file is generated!"<< std::endl; } } void BamWriter::convert(bam1_t *b, double prb) { diff --git a/EM.cpp b/EM.cpp index f884979..e106335 100644 --- a/EM.cpp +++ b/EM.cpp @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include #include "utils.h" @@ -55,7 +57,7 @@ struct Params { int read_type; int m, M; // m genes, M isoforms -int N0, N1, N2, N_tot; +READ_INT_TYPE N0, N1, N2, N_tot; int nThreads; @@ -67,7 +69,7 @@ bool genGibbsOut; // generate file for Gibbs sampler char refName[STRLEN], outName[STRLEN]; char imdName[STRLEN], statName[STRLEN]; char refF[STRLEN], groupF[STRLEN], cntF[STRLEN], tiF[STRLEN]; -char mparamsF[STRLEN], bmparamsF[STRLEN]; +char mparamsF[STRLEN]; char modelF[STRLEN], thetaF[STRLEN]; char inpSamType; @@ -88,8 +90,12 @@ ModelParams mparams; template void init(ReadReader **&readers, HitContainer **&hitvs, double **&ncpvs, ModelType **&mhps) { - int nReads, nHits, rt; - int nrLeft, nhT, curnr; // nrLeft : number of reads left, nhT : hit threshold per thread, curnr: current number of reads + READ_INT_TYPE nReads; + HIT_INT_TYPE nHits; + int rt; // read type + + READ_INT_TYPE nrLeft, curnr; // nrLeft : number of reads left, curnr: current number of reads + HIT_INT_TYPE nhT; // nhT : hit threshold per thread char datF[STRLEN]; int s; @@ -127,7 +133,7 @@ void init(ReadReader **&readers, HitContainer **&hitvs, doubl ncpvs = new double*[nThreads]; for (int i = 0; i < nThreads; i++) { - int ntLeft = nThreads - i - 1; // # of threads left + HIT_INT_TYPE ntLeft = nThreads - i - 1; // # of threads left general_assert(readers[i]->locate(curnr), "Read indices files do not match!"); @@ -135,13 +141,13 @@ void init(ReadReader **&readers, HitContainer **&hitvs, doubl general_assert(hitvs[i]->read(fin), "Cannot read alignments from .dat file!"); --nrLeft; - if (verbose && nrLeft % 1000000 == 0) { printf("DAT %d reads left!\n", nrLeft); } + if (verbose && nrLeft % 1000000 == 0) { cout<< "DAT "<< nrLeft << " reads left"<< endl; } } ncpvs[i] = new double[hitvs[i]->getN()]; memset(ncpvs[i], 0, sizeof(double) * hitvs[i]->getN()); curnr += hitvs[i]->getN(); - if (verbose) { printf("Thread %d : N = %d, NHit = %d\n", i, hitvs[i]->getN(), hitvs[i]->getNHits()); } + if (verbose) { cout<<"Thread "<< i<< " : N = "<< hitvs[i]->getN()<< ", NHit = "<< hitvs[i]->getNHits()<< endl; } } fin.close(); @@ -175,16 +181,16 @@ void* E_STEP(void* arg) { ReadType read; - int N = hitv->getN(); + READ_INT_TYPE N = hitv->getN(); double sum; vector fracs; //to remove this, do calculation twice - int fr, to, id; + HIT_INT_TYPE fr, to, id; if (needCalcConPrb || updateModel) { reader->reset(); } if (updateModel) { mhp->init(); } memset(countv, 0, sizeof(double) * (M + 1)); - for (int i = 0; i < N; i++) { + for (READ_INT_TYPE i = 0; i < N; i++) { if (needCalcConPrb || updateModel) { general_assert(reader->next(read), "Can not load a read!"); } @@ -199,7 +205,7 @@ void* E_STEP(void* arg) { fracs[0] = probv[0] * ncpv[i]; if (fracs[0] < EPSILON) fracs[0] = 0.0; sum += fracs[0]; - for (int j = fr; j < to; j++) { + for (HIT_INT_TYPE j = fr; j < to; j++) { HitType &hit = hitv->getHitAt(j); if (needCalcConPrb) { hit.setConPrb(model->getConPrb(read, hit)); } id = j - fr + 1; @@ -213,7 +219,7 @@ void* E_STEP(void* arg) { countv[0] += fracs[0]; if (updateModel) { mhp->updateNoise(read, fracs[0]); } if (calcExpectedWeights) { ncpv[i] = fracs[0]; } - for (int j = fr; j < to; j++) { + for (HIT_INT_TYPE j = fr; j < to; j++) { HitType &hit = hitv->getHitAt(j); id = j - fr + 1; fracs[id] /= sum; @@ -224,7 +230,7 @@ void* E_STEP(void* arg) { } else if (calcExpectedWeights) { ncpv[i] = 0.0; - for (int j = fr; j < to; j++) { + for (HIT_INT_TYPE j = fr; j < to; j++) { HitType &hit = hitv->getHitAt(j); hit.setConPrb(0.0); } @@ -243,20 +249,20 @@ void* calcConProbs(void* arg) { double *ncpv = (double*)(params->ncpv); ReadType read; - int N = hitv->getN(); - int fr, to; + READ_INT_TYPE N = hitv->getN(); + HIT_INT_TYPE fr, to; assert(model->getNeedCalcConPrb()); reader->reset(); - for (int i = 0; i < N; i++) { + for (READ_INT_TYPE i = 0; i < N; i++) { general_assert(reader->next(read), "Can not load a read!"); fr = hitv->getSAt(i); to = hitv->getSAt(i + 1); ncpv[i] = model->getNoiseConPrb(read); - for (int j = fr; j < to; j++) { + for (HIT_INT_TYPE j = fr; j < to; j++) { HitType &hit = hitv->getHitAt(j); hit.setConPrb(model->getConPrb(read, hit)); } @@ -406,7 +412,7 @@ void EM() { double sum; double bChange = 0.0, change = 0.0; // bChange : biggest change - int totNum = 0; + READ_INT_TYPE totNum = 0; ModelType model(mparams); //master model ReadReader **readers; @@ -501,11 +507,11 @@ void EM() { if (bChange < change) bChange = change; } - if (verbose) printf("ROUND = %d, SUM = %.15g, bChange = %f, totNum = %d\n", ROUND, sum, bChange, totNum); + if (verbose) { cout<< "ROUND = "<< ROUND<< ", SUM = "<< setprecision(15)<< sum<< ", bChange = " << setprecision(6)<< bChange<< ", totNum = %" << totNum<< endl; } } while (ROUND < MIN_ROUND || (totNum > 0 && ROUND < MAX_ROUND)); // } while (ROUND < 1); - if (totNum > 0) fprintf(stderr, "Warning: RSEM reaches %d iterations before meeting the convergence criteria.\n", MAX_ROUND); + if (totNum > 0) { cout<< "Warning: RSEM reaches "<< MAX_ROUND<< " iterations before meeting the convergence criteria."<< endl; } //generate output file used by Gibbs sampler if (genGibbsOut) { @@ -522,28 +528,28 @@ void EM() { model.setNeedCalcConPrb(false); sprintf(out_for_gibbs_F, "%s.ofg", imdName); - fo = fopen(out_for_gibbs_F, "w"); - fprintf(fo, "%d %d\n", M, N0); + ofstream fout(out_for_gibbs_F); + fout<< M<< " "<< N0<< endl; for (int i = 0; i < nThreads; i++) { - int numN = hitvs[i]->getN(); - for (int j = 0; j < numN; j++) { - int fr = hitvs[i]->getSAt(j); - int to = hitvs[i]->getSAt(j + 1); - int totNum = 0; - - if (ncpvs[i][j] >= EPSILON) { ++totNum; fprintf(fo, "%d %.15g ", 0, ncpvs[i][j]); } - for (int k = fr; k < to; k++) { + READ_INT_TYPE numN = hitvs[i]->getN(); + for (READ_INT_TYPE j = 0; j < numN; j++) { + HIT_INT_TYPE fr = hitvs[i]->getSAt(j); + HIT_INT_TYPE to = hitvs[i]->getSAt(j + 1); + HIT_INT_TYPE totNum = 0; + + if (ncpvs[i][j] >= EPSILON) { ++totNum; fout<< "0 "<< setprecision(15)<< ncpvs[i][j]<< " "; } + for (HIT_INT_TYPE k = fr; k < to; k++) { HitType &hit = hitvs[i]->getHitAt(k); if (hit.getConPrb() >= EPSILON) { ++totNum; - fprintf(fo, "%d %.15g ", hit.getSid(), hit.getConPrb()); + fout<< hit.getSid()<< " "<< setprecision(15)<< hit.getConPrb()<< " "; } } - if (totNum > 0) { fprintf(fo, "\n"); } + if (totNum > 0) { fout<< endl; } } } - fclose(fo); + fout.close(); } sprintf(thetaF, "%s.theta", statName); @@ -611,27 +617,27 @@ void EM() { sprintf(outBamF, "%s.transcript.bam", outName); if (bamSampling) { - int local_N; - int fr, to, len, id; + READ_INT_TYPE local_N; + HIT_INT_TYPE fr, to, len, id; vector arr; uniform01 rg(engine_type(time(NULL))); - if (verbose) printf("Begin to sample reads from their posteriors.\n"); + if (verbose) cout<< "Begin to sample reads from their posteriors."<< endl; for (int i = 0; i < nThreads; i++) { local_N = hitvs[i]->getN(); - for (int j = 0; j < local_N; j++) { + for (READ_INT_TYPE j = 0; j < local_N; j++) { fr = hitvs[i]->getSAt(j); to = hitvs[i]->getSAt(j + 1); len = to - fr + 1; arr.assign(len, 0); arr[0] = ncpvs[i][j]; - for (int k = fr; k < to; k++) arr[k - fr + 1] = arr[k - fr] + hitvs[i]->getHitAt(k).getConPrb(); + for (HIT_INT_TYPE k = fr; k < to; k++) arr[k - fr + 1] = arr[k - fr] + hitvs[i]->getHitAt(k).getConPrb(); id = (arr[len - 1] < EPSILON ? -1 : sample(rg, arr, len)); // if all entries in arr are 0, let id be -1 - for (int k = fr; k < to; k++) hitvs[i]->getHitAt(k).setConPrb(k - fr + 1 == id ? 1.0 : 0.0); + for (HIT_INT_TYPE k = fr; k < to; k++) hitvs[i]->getHitAt(k).setConPrb(k - fr + 1 == id ? 1.0 : 0.0); } } - if (verbose) printf("Sampling is finished.\n"); + if (verbose) cout<< "Sampling is finished."<< endl; } BamWriter writer(inpSamType, inpSamF, pt_fn_list, outBamF, transcripts); @@ -717,7 +723,7 @@ int main(int argc, char* argv[]) { general_assert(N1 > 0, "There are no alignable reads!"); - if (nThreads > N1) nThreads = N1; + if ((READ_INT_TYPE)nThreads > N1) nThreads = N1; //set model parameters mparams.M = M; diff --git a/Gibbs.cpp b/Gibbs.cpp index 1a084d6..c1f4fe2 100644 --- a/Gibbs.cpp +++ b/Gibbs.cpp @@ -44,7 +44,9 @@ struct Item { int nThreads; int model_type; -int m, M, N0, N1, nHits; +int m, M; +READ_INT_TYPE N0, N1; +HIT_INT_TYPE nHits; double totc; int BURNIN, NSAMPLES, GAP; char imdName[STRLEN], statName[STRLEN]; @@ -54,7 +56,7 @@ char cvsF[STRLEN]; Refs refs; GroupInfo gi; -vector s; +vector s; vector hits; vector theta; @@ -160,7 +162,7 @@ void init() { pthread_attr_init(&attr); pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - if (verbose) { printf("Initialization finished!"); } + if (verbose) { printf("Initialization finished!\n"); } } //sample theta from Dir(1) @@ -187,8 +189,8 @@ void writeCountVector(FILE* fo, vector& counts) { } void* Gibbs(void* arg) { - int len, fr, to; int CHAINLEN; + HIT_INT_TYPE len, fr, to; Params *params = (Params*)arg; vector theta; @@ -205,11 +207,11 @@ void* Gibbs(void* arg) { counts.assign(M + 1, 1); // 1 pseudo count counts[0] += N0; - for (int i = 0; i < N1; i++) { + for (READ_INT_TYPE i = 0; i < N1; i++) { fr = s[i]; to = s[i + 1]; len = to - fr; arr.assign(len, 0); - for (int j = fr; j < to; j++) { + for (HIT_INT_TYPE j = fr; j < to; j++) { arr[j - fr] = theta[hits[j].sid] * hits[j].conprb; if (j > fr) arr[j - fr] += arr[j - fr - 1]; // cumulative } @@ -221,11 +223,11 @@ void* Gibbs(void* arg) { CHAINLEN = 1 + (params->nsamples - 1) * GAP; for (int ROUND = 1; ROUND <= BURNIN + CHAINLEN; ROUND++) { - for (int i = 0; i < N1; i++) { + for (READ_INT_TYPE i = 0; i < N1; i++) { --counts[z[i]]; fr = s[i]; to = s[i + 1]; len = to - fr; arr.assign(len, 0); - for (int j = fr; j < to; j++) { + for (HIT_INT_TYPE j = fr; j < to; j++) { arr[j - fr] = counts[hits[j].sid] * hits[j].conprb; if (j > fr) arr[j - fr] += arr[j - fr - 1]; //cumulative } @@ -281,23 +283,6 @@ void release() { pve_c[i] = (pve_c[i] - NSAMPLES * pme_c[i] * pme_c[i]) / (NSAMPLES - 1); pme_theta[i] /= NSAMPLES; } - - /* - // combine files - FILE *fo = fopen(cvsF, "a"); - for (int i = 1; i < nThreads; i++) { - sprintf(inpF, "%s%d", cvsF, i); - ifstream fin(inpF); - while (getline(fin, line)) { - fprintf(fo, "%s\n", line.c_str()); - } - fin.close(); - sprintf(command, "rm -f %s", inpF); - int status = system(command); - general_assert(status == 0, "Fail to delete file " + cstrtos(inpF) + "!"); - } - fclose(fo); - */ } template diff --git a/HitContainer.h b/HitContainer.h index b051ed4..5dfbf7e 100644 --- a/HitContainer.h +++ b/HitContainer.h @@ -4,8 +4,9 @@ #include #include #include - #include + +#include "utils.h" #include "GroupInfo.h" template @@ -39,32 +40,32 @@ public: } } - int getN() { return n; } + READ_INT_TYPE getN() { return n; } - int getNHits() { return nhits; } + HIT_INT_TYPE getNHits() { return nhits; } - int calcNumGeneMultiReads(const GroupInfo&); - int calcNumIsoformMultiReads(); + READ_INT_TYPE calcNumGeneMultiReads(const GroupInfo&); + READ_INT_TYPE calcNumIsoformMultiReads(); - int getSAt(int pos) { assert(pos >= 0 && pos <= n); return s[pos]; } + HIT_INT_TYPE getSAt(READ_INT_TYPE pos) { assert(pos >= 0 && pos <= n); return s[pos]; } - HitType& getHitAt(int pos) { assert(pos >= 0 && pos < nhits); return hits[pos]; } + HitType& getHitAt(HIT_INT_TYPE pos) { assert(pos >= 0 && pos < nhits); return hits[pos]; } private: - int n; // n reads in total - int nhits; // # of hits - std::vector s; + READ_INT_TYPE n; // n reads in total + HIT_INT_TYPE nhits; // # of hits + std::vector s; std::vector hits; }; //Each time only read one read's hits. If you want to start over, must call clear() first! template bool HitContainer::read(std::istream& in) { - int tot; + HIT_INT_TYPE tot; if (!(in>>tot)) return false; assert(tot > 0); - for (int i = 0; i < tot; i++) { + for (HIT_INT_TYPE i = 0; i < tot; i++) { HitType hit; if (!hit.read(in)) return false; hits.push_back(hit); @@ -80,9 +81,9 @@ bool HitContainer::read(std::istream& in) { template void HitContainer::write(std::ostream& out) { if (n <= 0) return; - for (int i = 0; i < n; i++) { + for (READ_INT_TYPE i = 0; i < n; i++) { out<::write(std::ostream& out) { } template -int HitContainer::calcNumGeneMultiReads(const GroupInfo& gi) { - int res = 0; +READ_INT_TYPE HitContainer::calcNumGeneMultiReads(const GroupInfo& gi) { + READ_INT_TYPE res = 0; int *sortgids = NULL; - for (int i = 0; i < n; i++) { - int num = s[i + 1] - s[i]; + for (READ_INT_TYPE i = 0; i < n; i++) { + HIT_INT_TYPE num = s[i + 1] - s[i]; sortgids = new int[num]; - for (int j = s[i]; j < s[i + 1]; j++) sortgids[j] = gi.gidAt(hits[j].getSid()); + for (HIT_INT_TYPE j = s[i]; j < s[i + 1]; j++) sortgids[j] = gi.gidAt(hits[j].getSid()); std::sort(sortgids, sortgids + num); if (std::unique(sortgids, sortgids + num) - sortgids > 1) ++res; delete[] sortgids; @@ -107,9 +108,9 @@ int HitContainer::calcNumGeneMultiReads(const GroupInfo& gi) { } template -int HitContainer::calcNumIsoformMultiReads() { - int res = 0; - for (int i = 0; i < n; i++) +READ_INT_TYPE HitContainer::calcNumIsoformMultiReads() { + READ_INT_TYPE res = 0; + for (READ_INT_TYPE i = 0; i < n; i++) if (s[i + 1] - s[i] > 1) ++res; return res; } diff --git a/HitWrapper.h b/HitWrapper.h index 7fb940c..b2fbbd8 100644 --- a/HitWrapper.h +++ b/HitWrapper.h @@ -1,6 +1,7 @@ #ifndef HITWRAPPER_H_ #define HITWRAPPER_H_ +#include "utils.h" #include "HitContainer.h" // assume each hit vector contains at least one hit @@ -26,8 +27,8 @@ public: } private: - int i, j; - int nThreads; + int i, nThreads; + HIT_INT_TYPE j; HitContainer **hitvs; }; diff --git a/ModelParams.h b/ModelParams.h index 563c462..069db2a 100644 --- a/ModelParams.h +++ b/ModelParams.h @@ -9,7 +9,7 @@ struct ModelParams { int M; - int N[3]; + READ_INT_TYPE N[3]; int minL, maxL; bool estRSPD; // true if user wants to estimate RSPD; false if use uniform distribution int B; // number of bins in RSPD diff --git a/PairedEndModel.h b/PairedEndModel.h index 575344c..f3f95fe 100644 --- a/PairedEndModel.h +++ b/PairedEndModel.h @@ -8,6 +8,7 @@ #include #include #include +#include #include "utils.h" #include "my_assert.h" @@ -193,7 +194,7 @@ public: const LenDist& getGLD() { return *gld; } void startSimulation(simul*, double*); - bool simulate(int, PairedEndRead&, int&); + bool simulate(READ_INT_TYPE, PairedEndRead&, int&); void finishSimulation(); //Use it after function 'read' or 'estimateFromReads' @@ -209,7 +210,7 @@ private: static const int read_type = 2; int M; - int N[3]; + READ_INT_TYPE N[3]; Refs *refs; int seedLen; @@ -241,7 +242,7 @@ void PairedEndModel::estimateFromReads(const char* readFN) { genReadFileNames(readFN, i, read_type, s, readFs); ReadReader reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function - int cnt = 0; + READ_INT_TYPE cnt = 0; while (reader.next(read)) { SingleRead mate1 = read.getMate1(); SingleRead mate2 = read.getMate2(); @@ -256,14 +257,14 @@ void PairedEndModel::estimateFromReads(const char* readFN) { } } else if (verbose && (mate1.getReadLength() < seedLen || mate2.getReadLength() < seedLen)) { - printf("Warning: Read %s is ignored due to at least one of the mates' length < seed length %d!\n", read.getName().c_str(), seedLen); + std::cout<< "Warning: Read "<< read.getName()<< " is ignored due to at least one of the mates' length < seed length "<< seedLen<< "!"<< std::endl; } ++cnt; - if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); } + if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " READS PROCESSED"<< std::endl; } } - if (verbose) { printf("estimateFromReads, N%d finished.\n", i); } + if (verbose) { std::cout<< "estimateFromReads, N"<< i<< " finished."<< std::endl; } } mld->finish(); @@ -362,7 +363,7 @@ void PairedEndModel::startSimulation(simul* sampler, double* theta) { npro->startSimulation(); } -bool PairedEndModel::simulate(int rid, PairedEndRead& read, int& sid) { +bool PairedEndModel::simulate(READ_INT_TYPE rid, PairedEndRead& read, int& sid) { int dir, pos; int insertL, mateL1, mateL2; std::string name; diff --git a/PairedEndQModel.h b/PairedEndQModel.h index c61f6c8..a2f7981 100644 --- a/PairedEndQModel.h +++ b/PairedEndQModel.h @@ -8,6 +8,7 @@ #include #include #include +#include #include "utils.h" #include "my_assert.h" @@ -197,7 +198,7 @@ public: const LenDist& getGLD() { return *gld; } void startSimulation(simul*, double*); - bool simulate(int, PairedEndReadQ&, int&); + bool simulate(READ_INT_TYPE, PairedEndReadQ&, int&); void finishSimulation(); //Use it after function 'read' or 'estimateFromReads' @@ -213,7 +214,7 @@ private: static const int read_type = 3; int M; - int N[3]; + READ_INT_TYPE N[3]; Refs *refs; int seedLen; @@ -246,7 +247,7 @@ void PairedEndQModel::estimateFromReads(const char* readFN) { genReadFileNames(readFN, i, read_type, s, readFs); ReadReader reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function - int cnt = 0; + READ_INT_TYPE cnt = 0; while (reader.next(read)) { SingleReadQ mate1 = read.getMate1(); SingleReadQ mate2 = read.getMate2(); @@ -264,14 +265,14 @@ void PairedEndQModel::estimateFromReads(const char* readFN) { } } else if (verbose && (mate1.getReadLength() < seedLen || mate2.getReadLength() < seedLen)) { - printf("Warning: Read %s is ignored due to at least one of the mates' length < seed length %d!\n", read.getName().c_str(), seedLen); + std::cout<< "Warning: Read "<< read.getName()<< " is ignored due to at least one of the mates' length < seed length "<< seedLen<< "!"<< std::endl; } ++cnt; - if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); } + if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " READS PROCESSED"<< std::endl; } } - if (verbose) { printf("estimateFromReads, N%d finished.\n", i); } + if (verbose) { std::cout<<"estimateFromReads, N"<< i<<" finished."<< std::endl; } } mld->finish(); @@ -375,7 +376,7 @@ void PairedEndQModel::startSimulation(simul* sampler, double* theta) { nqpro->startSimulation(); } -bool PairedEndQModel::simulate(int rid, PairedEndReadQ& read, int& sid) { +bool PairedEndQModel::simulate(READ_INT_TYPE rid, PairedEndReadQ& read, int& sid) { int dir, pos; int insertL, mateL1, mateL2; std::string name; diff --git a/ReadIndex.h b/ReadIndex.h index 61ecd14..293e4a3 100644 --- a/ReadIndex.h +++ b/ReadIndex.h @@ -9,7 +9,7 @@ #include "utils.h" struct ReadIndex { - long nReads; + READ_INT_TYPE nReads; int gap, nPos; std::streampos *index; @@ -45,7 +45,7 @@ struct ReadIndex { } //rid 0-based , return crid : current seeked rid - long locate(long rid, std::ifstream& out) { + READ_INT_TYPE locate(READ_INT_TYPE rid, std::ifstream& out) { if (index == NULL) { out.seekg(0, std::ios::beg); return 0; diff --git a/ReadReader.h b/ReadReader.h index d244efa..9cd88e9 100644 --- a/ReadReader.h +++ b/ReadReader.h @@ -27,7 +27,7 @@ public: this->indices = indices; } - bool locate(long); // You should guarantee that indices exist and rid is valid, otherwise return false; If it fails, you should reset it manually! + bool locate(READ_INT_TYPE); // You should guarantee that indices exist and rid is valid, otherwise return false; If it fails, you should reset it manually! void reset(); bool next(ReadType& read, int flags = 7) { @@ -78,15 +78,15 @@ ReadReader::~ReadReader() { } template -bool ReadReader::locate(long rid) { - long crid = -1; +bool ReadReader::locate(READ_INT_TYPE rid) { + READ_INT_TYPE crid = -1; ReadType read; if (indices == NULL) return false; //We should make sure that crid returned by each indices is the same for (int i = 0; i < s; i++) { - long val = indices[i]->locate(rid, *arr[i]); + READ_INT_TYPE val = indices[i]->locate(rid, *arr[i]); if (i == 0) { crid = val; } else { assert(crid == val); } } assert(crid <= rid); diff --git a/SingleHit.h b/SingleHit.h index f2e6aa6..b157a15 100644 --- a/SingleHit.h +++ b/SingleHit.h @@ -23,7 +23,7 @@ public: //makes no sense for noise gene int getDir() const { return sid < 0; } - int getSid() const { return abs(sid); } + int getSid() const { return abs(sid); } int getPos() const { return pos; } diff --git a/SingleModel.h b/SingleModel.h index 8b26e0e..7ad3464 100644 --- a/SingleModel.h +++ b/SingleModel.h @@ -8,6 +8,7 @@ #include #include #include +#include #include "utils.h" #include "my_assert.h" @@ -230,7 +231,7 @@ public: const LenDist& getGLD() { return *gld; } void startSimulation(simul*, double*); - bool simulate(int, SingleRead&, int&); + bool simulate(READ_INT_TYPE, SingleRead&, int&); void finishSimulation(); double* getMW() { @@ -245,7 +246,7 @@ private: static const int read_type = 0; int M; - int N[3]; + READ_INT_TYPE N[3]; Refs *refs; double mean, sd; int seedLen; @@ -279,21 +280,21 @@ void SingleModel::estimateFromReads(const char* readFN) { genReadFileNames(readFN, i, read_type, s, readFs); ReadReader reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function - int cnt = 0; + READ_INT_TYPE cnt = 0; while (reader.next(read)) { if (!read.isLowQuality()) { mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0); if (i == 0) { npro->updateC(read.getReadSeq()); } } else if (verbose && read.getReadLength() < seedLen) { - printf("Warning: Read %s is ignored due to read length %d < seed length %d!\n", read.getName().c_str(), read.getReadLength(), seedLen); + std::cout<< "Warning: Read "<< read.getName()<< " is ignored due to read length "<< read.getReadLength()<< " < seed length "<< seedLen<< "!"<< std::endl; } - + ++cnt; - if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); } + if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " READS PROCESSED"<< std::endl; } } - if (verbose) { printf("estimateFromReads, N%d finished.\n", i); } + if (verbose) { std::cout<< "estimateFromReads, N"<< i<< " finished."<< std::endl; } } mld != NULL ? mld->finish() : gld->finish(); @@ -403,7 +404,7 @@ void SingleModel::startSimulation(simul* sampler, double* theta) { npro->startSimulation(); } -bool SingleModel::simulate(int rid, SingleRead& read, int& sid) { +bool SingleModel::simulate(READ_INT_TYPE rid, SingleRead& read, int& sid) { int dir, pos, readLen, fragLen; std::string name; std::string readseq; diff --git a/SingleQModel.h b/SingleQModel.h index ffc0639..5da4754 100644 --- a/SingleQModel.h +++ b/SingleQModel.h @@ -8,6 +8,7 @@ #include #include #include +#include #include "utils.h" #include "my_assert.h" @@ -238,7 +239,7 @@ public: const LenDist& getGLD() { return *gld; } void startSimulation(simul*, double*); - bool simulate(int, SingleReadQ&, int&); + bool simulate(READ_INT_TYPE, SingleReadQ&, int&); void finishSimulation(); //Use it after function 'read' or 'estimateFromReads' @@ -254,7 +255,7 @@ private: static const int read_type = 1; int M; - int N[3]; + READ_INT_TYPE N[3]; Refs *refs; double mean, sd; int seedLen; @@ -289,7 +290,7 @@ void SingleQModel::estimateFromReads(const char* readFN) { genReadFileNames(readFN, i, read_type, s, readFs); ReadReader reader(s, readFs, refs->hasPolyA(), seedLen); // allow calculation of calc_lq() function - int cnt = 0; + READ_INT_TYPE cnt = 0; while (reader.next(read)) { if (!read.isLowQuality()) { mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0); @@ -297,14 +298,14 @@ void SingleQModel::estimateFromReads(const char* readFN) { if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); } } else if (verbose && read.getReadLength() < seedLen) { - printf("Warning: Read %s is ignored due to read length %d < seed length %d!\n", read.getName().c_str(), read.getReadLength(), seedLen); + std::cout<< "Warning: Read "<< read.getName()<< " is ignored due to read length "<< read.getReadLength()<< " < seed length "<< seedLen<< "!"<< std::endl; } ++cnt; - if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); } + if (verbose && cnt % 1000000 == 0) { std::cout<< cnt<< " READS PROCESSED"<< std::endl; } } - if (verbose) { printf("estimateFromReads, N%d finished.\n", i); } + if (verbose) { std::cout<< "estimateFromReads, N"<< i<< " finished."<< std::endl; } } mld != NULL ? mld->finish() : gld->finish(); @@ -419,7 +420,7 @@ void SingleQModel::startSimulation(simul* sampler, double* theta) { nqpro->startSimulation(); } -bool SingleQModel::simulate(int rid, SingleReadQ& read, int& sid) { +bool SingleQModel::simulate(READ_INT_TYPE rid, SingleReadQ& read, int& sid) { int dir, pos, readLen, fragLen; std::string name; std::string qual, readseq; diff --git a/buildReadIndex.cpp b/buildReadIndex.cpp index c9e1deb..7fde025 100644 --- a/buildReadIndex.cpp +++ b/buildReadIndex.cpp @@ -13,7 +13,7 @@ bool hasQ; void buildIndex(char* readF, int gap, bool hasQ) { int nPos; - long nReads; + READ_INT_TYPE nReads; bool success; string line; char idxF[STRLEN]; @@ -53,7 +53,7 @@ void buildIndex(char* readF, int gap, bool hasQ) { } ++nReads; - if (verbose && nReads % 1000000 == 0) { printf("FIN %lld\n", (long long)nReads); } + if (verbose && nReads % 1000000 == 0) { cout<< "FIN "<< nReads<< endl; } } while (success); fout.seekp(startPos); @@ -64,7 +64,7 @@ void buildIndex(char* readF, int gap, bool hasQ) { fin.close(); fout.close(); - if (verbose) { printf("Build Index %s is Done!\n", readF); } + if (verbose) { cout<< "Build Index "<< readF<< " is Done!"<< endl; } } int main(int argc, char* argv[]) { diff --git a/extractRef.cpp b/extractRef.cpp index da0effc..0a46de0 100644 --- a/extractRef.cpp +++ b/extractRef.cpp @@ -134,14 +134,14 @@ void parse_gtf_file(char* gtfF) { else { if (hasMappingFile) { tid = item.getTranscriptID(); - mi_iter = mi_table.find(tid); - if (mi_iter == mi_table.end()) { - fprintf(stderr, "Mapping Info is not correct, cannot find %s's gene_id!\n", tid.c_str()); - exit(-1); - } - //assert(iter != table.end()); - gid = mi_iter->second; - item.setGeneID(gid); + mi_iter = mi_table.find(tid); + if (mi_iter == mi_table.end()) { + fprintf(stderr, "Mapping Info is not correct, cannot find %s's gene_id!\n", tid.c_str()); + exit(-1); + } + //assert(iter != table.end()); + gid = mi_iter->second; + item.setGeneID(gid); } items.push_back(item); } diff --git a/getUnique.cpp b/getUnique.cpp index 7272a81..c7925f2 100644 --- a/getUnique.cpp +++ b/getUnique.cpp @@ -9,6 +9,8 @@ #include "sam/bam.h" #include "sam/sam.h" +#include "utils.h" + using namespace std; string cqname; @@ -21,7 +23,7 @@ void output() { if (unaligned || arr.size() == 0) return; bool isPaired = (arr[0]->core.flag & 0x0001); if ((isPaired && arr.size() != 2) || (!isPaired && arr.size() != 1)) return; - for (int i = 0; i < (int)arr.size(); i++) samwrite(out, arr[i]); + for (size_t i = 0; i < arr.size(); i++) samwrite(out, arr[i]); } int main(int argc, char* argv[]) { @@ -35,7 +37,7 @@ int main(int argc, char* argv[]) { out = samopen(argv[2], "wb", in->header); assert(out != 0); - int cnt = 0; + HIT_INT_TYPE cnt = 0; cqname = ""; arr.clear(); @@ -46,7 +48,7 @@ int main(int argc, char* argv[]) { if (cqname != bam1_qname(b)) { output(); cqname = bam1_qname(b); - for (int i = 0; i < (int)arr.size(); i++) bam_destroy1(arr[i]); + for (size_t i = 0; i < arr.size(); i++) bam_destroy1(arr[i]); arr.clear(); unaligned = false; } diff --git a/parseIt.cpp b/parseIt.cpp index 8620432..c760c79 100644 --- a/parseIt.cpp +++ b/parseIt.cpp @@ -28,9 +28,9 @@ using namespace std; int read_type; // 0 SingleRead, 1 SingleReadQ, 2 PairedEndRead, 3 PairedEndReadQ -int N[3]; // note, N = N0 + N1 + N2 , but may not be equal to the total number of reads in data -int nHits; // # of hits -int nUnique, nMulti, nIsoMulti; +READ_INT_TYPE N[3]; // note, N = N0 + N1 + N2 , but may not be equal to the total number of reads in data +HIT_INT_TYPE nHits; // # of hits +READ_INT_TYPE nUnique, nMulti, nIsoMulti; char fn_list[STRLEN]; char groupF[STRLEN], tiF[STRLEN]; char imdName[STRLEN]; @@ -46,8 +46,8 @@ int n_os; // number of ostreams ostream *cat[3][2]; // cat : category 1-dim 0 N0 1 N1 2 N2; 2-dim 0 mate1 1 mate2 char readOutFs[3][2][STRLEN]; -map counter; -map::iterator iter; +map counter; +map::iterator iter; void init(const char* imdName, char alignFType, const char* alignF) { @@ -85,7 +85,7 @@ void parseIt(SamParser *parser) { nUnique = nMulti = nIsoMulti = 0; memset(N, 0, sizeof(N)); - long long cnt = 0; + READ_INT_TYPE cnt = 0; record_val = -2; //indicate no recorded read now while ((val = parser->parseNext(read, hit)) >= 0) { @@ -122,7 +122,7 @@ void parseIt(SamParser *parser) { } ++cnt; - if (verbose && (cnt % 1000000 == 0)) { printf("Parsed %lld entries\n", cnt); } + if (verbose && (cnt % 1000000 == 0)) { cout<< "Parsed "<< cnt<< " entries"<< endl; } } if (record_val >= 0) { @@ -203,7 +203,7 @@ int main(int argc, char* argv[]) { hit_out.open(datF); - string firstLine(59, ' '); + string firstLine(99, ' '); firstLine.append(1, '\n'); //May be dangerous! hit_out<= 0); bool isPaired; - long cnt = 0; + HIT_INT_TYPE cnt = 0; printf("."); fflush(stdout); diff --git a/simulation.cpp b/simulation.cpp index e0c9cec..97cee6e 100644 --- a/simulation.cpp +++ b/simulation.cpp @@ -31,7 +31,7 @@ using namespace std; -int N; +READ_INT_TYPE N; int model_type, M, m; Refs refs; @@ -138,19 +138,19 @@ void simulate(char* modelF, char* resultsF) { fin.close(); for (int i = 1; i <= M; i++) theta[i] = theta[i] / denom * (1.0 - theta[0]); - int resimulation_count = 0; + READ_INT_TYPE resimulation_count = 0; //simulating... model.startSimulation(&sampler, theta); - for (int i = 0; i < N; i++) { + for (READ_INT_TYPE i = 0; i < N; i++) { while (!model.simulate(i, read, sid)) { ++resimulation_count; } read.write(n_os, os); ++counts[sid]; - if ((i + 1) % 1000000 == 0 && verbose) printf("GEN %d\n", i + 1); + if ((i + 1) % 1000000 == 0 && verbose) cout<<"GEN "<< i + 1<< endl; } model.finishSimulation(); - printf("Total number of resimulation is %d\n", resimulation_count); + cout<< "Total number of resimulation is "<< resimulation_count<< endl; } void writeResFiles(char* outFN) { diff --git a/utils.h b/utils.h index e3f8bcb..b15f154 100644 --- a/utils.h +++ b/utils.h @@ -10,6 +10,10 @@ #include #include #include +#include + +typedef uint64_t HIT_INT_TYPE; +typedef uint64_t READ_INT_TYPE; const int STRLEN = 10005 ; const double EPSILON = 1e-300; diff --git a/wiggle.cpp b/wiggle.cpp index 65a3d4a..c0cbdcc 100644 --- a/wiggle.cpp +++ b/wiggle.cpp @@ -1,11 +1,13 @@ #include #include #include +#include #include #include "sam/bam.h" #include "sam/sam.h" +#include "utils.h" #include "wiggle.h" void add_bam_record_to_wiggle(const bam1_t *b, Wiggle& wiggle) { @@ -42,7 +44,7 @@ void build_wiggles(const std::string& bam_filename, memset(used, 0, sizeof(bool) * header->n_targets); int cur_tid = -1; //current tid; - int cnt = 0; + HIT_INT_TYPE cnt; bam1_t *b = bam_init1(); Wiggle wiggle; while (samread(bam_in, b) >= 0) { @@ -57,7 +59,7 @@ void build_wiggles(const std::string& bam_filename, } add_bam_record_to_wiggle(b, wiggle); ++cnt; - if (cnt % 1000000 == 0) fprintf(stderr, "%d FIN\n", cnt); + if (cnt % 1000000 == 0) std::cout<< cnt<< std::endl; } if (cur_tid >= 0) { used[cur_tid] = true; processor.process(wiggle); } -- 2.39.2