X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=pintail.cpp;h=84c3219822920e21bc20d89f2fdf6b4f58ead1f3;hb=fdc1f6eaf544f695fc1511f24bddd7e6069c33ba;hp=46da9137a18c072bfc1033025fd2551ea92c728a;hpb=81276c241b984898f8d30ad123c00592ee6db7b8;p=mothur.git diff --git a/pintail.cpp b/pintail.cpp index 46da913..84c3219 100644 --- a/pintail.cpp +++ b/pintail.cpp @@ -18,10 +18,30 @@ inline bool compareQuanMembers(quanMember left, quanMember right){ } //*************************************************************************************************************** -Pintail::Pintail(string filename, string o) { - fastafile = filename; outputDir = o; - distcalculator = new eachGapDist(); - decalc = new DeCalculator(); +Pintail::Pintail(string filename, string temp, bool f, int p, string mask, string cons, string q, int win, int inc, string o) : Chimera() { + try { + + fastafile = filename; + templateFileName = temp; templateSeqs = readSeqs(temp); + filter = f; + processors = p; + setMask(mask); + consfile = cons; + quanfile = q; + window = win; + increment = inc; + outputDir = o; + + distcalculator = new eachGapDist(); + decalc = new DeCalculator(); + + doPrep(); + } + catch(exception& e) { + m->errorOut(e, "Pintail", "Pintail"); + exit(1); + } + } //*************************************************************************************************************** @@ -32,12 +52,12 @@ Pintail::~Pintail() { delete decalc; } catch(exception& e) { - errorOut(e, "Pintail", "~Pintail"); + m->errorOut(e, "Pintail", "~Pintail"); exit(1); } } //*************************************************************************************************************** -void Pintail::doPrep() { +int Pintail::doPrep() { try { mergedFilterString = ""; @@ -51,6 +71,9 @@ void Pintail::doPrep() { decalc->setMask(seqMask); + #ifdef USE_MPI + //do nothing + #else #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) //find breakup of templatefile for quantiles if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); } @@ -64,51 +87,51 @@ void Pintail::doPrep() { #else templateLines.push_back(new linePair(0, templateSeqs.size())); #endif - + #endif - mothurOut("Getting conservation... "); cout.flush(); + m->mothurOut("Getting conservation... "); cout.flush(); if (consfile == "") { - mothurOut("Calculating probability of conservation for your template sequences. This can take a while... I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command. Providing the .freq file will improve speed. "); cout.flush(); + m->mothurOut("Calculating probability of conservation for your template sequences. This can take a while... I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command. Providing the .freq file will improve speed. "); cout.flush(); probabilityProfile = decalc->calcFreq(templateSeqs, outputDir + getSimpleName(templateFileName)); - mothurOut("Done."); mothurOutEndLine(); - }else { probabilityProfile = readFreq(); mothurOut("Done."); } - mothurOutEndLine(); + if (m->control_pressed) { return 0; } + m->mothurOut("Done."); m->mothurOutEndLine(); + }else { probabilityProfile = readFreq(); m->mothurOut("Done."); } + m->mothurOutEndLine(); //make P into Q - for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //cout << i << '\t' << probabilityProfile[i] << endl; + for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } // bool reRead = false; //create filter if needed for later if (filter) { - reRead = true; - - if (seqMask != "") { - //mask templates - for (int i = 0; i < templateSeqs.size(); i++) { - decalc->runMask(templateSeqs[i]); - } - } - + //read in all query seqs - ifstream in; - openInputFile(fastafile, in); - - vector tempQuerySeqs; - while(!in.eof()){ - Sequence* s = new Sequence(in); - gobble(in); + vector tempQuerySeqs = readSeqs(fastafile); - if (s->getName() != "") { tempQuerySeqs.push_back(s); } - } - in.close(); - vector temp; //merge query seqs and template seqs temp = templateSeqs; for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); } - + + if (seqMask != "") { + reRead = true; + //mask templates + for (int i = 0; i < temp.size(); i++) { + if (m->control_pressed) { + for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } + return 0; + } + decalc->runMask(temp[i]); + } + } + mergedFilterString = createFilter(temp, 0.5); + if (m->control_pressed) { + for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } + return 0; + } + //reread template seqs for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } } @@ -124,33 +147,43 @@ void Pintail::doPrep() { reRead = true; //mask templates for (int i = 0; i < templateSeqs.size(); i++) { + if (m->control_pressed) { return 0; } decalc->runMask(templateSeqs[i]); } } - mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush(); + if (filter) { + reRead = true; + for (int i = 0; i < templateSeqs.size(); i++) { + if (m->control_pressed) { return 0; } + runFilter(templateSeqs[i]); + } + } + + m->mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush(); if (processors == 1) { quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size()); }else { createProcessesQuan(); } + if (m->control_pressed) { return 0; } - ofstream out4, out5; string noOutliers, outliers; if ((!filter) && (seqMask == "")) { noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.quan"; - }else if ((filter) && (seqMask == "")) { - noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered.quan"; }else if ((!filter) && (seqMask != "")) { noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.masked.quan"; }else if ((filter) && (seqMask != "")) { - noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered.masked.quan"; + noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "masked.quan"; + }else if ((filter) && (seqMask == "")) { + noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered." + getSimpleName(getRootName(fastafile)) + "quan"; } - decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size()); - openOutputFile(noOutliers, out5); + if (m->control_pressed) { return 0; } + + string outputString = ""; //adjust quantiles for (int i = 0; i < quantilesMembers.size(); i++) { @@ -181,15 +214,17 @@ void Pintail::doPrep() { } //output quan value - out5 << i+1 << '\t'; - for (int u = 0; u < temp.size(); u++) { out5 << temp[u] << '\t'; } - out5 << endl; + outputString += toString(i+1) + "\t"; + for (int u = 0; u < temp.size(); u++) { outputString += toString(temp[u]) + "\t"; } + outputString += "\n"; quantiles[i] = temp; } - - mothurOut("Done."); mothurOutEndLine(); + + printQuanFile(noOutliers, outputString); + + m->mothurOut("Done."); m->mothurOutEndLine(); } if (reRead) { @@ -202,14 +237,16 @@ void Pintail::doPrep() { //free memory for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; } + return 0; + } catch(exception& e) { - errorOut(e, "Pintail", "doPrep"); + m->errorOut(e, "Pintail", "doPrep"); exit(1); } } //*************************************************************************************************************** -void Pintail::print(ostream& out) { +int Pintail::print(ostream& out, ostream& outAcc) { try { int index = ceil(deviation); @@ -226,7 +263,8 @@ void Pintail::print(ostream& out) { out << querySeq->getName() << '\t' << "div: " << deviation << "\tstDev: " << DE << "\tchimera flag: " << chimera << endl; if (chimera == "Yes") { - mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); mothurOutEndLine(); + m->mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); m->mothurOutEndLine(); + outAcc << querySeq->getName() << endl; } out << "Observed\t"; @@ -238,14 +276,72 @@ void Pintail::print(ostream& out) { for (int m = 0; m < expectedDistance.size(); m++) { out << expectedDistance[m] << '\t'; } out << endl; + return 0; } catch(exception& e) { - errorOut(e, "Pintail", "print"); + m->errorOut(e, "Pintail", "print"); exit(1); } } +#ifdef USE_MPI +//*************************************************************************************************************** +int Pintail::print(MPI_File& out, MPI_File& outAcc) { + try { + bool results = false; + string outputString = ""; + int index = ceil(deviation); + + //is your DE value higher than the 95% + string chimera; + if (index != 0) { //if index is 0 then its an exact match to a template seq + if (quantiles[index][4] == 0.0) { + chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index); + }else { + if (DE > quantiles[index][4]) { chimera = "Yes"; } + else { chimera = "No"; } + } + }else{ chimera = "No"; } + outputString += querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera + "\n"; + if (chimera == "Yes") { + cout << querySeq->getName() << "\tdiv: " << toString(deviation) << "\tstDev: " << toString(DE) << "\tchimera flag: " << chimera << endl; + string outAccString = querySeq->getName() + "\n"; + + MPI_Status statusAcc; + int length = outAccString.length(); + char buf[length]; + strcpy(buf, outAccString.c_str()); + + MPI_File_write_shared(outAcc, buf, length, MPI_CHAR, &statusAcc); + + results = true; + } + outputString += "Observed\t"; + + for (int j = 0; j < obsDistance.size(); j++) { outputString += toString(obsDistance[j]) + "\t"; } + outputString += "\n"; + + outputString += "Expected\t"; + + for (int m = 0; m < expectedDistance.size(); m++) { outputString += toString(expectedDistance[m]) + "\t"; } + outputString += "\n"; + + MPI_Status status; + int length = outputString.length(); + char buf2[length]; + strcpy(buf2, outputString.c_str()); + + MPI_File_write_shared(out, buf2, length, MPI_CHAR, &status); + + return results; + } + catch(exception& e) { + m->errorOut(e, "Pintail", "print"); + exit(1); + } +} +#endif //*************************************************************************************************************** int Pintail::getChimeras(Sequence* query) { try { @@ -256,6 +352,8 @@ int Pintail::getChimeras(Sequence* query) { //find pairs has to be done before a mask bestfit = findPairs(query); + if (m->control_pressed) { return 0; } + //if they mask if (seqMask != "") { decalc->runMask(query); @@ -277,8 +375,12 @@ int Pintail::getChimeras(Sequence* query) { //find observed distance obsDistance = decalc->calcObserved(query, bestfit, windowsForeachQuery, windowSizes); + + if (m->control_pressed) { return 0; } Qav = decalc->findQav(windowsForeachQuery, windowSizes, probabilityProfile); + + if (m->control_pressed) { return 0; } //find alpha seqCoef = decalc->getCoef(obsDistance, Qav); @@ -286,9 +388,13 @@ int Pintail::getChimeras(Sequence* query) { //calculating expected distance expectedDistance = decalc->calcExpected(Qav, seqCoef); + if (m->control_pressed) { return 0; } + //finding de DE = decalc->calcDE(obsDistance, expectedDistance); + if (m->control_pressed) { return 0; } + //find distance between query and closest match it = trimmed.begin(); deviation = decalc->calcDist(query, bestfit, it->first, it->second); @@ -298,7 +404,7 @@ int Pintail::getChimeras(Sequence* query) { return 0; } catch(exception& e) { - errorOut(e, "Pintail", "getChimeras"); + m->errorOut(e, "Pintail", "getChimeras"); exit(1); } } @@ -307,16 +413,56 @@ int Pintail::getChimeras(Sequence* query) { vector Pintail::readFreq() { try { - - ifstream in; - openInputFile(consfile, in); - + //read in probabilities and store in vector + int pos; float num; + vector prob; set h = decalc->getPos(); //positions of bases in masking sequence - //read in probabilities and store in vector - int pos; float num; + #ifdef USE_MPI + + MPI_File inMPI; + MPI_Offset size; + MPI_Status status; + + char inFileName[consfile.length()]; + strcpy(inFileName, consfile.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); + MPI_File_get_size(inMPI, &size); + + char buffer[size]; + MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status); + + string tempBuf = buffer; + + if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); } + istringstream iss (tempBuf,istringstream::in); + while(!iss.eof()) { + iss >> pos >> num; + + if (h.count(pos) > 0) { + float Pi; + Pi = (num - 0.25) / 0.75; + + //cannot have probability less than 0. + if (Pi < 0) { Pi = 0.0; } + + //do you want this spot + prob.push_back(Pi); + } + + gobble(iss); + } + + MPI_File_close(&inMPI); + + #else + + ifstream in; + openInputFile(consfile, in); + while(!in.eof()){ in >> pos >> num; @@ -334,13 +480,15 @@ vector Pintail::readFreq() { gobble(in); } - in.close(); + + #endif + return prob; } catch(exception& e) { - errorOut(e, "Pintail", "readFreq"); + m->errorOut(e, "Pintail", "readFreq"); exit(1); } } @@ -357,11 +505,11 @@ Sequence* Pintail::findPairs(Sequence* q) { } catch(exception& e) { - errorOut(e, "Pintail", "findPairs"); + m->errorOut(e, "Pintail", "findPairs"); exit(1); } } -/**************************************************************************************************/ +//************************************************************************************************** void Pintail::createProcessesQuan() { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) @@ -397,7 +545,7 @@ void Pintail::createProcessesQuan() { out.close(); exit(0); - }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } + }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } } //force parent to wait until all the processes are done @@ -425,13 +573,12 @@ void Pintail::createProcessesQuan() { vector q; float w; int b, n; for (int j = 0; j < num; j++) { in >> w >> b >> n; - //cout << w << '\t' << b << '\t' n << endl; + quanMember newMember(w, b, n); q.push_back(newMember); } -//cout << "here" << endl; + quan[m] = q; -//cout << "now here" << endl; gobble(in); } @@ -452,12 +599,138 @@ void Pintail::createProcessesQuan() { #endif } catch(exception& e) { - errorOut(e, "Pintail", "createProcessesQuan"); + m->errorOut(e, "Pintail", "createProcessesQuan"); exit(1); } } +//*************************************************************************************************************** +vector< vector > Pintail::readQuantiles() { + try { + int num; + float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine; + + vector< vector > quan; + vector temp; temp.resize(6, 0); + + //to fill 0 + quan.push_back(temp); + + #ifdef USE_MPI + + MPI_File inMPI; + MPI_Offset size; + MPI_Status status; + + char inFileName[quanfile.length()]; + strcpy(inFileName, quanfile.c_str()); + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); + MPI_File_get_size(inMPI, &size); + + char buffer[size]; + MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status); + + string tempBuf = buffer; + if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); } + istringstream iss (tempBuf,istringstream::in); + + while(!iss.eof()) { + iss >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; + + temp.clear(); + + temp.push_back(ten); + temp.push_back(twentyfive); + temp.push_back(fifty); + temp.push_back(seventyfive); + temp.push_back(ninetyfive); + temp.push_back(ninetynine); + + quan.push_back(temp); + + gobble(iss); + } + + MPI_File_close(&inMPI); + + #else + + ifstream in; + openInputFile(quanfile, in); + + while(!in.eof()){ + + in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; + + temp.clear(); + + temp.push_back(ten); + temp.push_back(twentyfive); + temp.push_back(fifty); + temp.push_back(seventyfive); + temp.push_back(ninetyfive); + temp.push_back(ninetynine); + + quan.push_back(temp); + + gobble(in); + } + in.close(); + #endif + + return quan; + + } + catch(exception& e) { + m->errorOut(e, "Pintail", "readQuantiles"); + exit(1); + } +} +//***************************************************************************************************************/ + +void Pintail::printQuanFile(string file, string outputString) { + try { + + #ifdef USE_MPI + + MPI_File outQuan; + MPI_Status status; + + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; + + char FileName[file.length()]; + strcpy(FileName, file.c_str()); + + if (pid == 0) { + MPI_File_open(MPI_COMM_SELF, FileName, outMode, MPI_INFO_NULL, &outQuan); //comm, filename, mode, info, filepointer + + int length = outputString.length(); + char buf[length]; + strcpy(buf, outputString.c_str()); + + MPI_File_write(outQuan, buf, length, MPI_CHAR, &status); + + MPI_File_close(&outQuan); + } + #else + ofstream outQuan; + openOutputFile(file, outQuan); + + outQuan << outputString; + + outQuan.close(); + #endif + } + catch(exception& e) { + m->errorOut(e, "Pintail", "printQuanFile"); + exit(1); + } +} + +//***************************************************************************************************************/ -//***************************************************************************************************************