X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=pintail.cpp;h=b9f2434d19d4cc842bfac1dce4f6de5a0d1b742d;hp=46da9137a18c072bfc1033025fd2551ea92c728a;hb=a8e2df1b96a57f5f29576b08361b86a96a8eff4f;hpb=81276c241b984898f8d30ad123c00592ee6db7b8 diff --git a/pintail.cpp b/pintail.cpp index 46da913..b9f2434 100644 --- a/pintail.cpp +++ b/pintail.cpp @@ -18,10 +18,30 @@ inline bool compareQuanMembers(quanMember left, quanMember right){ } //*************************************************************************************************************** -Pintail::Pintail(string filename, string o) { - fastafile = filename; outputDir = o; - distcalculator = new eachGapDist(); - decalc = new DeCalculator(); +Pintail::Pintail(string filename, string temp, bool f, int p, string mask, string cons, string q, int win, int inc, string o) : Chimera() { + try { + + fastafile = filename; + templateFileName = temp; templateSeqs = readSeqs(temp); + filter = f; + processors = p; + setMask(mask); + consfile = cons; + quanfile = q; + window = win; + increment = inc; + outputDir = o; + + distcalculator = new eachGapDist(); + decalc = new DeCalculator(); + + doPrep(); + } + catch(exception& e) { + m->errorOut(e, "Pintail", "Pintail"); + exit(1); + } + } //*************************************************************************************************************** @@ -32,12 +52,12 @@ Pintail::~Pintail() { delete decalc; } catch(exception& e) { - errorOut(e, "Pintail", "~Pintail"); + m->errorOut(e, "Pintail", "~Pintail"); exit(1); } } //*************************************************************************************************************** -void Pintail::doPrep() { +int Pintail::doPrep() { try { mergedFilterString = ""; @@ -51,7 +71,10 @@ void Pintail::doPrep() { decalc->setMask(seqMask); - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + #ifdef USE_MPI + //do nothing + #else + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) //find breakup of templatefile for quantiles if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); } else { @@ -64,51 +87,51 @@ void Pintail::doPrep() { #else templateLines.push_back(new linePair(0, templateSeqs.size())); #endif - + #endif - mothurOut("Getting conservation... "); cout.flush(); + m->mothurOut("Getting conservation... "); cout.flush(); if (consfile == "") { - mothurOut("Calculating probability of conservation for your template sequences. This can take a while... I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command. Providing the .freq file will improve speed. "); cout.flush(); - probabilityProfile = decalc->calcFreq(templateSeqs, outputDir + getSimpleName(templateFileName)); - mothurOut("Done."); mothurOutEndLine(); - }else { probabilityProfile = readFreq(); mothurOut("Done."); } - mothurOutEndLine(); + m->mothurOut("Calculating probability of conservation for your template sequences. This can take a while... I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command. Providing the .freq file will improve speed. "); cout.flush(); + probabilityProfile = decalc->calcFreq(templateSeqs, templateFileName); + if (m->control_pressed) { return 0; } + m->mothurOut("Done."); m->mothurOutEndLine(); + }else { probabilityProfile = readFreq(); m->mothurOut("Done."); } + m->mothurOutEndLine(); //make P into Q - for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //cout << i << '\t' << probabilityProfile[i] << endl; + for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } // bool reRead = false; //create filter if needed for later if (filter) { - reRead = true; - - if (seqMask != "") { - //mask templates - for (int i = 0; i < templateSeqs.size(); i++) { - decalc->runMask(templateSeqs[i]); - } - } - + //read in all query seqs - ifstream in; - openInputFile(fastafile, in); - - vector tempQuerySeqs; - while(!in.eof()){ - Sequence* s = new Sequence(in); - gobble(in); + vector tempQuerySeqs = readSeqs(fastafile); - if (s->getName() != "") { tempQuerySeqs.push_back(s); } - } - in.close(); - vector temp; //merge query seqs and template seqs temp = templateSeqs; for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); } - + + if (seqMask != "") { + reRead = true; + //mask templates + for (int i = 0; i < temp.size(); i++) { + if (m->control_pressed) { + for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } + return 0; + } + decalc->runMask(temp[i]); + } + } + mergedFilterString = createFilter(temp, 0.5); + if (m->control_pressed) { + for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } + return 0; + } + //reread template seqs for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } } @@ -124,33 +147,43 @@ void Pintail::doPrep() { reRead = true; //mask templates for (int i = 0; i < templateSeqs.size(); i++) { + if (m->control_pressed) { return 0; } decalc->runMask(templateSeqs[i]); } } - mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush(); + if (filter) { + reRead = true; + for (int i = 0; i < templateSeqs.size(); i++) { + if (m->control_pressed) { return 0; } + runFilter(templateSeqs[i]); + } + } + + m->mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush(); if (processors == 1) { quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size()); }else { createProcessesQuan(); } + if (m->control_pressed) { return 0; } - ofstream out4, out5; string noOutliers, outliers; if ((!filter) && (seqMask == "")) { - noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.quan"; - }else if ((filter) && (seqMask == "")) { - noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered.quan"; + noOutliers = m->getRootName(m->getSimpleName(templateFileName)) + "pintail.quan"; }else if ((!filter) && (seqMask != "")) { - noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.masked.quan"; + noOutliers =m->getRootName(m->getSimpleName(templateFileName)) + "pintail.masked.quan"; }else if ((filter) && (seqMask != "")) { - noOutliers = outputDir + getRootName(getSimpleName(templateFileName)) + "pintail.filtered.masked.quan"; + noOutliers = m->getRootName(m->getSimpleName(templateFileName)) + "pintail.filtered." + m->getSimpleName(m->getRootName(fastafile)) + "masked.quan"; + }else if ((filter) && (seqMask == "")) { + noOutliers = m->getRootName(m->getSimpleName(templateFileName)) + "pintail.filtered." + m->getSimpleName(m->getRootName(fastafile)) + "quan"; } - decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size()); - openOutputFile(noOutliers, out5); + if (m->control_pressed) { return 0; } + + string outputString = "#" + m->getVersion() + "\n"; //adjust quantiles for (int i = 0; i < quantilesMembers.size(); i++) { @@ -163,33 +196,38 @@ void Pintail::doPrep() { } }else{ - sort(quantilesMembers[i].begin(), quantilesMembers[i].end(), compareQuanMembers); + sort(quantilesMembers[i].begin(), quantilesMembers[i].end()); //save 10% - temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)].score); + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)]); //save 25% - temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)].score); + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)]); //save 50% - temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)].score); + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)]); //save 75% - temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)].score); + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)]); //save 95% - temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)].score); + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)]); //save 99% - temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)].score); + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)]); } //output quan value - out5 << i+1 << '\t'; - for (int u = 0; u < temp.size(); u++) { out5 << temp[u] << '\t'; } - out5 << endl; + outputString += toString(i+1) + "\t"; + for (int u = 0; u < temp.size(); u++) { outputString += toString(temp[u]) + "\t"; } + outputString += "\n"; quantiles[i] = temp; } - - mothurOut("Done."); mothurOutEndLine(); + + printQuanFile(noOutliers, outputString); + + //free memory + quantilesMembers.clear(); + + m->mothurOut("Done."); m->mothurOutEndLine(); } if (reRead) { @@ -202,15 +240,18 @@ void Pintail::doPrep() { //free memory for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; } + return 0; + } catch(exception& e) { - errorOut(e, "Pintail", "doPrep"); + m->errorOut(e, "Pintail", "doPrep"); exit(1); } } //*************************************************************************************************************** -void Pintail::print(ostream& out) { +Sequence Pintail::print(ostream& out, ostream& outAcc) { try { + int index = ceil(deviation); //is your DE value higher than the 95% @@ -226,7 +267,8 @@ void Pintail::print(ostream& out) { out << querySeq->getName() << '\t' << "div: " << deviation << "\tstDev: " << DE << "\tchimera flag: " << chimera << endl; if (chimera == "Yes") { - mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); mothurOutEndLine(); + m->mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); m->mothurOutEndLine(); + outAcc << querySeq->getName() << endl; } out << "Observed\t"; @@ -238,14 +280,74 @@ void Pintail::print(ostream& out) { for (int m = 0; m < expectedDistance.size(); m++) { out << expectedDistance[m] << '\t'; } out << endl; + return *querySeq; } catch(exception& e) { - errorOut(e, "Pintail", "print"); + m->errorOut(e, "Pintail", "print"); exit(1); } } +#ifdef USE_MPI +//*************************************************************************************************************** +Sequence Pintail::print(MPI_File& out, MPI_File& outAcc) { + try { + + string outputString = ""; + int index = ceil(deviation); + + //is your DE value higher than the 95% + string chimera; + if (index != 0) { //if index is 0 then its an exact match to a template seq + if (quantiles[index][4] == 0.0) { + chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index); + }else { + if (DE > quantiles[index][4]) { chimera = "Yes"; } + else { chimera = "No"; } + } + }else{ chimera = "No"; } + + outputString += querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera + "\n"; + if (chimera == "Yes") { + cout << querySeq->getName() << "\tdiv: " << toString(deviation) << "\tstDev: " << toString(DE) << "\tchimera flag: " << chimera << endl; + string outAccString = querySeq->getName() + "\n"; + + MPI_Status statusAcc; + int length = outAccString.length(); + char* buf = new char[length]; + memcpy(buf, outAccString.c_str(), length); + + MPI_File_write_shared(outAcc, buf, length, MPI_CHAR, &statusAcc); + delete buf; + return *querySeq; + } + outputString += "Observed\t"; + + for (int j = 0; j < obsDistance.size(); j++) { outputString += toString(obsDistance[j]) + "\t"; } + outputString += "\n"; + + outputString += "Expected\t"; + + for (int m = 0; m < expectedDistance.size(); m++) { outputString += toString(expectedDistance[m]) + "\t"; } + outputString += "\n"; + + MPI_Status status; + int length = outputString.length(); + char* buf2 = new char[length]; + memcpy(buf2, outputString.c_str(), length); + + MPI_File_write_shared(out, buf2, length, MPI_CHAR, &status); + delete buf2; + + return *querySeq; + } + catch(exception& e) { + m->errorOut(e, "Pintail", "print"); + exit(1); + } +} +#endif //*************************************************************************************************************** int Pintail::getChimeras(Sequence* query) { try { @@ -256,6 +358,8 @@ int Pintail::getChimeras(Sequence* query) { //find pairs has to be done before a mask bestfit = findPairs(query); + if (m->control_pressed) { return 0; } + //if they mask if (seqMask != "") { decalc->runMask(query); @@ -277,8 +381,12 @@ int Pintail::getChimeras(Sequence* query) { //find observed distance obsDistance = decalc->calcObserved(query, bestfit, windowsForeachQuery, windowSizes); + + if (m->control_pressed) { return 0; } Qav = decalc->findQav(windowsForeachQuery, windowSizes, probabilityProfile); + + if (m->control_pressed) { return 0; } //find alpha seqCoef = decalc->getCoef(obsDistance, Qav); @@ -286,9 +394,13 @@ int Pintail::getChimeras(Sequence* query) { //calculating expected distance expectedDistance = decalc->calcExpected(Qav, seqCoef); + if (m->control_pressed) { return 0; } + //finding de DE = decalc->calcDE(obsDistance, expectedDistance); + if (m->control_pressed) { return 0; } + //find distance between query and closest match it = trimmed.begin(); deviation = decalc->calcDist(query, bestfit, it->first, it->second); @@ -298,7 +410,7 @@ int Pintail::getChimeras(Sequence* query) { return 0; } catch(exception& e) { - errorOut(e, "Pintail", "getChimeras"); + m->errorOut(e, "Pintail", "getChimeras"); exit(1); } } @@ -307,16 +419,67 @@ int Pintail::getChimeras(Sequence* query) { vector Pintail::readFreq() { try { - - ifstream in; - openInputFile(consfile, in); - + //read in probabilities and store in vector + int pos; float num; + vector prob; set h = decalc->getPos(); //positions of bases in masking sequence - //read in probabilities and store in vector - int pos; float num; + #ifdef USE_MPI + + MPI_File inMPI; + MPI_Offset size; + MPI_Status status; + + //char* inFileName = new char[consfile.length()]; + //memcpy(inFileName, consfile.c_str(), consfile.length()); + + char inFileName[1024]; + strcpy(inFileName, consfile.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); + MPI_File_get_size(inMPI, &size); + //delete inFileName; + + char* buffer = new char[size]; + MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status); + + string tempBuf = buffer; + delete buffer; + + if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); } + istringstream iss (tempBuf,istringstream::in); + + //read version + string line = m->getline(iss); m->gobble(iss); + + while(!iss.eof()) { + iss >> pos >> num; + + if (h.count(pos) > 0) { + float Pi; + Pi = (num - 0.25) / 0.75; + + //cannot have probability less than 0. + if (Pi < 0) { Pi = 0.0; } + + //do you want this spot + prob.push_back(Pi); + } + + m->gobble(iss); + } + + MPI_File_close(&inMPI); + #else + + ifstream in; + m->openInputFile(consfile, in); + + //read version + string line = m->getline(in); m->gobble(in); + while(!in.eof()){ in >> pos >> num; @@ -332,15 +495,17 @@ vector Pintail::readFreq() { prob.push_back(Pi); } - gobble(in); + m->gobble(in); } - in.close(); + + #endif + return prob; } catch(exception& e) { - errorOut(e, "Pintail", "readFreq"); + m->errorOut(e, "Pintail", "readFreq"); exit(1); } } @@ -357,15 +522,15 @@ Sequence* Pintail::findPairs(Sequence* q) { } catch(exception& e) { - errorOut(e, "Pintail", "findPairs"); + m->errorOut(e, "Pintail", "findPairs"); exit(1); } } -/**************************************************************************************************/ +//************************************************************************************************** void Pintail::createProcessesQuan() { try { -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 0; +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + int process = 1; vector processIDS; //loop through and create all the processes you want @@ -382,14 +547,13 @@ void Pintail::createProcessesQuan() { //write out data to file so parent can read it ofstream out; string s = toString(getpid()) + ".temp"; - openOutputFile(s, out); - + m->openOutputFile(s, out); //output observed distances for (int i = 0; i < quantilesMembers.size(); i++) { out << quantilesMembers[i].size() << '\t'; for (int j = 0; j < quantilesMembers[i].size(); j++) { - out << quantilesMembers[i][j].score << '\t' << quantilesMembers[i][j].member1 << '\t' << quantilesMembers[i][j].member2 << '\t'; + out << quantilesMembers[i][j] << '\t'; } out << endl; } @@ -397,42 +561,46 @@ void Pintail::createProcessesQuan() { out.close(); exit(0); - }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } + }else { + m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); + for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); } + exit(0); + } } + //parent does its part + quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[0]->start, templateLines[0]->end); + //force parent to wait until all the processes are done - for (int i=0;iopenInputFile(s, in); - vector< vector > quan; + vector< vector > quan; quan.resize(100); //get quantiles - for (int m = 0; m < quan.size(); m++) { + for (int h = 0; h < quan.size(); h++) { int num; in >> num; - gobble(in); + m->gobble(in); - vector q; float w; int b, n; + vector q; float w; for (int j = 0; j < num; j++) { - in >> w >> b >> n; - //cout << w << '\t' << b << '\t' n << endl; - quanMember newMember(w, b, n); - q.push_back(newMember); + in >> w; + q.push_back(w); } -//cout << "here" << endl; - quan[m] = q; -//cout << "now here" << endl; - gobble(in); + + quan[h] = q; + m->gobble(in); } @@ -444,20 +612,164 @@ void Pintail::createProcessesQuan() { } in.close(); - remove(s.c_str()); + m->mothurRemove(s); } - + #else quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size()); #endif } catch(exception& e) { - errorOut(e, "Pintail", "createProcessesQuan"); + m->errorOut(e, "Pintail", "createProcessesQuan"); exit(1); } } +//*************************************************************************************************************** +vector< vector > Pintail::readQuantiles() { + try { + int num; + float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine; + + vector< vector > quan; + vector temp; temp.resize(6, 0); + + //to fill 0 + quan.push_back(temp); + #ifdef USE_MPI + + MPI_File inMPI; + MPI_Offset size; + MPI_Status status; + + //char* inFileName = new char[quanfile.length()]; + //memcpy(inFileName, quanfile.c_str(), quanfile.length()); + + char inFileName[1024]; + strcpy(inFileName, quanfile.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); + MPI_File_get_size(inMPI, &size); + //delete inFileName; + + + char* buffer = new char[size]; + MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status); + + string tempBuf = buffer; + if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); } + istringstream iss (tempBuf,istringstream::in); + delete buffer; + + //read version + string line = m->getline(iss); m->gobble(iss); + + while(!iss.eof()) { + iss >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; + + temp.clear(); + + temp.push_back(ten); + temp.push_back(twentyfive); + temp.push_back(fifty); + temp.push_back(seventyfive); + temp.push_back(ninetyfive); + temp.push_back(ninetynine); + + quan.push_back(temp); + + m->gobble(iss); + } + + MPI_File_close(&inMPI); + + #else + + ifstream in; + m->openInputFile(quanfile, in); + + //read version + string line = m->getline(in); m->gobble(in); + + while(!in.eof()){ + + in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; + + temp.clear(); + + temp.push_back(ten); + temp.push_back(twentyfive); + temp.push_back(fifty); + temp.push_back(seventyfive); + temp.push_back(ninetyfive); + temp.push_back(ninetynine); + + quan.push_back(temp); + + m->gobble(in); + } + in.close(); + #endif + + return quan; + + } + catch(exception& e) { + m->errorOut(e, "Pintail", "readQuantiles"); + exit(1); + } +} +//***************************************************************************************************************/ + +void Pintail::printQuanFile(string file, string outputString) { + try { + + #ifdef USE_MPI + + MPI_File outQuan; + MPI_Status status; + + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; + + //char* FileName = new char[file.length()]; + //memcpy(FileName, file.c_str(), file.length()); + + char FileName[1024]; + strcpy(FileName, file.c_str()); + + if (pid == 0) { + MPI_File_open(MPI_COMM_SELF, FileName, outMode, MPI_INFO_NULL, &outQuan); //comm, filename, mode, info, filepointer + + int length = outputString.length(); + char* buf = new char[length]; + memcpy(buf, outputString.c_str(), length); + + MPI_File_write(outQuan, buf, length, MPI_CHAR, &status); + delete buf; + + MPI_File_close(&outQuan); + } + + //delete FileName; + #else + ofstream outQuan; + m->openOutputFile(file, outQuan); + + outQuan << outputString; + + outQuan.close(); + #endif + } + catch(exception& e) { + m->errorOut(e, "Pintail", "printQuanFile"); + exit(1); + } +} + +//***************************************************************************************************************/ -//***************************************************************************************************************