X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;ds=sidebyside;f=pintail.cpp;fp=pintail.cpp;h=0000000000000000000000000000000000000000;hb=4a877efa127e56e81a21f53cfdbbfd3bfbe8c4ff;hp=b9f2434d19d4cc842bfac1dce4f6de5a0d1b742d;hpb=a6cf29fa4dac0909c7582cb1094151d34093ee76;p=mothur.git diff --git a/pintail.cpp b/pintail.cpp deleted file mode 100644 index b9f2434..0000000 --- a/pintail.cpp +++ /dev/null @@ -1,775 +0,0 @@ -/* - * pintail.cpp - * Mothur - * - * Created by Sarah Westcott on 7/9/09. - * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved. - * - */ - -#include "pintail.h" -#include "ignoregaps.h" -#include "eachgapdist.h" - -//******************************************************************************************************************** -//sorts lowest to highest -inline bool compareQuanMembers(quanMember left, quanMember right){ - return (left.score < right.score); -} -//*************************************************************************************************************** - -Pintail::Pintail(string filename, string temp, bool f, int p, string mask, string cons, string q, int win, int inc, string o) : Chimera() { - try { - - fastafile = filename; - templateFileName = temp; templateSeqs = readSeqs(temp); - filter = f; - processors = p; - setMask(mask); - consfile = cons; - quanfile = q; - window = win; - increment = inc; - outputDir = o; - - distcalculator = new eachGapDist(); - decalc = new DeCalculator(); - - doPrep(); - } - catch(exception& e) { - m->errorOut(e, "Pintail", "Pintail"); - exit(1); - } - -} -//*************************************************************************************************************** - -Pintail::~Pintail() { - try { - - delete distcalculator; - delete decalc; - } - catch(exception& e) { - m->errorOut(e, "Pintail", "~Pintail"); - exit(1); - } -} -//*************************************************************************************************************** -int Pintail::doPrep() { - try { - - mergedFilterString = ""; - windowSizesTemplate.resize(templateSeqs.size(), window); - quantiles.resize(100); //one for every percent mismatch - quantilesMembers.resize(100); //one for every percent mismatch - - //if the user does not enter a mask then you want to keep all the spots in the alignment - if (seqMask.length() == 0) { decalc->setAlignmentLength(templateSeqs[0]->getAligned().length()); } - else { decalc->setAlignmentLength(seqMask.length()); } - - decalc->setMask(seqMask); - - #ifdef USE_MPI - //do nothing - #else - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) - //find breakup of templatefile for quantiles - if (processors == 1) { templateLines.push_back(new linePair(0, templateSeqs.size())); } - else { - for (int i = 0; i < processors; i++) { - templateLines.push_back(new linePair()); - templateLines[i]->start = int (sqrt(float(i)/float(processors)) * templateSeqs.size()); - templateLines[i]->end = int (sqrt(float(i+1)/float(processors)) * templateSeqs.size()); - } - } - #else - templateLines.push_back(new linePair(0, templateSeqs.size())); - #endif - #endif - - m->mothurOut("Getting conservation... "); cout.flush(); - if (consfile == "") { - m->mothurOut("Calculating probability of conservation for your template sequences. This can take a while... I will output the frequency of the highest base in each position to a .freq file so that you can input them using the conservation parameter next time you run this command. Providing the .freq file will improve speed. "); cout.flush(); - probabilityProfile = decalc->calcFreq(templateSeqs, templateFileName); - if (m->control_pressed) { return 0; } - m->mothurOut("Done."); m->mothurOutEndLine(); - }else { probabilityProfile = readFreq(); m->mothurOut("Done."); } - m->mothurOutEndLine(); - - //make P into Q - for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } // - - bool reRead = false; - //create filter if needed for later - if (filter) { - - //read in all query seqs - vector tempQuerySeqs = readSeqs(fastafile); - - vector temp; - //merge query seqs and template seqs - temp = templateSeqs; - for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); } - - if (seqMask != "") { - reRead = true; - //mask templates - for (int i = 0; i < temp.size(); i++) { - if (m->control_pressed) { - for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } - return 0; - } - decalc->runMask(temp[i]); - } - } - - mergedFilterString = createFilter(temp, 0.5); - - if (m->control_pressed) { - for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } - return 0; - } - - //reread template seqs - for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } - } - - - //quantiles are used to determine whether the de values found indicate a chimera - //if you have to calculate them, its time intensive because you are finding the de and deviation values for each - //combination of sequences in the template - if (quanfile != "") { - quantiles = readQuantiles(); - }else { - if ((!filter) && (seqMask != "")) { //if you didn't filter but you want to mask. if you filtered then you did mask first above. - reRead = true; - //mask templates - for (int i = 0; i < templateSeqs.size(); i++) { - if (m->control_pressed) { return 0; } - decalc->runMask(templateSeqs[i]); - } - } - - if (filter) { - reRead = true; - for (int i = 0; i < templateSeqs.size(); i++) { - if (m->control_pressed) { return 0; } - runFilter(templateSeqs[i]); - } - } - - m->mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush(); - if (processors == 1) { - quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size()); - }else { createProcessesQuan(); } - - if (m->control_pressed) { return 0; } - - string noOutliers, outliers; - - if ((!filter) && (seqMask == "")) { - noOutliers = m->getRootName(m->getSimpleName(templateFileName)) + "pintail.quan"; - }else if ((!filter) && (seqMask != "")) { - noOutliers =m->getRootName(m->getSimpleName(templateFileName)) + "pintail.masked.quan"; - }else if ((filter) && (seqMask != "")) { - noOutliers = m->getRootName(m->getSimpleName(templateFileName)) + "pintail.filtered." + m->getSimpleName(m->getRootName(fastafile)) + "masked.quan"; - }else if ((filter) && (seqMask == "")) { - noOutliers = m->getRootName(m->getSimpleName(templateFileName)) + "pintail.filtered." + m->getSimpleName(m->getRootName(fastafile)) + "quan"; - } - - decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size()); - - if (m->control_pressed) { return 0; } - - string outputString = "#" + m->getVersion() + "\n"; - - //adjust quantiles - for (int i = 0; i < quantilesMembers.size(); i++) { - vector temp; - - if (quantilesMembers[i].size() == 0) { - //in case this is not a distance found in your template files - for (int g = 0; g < 6; g++) { - temp.push_back(0.0); - } - }else{ - - sort(quantilesMembers[i].begin(), quantilesMembers[i].end()); - - //save 10% - temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)]); - //save 25% - temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)]); - //save 50% - temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)]); - //save 75% - temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)]); - //save 95% - temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)]); - //save 99% - temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)]); - - } - - //output quan value - outputString += toString(i+1) + "\t"; - for (int u = 0; u < temp.size(); u++) { outputString += toString(temp[u]) + "\t"; } - outputString += "\n"; - - quantiles[i] = temp; - - } - - printQuanFile(noOutliers, outputString); - - //free memory - quantilesMembers.clear(); - - m->mothurOut("Done."); m->mothurOutEndLine(); - } - - if (reRead) { - for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } - templateSeqs.clear(); - templateSeqs = readSeqs(templateFileName); - } - - - //free memory - for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; } - - return 0; - - } - catch(exception& e) { - m->errorOut(e, "Pintail", "doPrep"); - exit(1); - } -} -//*************************************************************************************************************** -Sequence Pintail::print(ostream& out, ostream& outAcc) { - try { - - int index = ceil(deviation); - - //is your DE value higher than the 95% - string chimera; - if (index != 0) { //if index is 0 then its an exact match to a template seq - if (quantiles[index][4] == 0.0) { - chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index); - }else { - if (DE > quantiles[index][4]) { chimera = "Yes"; } - else { chimera = "No"; } - } - }else{ chimera = "No"; } - - out << querySeq->getName() << '\t' << "div: " << deviation << "\tstDev: " << DE << "\tchimera flag: " << chimera << endl; - if (chimera == "Yes") { - m->mothurOut(querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera); m->mothurOutEndLine(); - outAcc << querySeq->getName() << endl; - } - out << "Observed\t"; - - for (int j = 0; j < obsDistance.size(); j++) { out << obsDistance[j] << '\t'; } - out << endl; - - out << "Expected\t"; - - for (int m = 0; m < expectedDistance.size(); m++) { out << expectedDistance[m] << '\t'; } - out << endl; - - return *querySeq; - - } - catch(exception& e) { - m->errorOut(e, "Pintail", "print"); - exit(1); - } -} -#ifdef USE_MPI -//*************************************************************************************************************** -Sequence Pintail::print(MPI_File& out, MPI_File& outAcc) { - try { - - string outputString = ""; - int index = ceil(deviation); - - //is your DE value higher than the 95% - string chimera; - if (index != 0) { //if index is 0 then its an exact match to a template seq - if (quantiles[index][4] == 0.0) { - chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index); - }else { - if (DE > quantiles[index][4]) { chimera = "Yes"; } - else { chimera = "No"; } - } - }else{ chimera = "No"; } - - outputString += querySeq->getName() + "\tdiv: " + toString(deviation) + "\tstDev: " + toString(DE) + "\tchimera flag: " + chimera + "\n"; - if (chimera == "Yes") { - cout << querySeq->getName() << "\tdiv: " << toString(deviation) << "\tstDev: " << toString(DE) << "\tchimera flag: " << chimera << endl; - string outAccString = querySeq->getName() + "\n"; - - MPI_Status statusAcc; - int length = outAccString.length(); - char* buf = new char[length]; - memcpy(buf, outAccString.c_str(), length); - - MPI_File_write_shared(outAcc, buf, length, MPI_CHAR, &statusAcc); - delete buf; - - return *querySeq; - } - outputString += "Observed\t"; - - for (int j = 0; j < obsDistance.size(); j++) { outputString += toString(obsDistance[j]) + "\t"; } - outputString += "\n"; - - outputString += "Expected\t"; - - for (int m = 0; m < expectedDistance.size(); m++) { outputString += toString(expectedDistance[m]) + "\t"; } - outputString += "\n"; - - MPI_Status status; - int length = outputString.length(); - char* buf2 = new char[length]; - memcpy(buf2, outputString.c_str(), length); - - MPI_File_write_shared(out, buf2, length, MPI_CHAR, &status); - delete buf2; - - return *querySeq; - } - catch(exception& e) { - m->errorOut(e, "Pintail", "print"); - exit(1); - } -} -#endif -//*************************************************************************************************************** -int Pintail::getChimeras(Sequence* query) { - try { - querySeq = query; - trimmed.clear(); - windowSizes = window; - - //find pairs has to be done before a mask - bestfit = findPairs(query); - - if (m->control_pressed) { return 0; } - - //if they mask - if (seqMask != "") { - decalc->runMask(query); - decalc->runMask(bestfit); - } - - if (filter) { //must be done after a mask - runFilter(query); - runFilter(bestfit); - } - - - //trim seq - decalc->trimSeqs(query, bestfit, trimmed); - - //find windows - it = trimmed.begin(); - windowsForeachQuery = decalc->findWindows(query, it->first, it->second, windowSizes, increment); - - //find observed distance - obsDistance = decalc->calcObserved(query, bestfit, windowsForeachQuery, windowSizes); - - if (m->control_pressed) { return 0; } - - Qav = decalc->findQav(windowsForeachQuery, windowSizes, probabilityProfile); - - if (m->control_pressed) { return 0; } - - //find alpha - seqCoef = decalc->getCoef(obsDistance, Qav); - - //calculating expected distance - expectedDistance = decalc->calcExpected(Qav, seqCoef); - - if (m->control_pressed) { return 0; } - - //finding de - DE = decalc->calcDE(obsDistance, expectedDistance); - - if (m->control_pressed) { return 0; } - - //find distance between query and closest match - it = trimmed.begin(); - deviation = decalc->calcDist(query, bestfit, it->first, it->second); - - delete bestfit; - - return 0; - } - catch(exception& e) { - m->errorOut(e, "Pintail", "getChimeras"); - exit(1); - } -} - -//*************************************************************************************************************** - -vector Pintail::readFreq() { - try { - //read in probabilities and store in vector - int pos; float num; - - vector prob; - set h = decalc->getPos(); //positions of bases in masking sequence - - #ifdef USE_MPI - - MPI_File inMPI; - MPI_Offset size; - MPI_Status status; - - //char* inFileName = new char[consfile.length()]; - //memcpy(inFileName, consfile.c_str(), consfile.length()); - - char inFileName[1024]; - strcpy(inFileName, consfile.c_str()); - - MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); - MPI_File_get_size(inMPI, &size); - //delete inFileName; - - char* buffer = new char[size]; - MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status); - - string tempBuf = buffer; - delete buffer; - - if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); } - istringstream iss (tempBuf,istringstream::in); - - //read version - string line = m->getline(iss); m->gobble(iss); - - while(!iss.eof()) { - iss >> pos >> num; - - if (h.count(pos) > 0) { - float Pi; - Pi = (num - 0.25) / 0.75; - - //cannot have probability less than 0. - if (Pi < 0) { Pi = 0.0; } - - //do you want this spot - prob.push_back(Pi); - } - - m->gobble(iss); - } - - MPI_File_close(&inMPI); - - #else - - ifstream in; - m->openInputFile(consfile, in); - - //read version - string line = m->getline(in); m->gobble(in); - - while(!in.eof()){ - - in >> pos >> num; - - if (h.count(pos) > 0) { - float Pi; - Pi = (num - 0.25) / 0.75; - - //cannot have probability less than 0. - if (Pi < 0) { Pi = 0.0; } - - //do you want this spot - prob.push_back(Pi); - } - - m->gobble(in); - } - in.close(); - - #endif - - return prob; - - } - catch(exception& e) { - m->errorOut(e, "Pintail", "readFreq"); - exit(1); - } -} - -//*************************************************************************************************************** -//calculate the distances from each query sequence to all sequences in the template to find the closest sequence -Sequence* Pintail::findPairs(Sequence* q) { - try { - - Sequence* seqsMatches; - - seqsMatches = decalc->findClosest(q, templateSeqs); - return seqsMatches; - - } - catch(exception& e) { - m->errorOut(e, "Pintail", "findPairs"); - exit(1); - } -} -//************************************************************************************************** -void Pintail::createProcessesQuan() { - try { -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) - int process = 1; - vector processIDS; - - //loop through and create all the processes you want - while (process != processors) { - int pid = fork(); - - if (pid > 0) { - processIDS.push_back(pid); - process++; - }else if (pid == 0){ - - quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end); - - //write out data to file so parent can read it - ofstream out; - string s = toString(getpid()) + ".temp"; - m->openOutputFile(s, out); - - //output observed distances - for (int i = 0; i < quantilesMembers.size(); i++) { - out << quantilesMembers[i].size() << '\t'; - for (int j = 0; j < quantilesMembers[i].size(); j++) { - out << quantilesMembers[i][j] << '\t'; - } - out << endl; - } - - out.close(); - - exit(0); - }else { - m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); - for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); } - exit(0); - } - } - - //parent does its part - quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[0]->start, templateLines[0]->end); - - //force parent to wait until all the processes are done - for (int i=0;i<(processors-1);i++) { - int temp = processIDS[i]; - wait(&temp); - } - - //get data created by processes - for (int i=0;i<(processors-1);i++) { - ifstream in; - string s = toString(processIDS[i]) + ".temp"; - m->openInputFile(s, in); - - vector< vector > quan; - quan.resize(100); - - //get quantiles - for (int h = 0; h < quan.size(); h++) { - int num; - in >> num; - - m->gobble(in); - - vector q; float w; - for (int j = 0; j < num; j++) { - in >> w; - q.push_back(w); - } - - quan[h] = q; - m->gobble(in); - } - - - //save quan in quantiles - for (int j = 0; j < quan.size(); j++) { - //put all values of q[i] into quan[i] - for (int l = 0; l < quan[j].size(); l++) { quantilesMembers[j].push_back(quan[j][l]); } - //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end()); - } - - in.close(); - m->mothurRemove(s); - } - -#else - quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size()); -#endif - } - catch(exception& e) { - m->errorOut(e, "Pintail", "createProcessesQuan"); - exit(1); - } -} -//*************************************************************************************************************** -vector< vector > Pintail::readQuantiles() { - try { - int num; - float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine; - - vector< vector > quan; - vector temp; temp.resize(6, 0); - - //to fill 0 - quan.push_back(temp); - - #ifdef USE_MPI - - MPI_File inMPI; - MPI_Offset size; - MPI_Status status; - - //char* inFileName = new char[quanfile.length()]; - //memcpy(inFileName, quanfile.c_str(), quanfile.length()); - - char inFileName[1024]; - strcpy(inFileName, quanfile.c_str()); - - MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); - MPI_File_get_size(inMPI, &size); - //delete inFileName; - - - char* buffer = new char[size]; - MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status); - - string tempBuf = buffer; - if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); } - istringstream iss (tempBuf,istringstream::in); - delete buffer; - - //read version - string line = m->getline(iss); m->gobble(iss); - - while(!iss.eof()) { - iss >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; - - temp.clear(); - - temp.push_back(ten); - temp.push_back(twentyfive); - temp.push_back(fifty); - temp.push_back(seventyfive); - temp.push_back(ninetyfive); - temp.push_back(ninetynine); - - quan.push_back(temp); - - m->gobble(iss); - } - - MPI_File_close(&inMPI); - - #else - - ifstream in; - m->openInputFile(quanfile, in); - - //read version - string line = m->getline(in); m->gobble(in); - - while(!in.eof()){ - - in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; - - temp.clear(); - - temp.push_back(ten); - temp.push_back(twentyfive); - temp.push_back(fifty); - temp.push_back(seventyfive); - temp.push_back(ninetyfive); - temp.push_back(ninetynine); - - quan.push_back(temp); - - m->gobble(in); - } - in.close(); - #endif - - return quan; - - } - catch(exception& e) { - m->errorOut(e, "Pintail", "readQuantiles"); - exit(1); - } -} -//***************************************************************************************************************/ - -void Pintail::printQuanFile(string file, string outputString) { - try { - - #ifdef USE_MPI - - MPI_File outQuan; - MPI_Status status; - - int pid; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are - - int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; - - //char* FileName = new char[file.length()]; - //memcpy(FileName, file.c_str(), file.length()); - - char FileName[1024]; - strcpy(FileName, file.c_str()); - - if (pid == 0) { - MPI_File_open(MPI_COMM_SELF, FileName, outMode, MPI_INFO_NULL, &outQuan); //comm, filename, mode, info, filepointer - - int length = outputString.length(); - char* buf = new char[length]; - memcpy(buf, outputString.c_str(), length); - - MPI_File_write(outQuan, buf, length, MPI_CHAR, &status); - delete buf; - - MPI_File_close(&outQuan); - } - - //delete FileName; - #else - ofstream outQuan; - m->openOutputFile(file, outQuan); - - outQuan << outputString; - - outQuan.close(); - #endif - } - catch(exception& e) { - m->errorOut(e, "Pintail", "printQuanFile"); - exit(1); - } -} - -//***************************************************************************************************************/ - - -