X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=pintail.cpp;h=7ae3ff6dbb96224197dff97db4b435880738d87b;hb=98ea55ba2d46a429f031086b4b3272780d0ec894;hp=74c34e18be897ecb2c7966e5a481a5fff81d76df;hpb=ac1da2a793271cb67f2a2155c5558e1fabdd6aba;p=mothur.git diff --git a/pintail.cpp b/pintail.cpp index 74c34e1..7ae3ff6 100644 --- a/pintail.cpp +++ b/pintail.cpp @@ -9,7 +9,13 @@ #include "pintail.h" #include "ignoregaps.h" +#include "eachgapdist.h" +//******************************************************************************************************************** +//sorts lowest to highest +inline bool compareQuanMembers(quanMember left, quanMember right){ + return (left.score < right.score); +} //*************************************************************************************************************** Pintail::Pintail(string filename, string temp) { fastafile = filename; templateFile = temp; } @@ -19,8 +25,7 @@ Pintail::~Pintail() { try { for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; } for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } - - if (processors != 1) { for (int i = 0; i < bestfit.size(); i++) { delete bestfit[i]; } } + for (int i = 0; i < bestfit.size(); i++) { delete bestfit[i]; } } catch(exception& e) { errorOut(e, "Pintail", "~Pintail"); @@ -31,19 +36,20 @@ Pintail::~Pintail() { void Pintail::print(ostream& out) { try { + mothurOutEndLine(); + for (int i = 0; i < querySeqs.size(); i++) { int index = ceil(deviation[i]); -float quan = 2.64 * log10(deviation[i]) + 1.46; -cout << "dist = " << index << endl; -cout << "de = " << DE[i] << endl; -cout << "mallard quantile = " << quan << endl; -cout << "my quantile = " << quantiles[index][4] << endl; - + //is your DE value higher than the 95% string chimera; - if (DE[i] > quantiles[index][4]) { chimera = "Yes"; } - else { chimera = "No"; } + if (quantiles[index][4] == 0.0) { + chimera = "Your template does not include sequences that provide quantile values at distance " + toString(index); + }else { + if (DE[i] > quantiles[index][4]) { chimera = "Yes"; } + else { chimera = "No"; } + } out << querySeqs[i]->getName() << '\t' << "div: " << deviation[i] << "\tstDev: " << DE[i] << "\tchimera flag: " << chimera << endl; if (chimera == "Yes") { @@ -92,6 +98,7 @@ void Pintail::getChimeras() { windowsForeachQuery.resize(numSeqs); h.resize(numSeqs); quantiles.resize(100); //one for every percent mismatch + quantilesMembers.resize(100); //one for every percent mismatch //break up file if needed int linesPerProcess = numSeqs / processors ; @@ -123,7 +130,7 @@ void Pintail::getChimeras() { templateLines.push_back(new linePair(0, templateSeqs.size())); #endif - distcalculator = new ignoreGaps(); + distcalculator = new eachGapDist(); decalc = new DeCalculator(); //if the user does enter a mask then you want to keep all the spots in the alignment @@ -139,25 +146,15 @@ void Pintail::getChimeras() { mothurOut("Done."); mothurOutEndLine(); }else { createProcessesPairs(); } - - for (int j = 0; j < bestfit.size(); j++) { - //chops off beginning and end of sequences so they both start and end with a base - ofstream out; - string s = querySeqs[j]->getName(); - - openOutputFile(s, out); - out << ">" << querySeqs[j]->getName() << endl; - out << querySeqs[j]->getAligned() << endl; - out.close(); - - string t =querySeqs[j]->getName() + ".ref"; - openOutputFile(t, out); - out << ">" << bestfit[j]->getName() << endl; - out << bestfit[j]->getAligned() << endl; - out.close(); - } +//string o = "closestmatch.eachgap.fasta"; +//ofstream out7; +//openOutputFile(o, out7); - +//for (int i = 0; i < bestfit.size(); i++) { + //out7 << ">" << querySeqs[i]->getName() << "-"<< bestfit[i]->getName() << endl; + //out7 << bestfit[i]->getAligned() << endl; +//} +//out7.close(); //find P mothurOut("Getting conservation... "); cout.flush(); if (consfile == "") { @@ -170,51 +167,58 @@ void Pintail::getChimeras() { for (int i = 0; i < probabilityProfile.size(); i++) { probabilityProfile[i] = 1 - probabilityProfile[i]; } //cout << i << '\t' << probabilityProfile[i] << endl; mothurOut("Done."); mothurOutEndLine(); - //mask querys - for (int i = 0; i < querySeqs.size(); i++) { - //cout << querySeqs[i]->getName() << " before mask = " << querySeqs[i]->getAligned() << endl << endl; - decalc->runMask(querySeqs[i]); - //cout << querySeqs[i]->getName() << " after mask = " << querySeqs[i]->getAligned() << endl << endl; - } + //mask sequences if the user wants to + if (seqMask != "") { + //mask querys + for (int i = 0; i < querySeqs.size(); i++) { + decalc->runMask(querySeqs[i]); + } - //mask templates - for (int i = 0; i < templateSeqs.size(); i++) { - decalc->runMask(templateSeqs[i]); + //mask templates + for (int i = 0; i < templateSeqs.size(); i++) { + decalc->runMask(templateSeqs[i]); + } + + for (int i = 0; i < bestfit.size(); i++) { + decalc->runMask(bestfit[i]); + } + } -//for (int i = 0; i < lines.size(); i++) { cout << "line pair " << i << " = " << lines[i]->start << '\t' << lines[i]->end << endl; } - + if (filter) { + vector temp = templateSeqs; + for (int i = 0; i < querySeqs.size(); i++) { temp.push_back(querySeqs[i]); } + + createFilter(temp); + + runFilter(querySeqs); + runFilter(templateSeqs); + runFilter(bestfit); + } + + if (processors == 1) { for (int j = 0; j < bestfit.size(); j++) { - //cout << querySeqs[j]->getName() << " after mask = " << querySeqs[j]->getAligned() << endl << endl; - ///cout << bestfit[j]->getName() << " after mask = " << bestfit[j]->getAligned() << endl << endl; decalc->trimSeqs(querySeqs[j], bestfit[j], trimmed[j]); } mothurOut("Finding window breaks... "); cout.flush(); for (int i = lines[0]->start; i < lines[0]->end; i++) { it = trimmed[i].begin(); -//cout << querySeqs[i]->getName() << '\t' << "trimmed = " << it->first << '\t' << it->second << endl; vector win = decalc->findWindows(querySeqs[i], it->first, it->second, windowSizes[i], increment); windowsForeachQuery[i] = win; } mothurOut("Done."); mothurOutEndLine(); }else { createProcessesSpots(); } - if (processors == 1) { mothurOut("Calculating observed distance... "); cout.flush(); for (int i = lines[0]->start; i < lines[0]->end; i++) { - //cout << querySeqs[i]->getName() << '\t' << bestfit[i]->getName() << " windows = " << windowsForeachQuery[i].size() << " size = " << windowSizes[i] << endl; vector obsi = decalc->calcObserved(querySeqs[i], bestfit[i], windowsForeachQuery[i], windowSizes[i]); - for (int j = 0; j < obsi.size(); j++) { - //cout << obsi[j] << '\t'; - } - //cout << endl; obsDistance[i] = obsi; } mothurOut("Done."); mothurOutEndLine(); @@ -225,12 +229,6 @@ void Pintail::getChimeras() { vector q = decalc->findQav(windowsForeachQuery[i], windowSizes[i], probabilityProfile); Qav[i] = q; -//cout << querySeqs[i]->getName() << endl; -for (int j = 0; j < Qav[i].size(); j++) { - //cout << Qav[i][j] << '\t'; -} -//cout << endl << endl; - } mothurOut("Done."); mothurOutEndLine(); @@ -238,7 +236,6 @@ for (int j = 0; j < Qav[i].size(); j++) { mothurOut("Calculating alpha... "); cout.flush(); for (int i = lines[0]->start; i < lines[0]->end; i++) { float alpha = decalc->getCoef(obsDistance[i], Qav[i]); -//cout << querySeqs[i]->getName() << "\tcoef = " << alpha << endl; seqCoef[i] = alpha; } mothurOut("Done."); mothurOutEndLine(); @@ -256,10 +253,9 @@ for (int j = 0; j < Qav[i].size(); j++) { for (int i = lines[0]->start; i < lines[0]->end; i++) { float de = decalc->calcDE(obsDistance[i], expectedDistance[i]); DE[i] = de; -//cout << querySeqs[i]->getName() << '\t' << "de value = " << de << endl; + it = trimmed[i].begin(); float dist = decalc->calcDist(querySeqs[i], bestfit[i], it->first, it->second); -//cout << querySeqs[i]->getName() << '\t' << "dist value = " << dist << endl; deviation[i] = dist; } mothurOut("Done."); mothurOutEndLine(); @@ -267,64 +263,119 @@ for (int j = 0; j < Qav[i].size(); j++) { } else { createProcesses(); } - //quantiles are used to determine whether the de values found indicate a chimera //if you have to calculate them, its time intensive because you are finding the de and deviation values for each //combination of sequences in the template - if (quanfile != "") { quantiles = readQuantiles(); } + if (quanfile != "") { quantiles = readQuantiles(); } else { mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush(); if (processors == 1) { - quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size()); + quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size()); }else { createProcessesQuan(); } + + ofstream out4, out5; + string noOutliers, outliers; + + if ((!filter) && (seqMask == "")) { + noOutliers = getRootName(templateFile) + "pintail.quan"; + }else if ((filter) && (seqMask == "")) { + noOutliers = getRootName(templateFile) + "pintail.filtered.quan"; + }else if ((!filter) && (seqMask != "")) { + noOutliers = getRootName(templateFile) + "pintail.masked.quan"; + }else if ((filter) && (seqMask != "")) { + noOutliers = getRootName(templateFile) + "pintail.filtered.masked.quan"; + } + + //outliers = getRootName(templateFile) + "pintail.quanYESOUTLIERS"; - decalc->removeObviousOutliers(quantiles); - - ofstream out4; - string o = getRootName(templateFile) + "quan"; - - openOutputFile(o, out4); + /*openOutputFile(outliers, out4); //adjust quantiles - for (int i = 0; i < quantiles.size(); i++) { - if (quantiles[i].size() == 0) { + for (int i = 0; i < quantilesMembers.size(); i++) { + vector temp; + + if (quantilesMembers[i].size() == 0) { //in case this is not a distance found in your template files for (int g = 0; g < 6; g++) { - quantiles[i].push_back(0.0); + temp.push_back(0.0); } }else{ - sort(quantiles[i].begin(), quantiles[i].end()); + sort(quantilesMembers[i].begin(), quantilesMembers[i].end(), compareQuanMembers); - vector temp; //save 10% - temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)]); + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)].score); //save 25% - temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)]); + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)].score); //save 50% - temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)]); + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)].score); //save 75% - temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)]); + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)].score); //save 95% - temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)]); + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)].score); //save 99% - temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)]); + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)].score); - quantiles[i] = temp; } //output quan value out4 << i+1 << '\t'; - for (int u = 0; u < quantiles[i].size(); u++) { out4 << quantiles[i][u] << '\t'; } + for (int u = 0; u < temp.size(); u++) { out4 << temp[u] << '\t'; } out4 << endl; - + + quantiles[i] = temp; + } + out4.close();*/ + + decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size()); + + openOutputFile(noOutliers, out5); + + //adjust quantiles + for (int i = 0; i < quantilesMembers.size(); i++) { + vector temp; + + if (quantilesMembers[i].size() == 0) { + //in case this is not a distance found in your template files + for (int g = 0; g < 6; g++) { + temp.push_back(0.0); + } + }else{ + + sort(quantilesMembers[i].begin(), quantilesMembers[i].end(), compareQuanMembers); + + //save 10% + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)].score); + //save 25% + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)].score); + //save 50% + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)].score); + //save 75% + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)].score); + //save 95% + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)].score); + //save 99% + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)].score); + + } + + //output quan value + out5 << i+1 << '\t'; + for (int u = 0; u < temp.size(); u++) { out5 << temp[u] << '\t'; } + out5 << endl; + + quantiles[i] = temp; + + } + mothurOut("Done."); mothurOutEndLine(); + } - + //free memory for (int i = 0; i < lines.size(); i++) { delete lines[i]; } for (int i = 0; i < templateLines.size(); i++) { delete templateLines[i]; } @@ -380,45 +431,6 @@ vector Pintail::readFreq() { } } -//*************************************************************************************************************** - -vector< vector > Pintail::readQuantiles() { - try { - - ifstream in; - openInputFile(quanfile, in); - - vector< vector > quan; - - int num; float ten, twentyfive, fifty, seventyfive, ninetyfive, ninetynine; - - while(!in.eof()){ - - in >> num >> ten >> twentyfive >> fifty >> seventyfive >> ninetyfive >> ninetynine; - - vector temp; - - temp.push_back(ten); - temp.push_back(twentyfive); - temp.push_back(fifty); - temp.push_back(seventyfive); - temp.push_back(ninetyfive); - temp.push_back(ninetynine); - - quan.push_back(temp); - - gobble(in); - } - - in.close(); - return quan; - - } - catch(exception& e) { - errorOut(e, "Pintail", "readQuantiles"); - exit(1); - } -} //*************************************************************************************************************** //calculate the distances from each query sequence to all sequences in the template to find the closest sequence vector Pintail::findPairs(int start, int end) { @@ -445,7 +457,10 @@ vector Pintail::findPairs(int start, int end) { } } - seqsMatches.push_back(match); + //make copy so trimSeqs doesn't overtrim + Sequence* copy = new Sequence(match->getName(), match->getAligned()); + + seqsMatches.push_back(copy); } return seqsMatches; @@ -478,6 +493,7 @@ void Pintail::createProcessesSpots() { //chops off beginning and end of sequences so they both start and end with a base map trim; + decalc->trimSeqs(querySeqs[j], bestfit[j], trim); trimmed[j] = trim; @@ -512,8 +528,7 @@ void Pintail::createProcessesSpots() { //output trimmed values for (int i = lines[process]->start; i < lines[process]->end; i++) { - it = trimmed[i].begin(); - + it = trimmed[i].begin(); out << it->first << '\t' << it->second << endl; } out.close(); @@ -568,7 +583,7 @@ void Pintail::createProcessesSpots() { for (int m = 0; m < size; m++) { int front, back; in >> front >> back; - + map t; t[front] = back; @@ -923,7 +938,7 @@ void Pintail::createProcessesQuan() { process++; }else if (pid == 0){ - quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end); + quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end); //write out data to file so parent can read it ofstream out; @@ -932,10 +947,10 @@ void Pintail::createProcessesQuan() { //output observed distances - for (int i = 0; i < quantiles.size(); i++) { - out << quantiles[i].size() << '\t'; - for (int j = 0; j < quantiles[i].size(); j++) { - out << quantiles[i][j] << '\t'; + for (int i = 0; i < quantilesMembers.size(); i++) { + out << quantilesMembers[i].size() << '\t'; + for (int j = 0; j < quantilesMembers[i].size(); j++) { + out << quantilesMembers[i][j].score << '\t' << quantilesMembers[i][j].member1 << '\t' << quantilesMembers[i][j].member2 << '\t'; } out << endl; } @@ -951,35 +966,42 @@ void Pintail::createProcessesQuan() { int temp = processIDS[i]; wait(&temp); } - + //get data created by processes for (int i=0;i > quan; quan.resize(100); + vector< vector > quan; + quan.resize(100); //get quantiles for (int m = 0; m < quan.size(); m++) { int num; - in >> num; - - vector q; float w; + in >> num; + + gobble(in); + + vector q; float w; int b, n; for (int j = 0; j < num; j++) { - in >> w; - q.push_back(w); + in >> w >> b >> n; + //cout << w << '\t' << b << '\t' n << endl; + quanMember newMember(w, b, n); + q.push_back(newMember); } - +//cout << "here" << endl; quan[m] = q; +//cout << "now here" << endl; gobble(in); } - + //save quan in quantiles - for (int i = 0; i < quan.size(); i++) { + for (int j = 0; j < quan.size(); j++) { //put all values of q[i] into quan[i] - quantiles[i].insert(quantiles[i].begin(), quan[i].begin(), quan[i].end()); + for (int l = 0; l < quan[j].size(); l++) { quantilesMembers[j].push_back(quan[j][l]); } + //quantilesMembers[j].insert(quantilesMembers[j].begin(), quan[j].begin(), quan[j].end()); } in.close(); @@ -987,7 +1009,7 @@ void Pintail::createProcessesQuan() { } #else - quantiles = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size()); + quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size()); #endif } catch(exception& e) {