From f11da43e273f150a8ece7924c4c7f7b43862b372 Mon Sep 17 00:00:00 2001 From: westcott Date: Thu, 13 Aug 2009 14:01:10 +0000 Subject: [PATCH] last changes before move --- bellerophon.cpp | 32 +++++++++++----------- bellerophon.h | 2 +- decalc.cpp | 72 +++---------------------------------------------- decalc.h | 4 +-- mothur.cpp | 4 +-- pintail.cpp | 32 +++++++++++++--------- pintail.h | 2 -- 7 files changed, 44 insertions(+), 104 deletions(-) diff --git a/bellerophon.cpp b/bellerophon.cpp index 86bc752..5e219d4 100644 --- a/bellerophon.cpp +++ b/bellerophon.cpp @@ -80,7 +80,7 @@ void Bellerophon::getChimeras() { //do soft filter if (filter) { - string optionString = "fasta=" + fastafile + ", soft=50, vertical=F"; + string optionString = "fasta=" + fastafile + ", soft=50"; filterSeqs = new FilterSeqsCommand(optionString); filterSeqs->execute(); delete filterSeqs; @@ -92,39 +92,39 @@ void Bellerophon::getChimeras() { distCalculator = new eachGapDist(); //read in sequences - //seqs = readSeqs(fastafile); + seqs = readSeqs(fastafile); int numSeqs = seqs.size(); if (numSeqs == 0) { mothurOut("Error in reading you sequences."); mothurOutEndLine(); exit(1); } //set default window to 25% of sequence length - string seq0 = seqs[0].getAligned(); + string seq0 = seqs[0]->getAligned(); if (window == 0) { window = seq0.length() / 4; } else if (window > (seq0.length() / 2)) { mothurOut("Your sequence length is = " + toString(seq0.length()) + ". You have selected a window size greater than the length of half your aligned sequence. I will run it with a window size of " + toString((seq0.length() / 2))); mothurOutEndLine(); window = (seq0.length() / 2); } - if (increment > (seqs[0].getAlignLength() - (2*window))) { + if (increment > (seqs[0]->getAlignLength() - (2*window))) { if (increment != 10) { mothurOut("You have selected a increment that is too large. I will use the default."); mothurOutEndLine(); increment = 10; - if (increment > (seqs[0].getAlignLength() - (2*window))) { increment = 0; } + if (increment > (seqs[0]->getAlignLength() - (2*window))) { increment = 0; } }else{ increment = 0; } } cout << "increment = " << increment << endl; if (increment == 0) { iters = 1; } - else { iters = ((seqs[0].getAlignLength() - (2*window)) / increment); } + else { iters = ((seqs[0]->getAlignLength() - (2*window)) / increment); } //initialize pref pref.resize(numSeqs); for (int i = 0; i < numSeqs; i++ ) { pref[i].leftParent.resize(2); pref[i].rightParent.resize(2); pref[i].score.resize(2); pref[i].closestLeft.resize(2); pref[i].closestRight.resize(3); - pref[i].name = seqs[i].getName(); + pref[i].name = seqs[i]->getName(); pref[i].score[0] = 0.0; pref[i].score[1] = 0.0; pref[i].closestLeft[0] = 100000.0; pref[i].closestLeft[1] = 100000.0; pref[i].closestRight[0] = 100000.0; pref[i].closestRight[1] = 100000.0; @@ -140,16 +140,16 @@ cout << "increment = " << increment << endl; for (int i = 0; i < seqs.size(); i++) { //cout << "whole = " << seqs[i].getAligned() << endl; //save left side - string seqLeft = seqs[i].getAligned().substr(midpoint-window, window); + string seqLeft = seqs[i]->getAligned().substr(midpoint-window, window); Sequence tempLeft; - tempLeft.setName(seqs[i].getName()); + tempLeft.setName(seqs[i]->getName()); tempLeft.setAligned(seqLeft); left.push_back(tempLeft); //cout << "left = " << tempLeft.getAligned() << endl; //save right side - string seqRight = seqs[i].getAligned().substr(midpoint, window); + string seqRight = seqs[i]->getAligned().substr(midpoint, window); Sequence tempRight; - tempRight.setName(seqs[i].getName()); + tempRight.setName(seqs[i]->getName()); tempRight.setAligned(seqRight); right.push_back(tempRight); //cout << "right = " << seqRight << endl; @@ -217,6 +217,8 @@ cout << "increment = " << increment << endl; //how much higher or lower is this than expected pref[i].score[0] = pref[i].score[0] / expectedPercent; + + } @@ -310,24 +312,24 @@ void Bellerophon::generatePreferences(vector left, vector right, if (itL->second < pref[i].closestLeft[1]) { pref[i].closestLeft[1] = itL->second; - pref[i].leftParent[1] = seqs[j].getName(); + pref[i].leftParent[1] = seqs[j]->getName(); //cout << "updating closest left to " << pref[i].leftParent[1] << endl; } //cout << "pref[" << j << "].closestLeft[1] = " << pref[j].closestLeft[1] << " parent = " << pref[j].leftParent[1] << endl; if (itL->second < pref[j].closestLeft[1]) { pref[j].closestLeft[1] = itL->second; - pref[j].leftParent[1] = seqs[i].getName(); + pref[j].leftParent[1] = seqs[i]->getName(); //cout << "updating closest left to " << pref[j].leftParent[1] << endl; } //are you the closest right sequence if (itR->second < pref[i].closestRight[1]) { pref[i].closestRight[1] = itR->second; - pref[i].rightParent[1] = seqs[j].getName(); + pref[i].rightParent[1] = seqs[j]->getName(); } if (itR->second < pref[j].closestRight[1]) { pref[j].closestRight[1] = itR->second; - pref[j].rightParent[1] = seqs[i].getName(); + pref[j].rightParent[1] = seqs[i]->getName(); } } diff --git a/bellerophon.h b/bellerophon.h index b79089c..bc86c18 100644 --- a/bellerophon.h +++ b/bellerophon.h @@ -46,7 +46,7 @@ class Bellerophon : public Chimera { private: Dist* distCalculator; FilterSeqsCommand* filterSeqs; - vector seqs; + vector seqs; vector pref; string fastafile; int iters; diff --git a/decalc.cpp b/decalc.cpp index c951910..860b12f 100644 --- a/decalc.cpp +++ b/decalc.cpp @@ -358,14 +358,9 @@ vector DeCalculator::findQav(vector window, int size, vector exit(1); } } -//******************************************************************************************************************** -//sorts lowest to highest -inline bool compareQuanMembers(quanMember left, quanMember right){ - return (left.score < right.score); -} //*************************************************************************************************************** //seqs have already been masked -vector< vector > DeCalculator::getQuantiles(vector seqs, vector windowSizesTemplate, int window, vector probProfile, int increment, int start, int end, vector& highestDE) { +vector< vector > DeCalculator::getQuantiles(vector seqs, vector windowSizesTemplate, int window, vector probProfile, int increment, int start, int end) { try { vector< vector > quan; @@ -415,10 +410,6 @@ vector< vector > DeCalculator::getQuantiles(vector seqs, //dist-1 because vector indexes start at 0. quan[dist-1].push_back(newScore); - //save highestDE - if (de > highestDE[i]) { highestDE[i] = de; } - if(de > highestDE[j]) { highestDE[j] = de; } - delete subject; } @@ -555,65 +546,8 @@ cout << "high = " << high << endl; } } //*************************************************************************************************************** -//follows Mallard algorythn in paper referenced from mallard class -vector DeCalculator::returnObviousOutliers(vector< vector > quantiles, int num) { - try { - vector< vector > quan; - quan.resize(100); - - map contributions; //map of quanMember to distance from high or low - how bad is it. - vector marked; //marked[0] is the penalty of template seqs[0]. the higher the penalty the more likely the sequence is chimeric - marked.resize(num,0); - - //find contributions - for (int i = 0; i < quantiles.size(); i++) { - - //find mean of this quantile score - sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers); - - float high = quantiles[i][int(quantiles[i].size() * 0.99)].score; - - //look at each value in quantiles to see if it is an outlier - for (int j = 0; j < quantiles[i].size(); j++) { - - //is this score between above 99% - if (quantiles[i][j].score > high) { - //find out how "bad" of an outlier you are - so you can rank the outliers - float dist = quantiles[i][j].score - high; - contributions[&(quantiles[i][j])] = dist; - - //penalizing sequences for being in multiple outliers - marked[quantiles[i][j].member1]++; - marked[quantiles[i][j].member2]++; - } - } - } - - //find contributer with most offending score related to it - vector outliers = sortContrib(contributions); - - //go through the outliers marking the potential chimeras - for (int i = 0; i < outliers.size(); i++) { - - //who is responsible for this outlying score? - //if member1 has greater score mark him - //if member2 has greater score mark her - //if they are the same mark both - if (marked[outliers[i].member1] > marked[outliers[i].member2]) { marked[outliers[i].member1]++; } - else if (marked[outliers[i].member2] > marked[outliers[i].member1]) { marked[outliers[i].member2]++; } - else if (marked[outliers[i].member2] == marked[outliers[i].member1]) { marked[outliers[i].member2]++; marked[outliers[i].member1]++; } - } - - return marked; - } - catch(exception& e) { - errorOut(e, "DeCalculator", "removeObviousOutliers"); - exit(1); - } -} -//*************************************************************************************************************** //put quanMember in the vector based on how far they are from the 99% or 1%. Biggest offenders in front. -vector DeCalculator::sortContrib(map quan) { +/*vector DeCalculator::sortContrib(map quan) { try{ vector newQuan; @@ -646,7 +580,7 @@ cout << largest->second << '\t' << largest->first->score << '\t' << largest->fir //*************************************************************************************************************** //used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used... -/*int DeCalculator::findLargestContrib(vector seen) { +int DeCalculator::findLargestContrib(vector seen) { try{ int largest = 0; diff --git a/decalc.h b/decalc.h index e2a6cc7..a307497 100644 --- a/decalc.h +++ b/decalc.h @@ -53,12 +53,12 @@ class DeCalculator { float calcDE(vector, vector); float calcDist(Sequence*, Sequence*, int, int); float getCoef(vector, vector); - vector< vector > getQuantiles(vector, vector, int, vector, int, int, int, vector&); + vector< vector > getQuantiles(vector, vector, int, vector, int, int, int); vector returnObviousOutliers(vector< vector >, int); private: - vector sortContrib(map); //used by mallard + //vector sortContrib(map); //used by mallard float findAverage(vector); //int findLargestContrib(vector); //void removeContrib(int, vector&); diff --git a/mothur.cpp b/mothur.cpp index 9208a25..400c497 100644 --- a/mothur.cpp +++ b/mothur.cpp @@ -41,9 +41,9 @@ int main(int argc, char *argv[]){ //header - mothurOut("mothur v.1.4.1"); + mothurOut("mothur v.1.5.0"); mothurOutEndLine(); - mothurOut("Last updated: 6/23/2009"); + mothurOut("Last updated: 8/03/2009"); mothurOutEndLine(); mothurOutEndLine(); mothurOut("by"); diff --git a/pintail.cpp b/pintail.cpp index 53bdd9e..f46e0e2 100644 --- a/pintail.cpp +++ b/pintail.cpp @@ -10,6 +10,11 @@ #include "pintail.h" #include "ignoregaps.h" +//******************************************************************************************************************** +//sorts lowest to highest +inline bool compareQuanMembers(quanMember left, quanMember right){ + return (left.score < right.score); +} //*************************************************************************************************************** Pintail::Pintail(string filename, string temp) { fastafile = filename; templateFile = temp; } @@ -89,7 +94,6 @@ void Pintail::getChimeras() { h.resize(numSeqs); quantiles.resize(100); //one for every percent mismatch quantilesMembers.resize(100); //one for every percent mismatch - makeCompliant.resize(templateSeqs.size(), 0.0); //break up file if needed int linesPerProcess = numSeqs / processors ; @@ -242,10 +246,12 @@ void Pintail::getChimeras() { mothurOut("Calculating quantiles for your template. This can take a while... I will output the quantiles to a .quan file that you can input them using the quantiles parameter next time you run this command. Providing the .quan file will dramatically improve speed. "); cout.flush(); if (processors == 1) { - quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size(), makeCompliant); + quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size()); }else { createProcessesQuan(); } + + //decided against this because we were having trouble setting the sensitivity... may want to revisit this... //quantiles = decalc->removeObviousOutliers(quantilesMembers, templateSeqs.size()); @@ -257,30 +263,30 @@ void Pintail::getChimeras() { openOutputFile(o, out4); //adjust quantiles - for (int i = 0; i < quantiles.size(); i++) { + for (int i = 0; i < quantilesMembers.size(); i++) { vector temp; - if (quantiles[i].size() == 0) { + if (quantilesMembers[i].size() == 0) { //in case this is not a distance found in your template files for (int g = 0; g < 6; g++) { temp.push_back(0.0); } }else{ - sort(quantiles[i].begin(), quantiles[i].end()); + sort(quantilesMembers[i].begin(), quantilesMembers[i].end(), compareQuanMembers); //save 10% - temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)]); + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.10)].score); //save 25% - temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)]); + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.25)].score); //save 50% - temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)]); + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.5)].score); //save 75% - temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)]); + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.75)].score); //save 95% - temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)]); + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.95)].score); //save 99% - temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)]); + temp.push_back(quantilesMembers[i][int(quantilesMembers[i].size() * 0.99)].score); } @@ -900,7 +906,7 @@ void Pintail::createProcessesQuan() { process++; }else if (pid == 0){ - quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end, makeCompliant); + quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, templateLines[process]->start, templateLines[process]->end); //write out data to file so parent can read it ofstream out; @@ -971,7 +977,7 @@ void Pintail::createProcessesQuan() { } #else - quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size(), makeCompliant); + quantilesMembers = decalc->getQuantiles(templateSeqs, windowSizesTemplate, window, probabilityProfile, increment, 0, templateSeqs.size()); #endif } catch(exception& e) { diff --git a/pintail.h b/pintail.h index 181be2c..ddfba85 100644 --- a/pintail.h +++ b/pintail.h @@ -85,8 +85,6 @@ class Pintail : public Chimera { void createProcesses(); void createProcessesQuan(); - vector makeCompliant; //used by decalc->getQuantiles so pintail and mallard can use same function, it contains the highest de value for each seq in the template - }; /***********************************************************/ -- 2.39.2