X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=decalc.cpp;h=98545f8e24a72dca332bf7571fc0e5e9e6936988;hp=74cd8652909d17f0cd0e4e899b1472c6eea2ef93;hb=cf9987b67aa49777a4c91c2d21f96e58bf17aa82;hpb=cad05a21b084833b07808c1586e755be48fe7e1a diff --git a/decalc.cpp b/decalc.cpp index 74cd865..98545f8 100644 --- a/decalc.cpp +++ b/decalc.cpp @@ -12,7 +12,7 @@ #include "dist.h" #include "eachgapdist.h" #include "ignoregaps.h" - +#include "eachgapdist.h" //*************************************************************************************************************** void DeCalculator::setMask(string ms) { @@ -150,7 +150,7 @@ vector DeCalculator::findWindows(Sequence* query, int front, int back, int */ //this follows wigeon, but we may want to consider that it chops off the end values if the sequence cannot be evenly divided into steps - for (int m = front; m < (back - size) ; m+=increment) { win.push_back(m); } + for (int i = front; i < (back - size) ; i+=increment) { win.push_back(i); } @@ -169,10 +169,10 @@ vector DeCalculator::calcObserved(Sequence* query, Sequence* subject, vec vector temp; //int gaps = 0; - for (int m = 0; m < window.size(); m++) { + for (int i = 0; i < window.size(); i++) { - string seqFrag = query->getAligned().substr(window[m], size); - string seqFragsub = subject->getAligned().substr(window[m], size); + string seqFrag = query->getAligned().substr(window[i], size); + string seqFragsub = subject->getAligned().substr(window[i], size); int diff = 0; for (int b = 0; b < seqFrag.length(); b++) { @@ -244,9 +244,9 @@ vector DeCalculator::calcExpected(vector qav, float coef) { //for each window vector queryExpected; - for (int m = 0; m < qav.size(); m++) { + for (int j = 0; j < qav.size(); j++) { - float expected = qav[m] * coef; + float expected = qav[j] * coef; queryExpected.push_back(expected); } @@ -264,12 +264,12 @@ float DeCalculator::calcDE(vector obs, vector exp) { try { //for each window - float sum = 0.0; //sum = sum from 1 to m of (oi-ei)^2 + float sum = 0.0; //sum = sum from 1 to i of (oi-ei)^2 int numZeros = 0; - for (int m = 0; m < obs.size(); m++) { + for (int j = 0; j < obs.size(); j++) { - //if (obs[m] != 0.0) { - sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); + //if (obs[j] != 0.0) { + sum += ((obs[j] - exp[j]) * (obs[j] - exp[j])); //}else { numZeros++; } } @@ -290,10 +290,12 @@ vector DeCalculator::calcFreq(vector seqs, string filename) { try { vector prob; - string freqfile = getRootName(filename) + "freq"; + string freqfile = m->getRootName(filename) + "freq"; ofstream outFreq; - openOutputFile(freqfile, outFreq); + m->openOutputFile(freqfile, outFreq); + + outFreq << "#" << m->getVersion() << endl; string length = toString(seqs.size()); //if there are 5000 seqs in the template then set precision to 3 int precision = length.length() - 1; @@ -321,7 +323,7 @@ vector DeCalculator::calcFreq(vector seqs, string filename) { //find base with highest frequency int highest = 0; - for (int m = 0; m < freq.size(); m++) { if (freq[m] > highest) { highest = freq[m]; } } + for (int j = 0; j < freq.size(); j++) { if (freq[j] > highest) { highest = freq[j]; } } float highFreq = highest / (float) (seqs.size()); @@ -355,13 +357,13 @@ vector DeCalculator::findQav(vector window, int size, vector vector averages; //for each window find average - for (int m = 0; m < window.size(); m++) { + for (int i = 0; i < window.size(); i++) { float average = 0.0; //while you are in the window for this sequence int count = 0; - for (int j = window[m]; j < (window[m]+size); j++) { + for (int j = window[i]; j < (window[i]+size); j++) { average += probabilityProfile[j]; count++; } @@ -381,16 +383,13 @@ vector DeCalculator::findQav(vector window, int size, vector } //*************************************************************************************************************** //seqs have already been masked -vector< vector > DeCalculator::getQuantiles(vector seqs, vector windowSizesTemplate, int window, vector probProfile, int increment, int start, int end) { +vector< vector > DeCalculator::getQuantiles(vector seqs, vector windowSizesTemplate, int window, vector probProfile, int increment, int start, int end) { try { - vector< vector > quan; + vector< vector > quan; //percentage of mismatched pairs 1 to 100 quan.resize(100); -//ofstream o; -//string out = "getQuantiles.out"; -//openOutputFile(out, o); - + //for each sequence for(int i = start; i < end; i++){ @@ -431,15 +430,16 @@ vector< vector > DeCalculator::getQuantiles(vector seqs, //cout << i << '\t' << j << '\t' << dist << '\t' << de << endl; dist = ceil(dist); - quanMember newScore(de, i, j); - - quan[dist].push_back(newScore); + //quanMember newScore(de, i, j); + quan[dist].push_back(de); + delete subject; } delete query; } + return quan; @@ -456,27 +456,27 @@ inline bool compareQuanMembers(quanMember left, quanMember right){ } //*************************************************************************************************************** //this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right. may want to revisit in the future... -void DeCalculator::removeObviousOutliers(vector< vector >& quantiles, int num) { +void DeCalculator::removeObviousOutliers(vector< vector >& quantiles, int num) { try { for (int i = 0; i < quantiles.size(); i++) { - - //find mean of this quantile score - sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers); - float high = quantiles[i][int(quantiles[i].size() * 0.99)].score; - float low = quantiles[i][int(quantiles[i].size() * 0.01)].score; - - vector temp; + //find mean of this quantile score + sort(quantiles[i].begin(), quantiles[i].end()); - //look at each value in quantiles to see if it is an outlier - for (int j = 0; j < quantiles[i].size(); j++) { - //is this score between 1 and 99% - if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) { - temp.push_back(quantiles[i][j]); + vector temp; + if (quantiles[i].size() != 0) { + float high = quantiles[i][int(quantiles[i].size() * 0.99)]; + float low = quantiles[i][int(quantiles[i].size() * 0.01)]; + + //look at each value in quantiles to see if it is an outlier + for (int j = 0; j < quantiles[i].size(); j++) { + //is this score between 1 and 99% + if ((quantiles[i][j] > low) && (quantiles[i][j] < high)) { + temp.push_back(quantiles[i][j]); + } } } - quantiles[i] = temp; } @@ -599,7 +599,7 @@ cout << largest->second << '\t' << largest->first->score << '\t' << largest->fir } } -//*************************************************************************************************************** +*************************************************************************************************************** //used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used... int DeCalculator::findLargestContrib(vector seen) { try{ @@ -624,7 +624,7 @@ int DeCalculator::findLargestContrib(vector seen) { exit(1); } } -//*************************************************************************************************************** +*************************************************************************************************************** void DeCalculator::removeContrib(int bad, vector& quan) { try{ @@ -683,22 +683,23 @@ float DeCalculator::getCoef(vector obs, vector qav) { } //*************************************************************************************************************** //gets closest matches to each end, since chimeras will most likely have different parents on each end -vector DeCalculator::findClosest(Sequence* querySeq, vector db, int& numWanted, vector& indexes) { +vector DeCalculator::findClosest(Sequence querySeq, vector& thisTemplate, vector& thisFilteredTemplate, int numWanted, int minSim) { try { - indexes.clear(); + //indexes.clear(); + + vector seqsMatches; - vector seqsMatches; vector distsLeft; vector distsRight; Dist* distcalculator = new eachGapDist(); - string queryUnAligned = querySeq->getUnaligned(); + string queryUnAligned = querySeq.getUnaligned(); int numBases = int(queryUnAligned.length() * 0.33); string leftQuery = ""; //first 1/3 of the sequence string rightQuery = ""; //last 1/3 of the sequence - string queryAligned = querySeq->getAligned(); + string queryAligned = querySeq.getAligned(); //left side bool foundFirstBase = false; @@ -725,7 +726,7 @@ vector DeCalculator::findClosest(Sequence* querySeq, vector= numBases) { rightSpot = i; break; } //last 1/3 + if (baseCount > numBases + 1) { rightSpot = i; break; } //last 1/3 } //trim end @@ -737,36 +738,37 @@ vector DeCalculator::findClosest(Sequence* querySeq, vectorgetName(), leftQuery); - Sequence queryRight(querySeq->getName(), rightQuery); //cout << querySeq->getName() << '\t' << leftSpot << '\t' << rightSpot << '\t' << firstBaseSpot << '\t' << lastBaseSpot << endl; //cout << queryUnAligned.length() << '\t' << queryLeft.getUnaligned().length() << '\t' << queryRight.getUnaligned().length() << endl; - for(int j = 0; j < db.size(); j++){ + for(int j = 0; j < thisFilteredTemplate.size(); j++){ - string dbAligned = db[j]->getAligned(); + string dbAligned = thisFilteredTemplate[j]->getAligned(); string leftDB = dbAligned.substr(firstBaseSpot, (leftSpot-firstBaseSpot+1)); //first 1/3 of the sequence - string rightDB = dbAligned.substr(rightSpot, (lastBaseSpot-rightSpot)); //last 1/3 of the sequence - - Sequence dbLeft(db[j]->getName(), leftDB); - Sequence dbRight(db[j]->getName(), rightDB); + string rightDB = dbAligned.substr(rightSpot, (lastBaseSpot-rightSpot+1)); //last 1/3 of the sequence + + Sequence dbLeft(thisFilteredTemplate[j]->getName(), leftDB); + Sequence dbRight(thisFilteredTemplate[j]->getName(), rightDB); distcalculator->calcDist(queryLeft, dbLeft); float distLeft = distcalculator->getDist(); distcalculator->calcDist(queryRight, dbRight); float distRight = distcalculator->getDist(); - + SeqDist subjectLeft; - subjectLeft.seq = db[j]; + subjectLeft.seq = NULL; subjectLeft.dist = distLeft; subjectLeft.index = j; distsLeft.push_back(subjectLeft); SeqDist subjectRight; - subjectRight.seq = db[j]; + subjectRight.seq = NULL; subjectRight.dist = distRight; subjectRight.index = j; @@ -779,6 +781,7 @@ vector DeCalculator::findClosest(Sequence* querySeq, vector seen; @@ -787,51 +790,63 @@ vector DeCalculator::findClosest(Sequence* querySeq, vector dists; float lastRight = distsRight[0].dist; float lastLeft = distsLeft[0].dist; - int lasti = 0; - for (int i = 0; i < distsLeft.size(); i++) { + + float maxDist = 1.0 - (minSim / 100.0); + + for (int i = 0; i < numWanted+1; i++) { + if (m->control_pressed) { return seqsMatches; } + //add left if you havent already - it = seen.find(distsLeft[i].seq->getName()); - if (it == seen.end()) { + it = seen.find(thisTemplate[distsLeft[i].index]->getName()); + if (it == seen.end() && distsLeft[i].dist <= maxDist) { dists.push_back(distsLeft[i]); - seen[distsLeft[i].seq->getName()] = distsLeft[i].seq->getName(); + seen[thisTemplate[distsLeft[i].index]->getName()] = thisTemplate[distsLeft[i].index]->getName(); lastLeft = distsLeft[i].dist; +// cout << "loop-left\t" << db[distsLeft[i].index]->getName() << '\t' << distsLeft[i].dist << endl; } //add right if you havent already - it = seen.find(distsRight[i].seq->getName()); - if (it == seen.end()) { + it = seen.find(thisTemplate[distsRight[i].index]->getName()); + if (it == seen.end() && distsRight[i].dist <= maxDist) { dists.push_back(distsRight[i]); - seen[distsRight[i].seq->getName()] = distsRight[i].seq->getName(); + seen[thisTemplate[distsRight[i].index]->getName()] = thisTemplate[distsRight[i].index]->getName(); lastRight = distsRight[i].dist; +// cout << "loop-right\t" << db[distsRight[i].index]->getName() << '\t' << distsRight[i].dist << endl; } - if (dists.size() > numWanted) { lasti = i; break; } //you have enough results + if (i == numWanted) { break; } + } - //add in dups - lasti++; - int i = lasti; - while (i < distsLeft.size()) { - if (distsLeft[i].dist == lastLeft) { dists.push_back(distsLeft[i]); numWanted++; } - else { break; } - i++; + //are we still above the minimum similarity cutoff + if ((lastLeft >= minSim) || (lastRight >= minSim)) { + //add in ties from left + int i = numWanted; + while (i < distsLeft.size()) { + if (distsLeft[i].dist == lastLeft) { dists.push_back(distsLeft[i]); } + else { break; } + i++; + } + + //add in ties from right + i = numWanted; + while (i < distsRight.size()) { + if (distsRight[i].dist == lastRight) { dists.push_back(distsRight[i]); } + else { break; } + i++; + } } - i = lasti; - while (i < distsRight.size()) { - if (distsRight[i].dist == lastRight) { dists.push_back(distsRight[i]); numWanted++; } - else { break; } - i++; - } + //cout << numWanted << endl; + for (int i = 0; i < dists.size(); i++) { +// cout << db[dists[i].index]->getName() << '\t' << dists[i].dist << endl; - if (numWanted > dists.size()) { m->mothurOut("numwanted is larger than the number of template sequences, adjusting numwanted."); m->mothurOutEndLine(); numWanted = dists.size(); } + if ((thisTemplate[dists[i].index]->getName() != querySeq.getName()) && (((1.0-dists[i].dist)*100) >= minSim)) { + Sequence temp(thisTemplate[dists[i].index]->getName(), thisTemplate[dists[i].index]->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother. + //cout << querySeq->getName() << '\t' << thisTemplate[dists[i].index]->getName() << '\t' << dists[i].dist << endl; + seqsMatches.push_back(temp); + } -//cout << numWanted << endl; - for (int i = 0; i < numWanted; i++) { -//cout << dists[i].seq->getName() << '\t' << dists[i].dist << endl; - Sequence* temp = new Sequence(dists[i].seq->getName(), dists[i].seq->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother. - seqsMatches.push_back(temp); - indexes.push_back(dists[i].index); } return seqsMatches; @@ -874,17 +889,17 @@ Sequence* DeCalculator::findClosest(Sequence* querySeq, vector db) { } } /***************************************************************************************************************/ -map DeCalculator::trimSeqs(Sequence* query, vector topMatches) { +map DeCalculator::trimSeqs(Sequence& query, vector& topMatches) { try { int frontPos = 0; //should contain first position in all seqs that is not a gap character - int rearPos = query->getAligned().length(); + int rearPos = query.getAligned().length(); //********find first position in topMatches that is a non gap character***********// //find first position all query seqs that is a non gap character for (int i = 0; i < topMatches.size(); i++) { - string aligned = topMatches[i]->getAligned(); + string aligned = topMatches[i].getAligned(); int pos = 0; //find first spot in this seq @@ -900,7 +915,7 @@ map DeCalculator::trimSeqs(Sequence* query, vector topMatch } - string aligned = query->getAligned(); + string aligned = query.getAligned(); int pos = 0; //find first position in query that is a non gap character @@ -918,7 +933,7 @@ map DeCalculator::trimSeqs(Sequence* query, vector topMatch //********find last position in topMatches that is a non gap character***********// for (int i = 0; i < topMatches.size(); i++) { - string aligned = topMatches[i]->getAligned(); + string aligned = topMatches[i].getAligned(); int pos = aligned.length(); //find first spot in this seq @@ -934,7 +949,7 @@ map DeCalculator::trimSeqs(Sequence* query, vector topMatch } - aligned = query->getAligned(); + aligned = query.getAligned(); pos = aligned.length(); //find last position in query that is a non gap character @@ -947,28 +962,34 @@ map DeCalculator::trimSeqs(Sequence* query, vector topMatch //save this spot if it is the farthest if (pos < rearPos) { rearPos = pos; } - - //check to make sure that is not whole seq - if ((rearPos - frontPos - 1) <= 0) { m->mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); m->mothurOutEndLine(); exit(1); } -//cout << query->getName() << " front = " << frontPos << " rear = " << rearPos << endl; - //trim query - string newAligned = query->getAligned(); - newAligned = newAligned.substr(frontPos, (rearPos-frontPos+1)); - query->setAligned(newAligned); - - //trim topMatches - for (int i = 0; i < topMatches.size(); i++) { - newAligned = topMatches[i]->getAligned(); - newAligned = newAligned.substr(frontPos, (rearPos-frontPos+1)); - topMatches[i]->setAligned(newAligned); - } map trimmedPos; - - for (int i = 0; i < newAligned.length(); i++) { - trimmedPos[i] = i+frontPos; + //check to make sure that is not whole seq + if ((rearPos - frontPos - 1) <= 0) { + query.setAligned(""); + //trim topMatches + for (int i = 0; i < topMatches.size(); i++) { + topMatches[i].setAligned(""); + } + + }else { + + //trim query + string newAligned = query.getAligned(); + newAligned = newAligned.substr(frontPos, (rearPos-frontPos+1)); + query.setAligned(newAligned); + + //trim topMatches + for (int i = 0; i < topMatches.size(); i++) { + newAligned = topMatches[i].getAligned(); + newAligned = newAligned.substr(frontPos, (rearPos-frontPos+1)); + topMatches[i].setAligned(newAligned); + } + + for (int i = 0; i < newAligned.length(); i++) { + trimmedPos[i] = i+frontPos; + } } - return trimmedPos; } catch(exception& e) {