X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=decalc.cpp;h=b2e8be61f8df0aff52753058532a3ef42c6b6f0f;hb=4f8291bc0a482c68dc739ef5aa7a311fc74f5f43;hp=d57f43586bceea765b01cfe4ca39b22d4e747c4e;hpb=7597c66c08bac938fedcacc1f760186bc077b3f1;p=mothur.git diff --git a/decalc.cpp b/decalc.cpp index d57f435..b2e8be6 100644 --- a/decalc.cpp +++ b/decalc.cpp @@ -13,14 +13,26 @@ void DeCalculator::setMask(string m) { try { seqMask = m; + int count = 0; + maskMap.clear(); - //whereever there is a base in the mask, save that value is query and subject - for (int i = 0; i < seqMask.length(); i++) { - if (isalpha(seqMask[i])) { - h.insert(i); + if (seqMask.length() != 0) { + //whereever there is a base in the mask, save that value is query and subject + for (int i = 0; i < seqMask.length(); i++) { + if (isalpha(seqMask[i])) { + h.insert(i); + maskMap[count] = i; + count++; + + } + } + }else { + for (int i = 0; i < alignLength; i++) { + h.insert(i); + maskMap[count] = i; + count++; } } - } catch(exception& e) { errorOut(e, "DeCalculator", "setMask"); @@ -51,19 +63,21 @@ void DeCalculator::runMask(Sequence* seq) { } //*************************************************************************************************************** //num is query's spot in querySeqs -void DeCalculator::trimSeqs(Sequence* query, Sequence subject, map& trim) { +void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map& trim) { try { string q = query->getAligned(); - string s = subject.getAligned(); + string s = subject->getAligned(); int front = 0; for (int i = 0; i < q.length(); i++) { +//cout << "query = " << q[i] << " subject = " << s[i] << endl; if (isalpha(q[i]) && isalpha(s[i])) { front = i; break; } } - +//cout << endl << endl; int back = 0; for (int i = q.length(); i >= 0; i--) { +//cout << "query = " << q[i] << " subject = " << s[i] << endl; if (isalpha(q[i]) && isalpha(s[i])) { back = i; break; } } @@ -76,7 +90,6 @@ void DeCalculator::trimSeqs(Sequence* query, Sequence subject, map& tr } } //*************************************************************************************************************** -//find the window breaks for each sequence - this is so you can move ahead by bases. vector DeCalculator::findWindows(Sequence* query, int front, int back, int& size, int increment) { try { @@ -92,15 +105,15 @@ vector DeCalculator::findWindows(Sequence* query, int front, int back, int size = (cutoff / 4); } - string seq = query->getAligned().substr(front, cutoff); + /* string seq = query->getAligned().substr(front, cutoff); //count bases int numBases = 0; for (int l = 0; l < seq.length(); l++) { if (isalpha(seq[l])) { numBases++; } } - +//cout << "num Bases = " << numBases << endl; //save start of seq win.push_back(front); - +//cout << front << '\t'; //move ahead increment bases at a time until all bases are in a window int countBases = 0; int totalBases = 0; //used to eliminate window of blanks at end of sequence @@ -109,18 +122,33 @@ vector DeCalculator::findWindows(Sequence* query, int front, int back, int for (int m = front; m < (back - size) ; m++) { //count number of bases you see - if (isalpha(seq[m])) { countBases++; totalBases++; } + if (isalpha(seq[m])) { countBases++; } //if you have seen enough bases to make a new window if (countBases >= increment) { + //total bases is the number of bases in a window already. + totalBases += countBases; +//cout << "total bases = " << totalBases << endl; win.push_back(m); //save spot in alignment +//cout << m << '\t'; countBases = 0; //reset bases you've seen in this window } //no need to continue if all your bases are in a window if (totalBases == numBases) { break; } } - + + + //get last window if needed + if (totalBases < numBases) { win.push_back(back-size); } +//cout << endl << endl; +*/ + + //this follows wigeon, but we may want to consider that it chops off the end values if the sequence cannot be evenly divided into steps + for (int m = front; m < (back - size) ; m+=increment) { win.push_back(m); } + + + return win; } @@ -131,26 +159,36 @@ vector DeCalculator::findWindows(Sequence* query, int front, int back, int } //*************************************************************************************************************** -vector DeCalculator::calcObserved(Sequence* query, Sequence subject, vector window, int size) { +vector DeCalculator::calcObserved(Sequence* query, Sequence* subject, vector window, int size) { try { - vector temp; -//cout << "query length = " << query->getAligned().length() << '\t' << " subject length = " << subject.getAligned().length() << endl; + vector temp; + //int gaps = 0; for (int m = 0; m < window.size(); m++) { string seqFrag = query->getAligned().substr(window[m], size); - string seqFragsub = subject.getAligned().substr(window[m], size); - //cout << "start point = " << window[m] << " end point = " << window[m]+size << endl; + string seqFragsub = subject->getAligned().substr(window[m], size); + int diff = 0; for (int b = 0; b < seqFrag.length(); b++) { - - if (seqFrag[b] != seqFragsub[b]) { diff++; } + //if at least one is a base and they are not equal + if( (isalpha(seqFrag[b]) || isalpha(seqFragsub[b])) && (seqFrag[b] != seqFragsub[b]) ) { diff++; } + + //ignore gaps + //if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; } } //percentage of mismatched bases float dist; - dist = diff / (float) seqFrag.length() * 100; - + + //if the whole fragment is 0 distance = 0 + //if ((seqFrag.length()-gaps) == 0) { dist = 0.0; } + + //percentage of mismatched bases + //else { dist = diff / (float) (seqFrag.length()-gaps) * 100; } + + dist = diff / (float) (seqFrag.length()) * 100; + temp.push_back(dist); } @@ -162,23 +200,29 @@ vector DeCalculator::calcObserved(Sequence* query, Sequence subject, vect } } //*************************************************************************************************************** -float DeCalculator::calcDist(Sequence* query, Sequence subject, int front, int back) { +float DeCalculator::calcDist(Sequence* query, Sequence* subject, int front, int back) { try { //so you only look at the trimmed part of the sequence int cutoff = back - front; + int gaps = 0; //from first startpoint with length back-front string seqFrag = query->getAligned().substr(front, cutoff); - string seqFragsub = subject.getAligned().substr(front, cutoff); + string seqFragsub = subject->getAligned().substr(front, cutoff); int diff = 0; for (int b = 0; b < seqFrag.length(); b++) { + //ignore gaps + if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; } if (seqFrag[b] != seqFragsub[b]) { diff++; } } + + //if the whole fragment is 0 distance = 0 + if ((seqFrag.length()-gaps) == 0) { return 0.0; } //percentage of mismatched bases - float dist = diff / (float) seqFrag.length() * 100; + float dist = diff / (float) (seqFrag.length()-gaps) * 100; return dist; } @@ -216,9 +260,16 @@ float DeCalculator::calcDE(vector obs, vector exp) { //for each window float sum = 0.0; //sum = sum from 1 to m of (oi-ei)^2 - for (int m = 0; m < obs.size(); m++) { sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); } + int numZeros = 0; + for (int m = 0; m < obs.size(); m++) { + + //if (obs[m] != 0.0) { + sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); + //}else { numZeros++; } + + } - float de = sqrt((sum / (obs.size() - 1))); + float de = sqrt((sum / (obs.size() - 1 - numZeros))); return de; } @@ -234,11 +285,17 @@ vector DeCalculator::calcFreq(vector seqs, string filename) { try { vector prob; - string freqfile = getRootName(filename) + "prob"; + string freqfile = getRootName(filename) + "freq"; ofstream outFreq; openOutputFile(freqfile, outFreq); + string length = toString(seqs.size()); //if there are 5000 seqs in the template then set precision to 3 + int precision = length.length() - 1; + + //format output + outFreq.setf(ios::fixed, ios::floatfield); outFreq.setf(ios::showpoint); + //at each position in the sequence for (int i = 0; i < seqs[0]->getAligned().length(); i++) { @@ -261,11 +318,8 @@ vector DeCalculator::calcFreq(vector seqs, string filename) { int highest = 0; for (int m = 0; m < freq.size(); m++) { if (freq[m] > highest) { highest = freq[m]; } } - float highFreq; - //subtract gaps to "ignore them" - if ( (seqs.size() - gaps) == 0 ) { highFreq = 1.0; } - else { highFreq = highest / (float) (seqs.size() - gaps); } - + float highFreq = highest / (float) (seqs.size()); + float Pi; Pi = (highFreq - 0.25) / 0.75; @@ -273,9 +327,11 @@ vector DeCalculator::calcFreq(vector seqs, string filename) { if (Pi < 0) { Pi = 0.0; } //saves this for later - outFreq << i+1 << '\t' << Pi << endl; - - prob.push_back(Pi); + outFreq << setprecision(precision) << i << '\t' << highFreq << endl; + + if (h.count(i) > 0) { + prob.push_back(Pi); + } } outFreq.close(); @@ -301,12 +357,8 @@ vector DeCalculator::findQav(vector window, int size, vector //while you are in the window for this sequence int count = 0; for (int j = window[m]; j < (window[m]+size); j++) { - - //is this a spot that is included in the mask - if (h.count(j) > 0) { - average += probabilityProfile[j]; - count++; - } + average += probabilityProfile[j]; + count++; } average = average / count; @@ -322,26 +374,28 @@ vector DeCalculator::findQav(vector window, int size, vector exit(1); } } - //*************************************************************************************************************** -vector< vector > DeCalculator::getQuantiles(vector seqs, vector windowSizesTemplate, int window, vector probProfile, int increment, int start, int end) { +//seqs have already been masked +vector< vector > DeCalculator::getQuantiles(vector seqs, vector windowSizesTemplate, int window, vector probProfile, int increment, int start, int end) { try { - vector< vector > quan; + vector< vector > quan; //percentage of mismatched pairs 1 to 100 quan.resize(100); - +//ofstream o; +//string out = "getQuantiles.out"; +//openOutputFile(out, o); //for each sequence for(int i = start; i < end; i++){ - mothurOut("Processing template sequence " + toString(i)); mothurOutEndLine(); - Sequence* query = seqs[i]; + mothurOut("Processing sequence " + toString(i)); mothurOutEndLine(); + Sequence* query = new Sequence(seqs[i]->getName(), seqs[i]->getAligned()); //compare to every other sequence in template for(int j = 0; j < i; j++){ - Sequence subject = *(seqs[j]); + Sequence* subject = new Sequence(seqs[j]->getName(), seqs[j]->getAligned()); map trim; map::iterator it; @@ -367,20 +421,237 @@ vector< vector > DeCalculator::getQuantiles(vector seqs, vecto float de = calcDE(obsi, exp); float dist = calcDist(query, subject, front, back); - + //o << i << '\t' << j << '\t' << dist << '\t' << de << endl; dist = ceil(dist); + quanMember newScore(de, i, j); + //dist-1 because vector indexes start at 0. - quan[dist-1].push_back(de); + quan[dist-1].push_back(newScore); + delete subject; } + + delete query; } return quan; } catch(exception& e) { - errorOut(e, "DeCalculator", "findQav"); + errorOut(e, "DeCalculator", "getQuantiles"); + exit(1); + } +} +//******************************************************************************************************************** +//sorts lowest to highest +inline bool compareQuanMembers(quanMember left, quanMember right){ + return (left.score < right.score); +} +//*************************************************************************************************************** +//this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right. may want to revisit in the future... +void DeCalculator::removeObviousOutliers(vector< vector >& quantiles, int num) { + try { + + for (int i = 0; i < quantiles.size(); i++) { + + //find mean of this quantile score + sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers); + + float high = quantiles[i][int(quantiles[i].size() * 0.99)].score; + float low = quantiles[i][int(quantiles[i].size() * 0.01)].score; + + vector temp; + + //look at each value in quantiles to see if it is an outlier + for (int j = 0; j < quantiles[i].size(); j++) { + //is this score between 1 and 99% + if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) { + temp.push_back(quantiles[i][j]); + } + } + + quantiles[i] = temp; + } + +/* + //find contributer with most offending score related to it + int largestContrib = findLargestContrib(seen); + + //while you still have guys to eliminate + while (contributions.size() > 0) { + + mothurOut("Removing scores contributed by sequence " + toString(largestContrib) + " in your template file."); mothurOutEndLine(); + + //remove from quantiles all scores that were made using this largestContrib + for (int i = 0; i < quantiles.size(); i++) { +//cout << "before remove " << quantiles[i].size() << '\t'; + removeContrib(largestContrib, quantiles[i]); +//cout << "after remove " << quantiles[i].size() << endl; + } +//cout << count << " largest contrib = " << largestContrib << endl; count++; + //remove from contributions all scores that were made using this largestContrib + removeContrib(largestContrib, contributions); + + //"erase" largestContrib + seen[largestContrib] = -1; + + //get next largestContrib + largestContrib = findLargestContrib(seen); + } +ABOVE IS ATTEMPT #1 +************************************************************************************************** +BELOW IS ATTEMPT #2 + vector marked = returnObviousOutliers(quantiles, num); + vector copyMarked = marked; + + //find the 99% of marked + sort(copyMarked.begin(), copyMarked.end()); + int high = copyMarked[int(copyMarked.size() * 0.99)]; +cout << "high = " << high << endl; + + for(int i = 0; i < marked.size(); i++) { + if (marked[i] > high) { + mothurOut("Removing scores contributed by sequence " + toString(marked[i]) + " in your template file."); mothurOutEndLine(); + for (int i = 0; i < quantiles.size(); i++) { + removeContrib(marked[i], quantiles[i]); + } + } + + } + + + //adjust quantiles + for (int i = 0; i < quantiles.size(); i++) { + vector temp; + + if (quantiles[i].size() == 0) { + //in case this is not a distance found in your template files + for (int g = 0; g < 6; g++) { + temp.push_back(0.0); + } + }else{ + + sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers); + + //save 10% + temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)].score); + //save 25% + temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)].score); + //save 50% + temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)].score); + //save 75% + temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)].score); + //save 95% + temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)].score); + //save 99% + temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)].score); + + } + + quan[i] = temp; + + } +*/ + + } + catch(exception& e) { + errorOut(e, "DeCalculator", "removeObviousOutliers"); + exit(1); + } +} +//*************************************************************************************************************** +//put quanMember in the vector based on how far they are from the 99% or 1%. Biggest offenders in front. +/*vector DeCalculator::sortContrib(map quan) { + try{ + + vector newQuan; + map::iterator it; + + while (quan.size() > 0) { + + map::iterator largest = quan.begin(); + + //find biggest member + for (it = quan.begin(); it != quan.end(); it++) { + if (it->second > largest->second) { largest = it; } + } +cout << largest->second << '\t' << largest->first->score << '\t' << largest->first->member1 << '\t' << largest->first->member2 << endl; + //save it + newQuan.push_back(*(largest->first)); + + //erase from quan + quan.erase(largest); + } + + return newQuan; + + } + catch(exception& e) { + errorOut(e, "DeCalculator", "sortContrib"); + exit(1); + } +} + +//*************************************************************************************************************** +//used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used... +int DeCalculator::findLargestContrib(vector seen) { + try{ + + int largest = 0; + + int largestContribs; + + for (int i = 0; i < seen.size(); i++) { + + if (seen[i] > largest) { + largestContribs = i; + largest = seen[i]; + } + } + + return largestContribs; + + } + catch(exception& e) { + errorOut(e, "DeCalculator", "findLargestContribs"); + exit(1); + } +} +//*************************************************************************************************************** +void DeCalculator::removeContrib(int bad, vector& quan) { + try{ + + vector newQuan; + for (int i = 0; i < quan.size(); i++) { + if ((quan[i].member1 != bad) && (quan[i].member2 != bad) ) { + newQuan.push_back(quan[i]); + } + } + + quan = newQuan; + + } + catch(exception& e) { + errorOut(e, "DeCalculator", "removeContrib"); + exit(1); + } +} +*/ +//*************************************************************************************************************** +float DeCalculator::findAverage(vector myVector) { + try{ + + float total = 0.0; + for (int i = 0; i < myVector.size(); i++) { total += myVector[i]; } + + float average = total / (float) myVector.size(); + + return average; + + } + catch(exception& e) { + errorOut(e, "DeCalculator", "findAverage"); exit(1); } } @@ -390,16 +661,11 @@ float DeCalculator::getCoef(vector obs, vector qav) { try { //find average prob for this seqs windows - float probAverage = 0.0; - for (int j = 0; j < qav.size(); j++) { probAverage += qav[j]; } - probAverage = probAverage / (float) qav.size(); - + float probAverage = findAverage(qav); + //find observed average - float obsAverage = 0.0; - for (int j = 0; j < obs.size(); j++) { obsAverage += obs[j]; } - obsAverage = obsAverage / (float) obs.size(); -//cout << "sum ai / m = " << probAverage << endl; -//cout << "sum oi / m = " << obsAverage << endl; + float obsAverage = findAverage(obs); + float coef = obsAverage / probAverage; return coef;