5 * Created by Sarah Westcott on 7/22/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
13 #include "eachgapdist.h"
16 //***************************************************************************************************************
17 void DeCalculator::setMask(string m) {
23 if (seqMask.length() != 0) {
24 //whereever there is a base in the mask, save that value is query and subject
25 for (int i = 0; i < seqMask.length(); i++) {
26 if (isalpha(seqMask[i])) {
34 for (int i = 0; i < alignLength; i++) {
42 errorOut(e, "DeCalculator", "setMask");
46 //***************************************************************************************************************
47 void DeCalculator::runMask(Sequence* seq) {
50 string q = seq->getAligned();
51 string tempQuery = "";
53 //whereever there is a base in the mask, save that value is query and subject
54 set<int>::iterator setit;
55 for ( setit=h.begin() ; setit != h.end(); setit++ ) {
56 tempQuery += q[*setit];
60 seq->setAligned(tempQuery);
61 seq->setUnaligned(tempQuery);
64 errorOut(e, "DeCalculator", "runMask");
68 //***************************************************************************************************************
69 //num is query's spot in querySeqs
70 void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map<int, int>& trim) {
73 string q = query->getAligned();
74 string s = subject->getAligned();
77 for (int i = 0; i < q.length(); i++) {
78 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
79 if (isalpha(q[i]) && isalpha(s[i])) { front = i; break; }
81 //cout << endl << endl;
83 for (int i = q.length(); i >= 0; i--) {
84 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
85 if (isalpha(q[i]) && isalpha(s[i])) { back = i; break; }
92 errorOut(e, "DeCalculator", "trimSeqs");
96 //***************************************************************************************************************
97 vector<int> DeCalculator::findWindows(Sequence* query, int front, int back, int& size, int increment) {
102 int cutoff = back - front; //back - front
104 //if window is set to default
105 if (size == 0) { if (cutoff > 1200) { size = 300; }
106 else{ size = (cutoff / 4); } }
107 else if (size > (cutoff / 4)) {
108 mothurOut("You have selected to large a window size for sequence " + query->getName() + ". I will choose an appropriate window size."); mothurOutEndLine();
112 /* string seq = query->getAligned().substr(front, cutoff);
116 for (int l = 0; l < seq.length(); l++) { if (isalpha(seq[l])) { numBases++; } }
117 //cout << "num Bases = " << numBases << endl;
119 win.push_back(front);
120 //cout << front << '\t';
121 //move ahead increment bases at a time until all bases are in a window
123 int totalBases = 0; //used to eliminate window of blanks at end of sequence
125 seq = query->getAligned();
126 for (int m = front; m < (back - size) ; m++) {
128 //count number of bases you see
129 if (isalpha(seq[m])) { countBases++; }
131 //if you have seen enough bases to make a new window
132 if (countBases >= increment) {
133 //total bases is the number of bases in a window already.
134 totalBases += countBases;
135 //cout << "total bases = " << totalBases << endl;
136 win.push_back(m); //save spot in alignment
138 countBases = 0; //reset bases you've seen in this window
141 //no need to continue if all your bases are in a window
142 if (totalBases == numBases) { break; }
146 //get last window if needed
147 if (totalBases < numBases) { win.push_back(back-size); }
148 //cout << endl << endl;
151 //this follows wigeon, but we may want to consider that it chops off the end values if the sequence cannot be evenly divided into steps
152 for (int m = front; m < (back - size) ; m+=increment) { win.push_back(m); }
159 catch(exception& e) {
160 errorOut(e, "DeCalculator", "findWindows");
165 //***************************************************************************************************************
166 vector<float> DeCalculator::calcObserved(Sequence* query, Sequence* subject, vector<int> window, int size) {
171 for (int m = 0; m < window.size(); m++) {
173 string seqFrag = query->getAligned().substr(window[m], size);
174 string seqFragsub = subject->getAligned().substr(window[m], size);
177 for (int b = 0; b < seqFrag.length(); b++) {
178 //if at least one is a base and they are not equal
179 if( (isalpha(seqFrag[b]) || isalpha(seqFragsub[b])) && (seqFrag[b] != seqFragsub[b]) ) { diff++; }
182 //if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
185 //percentage of mismatched bases
188 //if the whole fragment is 0 distance = 0
189 //if ((seqFrag.length()-gaps) == 0) { dist = 0.0; }
191 //percentage of mismatched bases
192 //else { dist = diff / (float) (seqFrag.length()-gaps) * 100; }
194 dist = diff / (float) (seqFrag.length()) * 100;
196 temp.push_back(dist);
201 catch(exception& e) {
202 errorOut(e, "DeCalculator", "calcObserved");
206 //***************************************************************************************************************
207 float DeCalculator::calcDist(Sequence* query, Sequence* subject, int front, int back) {
210 //so you only look at the trimmed part of the sequence
211 int cutoff = back - front;
214 //from first startpoint with length back-front
215 string seqFrag = query->getAligned().substr(front, cutoff);
216 string seqFragsub = subject->getAligned().substr(front, cutoff);
219 for (int b = 0; b < seqFrag.length(); b++) {
221 if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
222 if (seqFrag[b] != seqFragsub[b]) { diff++; }
225 //if the whole fragment is 0 distance = 0
226 if ((seqFrag.length()-gaps) == 0) { return 0.0; }
228 //percentage of mismatched bases
229 float dist = diff / (float) (seqFrag.length()-gaps) * 100;
233 catch(exception& e) {
234 errorOut(e, "DeCalculator", "calcDist");
239 //***************************************************************************************************************
240 vector<float> DeCalculator::calcExpected(vector<float> qav, float coef) {
244 vector<float> queryExpected;
246 for (int m = 0; m < qav.size(); m++) {
248 float expected = qav[m] * coef;
250 queryExpected.push_back(expected);
253 return queryExpected;
256 catch(exception& e) {
257 errorOut(e, "DeCalculator", "calcExpected");
261 //***************************************************************************************************************
262 float DeCalculator::calcDE(vector<float> obs, vector<float> exp) {
266 float sum = 0.0; //sum = sum from 1 to m of (oi-ei)^2
268 for (int m = 0; m < obs.size(); m++) {
270 //if (obs[m] != 0.0) {
271 sum += ((obs[m] - exp[m]) * (obs[m] - exp[m]));
272 //}else { numZeros++; }
276 float de = sqrt((sum / (obs.size() - 1 - numZeros)));
280 catch(exception& e) {
281 errorOut(e, "DeCalculator", "calcDE");
286 //***************************************************************************************************************
288 vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
292 string freqfile = getRootName(filename) + "freq";
295 openOutputFile(freqfile, outFreq);
297 string length = toString(seqs.size()); //if there are 5000 seqs in the template then set precision to 3
298 int precision = length.length() - 1;
301 outFreq.setf(ios::fixed, ios::floatfield); outFreq.setf(ios::showpoint);
303 //at each position in the sequence
304 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
306 vector<int> freq; freq.resize(4,0);
309 //find the frequency of each nucleotide
310 for (int j = 0; j < seqs.size(); j++) {
312 char value = seqs[j]->getAligned()[i];
314 if(toupper(value) == 'A') { freq[0]++; }
315 else if(toupper(value) == 'T' || toupper(value) == 'U') { freq[1]++; }
316 else if(toupper(value) == 'G') { freq[2]++; }
317 else if(toupper(value) == 'C') { freq[3]++; }
321 //find base with highest frequency
323 for (int m = 0; m < freq.size(); m++) { if (freq[m] > highest) { highest = freq[m]; } }
325 float highFreq = highest / (float) (seqs.size());
328 Pi = (highFreq - 0.25) / 0.75;
330 //cannot have probability less than 0.
331 if (Pi < 0) { Pi = 0.0; }
333 //saves this for later
334 outFreq << setprecision(precision) << i << '\t' << highFreq << endl;
336 if (h.count(i) > 0) {
346 catch(exception& e) {
347 errorOut(e, "DeCalculator", "calcFreq");
351 //***************************************************************************************************************
352 vector<float> DeCalculator::findQav(vector<int> window, int size, vector<float> probabilityProfile) {
354 vector<float> averages;
356 //for each window find average
357 for (int m = 0; m < window.size(); m++) {
361 //while you are in the window for this sequence
363 for (int j = window[m]; j < (window[m]+size); j++) {
364 average += probabilityProfile[j];
368 average = average / count;
370 //save this windows average
371 averages.push_back(average);
376 catch(exception& e) {
377 errorOut(e, "DeCalculator", "findQav");
381 //***************************************************************************************************************
382 //seqs have already been masked
383 vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end) {
385 vector< vector<quanMember> > quan;
387 //percentage of mismatched pairs 1 to 100
390 //string out = "getQuantiles.out";
391 //openOutputFile(out, o);
394 for(int i = start; i < end; i++){
396 mothurOut("Processing sequence " + toString(i)); mothurOutEndLine();
397 Sequence* query = new Sequence(seqs[i]->getName(), seqs[i]->getAligned());
399 //compare to every other sequence in template
400 for(int j = 0; j < i; j++){
402 Sequence* subject = new Sequence(seqs[j]->getName(), seqs[j]->getAligned());
405 map<int, int>::iterator it;
407 trimSeqs(query, subject, trim);
410 int front = it->first; int back = it->second;
412 //reset window for each new comparison
413 windowSizesTemplate[i] = window;
415 vector<int> win = findWindows(query, front, back, windowSizesTemplate[i], increment);
417 vector<float> obsi = calcObserved(query, subject, win, windowSizesTemplate[i]);
419 vector<float> q = findQav(win, windowSizesTemplate[i], probProfile);
421 float alpha = getCoef(obsi, q);
423 vector<float> exp = calcExpected(q, alpha);
425 float de = calcDE(obsi, exp);
427 float dist = calcDist(query, subject, front, back);
428 //o << i << '\t' << j << '\t' << dist << '\t' << de << endl;
431 quanMember newScore(de, i, j);
433 quan[dist].push_back(newScore);
444 catch(exception& e) {
445 errorOut(e, "DeCalculator", "getQuantiles");
449 //********************************************************************************************************************
450 //sorts lowest to highest
451 inline bool compareQuanMembers(quanMember left, quanMember right){
452 return (left.score < right.score);
454 //***************************************************************************************************************
455 //this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right. may want to revisit in the future...
456 void DeCalculator::removeObviousOutliers(vector< vector<quanMember> >& quantiles, int num) {
459 for (int i = 0; i < quantiles.size(); i++) {
461 //find mean of this quantile score
462 sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
464 float high = quantiles[i][int(quantiles[i].size() * 0.99)].score;
465 float low = quantiles[i][int(quantiles[i].size() * 0.01)].score;
467 vector<quanMember> temp;
469 //look at each value in quantiles to see if it is an outlier
470 for (int j = 0; j < quantiles[i].size(); j++) {
471 //is this score between 1 and 99%
472 if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) {
473 temp.push_back(quantiles[i][j]);
481 //find contributer with most offending score related to it
482 int largestContrib = findLargestContrib(seen);
484 //while you still have guys to eliminate
485 while (contributions.size() > 0) {
487 mothurOut("Removing scores contributed by sequence " + toString(largestContrib) + " in your template file."); mothurOutEndLine();
489 //remove from quantiles all scores that were made using this largestContrib
490 for (int i = 0; i < quantiles.size(); i++) {
491 //cout << "before remove " << quantiles[i].size() << '\t';
492 removeContrib(largestContrib, quantiles[i]);
493 //cout << "after remove " << quantiles[i].size() << endl;
495 //cout << count << " largest contrib = " << largestContrib << endl; count++;
496 //remove from contributions all scores that were made using this largestContrib
497 removeContrib(largestContrib, contributions);
499 //"erase" largestContrib
500 seen[largestContrib] = -1;
502 //get next largestContrib
503 largestContrib = findLargestContrib(seen);
506 **************************************************************************************************
508 vector<int> marked = returnObviousOutliers(quantiles, num);
509 vector<int> copyMarked = marked;
511 //find the 99% of marked
512 sort(copyMarked.begin(), copyMarked.end());
513 int high = copyMarked[int(copyMarked.size() * 0.99)];
514 cout << "high = " << high << endl;
516 for(int i = 0; i < marked.size(); i++) {
517 if (marked[i] > high) {
518 mothurOut("Removing scores contributed by sequence " + toString(marked[i]) + " in your template file."); mothurOutEndLine();
519 for (int i = 0; i < quantiles.size(); i++) {
520 removeContrib(marked[i], quantiles[i]);
528 for (int i = 0; i < quantiles.size(); i++) {
531 if (quantiles[i].size() == 0) {
532 //in case this is not a distance found in your template files
533 for (int g = 0; g < 6; g++) {
538 sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
541 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)].score);
543 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)].score);
545 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)].score);
547 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)].score);
549 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)].score);
551 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)].score);
561 catch(exception& e) {
562 errorOut(e, "DeCalculator", "removeObviousOutliers");
566 //***************************************************************************************************************
567 //put quanMember in the vector based on how far they are from the 99% or 1%. Biggest offenders in front.
568 /*vector<quanMember> DeCalculator::sortContrib(map<quanMember*, float> quan) {
571 vector<quanMember> newQuan;
572 map<quanMember*, float>::iterator it;
574 while (quan.size() > 0) {
576 map<quanMember*, float>::iterator largest = quan.begin();
578 //find biggest member
579 for (it = quan.begin(); it != quan.end(); it++) {
580 if (it->second > largest->second) { largest = it; }
582 cout << largest->second << '\t' << largest->first->score << '\t' << largest->first->member1 << '\t' << largest->first->member2 << endl;
584 newQuan.push_back(*(largest->first));
593 catch(exception& e) {
594 errorOut(e, "DeCalculator", "sortContrib");
599 //***************************************************************************************************************
600 //used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used...
601 int DeCalculator::findLargestContrib(vector<int> seen) {
608 for (int i = 0; i < seen.size(); i++) {
610 if (seen[i] > largest) {
616 return largestContribs;
619 catch(exception& e) {
620 errorOut(e, "DeCalculator", "findLargestContribs");
624 //***************************************************************************************************************
625 void DeCalculator::removeContrib(int bad, vector<quanMember>& quan) {
628 vector<quanMember> newQuan;
629 for (int i = 0; i < quan.size(); i++) {
630 if ((quan[i].member1 != bad) && (quan[i].member2 != bad) ) {
631 newQuan.push_back(quan[i]);
638 catch(exception& e) {
639 errorOut(e, "DeCalculator", "removeContrib");
644 //***************************************************************************************************************
645 float DeCalculator::findAverage(vector<float> myVector) {
649 for (int i = 0; i < myVector.size(); i++) { total += myVector[i]; }
651 float average = total / (float) myVector.size();
656 catch(exception& e) {
657 errorOut(e, "DeCalculator", "findAverage");
662 //***************************************************************************************************************
663 float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
666 //find average prob for this seqs windows
667 float probAverage = findAverage(qav);
669 //find observed average
670 float obsAverage = findAverage(obs);
672 float coef = obsAverage / probAverage;
676 catch(exception& e) {
677 errorOut(e, "DeCalculator", "getCoef");
681 //***************************************************************************************************************
682 vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db, int numWanted) {
685 vector<Sequence*> seqsMatches;
686 vector<SeqDist> dists;
688 Dist* distcalculator = new eachGapDist();
690 Sequence query = *(querySeq);
692 for(int j = 0; j < db.size(); j++){
694 Sequence temp = *(db[j]);
696 distcalculator->calcDist(query, temp);
697 float dist = distcalculator->getDist();
703 dists.push_back(subject);
706 delete distcalculator;
708 sort(dists.begin(), dists.end(), compareSeqDist);
710 for (int i = 0; i < numWanted; i++) {
711 Sequence* temp = new Sequence(dists[i].seq->getName(), dists[i].seq->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
712 seqsMatches.push_back(temp);
717 catch(exception& e) {
718 errorOut(e, "DeCalculator", "findClosest");
722 /***************************************************************************************************************/
723 void DeCalculator::trimSeqs(Sequence* query, vector<Sequence*> topMatches) {
726 int frontPos = 0; //should contain first position in all seqs that is not a gap character
727 int rearPos = query->getAligned().length();
729 //********find first position in topMatches that is a non gap character***********//
730 //find first position all query seqs that is a non gap character
731 for (int i = 0; i < topMatches.size(); i++) {
733 string aligned = topMatches[i]->getAligned();
736 //find first spot in this seq
737 for (int j = 0; j < aligned.length(); j++) {
738 if (isalpha(aligned[j])) {
744 //save this spot if it is the farthest
745 if (pos > frontPos) { frontPos = pos; }
749 string aligned = query->getAligned();
752 //find first position in query that is a non gap character
753 for (int j = 0; j < aligned.length(); j++) {
754 if (isalpha(aligned[j])) {
760 //save this spot if it is the farthest
761 if (pos > frontPos) { frontPos = pos; }
764 //********find last position in topMatches that is a non gap character***********//
765 for (int i = 0; i < topMatches.size(); i++) {
767 string aligned = topMatches[i]->getAligned();
768 int pos = aligned.length();
770 //find first spot in this seq
771 for (int j = aligned.length()-1; j >= 0; j--) {
772 if (isalpha(aligned[j])) {
778 //save this spot if it is the farthest
779 if (pos < rearPos) { rearPos = pos; }
783 aligned = query->getAligned();
784 pos = aligned.length();
786 //find last position in query that is a non gap character
787 for (int j = aligned.length()-1; j >= 0; j--) {
788 if (isalpha(aligned[j])) {
794 //save this spot if it is the farthest
795 if (pos < rearPos) { rearPos = pos; }
797 //check to make sure that is not whole seq
798 if ((rearPos - frontPos - 1) <= 0) { mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); mothurOutEndLine(); exit(1); }
799 //cout << "front = " << frontPos << " rear = " << rearPos << endl;
801 string newAligned = query->getAligned();
802 newAligned = newAligned.substr(frontPos, (rearPos-frontPos));
803 query->setAligned(newAligned);
806 for (int i = 0; i < topMatches.size(); i++) {
807 newAligned = topMatches[i]->getAligned();
808 newAligned = newAligned.substr(frontPos, (rearPos-frontPos+1));
809 topMatches[i]->setAligned(newAligned);
813 catch(exception& e) {
814 errorOut(e, "DeCalculator", "trimSequences");
819 //***************************************************************************************************************