5 * Created by Sarah Westcott on 7/22/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
12 //***************************************************************************************************************
13 void DeCalculator::setMask(string m) {
19 if (seqMask.length() != 0) {
20 //whereever there is a base in the mask, save that value is query and subject
21 for (int i = 0; i < seqMask.length(); i++) {
22 if (isalpha(seqMask[i])) {
30 for (int i = 0; i < alignLength; i++) {
38 errorOut(e, "DeCalculator", "setMask");
42 //***************************************************************************************************************
43 void DeCalculator::runMask(Sequence* seq) {
46 string q = seq->getAligned();
47 string tempQuery = "";
49 //whereever there is a base in the mask, save that value is query and subject
50 set<int>::iterator setit;
51 for ( setit=h.begin() ; setit != h.end(); setit++ ) {
52 tempQuery += q[*setit];
56 seq->setAligned(tempQuery);
57 seq->setUnaligned(tempQuery);
60 errorOut(e, "DeCalculator", "runMask");
64 //***************************************************************************************************************
65 //num is query's spot in querySeqs
66 void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map<int, int>& trim) {
69 string q = query->getAligned();
70 string s = subject->getAligned();
73 for (int i = 0; i < q.length(); i++) {
74 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
75 if (isalpha(q[i]) && isalpha(s[i])) { front = i; break; }
77 //cout << endl << endl;
79 for (int i = q.length(); i >= 0; i--) {
80 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
81 if (isalpha(q[i]) && isalpha(s[i])) { back = i; break; }
88 errorOut(e, "DeCalculator", "trimSeqs");
92 //***************************************************************************************************************
93 vector<int> DeCalculator::findWindows(Sequence* query, int front, int back, int& size, int increment) {
98 int cutoff = back - front; //back - front
100 //if window is set to default
101 if (size == 0) { if (cutoff > 1200) { size = 300; }
102 else{ size = (cutoff / 4); } }
103 else if (size > (cutoff / 4)) {
104 mothurOut("You have selected to large a window size for sequence " + query->getName() + ". I will choose an appropriate window size."); mothurOutEndLine();
108 /* string seq = query->getAligned().substr(front, cutoff);
112 for (int l = 0; l < seq.length(); l++) { if (isalpha(seq[l])) { numBases++; } }
113 //cout << "num Bases = " << numBases << endl;
115 win.push_back(front);
116 //cout << front << '\t';
117 //move ahead increment bases at a time until all bases are in a window
119 int totalBases = 0; //used to eliminate window of blanks at end of sequence
121 seq = query->getAligned();
122 for (int m = front; m < (back - size) ; m++) {
124 //count number of bases you see
125 if (isalpha(seq[m])) { countBases++; }
127 //if you have seen enough bases to make a new window
128 if (countBases >= increment) {
129 //total bases is the number of bases in a window already.
130 totalBases += countBases;
131 //cout << "total bases = " << totalBases << endl;
132 win.push_back(m); //save spot in alignment
134 countBases = 0; //reset bases you've seen in this window
137 //no need to continue if all your bases are in a window
138 if (totalBases == numBases) { break; }
142 //get last window if needed
143 if (totalBases < numBases) { win.push_back(back-size); }
144 //cout << endl << endl;
147 //this follows wigeon, but we may want to consider that it chops off the end values if the sequence cannot be evenly divided into steps
148 for (int m = front; m < (back - size) ; m+=increment) { win.push_back(m); }
155 catch(exception& e) {
156 errorOut(e, "DeCalculator", "findWindows");
161 //***************************************************************************************************************
162 vector<float> DeCalculator::calcObserved(Sequence* query, Sequence* subject, vector<int> window, int size) {
167 for (int m = 0; m < window.size(); m++) {
169 string seqFrag = query->getAligned().substr(window[m], size);
170 string seqFragsub = subject->getAligned().substr(window[m], size);
173 for (int b = 0; b < seqFrag.length(); b++) {
174 //if at least one is a base and they are not equal
175 if( (isalpha(seqFrag[b]) || isalpha(seqFragsub[b])) && (seqFrag[b] != seqFragsub[b]) ) { diff++; }
178 //if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
181 //percentage of mismatched bases
184 //if the whole fragment is 0 distance = 0
185 //if ((seqFrag.length()-gaps) == 0) { dist = 0.0; }
187 //percentage of mismatched bases
188 //else { dist = diff / (float) (seqFrag.length()-gaps) * 100; }
190 dist = diff / (float) (seqFrag.length()) * 100;
192 temp.push_back(dist);
197 catch(exception& e) {
198 errorOut(e, "DeCalculator", "calcObserved");
202 //***************************************************************************************************************
203 float DeCalculator::calcDist(Sequence* query, Sequence* subject, int front, int back) {
206 //so you only look at the trimmed part of the sequence
207 int cutoff = back - front;
210 //from first startpoint with length back-front
211 string seqFrag = query->getAligned().substr(front, cutoff);
212 string seqFragsub = subject->getAligned().substr(front, cutoff);
215 for (int b = 0; b < seqFrag.length(); b++) {
217 if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
218 if (seqFrag[b] != seqFragsub[b]) { diff++; }
221 //if the whole fragment is 0 distance = 0
222 if ((seqFrag.length()-gaps) == 0) { return 0.0; }
224 //percentage of mismatched bases
225 float dist = diff / (float) (seqFrag.length()-gaps) * 100;
229 catch(exception& e) {
230 errorOut(e, "DeCalculator", "calcDist");
235 //***************************************************************************************************************
236 vector<float> DeCalculator::calcExpected(vector<float> qav, float coef) {
240 vector<float> queryExpected;
242 for (int m = 0; m < qav.size(); m++) {
244 float expected = qav[m] * coef;
246 queryExpected.push_back(expected);
249 return queryExpected;
252 catch(exception& e) {
253 errorOut(e, "DeCalculator", "calcExpected");
257 //***************************************************************************************************************
258 float DeCalculator::calcDE(vector<float> obs, vector<float> exp) {
262 float sum = 0.0; //sum = sum from 1 to m of (oi-ei)^2
264 for (int m = 0; m < obs.size(); m++) {
266 //if (obs[m] != 0.0) {
267 sum += ((obs[m] - exp[m]) * (obs[m] - exp[m]));
268 //}else { numZeros++; }
272 float de = sqrt((sum / (obs.size() - 1 - numZeros)));
276 catch(exception& e) {
277 errorOut(e, "DeCalculator", "calcDE");
282 //***************************************************************************************************************
284 vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
288 string freqfile = getRootName(filename) + "freq";
291 openOutputFile(freqfile, outFreq);
293 string length = toString(seqs.size()); //if there are 5000 seqs in the template then set precision to 3
294 int precision = length.length() - 1;
297 outFreq.setf(ios::fixed, ios::floatfield); outFreq.setf(ios::showpoint);
299 //at each position in the sequence
300 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
302 vector<int> freq; freq.resize(4,0);
305 //find the frequency of each nucleotide
306 for (int j = 0; j < seqs.size(); j++) {
308 char value = seqs[j]->getAligned()[i];
310 if(toupper(value) == 'A') { freq[0]++; }
311 else if(toupper(value) == 'T' || toupper(value) == 'U') { freq[1]++; }
312 else if(toupper(value) == 'G') { freq[2]++; }
313 else if(toupper(value) == 'C') { freq[3]++; }
317 //find base with highest frequency
319 for (int m = 0; m < freq.size(); m++) { if (freq[m] > highest) { highest = freq[m]; } }
321 float highFreq = highest / (float) (seqs.size());
324 Pi = (highFreq - 0.25) / 0.75;
326 //cannot have probability less than 0.
327 if (Pi < 0) { Pi = 0.0; }
329 //saves this for later
330 outFreq << setprecision(precision) << i << '\t' << highFreq << endl;
332 if (h.count(i) > 0) {
342 catch(exception& e) {
343 errorOut(e, "DeCalculator", "calcFreq");
347 //***************************************************************************************************************
348 vector<float> DeCalculator::findQav(vector<int> window, int size, vector<float> probabilityProfile) {
350 vector<float> averages;
352 //for each window find average
353 for (int m = 0; m < window.size(); m++) {
357 //while you are in the window for this sequence
359 for (int j = window[m]; j < (window[m]+size); j++) {
360 average += probabilityProfile[j];
364 average = average / count;
366 //save this windows average
367 averages.push_back(average);
372 catch(exception& e) {
373 errorOut(e, "DeCalculator", "findQav");
377 //***************************************************************************************************************
378 //seqs have already been masked
379 vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end) {
381 vector< vector<quanMember> > quan;
383 //percentage of mismatched pairs 1 to 100
386 //string out = "getQuantiles.out";
387 //openOutputFile(out, o);
390 for(int i = start; i < end; i++){
392 mothurOut("Processing sequence " + toString(i)); mothurOutEndLine();
393 Sequence* query = new Sequence(seqs[i]->getName(), seqs[i]->getAligned());
395 //compare to every other sequence in template
396 for(int j = 0; j < i; j++){
398 Sequence* subject = new Sequence(seqs[j]->getName(), seqs[j]->getAligned());
401 map<int, int>::iterator it;
403 trimSeqs(query, subject, trim);
406 int front = it->first; int back = it->second;
408 //reset window for each new comparison
409 windowSizesTemplate[i] = window;
411 vector<int> win = findWindows(query, front, back, windowSizesTemplate[i], increment);
413 vector<float> obsi = calcObserved(query, subject, win, windowSizesTemplate[i]);
415 vector<float> q = findQav(win, windowSizesTemplate[i], probProfile);
417 float alpha = getCoef(obsi, q);
419 vector<float> exp = calcExpected(q, alpha);
421 float de = calcDE(obsi, exp);
423 float dist = calcDist(query, subject, front, back);
424 //o << i << '\t' << j << '\t' << dist << '\t' << de << endl;
427 quanMember newScore(de, i, j);
429 quan[dist].push_back(newScore);
440 catch(exception& e) {
441 errorOut(e, "DeCalculator", "getQuantiles");
445 //********************************************************************************************************************
446 //sorts lowest to highest
447 inline bool compareQuanMembers(quanMember left, quanMember right){
448 return (left.score < right.score);
450 //***************************************************************************************************************
451 //this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right. may want to revisit in the future...
452 void DeCalculator::removeObviousOutliers(vector< vector<quanMember> >& quantiles, int num) {
455 for (int i = 0; i < quantiles.size(); i++) {
457 //find mean of this quantile score
458 sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
460 float high = quantiles[i][int(quantiles[i].size() * 0.99)].score;
461 float low = quantiles[i][int(quantiles[i].size() * 0.01)].score;
463 vector<quanMember> temp;
465 //look at each value in quantiles to see if it is an outlier
466 for (int j = 0; j < quantiles[i].size(); j++) {
467 //is this score between 1 and 99%
468 if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) {
469 temp.push_back(quantiles[i][j]);
477 //find contributer with most offending score related to it
478 int largestContrib = findLargestContrib(seen);
480 //while you still have guys to eliminate
481 while (contributions.size() > 0) {
483 mothurOut("Removing scores contributed by sequence " + toString(largestContrib) + " in your template file."); mothurOutEndLine();
485 //remove from quantiles all scores that were made using this largestContrib
486 for (int i = 0; i < quantiles.size(); i++) {
487 //cout << "before remove " << quantiles[i].size() << '\t';
488 removeContrib(largestContrib, quantiles[i]);
489 //cout << "after remove " << quantiles[i].size() << endl;
491 //cout << count << " largest contrib = " << largestContrib << endl; count++;
492 //remove from contributions all scores that were made using this largestContrib
493 removeContrib(largestContrib, contributions);
495 //"erase" largestContrib
496 seen[largestContrib] = -1;
498 //get next largestContrib
499 largestContrib = findLargestContrib(seen);
502 **************************************************************************************************
504 vector<int> marked = returnObviousOutliers(quantiles, num);
505 vector<int> copyMarked = marked;
507 //find the 99% of marked
508 sort(copyMarked.begin(), copyMarked.end());
509 int high = copyMarked[int(copyMarked.size() * 0.99)];
510 cout << "high = " << high << endl;
512 for(int i = 0; i < marked.size(); i++) {
513 if (marked[i] > high) {
514 mothurOut("Removing scores contributed by sequence " + toString(marked[i]) + " in your template file."); mothurOutEndLine();
515 for (int i = 0; i < quantiles.size(); i++) {
516 removeContrib(marked[i], quantiles[i]);
524 for (int i = 0; i < quantiles.size(); i++) {
527 if (quantiles[i].size() == 0) {
528 //in case this is not a distance found in your template files
529 for (int g = 0; g < 6; g++) {
534 sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
537 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)].score);
539 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)].score);
541 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)].score);
543 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)].score);
545 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)].score);
547 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)].score);
557 catch(exception& e) {
558 errorOut(e, "DeCalculator", "removeObviousOutliers");
562 //***************************************************************************************************************
563 //put quanMember in the vector based on how far they are from the 99% or 1%. Biggest offenders in front.
564 /*vector<quanMember> DeCalculator::sortContrib(map<quanMember*, float> quan) {
567 vector<quanMember> newQuan;
568 map<quanMember*, float>::iterator it;
570 while (quan.size() > 0) {
572 map<quanMember*, float>::iterator largest = quan.begin();
574 //find biggest member
575 for (it = quan.begin(); it != quan.end(); it++) {
576 if (it->second > largest->second) { largest = it; }
578 cout << largest->second << '\t' << largest->first->score << '\t' << largest->first->member1 << '\t' << largest->first->member2 << endl;
580 newQuan.push_back(*(largest->first));
589 catch(exception& e) {
590 errorOut(e, "DeCalculator", "sortContrib");
595 //***************************************************************************************************************
596 //used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used...
597 int DeCalculator::findLargestContrib(vector<int> seen) {
604 for (int i = 0; i < seen.size(); i++) {
606 if (seen[i] > largest) {
612 return largestContribs;
615 catch(exception& e) {
616 errorOut(e, "DeCalculator", "findLargestContribs");
620 //***************************************************************************************************************
621 void DeCalculator::removeContrib(int bad, vector<quanMember>& quan) {
624 vector<quanMember> newQuan;
625 for (int i = 0; i < quan.size(); i++) {
626 if ((quan[i].member1 != bad) && (quan[i].member2 != bad) ) {
627 newQuan.push_back(quan[i]);
634 catch(exception& e) {
635 errorOut(e, "DeCalculator", "removeContrib");
640 //***************************************************************************************************************
641 float DeCalculator::findAverage(vector<float> myVector) {
645 for (int i = 0; i < myVector.size(); i++) { total += myVector[i]; }
647 float average = total / (float) myVector.size();
652 catch(exception& e) {
653 errorOut(e, "DeCalculator", "findAverage");
658 //***************************************************************************************************************
659 float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
662 //find average prob for this seqs windows
663 float probAverage = findAverage(qav);
665 //find observed average
666 float obsAverage = findAverage(obs);
668 float coef = obsAverage / probAverage;
672 catch(exception& e) {
673 errorOut(e, "DeCalculator", "getCoef");
677 //***************************************************************************************************************