5 * Created by Sarah Westcott on 7/22/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
12 //***************************************************************************************************************
13 void DeCalculator::setMask(string m) {
17 if (seqMask.length() != 0) {
18 //whereever there is a base in the mask, save that value is query and subject
19 for (int i = 0; i < seqMask.length(); i++) {
20 if (isalpha(seqMask[i])) {
25 for (int i = 0; i < alignLength; i++) { h.insert(i); }
29 errorOut(e, "DeCalculator", "setMask");
33 //***************************************************************************************************************
34 void DeCalculator::runMask(Sequence* seq) {
37 string q = seq->getAligned();
38 string tempQuery = "";
40 //whereever there is a base in the mask, save that value is query and subject
41 set<int>::iterator setit;
42 for ( setit=h.begin() ; setit != h.end(); setit++ ) {
43 tempQuery += q[*setit];
47 seq->setAligned(tempQuery);
48 seq->setUnaligned(tempQuery);
51 errorOut(e, "DeCalculator", "runMask");
55 //***************************************************************************************************************
56 //num is query's spot in querySeqs
57 void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map<int, int>& trim) {
60 string q = query->getAligned();
61 string s = subject->getAligned();
64 for (int i = 0; i < q.length(); i++) {
65 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
66 if (isalpha(q[i]) && isalpha(s[i])) { front = i; break; }
68 //cout << endl << endl;
70 for (int i = q.length(); i >= 0; i--) {
71 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
72 if (isalpha(q[i]) && isalpha(s[i])) { back = i; break; }
79 errorOut(e, "DeCalculator", "trimSeqs");
83 //***************************************************************************************************************
84 vector<int> DeCalculator::findWindows(Sequence* query, int front, int back, int& size, int increment) {
89 int cutoff = back - front; //back - front
91 //if window is set to default
92 if (size == 0) { if (cutoff > 1200) { size = 300; }
93 else{ size = (cutoff / 4); } }
94 else if (size > (cutoff / 4)) {
95 mothurOut("You have selected to large a window size for sequence " + query->getName() + ". I will choose an appropriate window size."); mothurOutEndLine();
99 /* string seq = query->getAligned().substr(front, cutoff);
103 for (int l = 0; l < seq.length(); l++) { if (isalpha(seq[l])) { numBases++; } }
104 //cout << "num Bases = " << numBases << endl;
106 win.push_back(front);
107 //cout << front << '\t';
108 //move ahead increment bases at a time until all bases are in a window
110 int totalBases = 0; //used to eliminate window of blanks at end of sequence
112 seq = query->getAligned();
113 for (int m = front; m < (back - size) ; m++) {
115 //count number of bases you see
116 if (isalpha(seq[m])) { countBases++; }
118 //if you have seen enough bases to make a new window
119 if (countBases >= increment) {
120 //total bases is the number of bases in a window already.
121 totalBases += countBases;
122 //cout << "total bases = " << totalBases << endl;
123 win.push_back(m); //save spot in alignment
125 countBases = 0; //reset bases you've seen in this window
128 //no need to continue if all your bases are in a window
129 if (totalBases == numBases) { break; }
133 //get last window if needed
134 if (totalBases < numBases) { win.push_back(back-size); }
135 //cout << endl << endl;
138 //this follows wigeon, but we may want to consider that it chops off the end values if the sequence cannot be evenly divided into steps
139 for (int m = front; m < (back - size) ; m+=increment) { win.push_back(m); }
146 catch(exception& e) {
147 errorOut(e, "DeCalculator", "findWindows");
152 //***************************************************************************************************************
153 vector<float> DeCalculator::calcObserved(Sequence* query, Sequence* subject, vector<int> window, int size) {
158 for (int m = 0; m < window.size(); m++) {
160 string seqFrag = query->getAligned().substr(window[m], size);
161 string seqFragsub = subject->getAligned().substr(window[m], size);
164 for (int b = 0; b < seqFrag.length(); b++) {
165 //if at least one is a base and they are not equal
166 if( (isalpha(seqFrag[b]) || isalpha(seqFragsub[b])) && (seqFrag[b] != seqFragsub[b]) ) { diff++; }
169 //if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
172 //percentage of mismatched bases
175 //if the whole fragment is 0 distance = 0
176 //if ((seqFrag.length()-gaps) == 0) { dist = 0.0; }
178 //percentage of mismatched bases
179 //else { dist = diff / (float) (seqFrag.length()-gaps) * 100; }
181 dist = diff / (float) (seqFrag.length()) * 100;
183 temp.push_back(dist);
188 catch(exception& e) {
189 errorOut(e, "DeCalculator", "calcObserved");
193 //***************************************************************************************************************
194 float DeCalculator::calcDist(Sequence* query, Sequence* subject, int front, int back) {
197 //so you only look at the trimmed part of the sequence
198 int cutoff = back - front;
201 //from first startpoint with length back-front
202 string seqFrag = query->getAligned().substr(front, cutoff);
203 string seqFragsub = subject->getAligned().substr(front, cutoff);
206 for (int b = 0; b < seqFrag.length(); b++) {
208 if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
209 if (seqFrag[b] != seqFragsub[b]) { diff++; }
212 //if the whole fragment is 0 distance = 0
213 if ((seqFrag.length()-gaps) == 0) { return 0.0; }
215 //percentage of mismatched bases
216 float dist = diff / (float) (seqFrag.length()-gaps) * 100;
220 catch(exception& e) {
221 errorOut(e, "DeCalculator", "calcDist");
226 //***************************************************************************************************************
227 vector<float> DeCalculator::calcExpected(vector<float> qav, float coef) {
231 vector<float> queryExpected;
233 for (int m = 0; m < qav.size(); m++) {
235 float expected = qav[m] * coef;
237 queryExpected.push_back(expected);
240 return queryExpected;
243 catch(exception& e) {
244 errorOut(e, "DeCalculator", "calcExpected");
248 //***************************************************************************************************************
249 float DeCalculator::calcDE(vector<float> obs, vector<float> exp) {
253 float sum = 0.0; //sum = sum from 1 to m of (oi-ei)^2
255 for (int m = 0; m < obs.size(); m++) {
257 //if (obs[m] != 0.0) {
258 sum += ((obs[m] - exp[m]) * (obs[m] - exp[m]));
259 //}else { numZeros++; }
263 float de = sqrt((sum / (obs.size() - 1 - numZeros)));
267 catch(exception& e) {
268 errorOut(e, "DeCalculator", "calcDE");
273 //***************************************************************************************************************
275 vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
279 string freqfile = getRootName(filename) + "freq";
282 openOutputFile(freqfile, outFreq);
284 string length = toString(seqs.size()); //if there are 5000 seqs in the template then set precision to 3
285 int precision = length.length() - 1;
288 outFreq.setf(ios::fixed, ios::floatfield); outFreq.setf(ios::showpoint);
290 //at each position in the sequence
291 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
293 vector<int> freq; freq.resize(4,0);
296 //find the frequency of each nucleotide
297 for (int j = 0; j < seqs.size(); j++) {
299 char value = seqs[j]->getAligned()[i];
301 if(toupper(value) == 'A') { freq[0]++; }
302 else if(toupper(value) == 'T' || toupper(value) == 'U') { freq[1]++; }
303 else if(toupper(value) == 'G') { freq[2]++; }
304 else if(toupper(value) == 'C') { freq[3]++; }
308 //find base with highest frequency
310 for (int m = 0; m < freq.size(); m++) { if (freq[m] > highest) { highest = freq[m]; } }
312 float highFreq = highest / (float) (seqs.size());
315 Pi = (highFreq - 0.25) / 0.75;
317 //cannot have probability less than 0.
318 if (Pi < 0) { Pi = 0.0; }
320 //saves this for later
321 outFreq << setprecision(precision) << i << '\t' << highFreq << endl;
323 if (h.count(i) > 0) {
333 catch(exception& e) {
334 errorOut(e, "DeCalculator", "calcFreq");
338 //***************************************************************************************************************
339 vector<float> DeCalculator::findQav(vector<int> window, int size, vector<float> probabilityProfile) {
341 vector<float> averages;
343 //for each window find average
344 for (int m = 0; m < window.size(); m++) {
348 //while you are in the window for this sequence
350 for (int j = window[m]; j < (window[m]+size); j++) {
351 average += probabilityProfile[j];
355 average = average / count;
357 //save this windows average
358 averages.push_back(average);
363 catch(exception& e) {
364 errorOut(e, "DeCalculator", "findQav");
368 //***************************************************************************************************************
369 //seqs have already been masked
370 vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end) {
372 vector< vector<quanMember> > quan;
374 //percentage of mismatched pairs 1 to 100
377 //string out = "getQuantiles.out";
378 //openOutputFile(out, o);
381 for(int i = start; i < end; i++){
383 mothurOut("Processing sequence " + toString(i)); mothurOutEndLine();
384 Sequence* query = new Sequence(seqs[i]->getName(), seqs[i]->getAligned());
386 //compare to every other sequence in template
387 for(int j = 0; j < i; j++){
389 Sequence* subject = new Sequence(seqs[j]->getName(), seqs[j]->getAligned());
392 map<int, int>::iterator it;
394 trimSeqs(query, subject, trim);
397 int front = it->first; int back = it->second;
399 //reset window for each new comparison
400 windowSizesTemplate[i] = window;
402 vector<int> win = findWindows(query, front, back, windowSizesTemplate[i], increment);
404 vector<float> obsi = calcObserved(query, subject, win, windowSizesTemplate[i]);
406 vector<float> q = findQav(win, windowSizesTemplate[i], probProfile);
408 float alpha = getCoef(obsi, q);
410 vector<float> exp = calcExpected(q, alpha);
412 float de = calcDE(obsi, exp);
414 float dist = calcDist(query, subject, front, back);
415 //o << i << '\t' << j << '\t' << dist << '\t' << de << endl;
418 quanMember newScore(de, i, j);
420 //dist-1 because vector indexes start at 0.
421 quan[dist-1].push_back(newScore);
432 catch(exception& e) {
433 errorOut(e, "DeCalculator", "getQuantiles");
437 //********************************************************************************************************************
438 //sorts lowest to highest
439 inline bool compareQuanMembers(quanMember left, quanMember right){
440 return (left.score < right.score);
442 //***************************************************************************************************************
443 //this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right. may want to revisit in the future...
444 void DeCalculator::removeObviousOutliers(vector< vector<quanMember> >& quantiles, int num) {
447 for (int i = 0; i < quantiles.size(); i++) {
449 //find mean of this quantile score
450 sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
452 float high = quantiles[i][int(quantiles[i].size() * 0.99)].score;
453 float low = quantiles[i][int(quantiles[i].size() * 0.01)].score;
455 vector<quanMember> temp;
457 //look at each value in quantiles to see if it is an outlier
458 for (int j = 0; j < quantiles[i].size(); j++) {
459 //is this score between 1 and 99%
460 if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) {
461 temp.push_back(quantiles[i][j]);
469 //find contributer with most offending score related to it
470 int largestContrib = findLargestContrib(seen);
472 //while you still have guys to eliminate
473 while (contributions.size() > 0) {
475 mothurOut("Removing scores contributed by sequence " + toString(largestContrib) + " in your template file."); mothurOutEndLine();
477 //remove from quantiles all scores that were made using this largestContrib
478 for (int i = 0; i < quantiles.size(); i++) {
479 //cout << "before remove " << quantiles[i].size() << '\t';
480 removeContrib(largestContrib, quantiles[i]);
481 //cout << "after remove " << quantiles[i].size() << endl;
483 //cout << count << " largest contrib = " << largestContrib << endl; count++;
484 //remove from contributions all scores that were made using this largestContrib
485 removeContrib(largestContrib, contributions);
487 //"erase" largestContrib
488 seen[largestContrib] = -1;
490 //get next largestContrib
491 largestContrib = findLargestContrib(seen);
494 **************************************************************************************************
496 vector<int> marked = returnObviousOutliers(quantiles, num);
497 vector<int> copyMarked = marked;
499 //find the 99% of marked
500 sort(copyMarked.begin(), copyMarked.end());
501 int high = copyMarked[int(copyMarked.size() * 0.99)];
502 cout << "high = " << high << endl;
504 for(int i = 0; i < marked.size(); i++) {
505 if (marked[i] > high) {
506 mothurOut("Removing scores contributed by sequence " + toString(marked[i]) + " in your template file."); mothurOutEndLine();
507 for (int i = 0; i < quantiles.size(); i++) {
508 removeContrib(marked[i], quantiles[i]);
516 for (int i = 0; i < quantiles.size(); i++) {
519 if (quantiles[i].size() == 0) {
520 //in case this is not a distance found in your template files
521 for (int g = 0; g < 6; g++) {
526 sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
529 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)].score);
531 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)].score);
533 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)].score);
535 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)].score);
537 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)].score);
539 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)].score);
549 catch(exception& e) {
550 errorOut(e, "DeCalculator", "removeObviousOutliers");
554 //***************************************************************************************************************
555 //put quanMember in the vector based on how far they are from the 99% or 1%. Biggest offenders in front.
556 /*vector<quanMember> DeCalculator::sortContrib(map<quanMember*, float> quan) {
559 vector<quanMember> newQuan;
560 map<quanMember*, float>::iterator it;
562 while (quan.size() > 0) {
564 map<quanMember*, float>::iterator largest = quan.begin();
566 //find biggest member
567 for (it = quan.begin(); it != quan.end(); it++) {
568 if (it->second > largest->second) { largest = it; }
570 cout << largest->second << '\t' << largest->first->score << '\t' << largest->first->member1 << '\t' << largest->first->member2 << endl;
572 newQuan.push_back(*(largest->first));
581 catch(exception& e) {
582 errorOut(e, "DeCalculator", "sortContrib");
587 //***************************************************************************************************************
588 //used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used...
589 int DeCalculator::findLargestContrib(vector<int> seen) {
596 for (int i = 0; i < seen.size(); i++) {
598 if (seen[i] > largest) {
604 return largestContribs;
607 catch(exception& e) {
608 errorOut(e, "DeCalculator", "findLargestContribs");
612 //***************************************************************************************************************
613 void DeCalculator::removeContrib(int bad, vector<quanMember>& quan) {
616 vector<quanMember> newQuan;
617 for (int i = 0; i < quan.size(); i++) {
618 if ((quan[i].member1 != bad) && (quan[i].member2 != bad) ) {
619 newQuan.push_back(quan[i]);
626 catch(exception& e) {
627 errorOut(e, "DeCalculator", "removeContrib");
632 //***************************************************************************************************************
633 float DeCalculator::findAverage(vector<float> myVector) {
637 for (int i = 0; i < myVector.size(); i++) { total += myVector[i]; }
639 float average = total / (float) myVector.size();
644 catch(exception& e) {
645 errorOut(e, "DeCalculator", "findAverage");
650 //***************************************************************************************************************
651 float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
654 //find average prob for this seqs windows
655 float probAverage = findAverage(qav);
657 //find observed average
658 float obsAverage = findAverage(obs);
660 float coef = obsAverage / probAverage;
664 catch(exception& e) {
665 errorOut(e, "DeCalculator", "getCoef");
669 //***************************************************************************************************************