5 * Created by Sarah Westcott on 7/22/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
12 //***************************************************************************************************************
13 void DeCalculator::setMask(string m) {
19 if (seqMask.length() != 0) {
20 //whereever there is a base in the mask, save that value is query and subject
21 for (int i = 0; i < seqMask.length(); i++) {
22 if (isalpha(seqMask[i])) {
30 for (int i = 0; i < alignLength; i++) {
38 errorOut(e, "DeCalculator", "setMask");
42 //***************************************************************************************************************
43 void DeCalculator::runMask(Sequence* seq) {
46 string q = seq->getAligned();
47 string tempQuery = "";
49 //whereever there is a base in the mask, save that value is query and subject
50 set<int>::iterator setit;
51 for ( setit=h.begin() ; setit != h.end(); setit++ ) {
52 tempQuery += q[*setit];
56 seq->setAligned(tempQuery);
57 seq->setUnaligned(tempQuery);
60 errorOut(e, "DeCalculator", "runMask");
64 //***************************************************************************************************************
65 //num is query's spot in querySeqs
66 void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map<int, int>& trim) {
69 string q = query->getAligned();
70 string s = subject->getAligned();
73 for (int i = 0; i < q.length(); i++) {
74 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
75 if (isalpha(q[i]) && isalpha(s[i])) { front = i; break; }
77 //cout << endl << endl;
79 for (int i = q.length(); i >= 0; i--) {
80 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
81 if (isalpha(q[i]) && isalpha(s[i])) { back = i; break; }
88 errorOut(e, "DeCalculator", "trimSeqs");
92 //***************************************************************************************************************
93 vector<int> DeCalculator::findWindows(Sequence* query, int front, int back, int& size, int increment) {
98 int cutoff = back - front; //back - front
100 //if window is set to default
101 if (size == 0) { if (cutoff > 1200) { size = 300; }
102 else{ size = (cutoff / 4); } }
103 else if (size > (cutoff / 4)) {
104 mothurOut("You have selected to large a window size for sequence " + query->getName() + ". I will choose an appropriate window size."); mothurOutEndLine();
108 /* string seq = query->getAligned().substr(front, cutoff);
112 for (int l = 0; l < seq.length(); l++) { if (isalpha(seq[l])) { numBases++; } }
113 //cout << "num Bases = " << numBases << endl;
115 win.push_back(front);
116 //cout << front << '\t';
117 //move ahead increment bases at a time until all bases are in a window
119 int totalBases = 0; //used to eliminate window of blanks at end of sequence
121 seq = query->getAligned();
122 for (int m = front; m < (back - size) ; m++) {
124 //count number of bases you see
125 if (isalpha(seq[m])) { countBases++; }
127 //if you have seen enough bases to make a new window
128 if (countBases >= increment) {
129 //total bases is the number of bases in a window already.
130 totalBases += countBases;
131 //cout << "total bases = " << totalBases << endl;
132 win.push_back(m); //save spot in alignment
134 countBases = 0; //reset bases you've seen in this window
137 //no need to continue if all your bases are in a window
138 if (totalBases == numBases) { break; }
142 //get last window if needed
143 if (totalBases < numBases) { win.push_back(back-size); }
144 //cout << endl << endl;
147 //this follows wigeon, but we may want to consider that it chops off the end values if the sequence cannot be evenly divided into steps
148 for (int m = front; m < (back - size) ; m+=increment) { win.push_back(m); }
155 catch(exception& e) {
156 errorOut(e, "DeCalculator", "findWindows");
161 //***************************************************************************************************************
162 vector<float> DeCalculator::calcObserved(Sequence* query, Sequence* subject, vector<int> window, int size) {
167 for (int m = 0; m < window.size(); m++) {
169 string seqFrag = query->getAligned().substr(window[m], size);
170 string seqFragsub = subject->getAligned().substr(window[m], size);
173 for (int b = 0; b < seqFrag.length(); b++) {
174 //if at least one is a base and they are not equal
175 if( (isalpha(seqFrag[b]) || isalpha(seqFragsub[b])) && (seqFrag[b] != seqFragsub[b]) ) { diff++; }
178 //if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
181 //percentage of mismatched bases
184 //if the whole fragment is 0 distance = 0
185 //if ((seqFrag.length()-gaps) == 0) { dist = 0.0; }
187 //percentage of mismatched bases
188 //else { dist = diff / (float) (seqFrag.length()-gaps) * 100; }
190 dist = diff / (float) (seqFrag.length()) * 100;
192 temp.push_back(dist);
197 catch(exception& e) {
198 errorOut(e, "DeCalculator", "calcObserved");
202 //***************************************************************************************************************
203 float DeCalculator::calcDist(Sequence* query, Sequence* subject, int front, int back) {
206 //so you only look at the trimmed part of the sequence
207 int cutoff = back - front;
210 //from first startpoint with length back-front
211 string seqFrag = query->getAligned().substr(front, cutoff);
212 string seqFragsub = subject->getAligned().substr(front, cutoff);
215 for (int b = 0; b < seqFrag.length(); b++) {
217 if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
218 if (seqFrag[b] != seqFragsub[b]) { diff++; }
221 //if the whole fragment is 0 distance = 0
222 if ((seqFrag.length()-gaps) == 0) { return 0.0; }
224 //percentage of mismatched bases
225 float dist = diff / (float) (seqFrag.length()-gaps) * 100;
229 catch(exception& e) {
230 errorOut(e, "DeCalculator", "calcDist");
235 //***************************************************************************************************************
236 vector<float> DeCalculator::calcExpected(vector<float> qav, float coef) {
240 vector<float> queryExpected;
242 for (int m = 0; m < qav.size(); m++) {
244 float expected = qav[m] * coef;
246 queryExpected.push_back(expected);
249 return queryExpected;
252 catch(exception& e) {
253 errorOut(e, "DeCalculator", "calcExpected");
257 //***************************************************************************************************************
258 float DeCalculator::calcDE(vector<float> obs, vector<float> exp) {
262 float sum = 0.0; //sum = sum from 1 to m of (oi-ei)^2
264 for (int m = 0; m < obs.size(); m++) {
266 //if (obs[m] != 0.0) {
267 sum += ((obs[m] - exp[m]) * (obs[m] - exp[m]));
268 //}else { numZeros++; }
272 float de = sqrt((sum / (obs.size() - 1 - numZeros)));
276 catch(exception& e) {
277 errorOut(e, "DeCalculator", "calcDE");
282 //***************************************************************************************************************
284 vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
288 string freqfile = getRootName(filename) + "freq";
291 openOutputFile(freqfile, outFreq);
293 string length = toString(seqs.size()); //if there are 5000 seqs in the template then set precision to 3
294 int precision = length.length() - 1;
297 outFreq.setf(ios::fixed, ios::floatfield); outFreq.setf(ios::showpoint);
299 //at each position in the sequence
300 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
302 vector<int> freq; freq.resize(4,0);
305 //find the frequency of each nucleotide
306 for (int j = 0; j < seqs.size(); j++) {
308 char value = seqs[j]->getAligned()[i];
310 if(toupper(value) == 'A') { freq[0]++; }
311 else if(toupper(value) == 'T' || toupper(value) == 'U') { freq[1]++; }
312 else if(toupper(value) == 'G') { freq[2]++; }
313 else if(toupper(value) == 'C') { freq[3]++; }
317 //find base with highest frequency
319 for (int m = 0; m < freq.size(); m++) { if (freq[m] > highest) { highest = freq[m]; } }
321 float highFreq = highest / (float) (seqs.size());
324 Pi = (highFreq - 0.25) / 0.75;
326 //cannot have probability less than 0.
327 if (Pi < 0) { Pi = 0.0; }
329 //saves this for later
330 outFreq << setprecision(precision) << i << '\t' << highFreq << endl;
332 if (h.count(i) > 0) {
342 catch(exception& e) {
343 errorOut(e, "DeCalculator", "calcFreq");
347 //***************************************************************************************************************
348 vector<float> DeCalculator::findQav(vector<int> window, int size, vector<float> probabilityProfile) {
350 vector<float> averages;
352 //for each window find average
353 for (int m = 0; m < window.size(); m++) {
357 //while you are in the window for this sequence
359 for (int j = window[m]; j < (window[m]+size); j++) {
360 average += probabilityProfile[j];
364 average = average / count;
366 //save this windows average
367 averages.push_back(average);
372 catch(exception& e) {
373 errorOut(e, "DeCalculator", "findQav");
377 //***************************************************************************************************************
378 //seqs have already been masked
379 vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end) {
381 vector< vector<quanMember> > quan;
383 //percentage of mismatched pairs 1 to 100
386 //string out = "getQuantiles.out";
387 //openOutputFile(out, o);
390 for(int i = start; i < end; i++){
392 mothurOut("Processing sequence " + toString(i)); mothurOutEndLine();
393 Sequence* query = new Sequence(seqs[i]->getName(), seqs[i]->getAligned());
395 //compare to every other sequence in template
396 for(int j = 0; j < i; j++){
398 Sequence* subject = new Sequence(seqs[j]->getName(), seqs[j]->getAligned());
401 map<int, int>::iterator it;
403 trimSeqs(query, subject, trim);
406 int front = it->first; int back = it->second;
408 //reset window for each new comparison
409 windowSizesTemplate[i] = window;
411 vector<int> win = findWindows(query, front, back, windowSizesTemplate[i], increment);
413 vector<float> obsi = calcObserved(query, subject, win, windowSizesTemplate[i]);
415 vector<float> q = findQav(win, windowSizesTemplate[i], probProfile);
417 float alpha = getCoef(obsi, q);
419 vector<float> exp = calcExpected(q, alpha);
421 float de = calcDE(obsi, exp);
423 float dist = calcDist(query, subject, front, back);
424 //o << i << '\t' << j << '\t' << dist << '\t' << de << endl;
427 quanMember newScore(de, i, j);
429 //dist-1 because vector indexes start at 0.
430 quan[dist-1].push_back(newScore);
441 catch(exception& e) {
442 errorOut(e, "DeCalculator", "getQuantiles");
446 //********************************************************************************************************************
447 //sorts lowest to highest
448 inline bool compareQuanMembers(quanMember left, quanMember right){
449 return (left.score < right.score);
451 //***************************************************************************************************************
452 //this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right. may want to revisit in the future...
453 void DeCalculator::removeObviousOutliers(vector< vector<quanMember> >& quantiles, int num) {
456 for (int i = 0; i < quantiles.size(); i++) {
458 //find mean of this quantile score
459 sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
461 float high = quantiles[i][int(quantiles[i].size() * 0.99)].score;
462 float low = quantiles[i][int(quantiles[i].size() * 0.01)].score;
464 vector<quanMember> temp;
466 //look at each value in quantiles to see if it is an outlier
467 for (int j = 0; j < quantiles[i].size(); j++) {
468 //is this score between 1 and 99%
469 if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) {
470 temp.push_back(quantiles[i][j]);
478 //find contributer with most offending score related to it
479 int largestContrib = findLargestContrib(seen);
481 //while you still have guys to eliminate
482 while (contributions.size() > 0) {
484 mothurOut("Removing scores contributed by sequence " + toString(largestContrib) + " in your template file."); mothurOutEndLine();
486 //remove from quantiles all scores that were made using this largestContrib
487 for (int i = 0; i < quantiles.size(); i++) {
488 //cout << "before remove " << quantiles[i].size() << '\t';
489 removeContrib(largestContrib, quantiles[i]);
490 //cout << "after remove " << quantiles[i].size() << endl;
492 //cout << count << " largest contrib = " << largestContrib << endl; count++;
493 //remove from contributions all scores that were made using this largestContrib
494 removeContrib(largestContrib, contributions);
496 //"erase" largestContrib
497 seen[largestContrib] = -1;
499 //get next largestContrib
500 largestContrib = findLargestContrib(seen);
503 **************************************************************************************************
505 vector<int> marked = returnObviousOutliers(quantiles, num);
506 vector<int> copyMarked = marked;
508 //find the 99% of marked
509 sort(copyMarked.begin(), copyMarked.end());
510 int high = copyMarked[int(copyMarked.size() * 0.99)];
511 cout << "high = " << high << endl;
513 for(int i = 0; i < marked.size(); i++) {
514 if (marked[i] > high) {
515 mothurOut("Removing scores contributed by sequence " + toString(marked[i]) + " in your template file."); mothurOutEndLine();
516 for (int i = 0; i < quantiles.size(); i++) {
517 removeContrib(marked[i], quantiles[i]);
525 for (int i = 0; i < quantiles.size(); i++) {
528 if (quantiles[i].size() == 0) {
529 //in case this is not a distance found in your template files
530 for (int g = 0; g < 6; g++) {
535 sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
538 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)].score);
540 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)].score);
542 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)].score);
544 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)].score);
546 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)].score);
548 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)].score);
558 catch(exception& e) {
559 errorOut(e, "DeCalculator", "removeObviousOutliers");
563 //***************************************************************************************************************
564 //put quanMember in the vector based on how far they are from the 99% or 1%. Biggest offenders in front.
565 /*vector<quanMember> DeCalculator::sortContrib(map<quanMember*, float> quan) {
568 vector<quanMember> newQuan;
569 map<quanMember*, float>::iterator it;
571 while (quan.size() > 0) {
573 map<quanMember*, float>::iterator largest = quan.begin();
575 //find biggest member
576 for (it = quan.begin(); it != quan.end(); it++) {
577 if (it->second > largest->second) { largest = it; }
579 cout << largest->second << '\t' << largest->first->score << '\t' << largest->first->member1 << '\t' << largest->first->member2 << endl;
581 newQuan.push_back(*(largest->first));
590 catch(exception& e) {
591 errorOut(e, "DeCalculator", "sortContrib");
596 //***************************************************************************************************************
597 //used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used...
598 int DeCalculator::findLargestContrib(vector<int> seen) {
605 for (int i = 0; i < seen.size(); i++) {
607 if (seen[i] > largest) {
613 return largestContribs;
616 catch(exception& e) {
617 errorOut(e, "DeCalculator", "findLargestContribs");
621 //***************************************************************************************************************
622 void DeCalculator::removeContrib(int bad, vector<quanMember>& quan) {
625 vector<quanMember> newQuan;
626 for (int i = 0; i < quan.size(); i++) {
627 if ((quan[i].member1 != bad) && (quan[i].member2 != bad) ) {
628 newQuan.push_back(quan[i]);
635 catch(exception& e) {
636 errorOut(e, "DeCalculator", "removeContrib");
641 //***************************************************************************************************************
642 float DeCalculator::findAverage(vector<float> myVector) {
646 for (int i = 0; i < myVector.size(); i++) { total += myVector[i]; }
648 float average = total / (float) myVector.size();
653 catch(exception& e) {
654 errorOut(e, "DeCalculator", "findAverage");
659 //***************************************************************************************************************
660 float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
663 //find average prob for this seqs windows
664 float probAverage = findAverage(qav);
666 //find observed average
667 float obsAverage = findAverage(obs);
669 float coef = obsAverage / probAverage;
673 catch(exception& e) {
674 errorOut(e, "DeCalculator", "getCoef");
678 //***************************************************************************************************************