5 * Created by Sarah Westcott on 7/22/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
12 //***************************************************************************************************************
13 void DeCalculator::setMask(string m) {
19 if (seqMask.length() != 0) {
20 //whereever there is a base in the mask, save that value is query and subject
21 for (int i = 0; i < seqMask.length(); i++) {
22 if (isalpha(seqMask[i])) {
29 for (int i = 0; i < alignLength; i++) { h.insert(i); }
33 errorOut(e, "DeCalculator", "setMask");
37 //***************************************************************************************************************
38 void DeCalculator::runMask(Sequence* seq) {
41 string q = seq->getAligned();
42 string tempQuery = "";
44 //whereever there is a base in the mask, save that value is query and subject
45 set<int>::iterator setit;
46 for ( setit=h.begin() ; setit != h.end(); setit++ ) {
47 tempQuery += q[*setit];
51 seq->setAligned(tempQuery);
52 seq->setUnaligned(tempQuery);
55 errorOut(e, "DeCalculator", "runMask");
59 //***************************************************************************************************************
60 //num is query's spot in querySeqs
61 void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map<int, int>& trim) {
64 string q = query->getAligned();
65 string s = subject->getAligned();
68 for (int i = 0; i < q.length(); i++) {
69 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
70 if (isalpha(q[i]) && isalpha(s[i])) { front = i; break; }
72 //cout << endl << endl;
74 for (int i = q.length(); i >= 0; i--) {
75 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
76 if (isalpha(q[i]) && isalpha(s[i])) { back = i; break; }
83 errorOut(e, "DeCalculator", "trimSeqs");
87 //***************************************************************************************************************
88 vector<int> DeCalculator::findWindows(Sequence* query, int front, int back, int& size, int increment) {
93 int cutoff = back - front; //back - front
95 //if window is set to default
96 if (size == 0) { if (cutoff > 1200) { size = 300; }
97 else{ size = (cutoff / 4); } }
98 else if (size > (cutoff / 4)) {
99 mothurOut("You have selected to large a window size for sequence " + query->getName() + ". I will choose an appropriate window size."); mothurOutEndLine();
103 /* string seq = query->getAligned().substr(front, cutoff);
107 for (int l = 0; l < seq.length(); l++) { if (isalpha(seq[l])) { numBases++; } }
108 //cout << "num Bases = " << numBases << endl;
110 win.push_back(front);
111 //cout << front << '\t';
112 //move ahead increment bases at a time until all bases are in a window
114 int totalBases = 0; //used to eliminate window of blanks at end of sequence
116 seq = query->getAligned();
117 for (int m = front; m < (back - size) ; m++) {
119 //count number of bases you see
120 if (isalpha(seq[m])) { countBases++; }
122 //if you have seen enough bases to make a new window
123 if (countBases >= increment) {
124 //total bases is the number of bases in a window already.
125 totalBases += countBases;
126 //cout << "total bases = " << totalBases << endl;
127 win.push_back(m); //save spot in alignment
129 countBases = 0; //reset bases you've seen in this window
132 //no need to continue if all your bases are in a window
133 if (totalBases == numBases) { break; }
137 //get last window if needed
138 if (totalBases < numBases) { win.push_back(back-size); }
139 //cout << endl << endl;
142 //this follows wigeon, but we may want to consider that it chops off the end values if the sequence cannot be evenly divided into steps
143 for (int m = front; m < (back - size) ; m+=increment) { win.push_back(m); }
150 catch(exception& e) {
151 errorOut(e, "DeCalculator", "findWindows");
156 //***************************************************************************************************************
157 vector<float> DeCalculator::calcObserved(Sequence* query, Sequence* subject, vector<int> window, int size) {
162 for (int m = 0; m < window.size(); m++) {
164 string seqFrag = query->getAligned().substr(window[m], size);
165 string seqFragsub = subject->getAligned().substr(window[m], size);
168 for (int b = 0; b < seqFrag.length(); b++) {
169 //if at least one is a base and they are not equal
170 if( (isalpha(seqFrag[b]) || isalpha(seqFragsub[b])) && (seqFrag[b] != seqFragsub[b]) ) { diff++; }
173 //if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
176 //percentage of mismatched bases
179 //if the whole fragment is 0 distance = 0
180 //if ((seqFrag.length()-gaps) == 0) { dist = 0.0; }
182 //percentage of mismatched bases
183 //else { dist = diff / (float) (seqFrag.length()-gaps) * 100; }
185 dist = diff / (float) (seqFrag.length()) * 100;
187 temp.push_back(dist);
192 catch(exception& e) {
193 errorOut(e, "DeCalculator", "calcObserved");
197 //***************************************************************************************************************
198 float DeCalculator::calcDist(Sequence* query, Sequence* subject, int front, int back) {
201 //so you only look at the trimmed part of the sequence
202 int cutoff = back - front;
205 //from first startpoint with length back-front
206 string seqFrag = query->getAligned().substr(front, cutoff);
207 string seqFragsub = subject->getAligned().substr(front, cutoff);
210 for (int b = 0; b < seqFrag.length(); b++) {
212 if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
213 if (seqFrag[b] != seqFragsub[b]) { diff++; }
216 //if the whole fragment is 0 distance = 0
217 if ((seqFrag.length()-gaps) == 0) { return 0.0; }
219 //percentage of mismatched bases
220 float dist = diff / (float) (seqFrag.length()-gaps) * 100;
224 catch(exception& e) {
225 errorOut(e, "DeCalculator", "calcDist");
230 //***************************************************************************************************************
231 vector<float> DeCalculator::calcExpected(vector<float> qav, float coef) {
235 vector<float> queryExpected;
237 for (int m = 0; m < qav.size(); m++) {
239 float expected = qav[m] * coef;
241 queryExpected.push_back(expected);
244 return queryExpected;
247 catch(exception& e) {
248 errorOut(e, "DeCalculator", "calcExpected");
252 //***************************************************************************************************************
253 float DeCalculator::calcDE(vector<float> obs, vector<float> exp) {
257 float sum = 0.0; //sum = sum from 1 to m of (oi-ei)^2
259 for (int m = 0; m < obs.size(); m++) {
261 //if (obs[m] != 0.0) {
262 sum += ((obs[m] - exp[m]) * (obs[m] - exp[m]));
263 //}else { numZeros++; }
267 float de = sqrt((sum / (obs.size() - 1 - numZeros)));
271 catch(exception& e) {
272 errorOut(e, "DeCalculator", "calcDE");
277 //***************************************************************************************************************
279 vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
283 string freqfile = getRootName(filename) + "freq";
286 openOutputFile(freqfile, outFreq);
288 string length = toString(seqs.size()); //if there are 5000 seqs in the template then set precision to 3
289 int precision = length.length() - 1;
292 outFreq.setf(ios::fixed, ios::floatfield); outFreq.setf(ios::showpoint);
294 //at each position in the sequence
295 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
297 vector<int> freq; freq.resize(4,0);
300 //find the frequency of each nucleotide
301 for (int j = 0; j < seqs.size(); j++) {
303 char value = seqs[j]->getAligned()[i];
305 if(toupper(value) == 'A') { freq[0]++; }
306 else if(toupper(value) == 'T' || toupper(value) == 'U') { freq[1]++; }
307 else if(toupper(value) == 'G') { freq[2]++; }
308 else if(toupper(value) == 'C') { freq[3]++; }
312 //find base with highest frequency
314 for (int m = 0; m < freq.size(); m++) { if (freq[m] > highest) { highest = freq[m]; } }
316 float highFreq = highest / (float) (seqs.size());
319 Pi = (highFreq - 0.25) / 0.75;
321 //cannot have probability less than 0.
322 if (Pi < 0) { Pi = 0.0; }
324 //saves this for later
325 outFreq << setprecision(precision) << i << '\t' << highFreq << endl;
327 if (h.count(i) > 0) {
337 catch(exception& e) {
338 errorOut(e, "DeCalculator", "calcFreq");
342 //***************************************************************************************************************
343 vector<float> DeCalculator::findQav(vector<int> window, int size, vector<float> probabilityProfile) {
345 vector<float> averages;
347 //for each window find average
348 for (int m = 0; m < window.size(); m++) {
352 //while you are in the window for this sequence
354 for (int j = window[m]; j < (window[m]+size); j++) {
355 average += probabilityProfile[j];
359 average = average / count;
361 //save this windows average
362 averages.push_back(average);
367 catch(exception& e) {
368 errorOut(e, "DeCalculator", "findQav");
372 //***************************************************************************************************************
373 //seqs have already been masked
374 vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end) {
376 vector< vector<quanMember> > quan;
378 //percentage of mismatched pairs 1 to 100
381 //string out = "getQuantiles.out";
382 //openOutputFile(out, o);
385 for(int i = start; i < end; i++){
387 mothurOut("Processing sequence " + toString(i)); mothurOutEndLine();
388 Sequence* query = new Sequence(seqs[i]->getName(), seqs[i]->getAligned());
390 //compare to every other sequence in template
391 for(int j = 0; j < i; j++){
393 Sequence* subject = new Sequence(seqs[j]->getName(), seqs[j]->getAligned());
396 map<int, int>::iterator it;
398 trimSeqs(query, subject, trim);
401 int front = it->first; int back = it->second;
403 //reset window for each new comparison
404 windowSizesTemplate[i] = window;
406 vector<int> win = findWindows(query, front, back, windowSizesTemplate[i], increment);
408 vector<float> obsi = calcObserved(query, subject, win, windowSizesTemplate[i]);
410 vector<float> q = findQav(win, windowSizesTemplate[i], probProfile);
412 float alpha = getCoef(obsi, q);
414 vector<float> exp = calcExpected(q, alpha);
416 float de = calcDE(obsi, exp);
418 float dist = calcDist(query, subject, front, back);
419 //o << i << '\t' << j << '\t' << dist << '\t' << de << endl;
422 quanMember newScore(de, i, j);
424 //dist-1 because vector indexes start at 0.
425 quan[dist-1].push_back(newScore);
436 catch(exception& e) {
437 errorOut(e, "DeCalculator", "getQuantiles");
441 //********************************************************************************************************************
442 //sorts lowest to highest
443 inline bool compareQuanMembers(quanMember left, quanMember right){
444 return (left.score < right.score);
446 //***************************************************************************************************************
447 //this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right. may want to revisit in the future...
448 void DeCalculator::removeObviousOutliers(vector< vector<quanMember> >& quantiles, int num) {
451 for (int i = 0; i < quantiles.size(); i++) {
453 //find mean of this quantile score
454 sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
456 float high = quantiles[i][int(quantiles[i].size() * 0.99)].score;
457 float low = quantiles[i][int(quantiles[i].size() * 0.01)].score;
459 vector<quanMember> temp;
461 //look at each value in quantiles to see if it is an outlier
462 for (int j = 0; j < quantiles[i].size(); j++) {
463 //is this score between 1 and 99%
464 if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) {
465 temp.push_back(quantiles[i][j]);
473 //find contributer with most offending score related to it
474 int largestContrib = findLargestContrib(seen);
476 //while you still have guys to eliminate
477 while (contributions.size() > 0) {
479 mothurOut("Removing scores contributed by sequence " + toString(largestContrib) + " in your template file."); mothurOutEndLine();
481 //remove from quantiles all scores that were made using this largestContrib
482 for (int i = 0; i < quantiles.size(); i++) {
483 //cout << "before remove " << quantiles[i].size() << '\t';
484 removeContrib(largestContrib, quantiles[i]);
485 //cout << "after remove " << quantiles[i].size() << endl;
487 //cout << count << " largest contrib = " << largestContrib << endl; count++;
488 //remove from contributions all scores that were made using this largestContrib
489 removeContrib(largestContrib, contributions);
491 //"erase" largestContrib
492 seen[largestContrib] = -1;
494 //get next largestContrib
495 largestContrib = findLargestContrib(seen);
498 **************************************************************************************************
500 vector<int> marked = returnObviousOutliers(quantiles, num);
501 vector<int> copyMarked = marked;
503 //find the 99% of marked
504 sort(copyMarked.begin(), copyMarked.end());
505 int high = copyMarked[int(copyMarked.size() * 0.99)];
506 cout << "high = " << high << endl;
508 for(int i = 0; i < marked.size(); i++) {
509 if (marked[i] > high) {
510 mothurOut("Removing scores contributed by sequence " + toString(marked[i]) + " in your template file."); mothurOutEndLine();
511 for (int i = 0; i < quantiles.size(); i++) {
512 removeContrib(marked[i], quantiles[i]);
520 for (int i = 0; i < quantiles.size(); i++) {
523 if (quantiles[i].size() == 0) {
524 //in case this is not a distance found in your template files
525 for (int g = 0; g < 6; g++) {
530 sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
533 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)].score);
535 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)].score);
537 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)].score);
539 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)].score);
541 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)].score);
543 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)].score);
553 catch(exception& e) {
554 errorOut(e, "DeCalculator", "removeObviousOutliers");
558 //***************************************************************************************************************
559 //put quanMember in the vector based on how far they are from the 99% or 1%. Biggest offenders in front.
560 /*vector<quanMember> DeCalculator::sortContrib(map<quanMember*, float> quan) {
563 vector<quanMember> newQuan;
564 map<quanMember*, float>::iterator it;
566 while (quan.size() > 0) {
568 map<quanMember*, float>::iterator largest = quan.begin();
570 //find biggest member
571 for (it = quan.begin(); it != quan.end(); it++) {
572 if (it->second > largest->second) { largest = it; }
574 cout << largest->second << '\t' << largest->first->score << '\t' << largest->first->member1 << '\t' << largest->first->member2 << endl;
576 newQuan.push_back(*(largest->first));
585 catch(exception& e) {
586 errorOut(e, "DeCalculator", "sortContrib");
591 //***************************************************************************************************************
592 //used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used...
593 int DeCalculator::findLargestContrib(vector<int> seen) {
600 for (int i = 0; i < seen.size(); i++) {
602 if (seen[i] > largest) {
608 return largestContribs;
611 catch(exception& e) {
612 errorOut(e, "DeCalculator", "findLargestContribs");
616 //***************************************************************************************************************
617 void DeCalculator::removeContrib(int bad, vector<quanMember>& quan) {
620 vector<quanMember> newQuan;
621 for (int i = 0; i < quan.size(); i++) {
622 if ((quan[i].member1 != bad) && (quan[i].member2 != bad) ) {
623 newQuan.push_back(quan[i]);
630 catch(exception& e) {
631 errorOut(e, "DeCalculator", "removeContrib");
636 //***************************************************************************************************************
637 float DeCalculator::findAverage(vector<float> myVector) {
641 for (int i = 0; i < myVector.size(); i++) { total += myVector[i]; }
643 float average = total / (float) myVector.size();
648 catch(exception& e) {
649 errorOut(e, "DeCalculator", "findAverage");
654 //***************************************************************************************************************
655 float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
658 //find average prob for this seqs windows
659 float probAverage = findAverage(qav);
661 //find observed average
662 float obsAverage = findAverage(obs);
664 float coef = obsAverage / probAverage;
668 catch(exception& e) {
669 errorOut(e, "DeCalculator", "getCoef");
673 //***************************************************************************************************************