]> git.donarmstrong.com Git - mothur.git/blob - decalc.cpp
checking in chimera files in progress after move to michigan
[mothur.git] / decalc.cpp
1 /*
2  *  decalc.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 7/22/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "decalc.h"
11
12 //***************************************************************************************************************
13 void DeCalculator::setMask(string m) { 
14         try {
15                 seqMask = m; 
16                 
17                 if (seqMask.length() != 0) {
18                         //whereever there is a base in the mask, save that value is query and subject
19                         for (int i = 0; i < seqMask.length(); i++) {
20                                 if (isalpha(seqMask[i])) {
21                                         h.insert(i);
22                                 }
23                         }
24                 }else {
25                         for (int i = 0; i < alignLength; i++) {   h.insert(i);  }
26                 }
27         }
28         catch(exception& e) {
29                 errorOut(e, "DeCalculator", "setMask");
30                 exit(1);
31         } 
32 }
33 //***************************************************************************************************************
34 void DeCalculator::runMask(Sequence* seq) {
35         try{
36                 
37                 string q = seq->getAligned();
38                 string tempQuery = "";
39                 
40                 //whereever there is a base in the mask, save that value is query and subject
41                 set<int>::iterator setit;
42                 for ( setit=h.begin() ; setit != h.end(); setit++ )  {
43                         tempQuery += q[*setit];
44                 }
45                 
46                 //save masked values
47                 seq->setAligned(tempQuery);
48                 seq->setUnaligned(tempQuery);
49         }
50         catch(exception& e) {
51                 errorOut(e, "DeCalculator", "runMask");
52                 exit(1);
53         }
54 }
55 //***************************************************************************************************************
56 //num is query's spot in querySeqs
57 void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map<int, int>& trim) {
58         try {
59                 
60                 string q = query->getAligned();
61                 string s = subject->getAligned();
62                 
63                 int front = 0;
64                 for (int i = 0; i < q.length(); i++) {
65 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
66                         if (isalpha(q[i]) && isalpha(s[i])) { front = i; break;  }
67                 }
68 //cout << endl << endl;         
69                 int back = 0;           
70                 for (int i = q.length(); i >= 0; i--) {
71 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
72                         if (isalpha(q[i]) && isalpha(s[i])) { back = i; break;  }
73                 }
74                 
75                 trim[front] = back;
76                 
77         }
78         catch(exception& e) {
79                 errorOut(e, "DeCalculator", "trimSeqs");
80                 exit(1);
81         }
82 }
83 //***************************************************************************************************************
84 vector<int>  DeCalculator::findWindows(Sequence* query, int front, int back, int& size, int increment) {
85         try {
86                 
87                 vector<int> win; 
88                 
89                 int cutoff = back - front;  //back - front 
90                         
91                 //if window is set to default
92                 if (size == 0) {  if (cutoff > 1200) {  size = 300; }
93                                                         else{  size = (cutoff / 4); }  } 
94                 else if (size > (cutoff / 4)) { 
95                                 mothurOut("You have selected to large a window size for sequence " + query->getName() + ".  I will choose an appropriate window size."); mothurOutEndLine();
96                                 size = (cutoff / 4); 
97                 }
98         
99         /*      string seq = query->getAligned().substr(front, cutoff);
100                         
101                 //count bases
102                 int numBases = 0;
103                 for (int l = 0; l < seq.length(); l++) {  if (isalpha(seq[l])) { numBases++; }  }
104 //cout << "num Bases = " << numBases << endl;                   
105                 //save start of seq
106                 win.push_back(front);
107 //cout << front << '\t';                
108                 //move ahead increment bases at a time until all bases are in a window
109                 int countBases = 0;
110                 int totalBases = 0;  //used to eliminate window of blanks at end of sequence
111                         
112                 seq = query->getAligned();
113                 for (int m = front; m < (back - size) ; m++) {
114                                 
115                         //count number of bases you see
116                         if (isalpha(seq[m])) { countBases++;  }
117                                 
118                         //if you have seen enough bases to make a new window
119                         if (countBases >= increment) {
120                                 //total bases is the number of bases in a window already.
121                                 totalBases += countBases;
122 //cout << "total bases = " << totalBases << endl;
123                                 win.push_back(m);  //save spot in alignment
124 //cout << m << '\t';
125                                 countBases = 0;                         //reset bases you've seen in this window
126                         }
127                                 
128                         //no need to continue if all your bases are in a window
129                         if (totalBases == numBases) {   break;  }
130                 }
131         
132
133                 //get last window if needed
134                 if (totalBases < numBases) {   win.push_back(back-size);  }
135 //cout << endl << endl;
136 */      
137                 
138                 //this follows wigeon, but we may want to consider that it chops off the end values if the sequence cannot be evenly divided into steps
139                 for (int m = front; m < (back - size) ; m+=increment) {  win.push_back(m);  }
140
141
142         
143                 return win;
144         
145         }
146         catch(exception& e) {
147                 errorOut(e, "DeCalculator", "findWindows");
148                 exit(1);
149         }
150 }
151
152 //***************************************************************************************************************
153 vector<float> DeCalculator::calcObserved(Sequence* query, Sequence* subject, vector<int> window, int size) {
154         try {
155                 
156                 vector<float> temp;     
157                 //int gaps = 0;         
158                 for (int m = 0; m < window.size(); m++) {
159                                                 
160                         string seqFrag = query->getAligned().substr(window[m], size);
161                         string seqFragsub = subject->getAligned().substr(window[m], size);
162                                 
163                         int diff = 0;
164                         for (int b = 0; b < seqFrag.length(); b++) {
165                                 //if at least one is a base and they are not equal
166                                 if( (isalpha(seqFrag[b]) || isalpha(seqFragsub[b])) && (seqFrag[b] != seqFragsub[b]) ) { diff++; }
167                                 
168                                 //ignore gaps
169                                 //if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
170                         }
171                
172                         //percentage of mismatched bases
173                         float dist;
174                         
175                         //if the whole fragment is 0 distance = 0
176                         //if ((seqFrag.length()-gaps) == 0)  { dist =  0.0; }
177                
178                         //percentage of mismatched bases
179                         //else {  dist = diff / (float) (seqFrag.length()-gaps) * 100;   } 
180                         
181                         dist = diff / (float) (seqFrag.length()) * 100; 
182                         
183                         temp.push_back(dist);
184                 }
185                         
186                 return temp;
187         }
188         catch(exception& e) {
189                 errorOut(e, "DeCalculator", "calcObserved");
190                 exit(1);
191         }
192 }
193 //***************************************************************************************************************
194 float DeCalculator::calcDist(Sequence* query, Sequence* subject, int front, int back) {
195         try {
196                 
197                 //so you only look at the trimmed part of the sequence
198                 int cutoff = back - front;  
199                 int gaps = 0;
200                         
201                 //from first startpoint with length back-front
202                 string seqFrag = query->getAligned().substr(front, cutoff);
203                 string seqFragsub = subject->getAligned().substr(front, cutoff);
204                                                                                                                 
205                 int diff = 0;
206                 for (int b = 0; b < seqFrag.length(); b++) {
207                         //ignore gaps
208                         if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
209                         if (seqFrag[b] != seqFragsub[b]) { diff++; }
210                 }
211                 
212                 //if the whole fragment is 0 distance = 0
213                 if ((seqFrag.length()-gaps) == 0)  { return 0.0; }
214                
215                 //percentage of mismatched bases
216                 float dist = diff / (float) (seqFrag.length()-gaps) * 100;       
217                                 
218                 return dist;
219         }
220         catch(exception& e) {
221                 errorOut(e, "DeCalculator", "calcDist");
222                 exit(1);
223         }
224 }
225
226 //***************************************************************************************************************
227 vector<float> DeCalculator::calcExpected(vector<float> qav, float coef) {
228         try {
229                 
230                 //for each window
231                 vector<float> queryExpected;
232                         
233                 for (int m = 0; m < qav.size(); m++) {          
234                                 
235                         float expected = qav[m] * coef;
236                                 
237                         queryExpected.push_back(expected);      
238                 }
239                         
240                 return queryExpected;
241                                 
242         }
243         catch(exception& e) {
244                 errorOut(e, "DeCalculator", "calcExpected");
245                 exit(1);
246         }
247 }
248 //***************************************************************************************************************
249 float DeCalculator::calcDE(vector<float> obs, vector<float> exp) {
250         try {
251                 
252                 //for each window
253                 float sum = 0.0;  //sum = sum from 1 to m of (oi-ei)^2
254                 int numZeros = 0;
255                 for (int m = 0; m < obs.size(); m++) {          
256                         
257                         //if (obs[m] != 0.0) {
258                                 sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); 
259                         //}else {  numZeros++;   }
260                         
261                 }
262                         
263                 float de = sqrt((sum / (obs.size() - 1 - numZeros)));
264                         
265                 return de;
266         }
267         catch(exception& e) {
268                 errorOut(e, "DeCalculator", "calcDE");
269                 exit(1);
270         }
271 }
272
273 //***************************************************************************************************************
274
275 vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
276         try {
277
278                 vector<float> prob;
279                 string freqfile = getRootName(filename) + "freq";
280                 ofstream outFreq;
281                 
282                 openOutputFile(freqfile, outFreq);
283                 
284                 string length = toString(seqs.size());  //if there are 5000 seqs in the template then set precision to 3
285                 int precision = length.length() - 1;
286                 
287                 //format output
288                 outFreq.setf(ios::fixed, ios::floatfield); outFreq.setf(ios::showpoint);
289                 
290                 //at each position in the sequence
291                 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
292                         
293                         vector<int> freq;   freq.resize(4,0);
294                         int gaps = 0;
295                         
296                         //find the frequency of each nucleotide
297                         for (int j = 0; j < seqs.size(); j++) {
298                                 
299                                 char value = seqs[j]->getAligned()[i];
300                                 
301                                 if(toupper(value) == 'A')                                                                       {       freq[0]++;      }
302                                 else if(toupper(value) == 'T' || toupper(value) == 'U')         {       freq[1]++;      }
303                                 else if(toupper(value) == 'G')                                                          {       freq[2]++;      }
304                                 else if(toupper(value) == 'C')                                                          {       freq[3]++;      }
305                                 else { gaps++; }
306                         }
307                         
308                         //find base with highest frequency
309                         int highest = 0;
310                         for (int m = 0; m < freq.size(); m++) {   if (freq[m] > highest) {  highest = freq[m];  }               }
311                         
312                         float highFreq = highest / (float) (seqs.size());       
313                         
314                         float Pi;
315                         Pi =  (highFreq - 0.25) / 0.75; 
316                         
317                         //cannot have probability less than 0.
318                         if (Pi < 0) { Pi = 0.0; }
319                         
320                         //saves this for later
321                         outFreq << setprecision(precision) << i << '\t' << highFreq << endl;
322         
323                         if (h.count(i) > 0) {
324                                 prob.push_back(Pi); 
325                         }
326                 }
327                 
328                 outFreq.close();
329                 
330                 return prob;
331                                 
332         }
333         catch(exception& e) {
334                 errorOut(e, "DeCalculator", "calcFreq");
335                 exit(1);
336         }
337 }
338 //***************************************************************************************************************
339 vector<float>  DeCalculator::findQav(vector<int> window, int size, vector<float> probabilityProfile) {
340         try {
341                 vector<float>  averages; 
342                                 
343                 //for each window find average
344                 for (int m = 0; m < window.size(); m++) {
345                                 
346                         float average = 0.0;
347                                 
348                         //while you are in the window for this sequence
349                         int count = 0;
350                         for (int j = window[m]; j < (window[m]+size); j++) {   
351                                 average += probabilityProfile[j];
352                                 count++;
353                         }
354                                 
355                         average = average / count;
356         
357                         //save this windows average
358                         averages.push_back(average);
359                 }
360                                 
361                 return averages;
362         }
363         catch(exception& e) {
364                 errorOut(e, "DeCalculator", "findQav");
365                 exit(1);
366         }
367 }
368 //***************************************************************************************************************
369 //seqs have already been masked
370 vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end) {
371         try {
372                 vector< vector<quanMember> > quan; 
373                 
374                 //percentage of mismatched pairs 1 to 100
375                 quan.resize(100);
376 //ofstream o;
377 //string out = "getQuantiles.out";
378 //openOutputFile(out, o);
379                 
380                 //for each sequence
381                 for(int i = start; i < end; i++){
382                 
383                         mothurOut("Processing sequence " + toString(i)); mothurOutEndLine();
384                         Sequence* query = new Sequence(seqs[i]->getName(), seqs[i]->getAligned());
385                         
386                         //compare to every other sequence in template
387                         for(int j = 0; j < i; j++){
388                                 
389                                 Sequence* subject = new Sequence(seqs[j]->getName(), seqs[j]->getAligned());
390                                 
391                                 map<int, int> trim;
392                                 map<int, int>::iterator it;
393                                 
394                                 trimSeqs(query, subject, trim);
395                                 
396                                 it = trim.begin();
397                                 int front = it->first; int back = it->second;
398                                 
399                                 //reset window for each new comparison
400                                 windowSizesTemplate[i] = window;
401                                 
402                                 vector<int> win = findWindows(query, front, back, windowSizesTemplate[i], increment);
403                                 
404                                 vector<float> obsi = calcObserved(query, subject, win, windowSizesTemplate[i]);
405                                 
406                                 vector<float> q = findQav(win, windowSizesTemplate[i], probProfile);
407                                                                         
408                                 float alpha = getCoef(obsi, q);
409                                                 
410                                 vector<float> exp = calcExpected(q, alpha);
411                                 
412                                 float de = calcDE(obsi, exp);
413                                                                 
414                                 float dist = calcDist(query, subject, front, back); 
415         //o << i << '\t' <<  j << '\t' << dist << '\t' << de << endl;                   
416                                 dist = ceil(dist);
417                                 
418                                 quanMember newScore(de, i, j);
419                                 
420                                 //dist-1 because vector indexes start at 0.
421                                 quan[dist-1].push_back(newScore);
422                                 
423                                 delete subject;
424                         }
425                         
426                         delete query;
427                 }
428
429                 return quan;
430                                                 
431         }
432         catch(exception& e) {
433                 errorOut(e, "DeCalculator", "getQuantiles");
434                 exit(1);
435         }
436 }
437 //********************************************************************************************************************
438 //sorts lowest to highest
439 inline bool compareQuanMembers(quanMember left, quanMember right){
440         return (left.score < right.score);      
441
442 //***************************************************************************************************************
443 //this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right.  may want to revisit in the future...
444 void DeCalculator::removeObviousOutliers(vector< vector<quanMember> >& quantiles, int num) {
445         try {
446                                                 
447                 for (int i = 0; i < quantiles.size(); i++) {
448                 
449                         //find mean of this quantile score
450                         sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
451                         
452                         float high = quantiles[i][int(quantiles[i].size() * 0.99)].score;
453                         float low =  quantiles[i][int(quantiles[i].size() * 0.01)].score;
454                         
455                         vector<quanMember> temp;
456                         
457                         //look at each value in quantiles to see if it is an outlier
458                         for (int j = 0; j < quantiles[i].size(); j++) {
459                                 //is this score between 1 and 99%
460                                 if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) {
461                                         temp.push_back(quantiles[i][j]);
462                                 }
463                         }
464                         
465                         quantiles[i] = temp;
466                 }
467
468 /*
469                 //find contributer with most offending score related to it
470                 int largestContrib = findLargestContrib(seen);
471         
472                 //while you still have guys to eliminate
473                 while (contributions.size() > 0) {
474                 
475                         mothurOut("Removing scores contributed by sequence " + toString(largestContrib) + " in your template file."); mothurOutEndLine();
476                         
477                         //remove from quantiles all scores that were made using this largestContrib
478                         for (int i = 0; i < quantiles.size(); i++) {
479 //cout << "before remove " << quantiles[i].size() << '\t';
480                                 removeContrib(largestContrib, quantiles[i]);
481 //cout << "after remove " << quantiles[i].size() << endl;
482                         }
483 //cout << count << " largest contrib = " << largestContrib << endl;  count++;
484                         //remove from contributions all scores that were made using this largestContrib
485                         removeContrib(largestContrib, contributions);
486                         
487                         //"erase" largestContrib
488                         seen[largestContrib] = -1;
489                         
490                         //get next largestContrib
491                         largestContrib = findLargestContrib(seen);
492                 }
493 ABOVE IS ATTEMPT #1             
494 **************************************************************************************************
495 BELOW IS ATTEMPT #2             
496                 vector<int> marked = returnObviousOutliers(quantiles, num);
497                 vector<int> copyMarked = marked;
498                 
499                 //find the 99% of marked
500                 sort(copyMarked.begin(), copyMarked.end());
501                 int high = copyMarked[int(copyMarked.size() * 0.99)];
502 cout << "high = " << high << endl;              
503                 
504                 for(int i = 0; i < marked.size(); i++) {
505                         if (marked[i] > high) { 
506                                 mothurOut("Removing scores contributed by sequence " + toString(marked[i]) + " in your template file."); mothurOutEndLine();
507                                 for (int i = 0; i < quantiles.size(); i++) {
508                                         removeContrib(marked[i], quantiles[i]);
509                                 }
510                         }
511
512                 }
513                 
514                 
515                 //adjust quantiles
516                 for (int i = 0; i < quantiles.size(); i++) {
517                         vector<float> temp;
518                         
519                         if (quantiles[i].size() == 0) {
520                                 //in case this is not a distance found in your template files
521                                 for (int g = 0; g < 6; g++) {
522                                         temp.push_back(0.0);
523                                 }
524                         }else{
525                                 
526                                 sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
527                                 
528                                 //save 10%
529                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)].score);
530                                 //save 25%
531                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)].score);
532                                 //save 50%
533                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)].score);
534                                 //save 75%
535                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)].score);
536                                 //save 95%
537                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)].score);
538                                 //save 99%
539                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)].score);
540                                 
541                         }
542                         
543                         quan[i] = temp;
544                         
545                 }
546 */
547                 
548         }
549         catch(exception& e) {
550                 errorOut(e, "DeCalculator", "removeObviousOutliers");
551                 exit(1);
552         }
553 }
554 //***************************************************************************************************************
555 //put quanMember in the vector based on how far they are from the 99% or 1%.  Biggest offenders in front.
556 /*vector<quanMember> DeCalculator::sortContrib(map<quanMember*, float> quan) {
557         try{
558                 
559                 vector<quanMember> newQuan;
560                 map<quanMember*, float>::iterator it;
561                 
562                 while (quan.size() > 0) {
563                         
564                          map<quanMember*, float>::iterator largest = quan.begin(); 
565                           
566                         //find biggest member
567                         for (it = quan.begin(); it != quan.end(); it++) {
568                                 if (it->second > largest->second) {  largest = it;  }
569                         }
570 cout << largest->second << '\t' << largest->first->score << '\t' << largest->first->member1 << '\t' << largest->first->member2 << endl;
571                         //save it 
572                         newQuan.push_back(*(largest->first));
573                 
574                         //erase from quan
575                         quan.erase(largest);
576                 }
577                 
578                 return newQuan;
579                 
580         }
581         catch(exception& e) {
582                 errorOut(e, "DeCalculator", "sortContrib");
583                 exit(1);
584         }
585 }
586
587 //***************************************************************************************************************
588 //used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used...
589 int DeCalculator::findLargestContrib(vector<int> seen) {
590         try{
591                 
592                 int largest = 0;
593                 
594                 int largestContribs;
595                 
596                 for (int i = 0; i < seen.size(); i++)  {  
597                         
598                         if (seen[i] > largest) {
599                                 largestContribs = i;
600                                 largest = seen[i];
601                         }
602                 }
603                 
604                 return largestContribs;
605                 
606         }
607         catch(exception& e) {
608                 errorOut(e, "DeCalculator", "findLargestContribs");
609                 exit(1);
610         }
611 }
612 //***************************************************************************************************************
613 void DeCalculator::removeContrib(int bad, vector<quanMember>& quan) {
614         try{
615         
616                 vector<quanMember> newQuan;
617                 for (int i = 0; i < quan.size(); i++)  {  
618                         if ((quan[i].member1 != bad) && (quan[i].member2 != bad) ) {  
619                                 newQuan.push_back(quan[i]);  
620                         }
621                 }
622                 
623                 quan = newQuan;
624                 
625         }
626         catch(exception& e) {
627                 errorOut(e, "DeCalculator", "removeContrib");
628                 exit(1);
629         }
630 }
631 */
632 //***************************************************************************************************************
633 float DeCalculator::findAverage(vector<float> myVector) {
634         try{
635                 
636                 float total = 0.0;
637                 for (int i = 0; i < myVector.size(); i++)  {  total += myVector[i];  }
638                 
639                 float average = total / (float) myVector.size();
640                 
641                 return average;
642                 
643         }
644         catch(exception& e) {
645                 errorOut(e, "DeCalculator", "findAverage");
646                 exit(1);
647         }
648 }
649
650 //***************************************************************************************************************
651 float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
652         try {
653         
654                 //find average prob for this seqs windows
655                 float probAverage = findAverage(qav);
656                                 
657                 //find observed average 
658                 float obsAverage = findAverage(obs);
659                         
660                 float coef = obsAverage / probAverage;
661                                                 
662                 return coef;
663         }
664         catch(exception& e) {
665                 errorOut(e, "DeCalculator", "getCoef");
666                 exit(1);
667         }
668 }
669 //***************************************************************************************************************
670
671