]> git.donarmstrong.com Git - mothur.git/blob - decalc.cpp
changed reporting in ccode to only report those who are found to be chimeric by all...
[mothur.git] / decalc.cpp
1 /*
2  *  decalc.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 7/22/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "decalc.h"
11
12 //***************************************************************************************************************
13 void DeCalculator::setMask(string m) { 
14         try {
15                 seqMask = m; 
16                 int count = 0;
17                 maskMap.clear();
18                 
19                 if (seqMask.length() != 0) {
20                         //whereever there is a base in the mask, save that value is query and subject
21                         for (int i = 0; i < seqMask.length(); i++) {
22                                 if (isalpha(seqMask[i])) {
23                                         h.insert(i);
24                                         maskMap[count] = i;
25                                         count++;
26                                         
27                                 }
28                         }
29                 }else {
30                         for (int i = 0; i < alignLength; i++) {   
31                                 h.insert(i); 
32                                 maskMap[count] = i;
33                                 count++;
34                         }
35                 }
36         }
37         catch(exception& e) {
38                 errorOut(e, "DeCalculator", "setMask");
39                 exit(1);
40         } 
41 }
42 //***************************************************************************************************************
43 void DeCalculator::runMask(Sequence* seq) {
44         try{
45                 
46                 string q = seq->getAligned();
47                 string tempQuery = "";
48                 
49                 //whereever there is a base in the mask, save that value is query and subject
50                 set<int>::iterator setit;
51                 for ( setit=h.begin() ; setit != h.end(); setit++ )  {
52                         tempQuery += q[*setit];
53                 }
54                 
55                 //save masked values
56                 seq->setAligned(tempQuery);
57                 seq->setUnaligned(tempQuery);
58         }
59         catch(exception& e) {
60                 errorOut(e, "DeCalculator", "runMask");
61                 exit(1);
62         }
63 }
64 //***************************************************************************************************************
65 //num is query's spot in querySeqs
66 void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map<int, int>& trim) {
67         try {
68                 
69                 string q = query->getAligned();
70                 string s = subject->getAligned();
71                 
72                 int front = 0;
73                 for (int i = 0; i < q.length(); i++) {
74 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
75                         if (isalpha(q[i]) && isalpha(s[i])) { front = i; break;  }
76                 }
77 //cout << endl << endl;         
78                 int back = 0;           
79                 for (int i = q.length(); i >= 0; i--) {
80 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
81                         if (isalpha(q[i]) && isalpha(s[i])) { back = i; break;  }
82                 }
83                 
84                 trim[front] = back;
85                 
86         }
87         catch(exception& e) {
88                 errorOut(e, "DeCalculator", "trimSeqs");
89                 exit(1);
90         }
91 }
92 //***************************************************************************************************************
93 vector<int>  DeCalculator::findWindows(Sequence* query, int front, int back, int& size, int increment) {
94         try {
95                 
96                 vector<int> win; 
97                 
98                 int cutoff = back - front;  //back - front 
99                         
100                 //if window is set to default
101                 if (size == 0) {  if (cutoff > 1200) {  size = 300; }
102                                                         else{  size = (cutoff / 4); }  } 
103                 else if (size > (cutoff / 4)) { 
104                                 mothurOut("You have selected to large a window size for sequence " + query->getName() + ".  I will choose an appropriate window size."); mothurOutEndLine();
105                                 size = (cutoff / 4); 
106                 }
107         
108         /*      string seq = query->getAligned().substr(front, cutoff);
109                         
110                 //count bases
111                 int numBases = 0;
112                 for (int l = 0; l < seq.length(); l++) {  if (isalpha(seq[l])) { numBases++; }  }
113 //cout << "num Bases = " << numBases << endl;                   
114                 //save start of seq
115                 win.push_back(front);
116 //cout << front << '\t';                
117                 //move ahead increment bases at a time until all bases are in a window
118                 int countBases = 0;
119                 int totalBases = 0;  //used to eliminate window of blanks at end of sequence
120                         
121                 seq = query->getAligned();
122                 for (int m = front; m < (back - size) ; m++) {
123                                 
124                         //count number of bases you see
125                         if (isalpha(seq[m])) { countBases++;  }
126                                 
127                         //if you have seen enough bases to make a new window
128                         if (countBases >= increment) {
129                                 //total bases is the number of bases in a window already.
130                                 totalBases += countBases;
131 //cout << "total bases = " << totalBases << endl;
132                                 win.push_back(m);  //save spot in alignment
133 //cout << m << '\t';
134                                 countBases = 0;                         //reset bases you've seen in this window
135                         }
136                                 
137                         //no need to continue if all your bases are in a window
138                         if (totalBases == numBases) {   break;  }
139                 }
140         
141
142                 //get last window if needed
143                 if (totalBases < numBases) {   win.push_back(back-size);  }
144 //cout << endl << endl;
145 */      
146                 
147                 //this follows wigeon, but we may want to consider that it chops off the end values if the sequence cannot be evenly divided into steps
148                 for (int m = front; m < (back - size) ; m+=increment) {  win.push_back(m);  }
149
150
151         
152                 return win;
153         
154         }
155         catch(exception& e) {
156                 errorOut(e, "DeCalculator", "findWindows");
157                 exit(1);
158         }
159 }
160
161 //***************************************************************************************************************
162 vector<float> DeCalculator::calcObserved(Sequence* query, Sequence* subject, vector<int> window, int size) {
163         try {
164                 
165                 vector<float> temp;     
166                 //int gaps = 0;         
167                 for (int m = 0; m < window.size(); m++) {
168                                                 
169                         string seqFrag = query->getAligned().substr(window[m], size);
170                         string seqFragsub = subject->getAligned().substr(window[m], size);
171                                 
172                         int diff = 0;
173                         for (int b = 0; b < seqFrag.length(); b++) {
174                                 //if at least one is a base and they are not equal
175                                 if( (isalpha(seqFrag[b]) || isalpha(seqFragsub[b])) && (seqFrag[b] != seqFragsub[b]) ) { diff++; }
176                                 
177                                 //ignore gaps
178                                 //if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
179                         }
180                
181                         //percentage of mismatched bases
182                         float dist;
183                         
184                         //if the whole fragment is 0 distance = 0
185                         //if ((seqFrag.length()-gaps) == 0)  { dist =  0.0; }
186                
187                         //percentage of mismatched bases
188                         //else {  dist = diff / (float) (seqFrag.length()-gaps) * 100;   } 
189                         
190                         dist = diff / (float) (seqFrag.length()) * 100; 
191                         
192                         temp.push_back(dist);
193                 }
194                         
195                 return temp;
196         }
197         catch(exception& e) {
198                 errorOut(e, "DeCalculator", "calcObserved");
199                 exit(1);
200         }
201 }
202 //***************************************************************************************************************
203 float DeCalculator::calcDist(Sequence* query, Sequence* subject, int front, int back) {
204         try {
205                 
206                 //so you only look at the trimmed part of the sequence
207                 int cutoff = back - front;  
208                 int gaps = 0;
209                         
210                 //from first startpoint with length back-front
211                 string seqFrag = query->getAligned().substr(front, cutoff);
212                 string seqFragsub = subject->getAligned().substr(front, cutoff);
213                                                                                                                 
214                 int diff = 0;
215                 for (int b = 0; b < seqFrag.length(); b++) {
216                         //ignore gaps
217                         if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
218                         if (seqFrag[b] != seqFragsub[b]) { diff++; }
219                 }
220                 
221                 //if the whole fragment is 0 distance = 0
222                 if ((seqFrag.length()-gaps) == 0)  { return 0.0; }
223                
224                 //percentage of mismatched bases
225                 float dist = diff / (float) (seqFrag.length()-gaps) * 100;       
226                                 
227                 return dist;
228         }
229         catch(exception& e) {
230                 errorOut(e, "DeCalculator", "calcDist");
231                 exit(1);
232         }
233 }
234
235 //***************************************************************************************************************
236 vector<float> DeCalculator::calcExpected(vector<float> qav, float coef) {
237         try {
238                 
239                 //for each window
240                 vector<float> queryExpected;
241                         
242                 for (int m = 0; m < qav.size(); m++) {          
243                                 
244                         float expected = qav[m] * coef;
245                                 
246                         queryExpected.push_back(expected);      
247                 }
248                         
249                 return queryExpected;
250                                 
251         }
252         catch(exception& e) {
253                 errorOut(e, "DeCalculator", "calcExpected");
254                 exit(1);
255         }
256 }
257 //***************************************************************************************************************
258 float DeCalculator::calcDE(vector<float> obs, vector<float> exp) {
259         try {
260                 
261                 //for each window
262                 float sum = 0.0;  //sum = sum from 1 to m of (oi-ei)^2
263                 int numZeros = 0;
264                 for (int m = 0; m < obs.size(); m++) {          
265                         
266                         //if (obs[m] != 0.0) {
267                                 sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); 
268                         //}else {  numZeros++;   }
269                         
270                 }
271                         
272                 float de = sqrt((sum / (obs.size() - 1 - numZeros)));
273                         
274                 return de;
275         }
276         catch(exception& e) {
277                 errorOut(e, "DeCalculator", "calcDE");
278                 exit(1);
279         }
280 }
281
282 //***************************************************************************************************************
283
284 vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
285         try {
286
287                 vector<float> prob;
288                 string freqfile = getRootName(filename) + "freq";
289                 ofstream outFreq;
290                 
291                 openOutputFile(freqfile, outFreq);
292                 
293                 string length = toString(seqs.size());  //if there are 5000 seqs in the template then set precision to 3
294                 int precision = length.length() - 1;
295                 
296                 //format output
297                 outFreq.setf(ios::fixed, ios::floatfield); outFreq.setf(ios::showpoint);
298                 
299                 //at each position in the sequence
300                 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
301                         
302                         vector<int> freq;   freq.resize(4,0);
303                         int gaps = 0;
304                         
305                         //find the frequency of each nucleotide
306                         for (int j = 0; j < seqs.size(); j++) {
307                                 
308                                 char value = seqs[j]->getAligned()[i];
309                                 
310                                 if(toupper(value) == 'A')                                                                       {       freq[0]++;      }
311                                 else if(toupper(value) == 'T' || toupper(value) == 'U')         {       freq[1]++;      }
312                                 else if(toupper(value) == 'G')                                                          {       freq[2]++;      }
313                                 else if(toupper(value) == 'C')                                                          {       freq[3]++;      }
314                                 else { gaps++; }
315                         }
316                         
317                         //find base with highest frequency
318                         int highest = 0;
319                         for (int m = 0; m < freq.size(); m++) {   if (freq[m] > highest) {  highest = freq[m];  }               }
320                         
321                         float highFreq = highest / (float) (seqs.size());       
322                         
323                         float Pi;
324                         Pi =  (highFreq - 0.25) / 0.75; 
325                         
326                         //cannot have probability less than 0.
327                         if (Pi < 0) { Pi = 0.0; }
328                         
329                         //saves this for later
330                         outFreq << setprecision(precision) << i << '\t' << highFreq << endl;
331         
332                         if (h.count(i) > 0) {
333                                 prob.push_back(Pi); 
334                         }
335                 }
336                 
337                 outFreq.close();
338                 
339                 return prob;
340                                 
341         }
342         catch(exception& e) {
343                 errorOut(e, "DeCalculator", "calcFreq");
344                 exit(1);
345         }
346 }
347 //***************************************************************************************************************
348 vector<float>  DeCalculator::findQav(vector<int> window, int size, vector<float> probabilityProfile) {
349         try {
350                 vector<float>  averages; 
351                                 
352                 //for each window find average
353                 for (int m = 0; m < window.size(); m++) {
354                                 
355                         float average = 0.0;
356                                 
357                         //while you are in the window for this sequence
358                         int count = 0;
359                         for (int j = window[m]; j < (window[m]+size); j++) {   
360                                 average += probabilityProfile[j];
361                                 count++;
362                         }
363                                 
364                         average = average / count;
365         
366                         //save this windows average
367                         averages.push_back(average);
368                 }
369                                 
370                 return averages;
371         }
372         catch(exception& e) {
373                 errorOut(e, "DeCalculator", "findQav");
374                 exit(1);
375         }
376 }
377 //***************************************************************************************************************
378 //seqs have already been masked
379 vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end) {
380         try {
381                 vector< vector<quanMember> > quan; 
382                 
383                 //percentage of mismatched pairs 1 to 100
384                 quan.resize(100);
385 //ofstream o;
386 //string out = "getQuantiles.out";
387 //openOutputFile(out, o);
388                 
389                 //for each sequence
390                 for(int i = start; i < end; i++){
391                 
392                         mothurOut("Processing sequence " + toString(i)); mothurOutEndLine();
393                         Sequence* query = new Sequence(seqs[i]->getName(), seqs[i]->getAligned());
394                         
395                         //compare to every other sequence in template
396                         for(int j = 0; j < i; j++){
397                                 
398                                 Sequence* subject = new Sequence(seqs[j]->getName(), seqs[j]->getAligned());
399                                 
400                                 map<int, int> trim;
401                                 map<int, int>::iterator it;
402                                 
403                                 trimSeqs(query, subject, trim);
404                                 
405                                 it = trim.begin();
406                                 int front = it->first; int back = it->second;
407                                 
408                                 //reset window for each new comparison
409                                 windowSizesTemplate[i] = window;
410                                 
411                                 vector<int> win = findWindows(query, front, back, windowSizesTemplate[i], increment);
412                                 
413                                 vector<float> obsi = calcObserved(query, subject, win, windowSizesTemplate[i]);
414                                 
415                                 vector<float> q = findQav(win, windowSizesTemplate[i], probProfile);
416                                                                         
417                                 float alpha = getCoef(obsi, q);
418                                                 
419                                 vector<float> exp = calcExpected(q, alpha);
420                                 
421                                 float de = calcDE(obsi, exp);
422                                                                 
423                                 float dist = calcDist(query, subject, front, back); 
424         //o << i << '\t' <<  j << '\t' << dist << '\t' << de << endl;                   
425                                 dist = ceil(dist);
426                                 
427                                 quanMember newScore(de, i, j);
428                                 
429                                 //dist-1 because vector indexes start at 0.
430                                 quan[dist-1].push_back(newScore);
431                                 
432                                 delete subject;
433                         }
434                         
435                         delete query;
436                 }
437
438                 return quan;
439                                                 
440         }
441         catch(exception& e) {
442                 errorOut(e, "DeCalculator", "getQuantiles");
443                 exit(1);
444         }
445 }
446 //********************************************************************************************************************
447 //sorts lowest to highest
448 inline bool compareQuanMembers(quanMember left, quanMember right){
449         return (left.score < right.score);      
450
451 //***************************************************************************************************************
452 //this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right.  may want to revisit in the future...
453 void DeCalculator::removeObviousOutliers(vector< vector<quanMember> >& quantiles, int num) {
454         try {
455                                                 
456                 for (int i = 0; i < quantiles.size(); i++) {
457                 
458                         //find mean of this quantile score
459                         sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
460                         
461                         float high = quantiles[i][int(quantiles[i].size() * 0.99)].score;
462                         float low =  quantiles[i][int(quantiles[i].size() * 0.01)].score;
463                         
464                         vector<quanMember> temp;
465                         
466                         //look at each value in quantiles to see if it is an outlier
467                         for (int j = 0; j < quantiles[i].size(); j++) {
468                                 //is this score between 1 and 99%
469                                 if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) {
470                                         temp.push_back(quantiles[i][j]);
471                                 }
472                         }
473                         
474                         quantiles[i] = temp;
475                 }
476
477 /*
478                 //find contributer with most offending score related to it
479                 int largestContrib = findLargestContrib(seen);
480         
481                 //while you still have guys to eliminate
482                 while (contributions.size() > 0) {
483                 
484                         mothurOut("Removing scores contributed by sequence " + toString(largestContrib) + " in your template file."); mothurOutEndLine();
485                         
486                         //remove from quantiles all scores that were made using this largestContrib
487                         for (int i = 0; i < quantiles.size(); i++) {
488 //cout << "before remove " << quantiles[i].size() << '\t';
489                                 removeContrib(largestContrib, quantiles[i]);
490 //cout << "after remove " << quantiles[i].size() << endl;
491                         }
492 //cout << count << " largest contrib = " << largestContrib << endl;  count++;
493                         //remove from contributions all scores that were made using this largestContrib
494                         removeContrib(largestContrib, contributions);
495                         
496                         //"erase" largestContrib
497                         seen[largestContrib] = -1;
498                         
499                         //get next largestContrib
500                         largestContrib = findLargestContrib(seen);
501                 }
502 ABOVE IS ATTEMPT #1             
503 **************************************************************************************************
504 BELOW IS ATTEMPT #2             
505                 vector<int> marked = returnObviousOutliers(quantiles, num);
506                 vector<int> copyMarked = marked;
507                 
508                 //find the 99% of marked
509                 sort(copyMarked.begin(), copyMarked.end());
510                 int high = copyMarked[int(copyMarked.size() * 0.99)];
511 cout << "high = " << high << endl;              
512                 
513                 for(int i = 0; i < marked.size(); i++) {
514                         if (marked[i] > high) { 
515                                 mothurOut("Removing scores contributed by sequence " + toString(marked[i]) + " in your template file."); mothurOutEndLine();
516                                 for (int i = 0; i < quantiles.size(); i++) {
517                                         removeContrib(marked[i], quantiles[i]);
518                                 }
519                         }
520
521                 }
522                 
523                 
524                 //adjust quantiles
525                 for (int i = 0; i < quantiles.size(); i++) {
526                         vector<float> temp;
527                         
528                         if (quantiles[i].size() == 0) {
529                                 //in case this is not a distance found in your template files
530                                 for (int g = 0; g < 6; g++) {
531                                         temp.push_back(0.0);
532                                 }
533                         }else{
534                                 
535                                 sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
536                                 
537                                 //save 10%
538                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)].score);
539                                 //save 25%
540                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)].score);
541                                 //save 50%
542                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)].score);
543                                 //save 75%
544                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)].score);
545                                 //save 95%
546                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)].score);
547                                 //save 99%
548                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)].score);
549                                 
550                         }
551                         
552                         quan[i] = temp;
553                         
554                 }
555 */
556                 
557         }
558         catch(exception& e) {
559                 errorOut(e, "DeCalculator", "removeObviousOutliers");
560                 exit(1);
561         }
562 }
563 //***************************************************************************************************************
564 //put quanMember in the vector based on how far they are from the 99% or 1%.  Biggest offenders in front.
565 /*vector<quanMember> DeCalculator::sortContrib(map<quanMember*, float> quan) {
566         try{
567                 
568                 vector<quanMember> newQuan;
569                 map<quanMember*, float>::iterator it;
570                 
571                 while (quan.size() > 0) {
572                         
573                          map<quanMember*, float>::iterator largest = quan.begin(); 
574                           
575                         //find biggest member
576                         for (it = quan.begin(); it != quan.end(); it++) {
577                                 if (it->second > largest->second) {  largest = it;  }
578                         }
579 cout << largest->second << '\t' << largest->first->score << '\t' << largest->first->member1 << '\t' << largest->first->member2 << endl;
580                         //save it 
581                         newQuan.push_back(*(largest->first));
582                 
583                         //erase from quan
584                         quan.erase(largest);
585                 }
586                 
587                 return newQuan;
588                 
589         }
590         catch(exception& e) {
591                 errorOut(e, "DeCalculator", "sortContrib");
592                 exit(1);
593         }
594 }
595
596 //***************************************************************************************************************
597 //used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used...
598 int DeCalculator::findLargestContrib(vector<int> seen) {
599         try{
600                 
601                 int largest = 0;
602                 
603                 int largestContribs;
604                 
605                 for (int i = 0; i < seen.size(); i++)  {  
606                         
607                         if (seen[i] > largest) {
608                                 largestContribs = i;
609                                 largest = seen[i];
610                         }
611                 }
612                 
613                 return largestContribs;
614                 
615         }
616         catch(exception& e) {
617                 errorOut(e, "DeCalculator", "findLargestContribs");
618                 exit(1);
619         }
620 }
621 //***************************************************************************************************************
622 void DeCalculator::removeContrib(int bad, vector<quanMember>& quan) {
623         try{
624         
625                 vector<quanMember> newQuan;
626                 for (int i = 0; i < quan.size(); i++)  {  
627                         if ((quan[i].member1 != bad) && (quan[i].member2 != bad) ) {  
628                                 newQuan.push_back(quan[i]);  
629                         }
630                 }
631                 
632                 quan = newQuan;
633                 
634         }
635         catch(exception& e) {
636                 errorOut(e, "DeCalculator", "removeContrib");
637                 exit(1);
638         }
639 }
640 */
641 //***************************************************************************************************************
642 float DeCalculator::findAverage(vector<float> myVector) {
643         try{
644                 
645                 float total = 0.0;
646                 for (int i = 0; i < myVector.size(); i++)  {  total += myVector[i];  }
647                 
648                 float average = total / (float) myVector.size();
649                 
650                 return average;
651                 
652         }
653         catch(exception& e) {
654                 errorOut(e, "DeCalculator", "findAverage");
655                 exit(1);
656         }
657 }
658
659 //***************************************************************************************************************
660 float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
661         try {
662         
663                 //find average prob for this seqs windows
664                 float probAverage = findAverage(qav);
665                                 
666                 //find observed average 
667                 float obsAverage = findAverage(obs);
668                         
669                 float coef = obsAverage / probAverage;
670                                                 
671                 return coef;
672         }
673         catch(exception& e) {
674                 errorOut(e, "DeCalculator", "getCoef");
675                 exit(1);
676         }
677 }
678 //***************************************************************************************************************
679
680