]> git.donarmstrong.com Git - mothur.git/blob - decalc.cpp
finished with ccode, returned bellerophon to last save before move, cleaned up pintai...
[mothur.git] / decalc.cpp
1 /*
2  *  decalc.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 7/22/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "decalc.h"
11
12 //***************************************************************************************************************
13 void DeCalculator::setMask(string m) { 
14         try {
15                 seqMask = m; 
16                 int count = 0;
17                 maskMap.clear();
18                 
19                 if (seqMask.length() != 0) {
20                         //whereever there is a base in the mask, save that value is query and subject
21                         for (int i = 0; i < seqMask.length(); i++) {
22                                 if (isalpha(seqMask[i])) {
23                                         h.insert(i);
24                                         maskMap[count] = i;
25                                         count++;
26                                         
27                                 }
28                         }
29                 }else {
30                         for (int i = 0; i < alignLength; i++) {   
31                                 h.insert(i); 
32                                 maskMap[count] = i;
33                                 count++;
34                         }
35                 }
36         }
37         catch(exception& e) {
38                 errorOut(e, "DeCalculator", "setMask");
39                 exit(1);
40         } 
41 }
42 //***************************************************************************************************************
43 void DeCalculator::runMask(Sequence* seq) {
44         try{
45                 
46                 string q = seq->getAligned();
47                 string tempQuery = "";
48                 
49                 //whereever there is a base in the mask, save that value is query and subject
50                 set<int>::iterator setit;
51                 for ( setit=h.begin() ; setit != h.end(); setit++ )  {
52                         tempQuery += q[*setit];
53                 }
54                 
55                 //save masked values
56                 seq->setAligned(tempQuery);
57                 seq->setUnaligned(tempQuery);
58         }
59         catch(exception& e) {
60                 errorOut(e, "DeCalculator", "runMask");
61                 exit(1);
62         }
63 }
64 //***************************************************************************************************************
65 //num is query's spot in querySeqs
66 void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map<int, int>& trim) {
67         try {
68                 
69                 string q = query->getAligned();
70                 string s = subject->getAligned();
71                 
72                 int front = 0;
73                 for (int i = 0; i < q.length(); i++) {
74 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
75                         if (isalpha(q[i]) && isalpha(s[i])) { front = i; break;  }
76                 }
77 //cout << endl << endl;         
78                 int back = 0;           
79                 for (int i = q.length(); i >= 0; i--) {
80 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
81                         if (isalpha(q[i]) && isalpha(s[i])) { back = i; break;  }
82                 }
83                 
84                 trim[front] = back;
85                 
86         }
87         catch(exception& e) {
88                 errorOut(e, "DeCalculator", "trimSeqs");
89                 exit(1);
90         }
91 }
92 //***************************************************************************************************************
93 vector<int>  DeCalculator::findWindows(Sequence* query, int front, int back, int& size, int increment) {
94         try {
95                 
96                 vector<int> win; 
97                 
98                 int cutoff = back - front;  //back - front 
99                         
100                 //if window is set to default
101                 if (size == 0) {  if (cutoff > 1200) {  size = 300; }
102                                                         else{  size = (cutoff / 4); }  } 
103                 else if (size > (cutoff / 4)) { 
104                                 mothurOut("You have selected to large a window size for sequence " + query->getName() + ".  I will choose an appropriate window size."); mothurOutEndLine();
105                                 size = (cutoff / 4); 
106                 }
107         
108         /*      string seq = query->getAligned().substr(front, cutoff);
109                         
110                 //count bases
111                 int numBases = 0;
112                 for (int l = 0; l < seq.length(); l++) {  if (isalpha(seq[l])) { numBases++; }  }
113 //cout << "num Bases = " << numBases << endl;                   
114                 //save start of seq
115                 win.push_back(front);
116 //cout << front << '\t';                
117                 //move ahead increment bases at a time until all bases are in a window
118                 int countBases = 0;
119                 int totalBases = 0;  //used to eliminate window of blanks at end of sequence
120                         
121                 seq = query->getAligned();
122                 for (int m = front; m < (back - size) ; m++) {
123                                 
124                         //count number of bases you see
125                         if (isalpha(seq[m])) { countBases++;  }
126                                 
127                         //if you have seen enough bases to make a new window
128                         if (countBases >= increment) {
129                                 //total bases is the number of bases in a window already.
130                                 totalBases += countBases;
131 //cout << "total bases = " << totalBases << endl;
132                                 win.push_back(m);  //save spot in alignment
133 //cout << m << '\t';
134                                 countBases = 0;                         //reset bases you've seen in this window
135                         }
136                                 
137                         //no need to continue if all your bases are in a window
138                         if (totalBases == numBases) {   break;  }
139                 }
140         
141
142                 //get last window if needed
143                 if (totalBases < numBases) {   win.push_back(back-size);  }
144 //cout << endl << endl;
145 */      
146                 
147                 //this follows wigeon, but we may want to consider that it chops off the end values if the sequence cannot be evenly divided into steps
148                 for (int m = front; m < (back - size) ; m+=increment) {  win.push_back(m);  }
149
150
151         
152                 return win;
153         
154         }
155         catch(exception& e) {
156                 errorOut(e, "DeCalculator", "findWindows");
157                 exit(1);
158         }
159 }
160
161 //***************************************************************************************************************
162 vector<float> DeCalculator::calcObserved(Sequence* query, Sequence* subject, vector<int> window, int size) {
163         try {
164                 
165                 vector<float> temp;     
166                 //int gaps = 0;         
167                 for (int m = 0; m < window.size(); m++) {
168                                                 
169                         string seqFrag = query->getAligned().substr(window[m], size);
170                         string seqFragsub = subject->getAligned().substr(window[m], size);
171                                 
172                         int diff = 0;
173                         for (int b = 0; b < seqFrag.length(); b++) {
174                                 //if at least one is a base and they are not equal
175                                 if( (isalpha(seqFrag[b]) || isalpha(seqFragsub[b])) && (seqFrag[b] != seqFragsub[b]) ) { diff++; }
176                                 
177                                 //ignore gaps
178                                 //if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
179                         }
180                
181                         //percentage of mismatched bases
182                         float dist;
183                         
184                         //if the whole fragment is 0 distance = 0
185                         //if ((seqFrag.length()-gaps) == 0)  { dist =  0.0; }
186                
187                         //percentage of mismatched bases
188                         //else {  dist = diff / (float) (seqFrag.length()-gaps) * 100;   } 
189                         
190                         dist = diff / (float) (seqFrag.length()) * 100; 
191                         
192                         temp.push_back(dist);
193                 }
194                         
195                 return temp;
196         }
197         catch(exception& e) {
198                 errorOut(e, "DeCalculator", "calcObserved");
199                 exit(1);
200         }
201 }
202 //***************************************************************************************************************
203 float DeCalculator::calcDist(Sequence* query, Sequence* subject, int front, int back) {
204         try {
205                 
206                 //so you only look at the trimmed part of the sequence
207                 int cutoff = back - front;  
208                 int gaps = 0;
209                         
210                 //from first startpoint with length back-front
211                 string seqFrag = query->getAligned().substr(front, cutoff);
212                 string seqFragsub = subject->getAligned().substr(front, cutoff);
213                                                                                                                 
214                 int diff = 0;
215                 for (int b = 0; b < seqFrag.length(); b++) {
216                         //ignore gaps
217                         if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
218                         if (seqFrag[b] != seqFragsub[b]) { diff++; }
219                 }
220                 
221                 //if the whole fragment is 0 distance = 0
222                 if ((seqFrag.length()-gaps) == 0)  { return 0.0; }
223                
224                 //percentage of mismatched bases
225                 float dist = diff / (float) (seqFrag.length()-gaps) * 100;       
226                                 
227                 return dist;
228         }
229         catch(exception& e) {
230                 errorOut(e, "DeCalculator", "calcDist");
231                 exit(1);
232         }
233 }
234
235 //***************************************************************************************************************
236 vector<float> DeCalculator::calcExpected(vector<float> qav, float coef) {
237         try {
238                 
239                 //for each window
240                 vector<float> queryExpected;
241                         
242                 for (int m = 0; m < qav.size(); m++) {          
243                                 
244                         float expected = qav[m] * coef;
245                                 
246                         queryExpected.push_back(expected);      
247                 }
248                         
249                 return queryExpected;
250                                 
251         }
252         catch(exception& e) {
253                 errorOut(e, "DeCalculator", "calcExpected");
254                 exit(1);
255         }
256 }
257 //***************************************************************************************************************
258 float DeCalculator::calcDE(vector<float> obs, vector<float> exp) {
259         try {
260                 
261                 //for each window
262                 float sum = 0.0;  //sum = sum from 1 to m of (oi-ei)^2
263                 int numZeros = 0;
264                 for (int m = 0; m < obs.size(); m++) {          
265                         
266                         //if (obs[m] != 0.0) {
267                                 sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); 
268                         //}else {  numZeros++;   }
269                         
270                 }
271                         
272                 float de = sqrt((sum / (obs.size() - 1 - numZeros)));
273                         
274                 return de;
275         }
276         catch(exception& e) {
277                 errorOut(e, "DeCalculator", "calcDE");
278                 exit(1);
279         }
280 }
281
282 //***************************************************************************************************************
283
284 vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
285         try {
286
287                 vector<float> prob;
288                 string freqfile = getRootName(filename) + "freq";
289                 ofstream outFreq;
290                 
291                 openOutputFile(freqfile, outFreq);
292                 
293                 string length = toString(seqs.size());  //if there are 5000 seqs in the template then set precision to 3
294                 int precision = length.length() - 1;
295                 
296                 //format output
297                 outFreq.setf(ios::fixed, ios::floatfield); outFreq.setf(ios::showpoint);
298                 
299                 //at each position in the sequence
300                 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
301                         
302                         vector<int> freq;   freq.resize(4,0);
303                         int gaps = 0;
304                         
305                         //find the frequency of each nucleotide
306                         for (int j = 0; j < seqs.size(); j++) {
307                                 
308                                 char value = seqs[j]->getAligned()[i];
309                                 
310                                 if(toupper(value) == 'A')                                                                       {       freq[0]++;      }
311                                 else if(toupper(value) == 'T' || toupper(value) == 'U')         {       freq[1]++;      }
312                                 else if(toupper(value) == 'G')                                                          {       freq[2]++;      }
313                                 else if(toupper(value) == 'C')                                                          {       freq[3]++;      }
314                                 else { gaps++; }
315                         }
316                         
317                         //find base with highest frequency
318                         int highest = 0;
319                         for (int m = 0; m < freq.size(); m++) {   if (freq[m] > highest) {  highest = freq[m];  }               }
320                         
321                         float highFreq = highest / (float) (seqs.size());       
322                         
323                         float Pi;
324                         Pi =  (highFreq - 0.25) / 0.75; 
325                         
326                         //cannot have probability less than 0.
327                         if (Pi < 0) { Pi = 0.0; }
328                         
329                         //saves this for later
330                         outFreq << setprecision(precision) << i << '\t' << highFreq << endl;
331         
332                         if (h.count(i) > 0) {
333                                 prob.push_back(Pi); 
334                         }
335                 }
336                 
337                 outFreq.close();
338                 
339                 return prob;
340                                 
341         }
342         catch(exception& e) {
343                 errorOut(e, "DeCalculator", "calcFreq");
344                 exit(1);
345         }
346 }
347 //***************************************************************************************************************
348 vector<float>  DeCalculator::findQav(vector<int> window, int size, vector<float> probabilityProfile) {
349         try {
350                 vector<float>  averages; 
351                                 
352                 //for each window find average
353                 for (int m = 0; m < window.size(); m++) {
354                                 
355                         float average = 0.0;
356                                 
357                         //while you are in the window for this sequence
358                         int count = 0;
359                         for (int j = window[m]; j < (window[m]+size); j++) {   
360                                 average += probabilityProfile[j];
361                                 count++;
362                         }
363                                 
364                         average = average / count;
365         
366                         //save this windows average
367                         averages.push_back(average);
368                 }
369                                 
370                 return averages;
371         }
372         catch(exception& e) {
373                 errorOut(e, "DeCalculator", "findQav");
374                 exit(1);
375         }
376 }
377 //***************************************************************************************************************
378 //seqs have already been masked
379 vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end) {
380         try {
381                 vector< vector<quanMember> > quan; 
382                 
383                 //percentage of mismatched pairs 1 to 100
384                 quan.resize(100);
385 //ofstream o;
386 //string out = "getQuantiles.out";
387 //openOutputFile(out, o);
388                 
389                 //for each sequence
390                 for(int i = start; i < end; i++){
391                 
392                         mothurOut("Processing sequence " + toString(i)); mothurOutEndLine();
393                         Sequence* query = new Sequence(seqs[i]->getName(), seqs[i]->getAligned());
394                         
395                         //compare to every other sequence in template
396                         for(int j = 0; j < i; j++){
397                                 
398                                 Sequence* subject = new Sequence(seqs[j]->getName(), seqs[j]->getAligned());
399                                 
400                                 map<int, int> trim;
401                                 map<int, int>::iterator it;
402                                 
403                                 trimSeqs(query, subject, trim);
404                                 
405                                 it = trim.begin();
406                                 int front = it->first; int back = it->second;
407                                 
408                                 //reset window for each new comparison
409                                 windowSizesTemplate[i] = window;
410                                 
411                                 vector<int> win = findWindows(query, front, back, windowSizesTemplate[i], increment);
412                                 
413                                 vector<float> obsi = calcObserved(query, subject, win, windowSizesTemplate[i]);
414                                 
415                                 vector<float> q = findQav(win, windowSizesTemplate[i], probProfile);
416                                                                         
417                                 float alpha = getCoef(obsi, q);
418                                                 
419                                 vector<float> exp = calcExpected(q, alpha);
420                                 
421                                 float de = calcDE(obsi, exp);
422                                                                 
423                                 float dist = calcDist(query, subject, front, back); 
424         //o << i << '\t' <<  j << '\t' << dist << '\t' << de << endl;                   
425                                 dist = ceil(dist);
426                                 
427                                 quanMember newScore(de, i, j);
428                                 
429                                 quan[dist].push_back(newScore);
430                                 
431                                 delete subject;
432                         }
433                         
434                         delete query;
435                 }
436
437                 return quan;
438                                                 
439         }
440         catch(exception& e) {
441                 errorOut(e, "DeCalculator", "getQuantiles");
442                 exit(1);
443         }
444 }
445 //********************************************************************************************************************
446 //sorts lowest to highest
447 inline bool compareQuanMembers(quanMember left, quanMember right){
448         return (left.score < right.score);      
449
450 //***************************************************************************************************************
451 //this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right.  may want to revisit in the future...
452 void DeCalculator::removeObviousOutliers(vector< vector<quanMember> >& quantiles, int num) {
453         try {
454                                                 
455                 for (int i = 0; i < quantiles.size(); i++) {
456                 
457                         //find mean of this quantile score
458                         sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
459                         
460                         float high = quantiles[i][int(quantiles[i].size() * 0.99)].score;
461                         float low =  quantiles[i][int(quantiles[i].size() * 0.01)].score;
462                         
463                         vector<quanMember> temp;
464                         
465                         //look at each value in quantiles to see if it is an outlier
466                         for (int j = 0; j < quantiles[i].size(); j++) {
467                                 //is this score between 1 and 99%
468                                 if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) {
469                                         temp.push_back(quantiles[i][j]);
470                                 }
471                         }
472                         
473                         quantiles[i] = temp;
474                 }
475
476 /*
477                 //find contributer with most offending score related to it
478                 int largestContrib = findLargestContrib(seen);
479         
480                 //while you still have guys to eliminate
481                 while (contributions.size() > 0) {
482                 
483                         mothurOut("Removing scores contributed by sequence " + toString(largestContrib) + " in your template file."); mothurOutEndLine();
484                         
485                         //remove from quantiles all scores that were made using this largestContrib
486                         for (int i = 0; i < quantiles.size(); i++) {
487 //cout << "before remove " << quantiles[i].size() << '\t';
488                                 removeContrib(largestContrib, quantiles[i]);
489 //cout << "after remove " << quantiles[i].size() << endl;
490                         }
491 //cout << count << " largest contrib = " << largestContrib << endl;  count++;
492                         //remove from contributions all scores that were made using this largestContrib
493                         removeContrib(largestContrib, contributions);
494                         
495                         //"erase" largestContrib
496                         seen[largestContrib] = -1;
497                         
498                         //get next largestContrib
499                         largestContrib = findLargestContrib(seen);
500                 }
501 ABOVE IS ATTEMPT #1             
502 **************************************************************************************************
503 BELOW IS ATTEMPT #2             
504                 vector<int> marked = returnObviousOutliers(quantiles, num);
505                 vector<int> copyMarked = marked;
506                 
507                 //find the 99% of marked
508                 sort(copyMarked.begin(), copyMarked.end());
509                 int high = copyMarked[int(copyMarked.size() * 0.99)];
510 cout << "high = " << high << endl;              
511                 
512                 for(int i = 0; i < marked.size(); i++) {
513                         if (marked[i] > high) { 
514                                 mothurOut("Removing scores contributed by sequence " + toString(marked[i]) + " in your template file."); mothurOutEndLine();
515                                 for (int i = 0; i < quantiles.size(); i++) {
516                                         removeContrib(marked[i], quantiles[i]);
517                                 }
518                         }
519
520                 }
521                 
522                 
523                 //adjust quantiles
524                 for (int i = 0; i < quantiles.size(); i++) {
525                         vector<float> temp;
526                         
527                         if (quantiles[i].size() == 0) {
528                                 //in case this is not a distance found in your template files
529                                 for (int g = 0; g < 6; g++) {
530                                         temp.push_back(0.0);
531                                 }
532                         }else{
533                                 
534                                 sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
535                                 
536                                 //save 10%
537                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)].score);
538                                 //save 25%
539                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)].score);
540                                 //save 50%
541                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)].score);
542                                 //save 75%
543                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)].score);
544                                 //save 95%
545                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)].score);
546                                 //save 99%
547                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)].score);
548                                 
549                         }
550                         
551                         quan[i] = temp;
552                         
553                 }
554 */
555                 
556         }
557         catch(exception& e) {
558                 errorOut(e, "DeCalculator", "removeObviousOutliers");
559                 exit(1);
560         }
561 }
562 //***************************************************************************************************************
563 //put quanMember in the vector based on how far they are from the 99% or 1%.  Biggest offenders in front.
564 /*vector<quanMember> DeCalculator::sortContrib(map<quanMember*, float> quan) {
565         try{
566                 
567                 vector<quanMember> newQuan;
568                 map<quanMember*, float>::iterator it;
569                 
570                 while (quan.size() > 0) {
571                         
572                          map<quanMember*, float>::iterator largest = quan.begin(); 
573                           
574                         //find biggest member
575                         for (it = quan.begin(); it != quan.end(); it++) {
576                                 if (it->second > largest->second) {  largest = it;  }
577                         }
578 cout << largest->second << '\t' << largest->first->score << '\t' << largest->first->member1 << '\t' << largest->first->member2 << endl;
579                         //save it 
580                         newQuan.push_back(*(largest->first));
581                 
582                         //erase from quan
583                         quan.erase(largest);
584                 }
585                 
586                 return newQuan;
587                 
588         }
589         catch(exception& e) {
590                 errorOut(e, "DeCalculator", "sortContrib");
591                 exit(1);
592         }
593 }
594
595 //***************************************************************************************************************
596 //used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used...
597 int DeCalculator::findLargestContrib(vector<int> seen) {
598         try{
599                 
600                 int largest = 0;
601                 
602                 int largestContribs;
603                 
604                 for (int i = 0; i < seen.size(); i++)  {  
605                         
606                         if (seen[i] > largest) {
607                                 largestContribs = i;
608                                 largest = seen[i];
609                         }
610                 }
611                 
612                 return largestContribs;
613                 
614         }
615         catch(exception& e) {
616                 errorOut(e, "DeCalculator", "findLargestContribs");
617                 exit(1);
618         }
619 }
620 //***************************************************************************************************************
621 void DeCalculator::removeContrib(int bad, vector<quanMember>& quan) {
622         try{
623         
624                 vector<quanMember> newQuan;
625                 for (int i = 0; i < quan.size(); i++)  {  
626                         if ((quan[i].member1 != bad) && (quan[i].member2 != bad) ) {  
627                                 newQuan.push_back(quan[i]);  
628                         }
629                 }
630                 
631                 quan = newQuan;
632                 
633         }
634         catch(exception& e) {
635                 errorOut(e, "DeCalculator", "removeContrib");
636                 exit(1);
637         }
638 }
639 */
640 //***************************************************************************************************************
641 float DeCalculator::findAverage(vector<float> myVector) {
642         try{
643                 
644                 float total = 0.0;
645                 for (int i = 0; i < myVector.size(); i++)  {  total += myVector[i];  }
646                 
647                 float average = total / (float) myVector.size();
648                 
649                 return average;
650                 
651         }
652         catch(exception& e) {
653                 errorOut(e, "DeCalculator", "findAverage");
654                 exit(1);
655         }
656 }
657
658 //***************************************************************************************************************
659 float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
660         try {
661         
662                 //find average prob for this seqs windows
663                 float probAverage = findAverage(qav);
664                                 
665                 //find observed average 
666                 float obsAverage = findAverage(obs);
667                         
668                 float coef = obsAverage / probAverage;
669                                                 
670                 return coef;
671         }
672         catch(exception& e) {
673                 errorOut(e, "DeCalculator", "getCoef");
674                 exit(1);
675         }
676 }
677 //***************************************************************************************************************
678
679