]> git.donarmstrong.com Git - mothur.git/blob - decalc.cpp
last changes before move
[mothur.git] / decalc.cpp
1 /*
2  *  decalc.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 7/22/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "decalc.h"
11
12 //***************************************************************************************************************
13 void DeCalculator::setMask(string m) { 
14         try {
15                 seqMask = m; 
16                 
17                 if (seqMask.length() != 0) {
18                         //whereever there is a base in the mask, save that value is query and subject
19                         for (int i = 0; i < seqMask.length(); i++) {
20                                 if (isalpha(seqMask[i])) {
21                                         h.insert(i);
22                                 }
23                         }
24                 }else {
25                         for (int i = 0; i < alignLength; i++) {   h.insert(i);  }
26                 }
27         }
28         catch(exception& e) {
29                 errorOut(e, "DeCalculator", "setMask");
30                 exit(1);
31         } 
32 }
33 //***************************************************************************************************************
34 void DeCalculator::runMask(Sequence* seq) {
35         try{
36                 
37                 string q = seq->getAligned();
38                 string tempQuery = "";
39                 
40                 //whereever there is a base in the mask, save that value is query and subject
41                 set<int>::iterator setit;
42                 for ( setit=h.begin() ; setit != h.end(); setit++ )  {
43                         tempQuery += q[*setit];
44                 }
45                 
46                 //save masked values
47                 seq->setAligned(tempQuery);
48                 seq->setUnaligned(tempQuery);
49         }
50         catch(exception& e) {
51                 errorOut(e, "DeCalculator", "runMask");
52                 exit(1);
53         }
54 }
55 //***************************************************************************************************************
56 //num is query's spot in querySeqs
57 void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map<int, int>& trim) {
58         try {
59                 
60                 string q = query->getAligned();
61                 string s = subject->getAligned();
62                 
63                 int front = 0;
64                 for (int i = 0; i < q.length(); i++) {
65 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
66                         if (isalpha(q[i]) && isalpha(s[i])) { front = i; break;  }
67                 }
68 //cout << endl << endl;         
69                 int back = 0;           
70                 for (int i = q.length(); i >= 0; i--) {
71 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
72                         if (isalpha(q[i]) && isalpha(s[i])) { back = i; break;  }
73                 }
74                 
75                 trim[front] = back;
76                 
77         }
78         catch(exception& e) {
79                 errorOut(e, "DeCalculator", "trimSeqs");
80                 exit(1);
81         }
82 }
83 //***************************************************************************************************************
84 vector<int>  DeCalculator::findWindows(Sequence* query, int front, int back, int& size, int increment) {
85         try {
86                 
87                 vector<int> win; 
88                 
89                 int cutoff = back - front;  //back - front 
90                         
91                 //if window is set to default
92                 if (size == 0) {  if (cutoff > 1200) {  size = 300; }
93                                                         else{  size = (cutoff / 4); }  } 
94                 else if (size > (cutoff / 4)) { 
95                                 mothurOut("You have selected to large a window size for sequence " + query->getName() + ".  I will choose an appropriate window size."); mothurOutEndLine();
96                                 size = (cutoff / 4); 
97                 }
98         
99         /*      string seq = query->getAligned().substr(front, cutoff);
100                         
101                 //count bases
102                 int numBases = 0;
103                 for (int l = 0; l < seq.length(); l++) {  if (isalpha(seq[l])) { numBases++; }  }
104 //cout << "num Bases = " << numBases << endl;                   
105                 //save start of seq
106                 win.push_back(front);
107 //cout << front << '\t';                
108                 //move ahead increment bases at a time until all bases are in a window
109                 int countBases = 0;
110                 int totalBases = 0;  //used to eliminate window of blanks at end of sequence
111                         
112                 seq = query->getAligned();
113                 for (int m = front; m < (back - size) ; m++) {
114                                 
115                         //count number of bases you see
116                         if (isalpha(seq[m])) { countBases++;  }
117                                 
118                         //if you have seen enough bases to make a new window
119                         if (countBases >= increment) {
120                                 //total bases is the number of bases in a window already.
121                                 totalBases += countBases;
122 //cout << "total bases = " << totalBases << endl;
123                                 win.push_back(m);  //save spot in alignment
124 //cout << m << '\t';
125                                 countBases = 0;                         //reset bases you've seen in this window
126                         }
127                                 
128                         //no need to continue if all your bases are in a window
129                         if (totalBases == numBases) {   break;  }
130                 }
131         
132
133                 //get last window if needed
134                 if (totalBases < numBases) {   win.push_back(back-size);  }
135 //cout << endl << endl;
136 */      
137                 
138                 //this follows wigeon, but we may want to consider that it chops off the end values if the sequence cannot be evenly divided into steps
139                 for (int m = front; m < (back - size) ; m+=increment) {  win.push_back(m);  }
140
141
142         
143                 return win;
144         
145         }
146         catch(exception& e) {
147                 errorOut(e, "DeCalculator", "findWindows");
148                 exit(1);
149         }
150 }
151
152 //***************************************************************************************************************
153 vector<float> DeCalculator::calcObserved(Sequence* query, Sequence* subject, vector<int> window, int size) {
154         try {
155                 
156                 vector<float> temp;     
157                 //int gaps = 0;         
158                 for (int m = 0; m < window.size(); m++) {
159                                                 
160                         string seqFrag = query->getAligned().substr(window[m], size);
161                         string seqFragsub = subject->getAligned().substr(window[m], size);
162                                 
163                         int diff = 0;
164                         for (int b = 0; b < seqFrag.length(); b++) {
165                                 //if at least one is a base and they are not equal
166                                 if( (isalpha(seqFrag[b]) || isalpha(seqFragsub[b])) && (seqFrag[b] != seqFragsub[b]) ) { diff++; }
167                                 
168                                 //ignore gaps
169                                 //if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
170                         }
171                
172                         //percentage of mismatched bases
173                         float dist;
174                         
175                         //if the whole fragment is 0 distance = 0
176                         //if ((seqFrag.length()-gaps) == 0)  { dist =  0.0; }
177                
178                         //percentage of mismatched bases
179                         //else {  dist = diff / (float) (seqFrag.length()-gaps) * 100;   } 
180                         
181                         dist = diff / (float) (seqFrag.length()) * 100; 
182                         
183                         temp.push_back(dist);
184                 }
185                         
186                 return temp;
187         }
188         catch(exception& e) {
189                 errorOut(e, "DeCalculator", "calcObserved");
190                 exit(1);
191         }
192 }
193 //***************************************************************************************************************
194 float DeCalculator::calcDist(Sequence* query, Sequence* subject, int front, int back) {
195         try {
196                 
197                 //so you only look at the trimmed part of the sequence
198                 int cutoff = back - front;  
199                 int gaps = 0;
200                         
201                 //from first startpoint with length back-front
202                 string seqFrag = query->getAligned().substr(front, cutoff);
203                 string seqFragsub = subject->getAligned().substr(front, cutoff);
204                                                                                                                 
205                 int diff = 0;
206                 for (int b = 0; b < seqFrag.length(); b++) {
207                         //ignore gaps
208                         if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
209                         if (seqFrag[b] != seqFragsub[b]) { diff++; }
210                 }
211                 
212                 //if the whole fragment is 0 distance = 0
213                 if ((seqFrag.length()-gaps) == 0)  { return 0.0; }
214                
215                 //percentage of mismatched bases
216                 float dist = diff / (float) (seqFrag.length()-gaps) * 100;       
217                                 
218                 return dist;
219         }
220         catch(exception& e) {
221                 errorOut(e, "DeCalculator", "calcDist");
222                 exit(1);
223         }
224 }
225
226 //***************************************************************************************************************
227 vector<float> DeCalculator::calcExpected(vector<float> qav, float coef) {
228         try {
229                 
230                 //for each window
231                 vector<float> queryExpected;
232                         
233                 for (int m = 0; m < qav.size(); m++) {          
234                                 
235                         float expected = qav[m] * coef;
236                                 
237                         queryExpected.push_back(expected);      
238                 }
239                         
240                 return queryExpected;
241                                 
242         }
243         catch(exception& e) {
244                 errorOut(e, "DeCalculator", "calcExpected");
245                 exit(1);
246         }
247 }
248 //***************************************************************************************************************
249 float DeCalculator::calcDE(vector<float> obs, vector<float> exp) {
250         try {
251                 
252                 //for each window
253                 float sum = 0.0;  //sum = sum from 1 to m of (oi-ei)^2
254                 for (int m = 0; m < obs.size(); m++) {          sum += ((obs[m] - exp[m]) * (obs[m] - exp[m]));         }
255                         
256                 float de = sqrt((sum / (obs.size() - 1)));
257                         
258                 return de;
259         }
260         catch(exception& e) {
261                 errorOut(e, "DeCalculator", "calcDE");
262                 exit(1);
263         }
264 }
265
266 //***************************************************************************************************************
267
268 vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
269         try {
270
271                 vector<float> prob;
272                 string freqfile = getRootName(filename) + "freq";
273                 ofstream outFreq;
274                 
275                 openOutputFile(freqfile, outFreq);
276                 
277                 string length = toString(seqs.size());  //if there are 5000 seqs in the template then set precision to 3
278                 int precision = length.length() - 1;
279                 
280                 //format output
281                 outFreq.setf(ios::fixed, ios::floatfield); outFreq.setf(ios::showpoint);
282                 
283                 //at each position in the sequence
284                 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
285                         
286                         vector<int> freq;   freq.resize(4,0);
287                         int gaps = 0;
288                         
289                         //find the frequency of each nucleotide
290                         for (int j = 0; j < seqs.size(); j++) {
291                                 
292                                 char value = seqs[j]->getAligned()[i];
293                                 
294                                 if(toupper(value) == 'A')                                                                       {       freq[0]++;      }
295                                 else if(toupper(value) == 'T' || toupper(value) == 'U')         {       freq[1]++;      }
296                                 else if(toupper(value) == 'G')                                                          {       freq[2]++;      }
297                                 else if(toupper(value) == 'C')                                                          {       freq[3]++;      }
298                                 else { gaps++; }
299                         }
300                         
301                         //find base with highest frequency
302                         int highest = 0;
303                         for (int m = 0; m < freq.size(); m++) {   if (freq[m] > highest) {  highest = freq[m];  }               }
304                         
305                         float highFreq = highest / (float) (seqs.size());       
306                         
307                         float Pi;
308                         Pi =  (highFreq - 0.25) / 0.75; 
309                         
310                         //cannot have probability less than 0.
311                         if (Pi < 0) { Pi = 0.0; }
312                         
313                         //saves this for later
314                         outFreq << setprecision(precision) << i << '\t' << highFreq << endl;
315         
316                         if (h.count(i) > 0) {
317                                 prob.push_back(Pi); 
318                         }
319                 }
320                 
321                 outFreq.close();
322                 
323                 return prob;
324                                 
325         }
326         catch(exception& e) {
327                 errorOut(e, "DeCalculator", "calcFreq");
328                 exit(1);
329         }
330 }
331 //***************************************************************************************************************
332 vector<float>  DeCalculator::findQav(vector<int> window, int size, vector<float> probabilityProfile) {
333         try {
334                 vector<float>  averages; 
335                                 
336                 //for each window find average
337                 for (int m = 0; m < window.size(); m++) {
338                                 
339                         float average = 0.0;
340                                 
341                         //while you are in the window for this sequence
342                         int count = 0;
343                         for (int j = window[m]; j < (window[m]+size); j++) {   
344                                 average += probabilityProfile[j];
345                                 count++;
346                         }
347                                 
348                         average = average / count;
349         
350                         //save this windows average
351                         averages.push_back(average);
352                 }
353                                 
354                 return averages;
355         }
356         catch(exception& e) {
357                 errorOut(e, "DeCalculator", "findQav");
358                 exit(1);
359         }
360 }
361 //***************************************************************************************************************
362 //seqs have already been masked
363 vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end) {
364         try {
365                 vector< vector<quanMember> > quan; 
366                 
367                 //percentage of mismatched pairs 1 to 100
368                 quan.resize(100);
369                 
370                 //for each sequence
371                 for(int i = start; i < end; i++){
372                 
373                         mothurOut("Processing sequence " + toString(i)); mothurOutEndLine();
374                         Sequence* query = new Sequence(seqs[i]->getName(), seqs[i]->getAligned());
375                         
376                         //compare to every other sequence in template
377                         for(int j = 0; j < i; j++){
378                                 
379                                 Sequence* subject = new Sequence(seqs[j]->getName(), seqs[j]->getAligned());
380                                 
381                                 map<int, int> trim;
382                                 map<int, int>::iterator it;
383                                 
384                                 trimSeqs(query, subject, trim);
385                                 
386                                 it = trim.begin();
387                                 int front = it->first; int back = it->second;
388                                 
389                                 //reset window for each new comparison
390                                 windowSizesTemplate[i] = window;
391                                 
392                                 vector<int> win = findWindows(query, front, back, windowSizesTemplate[i], increment);
393                                 
394                                 vector<float> obsi = calcObserved(query, subject, win, windowSizesTemplate[i]);
395                                 
396                                 vector<float> q = findQav(win, windowSizesTemplate[i], probProfile);
397                                                                         
398                                 float alpha = getCoef(obsi, q);
399                                                 
400                                 vector<float> exp = calcExpected(q, alpha);
401                                 
402                                 float de = calcDE(obsi, exp);
403                                                                 
404                                 float dist = calcDist(query, subject, front, back); 
405                                 
406                                 dist = ceil(dist);
407                                 
408                                 quanMember newScore(de, i, j);
409                                 
410                                 //dist-1 because vector indexes start at 0.
411                                 quan[dist-1].push_back(newScore);
412                                 
413                                 delete subject;
414                         }
415                         
416                         delete query;
417                 }
418
419                 return quan;
420                                                 
421         }
422         catch(exception& e) {
423                 errorOut(e, "DeCalculator", "getQuantiles");
424                 exit(1);
425         }
426 }
427
428 //***************************************************************************************************************
429 //this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right.  may want to revisit in the future...
430 vector< vector<float> > DeCalculator::removeObviousOutliers(vector< vector<quanMember> >& quantiles, int num) {
431         try {
432                 vector< vector<float> > quan; 
433                 quan.resize(100);
434         
435                 /*vector<quanMember> contributions;  
436                 vector<int> seen;  //seen[0] is the number of outliers that template seqs[0] was part of.
437                 seen.resize(num,0);
438                                 
439                 //find contributions
440                 for (int i = 0; i < quantiles.size(); i++) {
441                 
442                         //find mean of this quantile score
443                         sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
444                         
445                         float high = quantiles[i][int(quantiles[i].size() * 0.99)].score;
446                         float low =  quantiles[i][int(quantiles[i].size() * 0.01)].score;
447                 
448                         //look at each value in quantiles to see if it is an outlier
449                         for (int j = 0; j < quantiles[i].size(); j++) {
450                                 
451                                 //is this score between 1 and 99%
452                                 if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) {
453                                         
454                                 }else {
455                                         //add to contributions
456                                         contributions.push_back(quantiles[i][j]);
457                                         seen[quantiles[i][j].member1]++;
458                                         seen[quantiles[i][j].member2]++;
459                                 }
460                         }
461                 }
462
463                 //find contributer with most offending score related to it
464                 int largestContrib = findLargestContrib(seen);
465         
466                 //while you still have guys to eliminate
467                 while (contributions.size() > 0) {
468                 
469                         mothurOut("Removing scores contributed by sequence " + toString(largestContrib) + " in your template file."); mothurOutEndLine();
470                         
471                         //remove from quantiles all scores that were made using this largestContrib
472                         for (int i = 0; i < quantiles.size(); i++) {
473 //cout << "before remove " << quantiles[i].size() << '\t';
474                                 removeContrib(largestContrib, quantiles[i]);
475 //cout << "after remove " << quantiles[i].size() << endl;
476                         }
477 //cout << count << " largest contrib = " << largestContrib << endl;  count++;
478                         //remove from contributions all scores that were made using this largestContrib
479                         removeContrib(largestContrib, contributions);
480                         
481                         //"erase" largestContrib
482                         seen[largestContrib] = -1;
483                         
484                         //get next largestContrib
485                         largestContrib = findLargestContrib(seen);
486                 }
487 ABOVE IS ATTEMPT #1             
488 **************************************************************************************************
489 BELOW IS ATTEMPT #2             
490                 vector<int> marked = returnObviousOutliers(quantiles, num);
491                 vector<int> copyMarked = marked;
492                 
493                 //find the 99% of marked
494                 sort(copyMarked.begin(), copyMarked.end());
495                 int high = copyMarked[int(copyMarked.size() * 0.99)];
496 cout << "high = " << high << endl;              
497                 
498                 for(int i = 0; i < marked.size(); i++) {
499                         if (marked[i] > high) { 
500                                 mothurOut("Removing scores contributed by sequence " + toString(marked[i]) + " in your template file."); mothurOutEndLine();
501                                 for (int i = 0; i < quantiles.size(); i++) {
502                                         removeContrib(marked[i], quantiles[i]);
503                                 }
504                         }
505
506                 }
507                 
508                 
509                 //adjust quantiles
510                 for (int i = 0; i < quantiles.size(); i++) {
511                         vector<float> temp;
512                         
513                         if (quantiles[i].size() == 0) {
514                                 //in case this is not a distance found in your template files
515                                 for (int g = 0; g < 6; g++) {
516                                         temp.push_back(0.0);
517                                 }
518                         }else{
519                                 
520                                 sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
521                                 
522                                 //save 10%
523                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)].score);
524                                 //save 25%
525                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)].score);
526                                 //save 50%
527                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)].score);
528                                 //save 75%
529                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)].score);
530                                 //save 95%
531                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)].score);
532                                 //save 99%
533                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)].score);
534                                 
535                         }
536                         
537                         quan[i] = temp;
538                         
539                 }
540 */
541                 return quan;
542         }
543         catch(exception& e) {
544                 errorOut(e, "DeCalculator", "removeObviousOutliers");
545                 exit(1);
546         }
547 }
548 //***************************************************************************************************************
549 //put quanMember in the vector based on how far they are from the 99% or 1%.  Biggest offenders in front.
550 /*vector<quanMember> DeCalculator::sortContrib(map<quanMember*, float> quan) {
551         try{
552                 
553                 vector<quanMember> newQuan;
554                 map<quanMember*, float>::iterator it;
555                 
556                 while (quan.size() > 0) {
557                         
558                          map<quanMember*, float>::iterator largest = quan.begin(); 
559                           
560                         //find biggest member
561                         for (it = quan.begin(); it != quan.end(); it++) {
562                                 if (it->second > largest->second) {  largest = it;  }
563                         }
564 cout << largest->second << '\t' << largest->first->score << '\t' << largest->first->member1 << '\t' << largest->first->member2 << endl;
565                         //save it 
566                         newQuan.push_back(*(largest->first));
567                 
568                         //erase from quan
569                         quan.erase(largest);
570                 }
571                 
572                 return newQuan;
573                 
574         }
575         catch(exception& e) {
576                 errorOut(e, "DeCalculator", "sortContrib");
577                 exit(1);
578         }
579 }
580
581 //***************************************************************************************************************
582 //used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used...
583 int DeCalculator::findLargestContrib(vector<int> seen) {
584         try{
585                 
586                 int largest = 0;
587                 
588                 int largestContribs;
589                 
590                 for (int i = 0; i < seen.size(); i++)  {  
591                         
592                         if (seen[i] > largest) {
593                                 largestContribs = i;
594                                 largest = seen[i];
595                         }
596                 }
597                 
598                 return largestContribs;
599                 
600         }
601         catch(exception& e) {
602                 errorOut(e, "DeCalculator", "findLargestContribs");
603                 exit(1);
604         }
605 }
606 //***************************************************************************************************************
607 void DeCalculator::removeContrib(int bad, vector<quanMember>& quan) {
608         try{
609         
610                 vector<quanMember> newQuan;
611                 for (int i = 0; i < quan.size(); i++)  {  
612                         if ((quan[i].member1 != bad) && (quan[i].member2 != bad) ) {  
613                                 newQuan.push_back(quan[i]);  
614                         }
615                 }
616                 
617                 quan = newQuan;
618                 
619         }
620         catch(exception& e) {
621                 errorOut(e, "DeCalculator", "removeContrib");
622                 exit(1);
623         }
624 }
625 */
626 //***************************************************************************************************************
627 float DeCalculator::findAverage(vector<float> myVector) {
628         try{
629                 
630                 float total = 0.0;
631                 for (int i = 0; i < myVector.size(); i++)  {  total += myVector[i];  }
632                 
633                 float average = total / (float) myVector.size();
634                 
635                 return average;
636                 
637         }
638         catch(exception& e) {
639                 errorOut(e, "DeCalculator", "findAverage");
640                 exit(1);
641         }
642 }
643
644 //***************************************************************************************************************
645 float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
646         try {
647         
648                 //find average prob for this seqs windows
649                 float probAverage = findAverage(qav);
650                                 
651                 //find observed average 
652                 float obsAverage = findAverage(obs);
653                         
654                 float coef = obsAverage / probAverage;
655                                                 
656                 return coef;
657         }
658         catch(exception& e) {
659                 errorOut(e, "DeCalculator", "getCoef");
660                 exit(1);
661         }
662 }
663 //***************************************************************************************************************
664
665