]> git.donarmstrong.com Git - mothur.git/blob - decalc.cpp
ccode working - still need to paralellize
[mothur.git] / decalc.cpp
1 /*
2  *  decalc.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 7/22/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "decalc.h"
11
12 //***************************************************************************************************************
13 void DeCalculator::setMask(string m) { 
14         try {
15                 seqMask = m; 
16                 int count = 0;
17                 maskMap.clear();
18                 
19                 if (seqMask.length() != 0) {
20                         //whereever there is a base in the mask, save that value is query and subject
21                         for (int i = 0; i < seqMask.length(); i++) {
22                                 if (isalpha(seqMask[i])) {
23                                         h.insert(i);
24                                         maskMap[i] = count;
25                                 }
26                                 count++;
27                         }
28                 }else {
29                         for (int i = 0; i < alignLength; i++) {   h.insert(i);  }
30                 }
31         }
32         catch(exception& e) {
33                 errorOut(e, "DeCalculator", "setMask");
34                 exit(1);
35         } 
36 }
37 //***************************************************************************************************************
38 void DeCalculator::runMask(Sequence* seq) {
39         try{
40                 
41                 string q = seq->getAligned();
42                 string tempQuery = "";
43                 
44                 //whereever there is a base in the mask, save that value is query and subject
45                 set<int>::iterator setit;
46                 for ( setit=h.begin() ; setit != h.end(); setit++ )  {
47                         tempQuery += q[*setit];
48                 }
49                 
50                 //save masked values
51                 seq->setAligned(tempQuery);
52                 seq->setUnaligned(tempQuery);
53         }
54         catch(exception& e) {
55                 errorOut(e, "DeCalculator", "runMask");
56                 exit(1);
57         }
58 }
59 //***************************************************************************************************************
60 //num is query's spot in querySeqs
61 void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map<int, int>& trim) {
62         try {
63                 
64                 string q = query->getAligned();
65                 string s = subject->getAligned();
66                 
67                 int front = 0;
68                 for (int i = 0; i < q.length(); i++) {
69 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
70                         if (isalpha(q[i]) && isalpha(s[i])) { front = i; break;  }
71                 }
72 //cout << endl << endl;         
73                 int back = 0;           
74                 for (int i = q.length(); i >= 0; i--) {
75 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
76                         if (isalpha(q[i]) && isalpha(s[i])) { back = i; break;  }
77                 }
78                 
79                 trim[front] = back;
80                 
81         }
82         catch(exception& e) {
83                 errorOut(e, "DeCalculator", "trimSeqs");
84                 exit(1);
85         }
86 }
87 //***************************************************************************************************************
88 vector<int>  DeCalculator::findWindows(Sequence* query, int front, int back, int& size, int increment) {
89         try {
90                 
91                 vector<int> win; 
92                 
93                 int cutoff = back - front;  //back - front 
94                         
95                 //if window is set to default
96                 if (size == 0) {  if (cutoff > 1200) {  size = 300; }
97                                                         else{  size = (cutoff / 4); }  } 
98                 else if (size > (cutoff / 4)) { 
99                                 mothurOut("You have selected to large a window size for sequence " + query->getName() + ".  I will choose an appropriate window size."); mothurOutEndLine();
100                                 size = (cutoff / 4); 
101                 }
102         
103         /*      string seq = query->getAligned().substr(front, cutoff);
104                         
105                 //count bases
106                 int numBases = 0;
107                 for (int l = 0; l < seq.length(); l++) {  if (isalpha(seq[l])) { numBases++; }  }
108 //cout << "num Bases = " << numBases << endl;                   
109                 //save start of seq
110                 win.push_back(front);
111 //cout << front << '\t';                
112                 //move ahead increment bases at a time until all bases are in a window
113                 int countBases = 0;
114                 int totalBases = 0;  //used to eliminate window of blanks at end of sequence
115                         
116                 seq = query->getAligned();
117                 for (int m = front; m < (back - size) ; m++) {
118                                 
119                         //count number of bases you see
120                         if (isalpha(seq[m])) { countBases++;  }
121                                 
122                         //if you have seen enough bases to make a new window
123                         if (countBases >= increment) {
124                                 //total bases is the number of bases in a window already.
125                                 totalBases += countBases;
126 //cout << "total bases = " << totalBases << endl;
127                                 win.push_back(m);  //save spot in alignment
128 //cout << m << '\t';
129                                 countBases = 0;                         //reset bases you've seen in this window
130                         }
131                                 
132                         //no need to continue if all your bases are in a window
133                         if (totalBases == numBases) {   break;  }
134                 }
135         
136
137                 //get last window if needed
138                 if (totalBases < numBases) {   win.push_back(back-size);  }
139 //cout << endl << endl;
140 */      
141                 
142                 //this follows wigeon, but we may want to consider that it chops off the end values if the sequence cannot be evenly divided into steps
143                 for (int m = front; m < (back - size) ; m+=increment) {  win.push_back(m);  }
144
145
146         
147                 return win;
148         
149         }
150         catch(exception& e) {
151                 errorOut(e, "DeCalculator", "findWindows");
152                 exit(1);
153         }
154 }
155
156 //***************************************************************************************************************
157 vector<float> DeCalculator::calcObserved(Sequence* query, Sequence* subject, vector<int> window, int size) {
158         try {
159                 
160                 vector<float> temp;     
161                 //int gaps = 0;         
162                 for (int m = 0; m < window.size(); m++) {
163                                                 
164                         string seqFrag = query->getAligned().substr(window[m], size);
165                         string seqFragsub = subject->getAligned().substr(window[m], size);
166                                 
167                         int diff = 0;
168                         for (int b = 0; b < seqFrag.length(); b++) {
169                                 //if at least one is a base and they are not equal
170                                 if( (isalpha(seqFrag[b]) || isalpha(seqFragsub[b])) && (seqFrag[b] != seqFragsub[b]) ) { diff++; }
171                                 
172                                 //ignore gaps
173                                 //if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
174                         }
175                
176                         //percentage of mismatched bases
177                         float dist;
178                         
179                         //if the whole fragment is 0 distance = 0
180                         //if ((seqFrag.length()-gaps) == 0)  { dist =  0.0; }
181                
182                         //percentage of mismatched bases
183                         //else {  dist = diff / (float) (seqFrag.length()-gaps) * 100;   } 
184                         
185                         dist = diff / (float) (seqFrag.length()) * 100; 
186                         
187                         temp.push_back(dist);
188                 }
189                         
190                 return temp;
191         }
192         catch(exception& e) {
193                 errorOut(e, "DeCalculator", "calcObserved");
194                 exit(1);
195         }
196 }
197 //***************************************************************************************************************
198 float DeCalculator::calcDist(Sequence* query, Sequence* subject, int front, int back) {
199         try {
200                 
201                 //so you only look at the trimmed part of the sequence
202                 int cutoff = back - front;  
203                 int gaps = 0;
204                         
205                 //from first startpoint with length back-front
206                 string seqFrag = query->getAligned().substr(front, cutoff);
207                 string seqFragsub = subject->getAligned().substr(front, cutoff);
208                                                                                                                 
209                 int diff = 0;
210                 for (int b = 0; b < seqFrag.length(); b++) {
211                         //ignore gaps
212                         if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
213                         if (seqFrag[b] != seqFragsub[b]) { diff++; }
214                 }
215                 
216                 //if the whole fragment is 0 distance = 0
217                 if ((seqFrag.length()-gaps) == 0)  { return 0.0; }
218                
219                 //percentage of mismatched bases
220                 float dist = diff / (float) (seqFrag.length()-gaps) * 100;       
221                                 
222                 return dist;
223         }
224         catch(exception& e) {
225                 errorOut(e, "DeCalculator", "calcDist");
226                 exit(1);
227         }
228 }
229
230 //***************************************************************************************************************
231 vector<float> DeCalculator::calcExpected(vector<float> qav, float coef) {
232         try {
233                 
234                 //for each window
235                 vector<float> queryExpected;
236                         
237                 for (int m = 0; m < qav.size(); m++) {          
238                                 
239                         float expected = qav[m] * coef;
240                                 
241                         queryExpected.push_back(expected);      
242                 }
243                         
244                 return queryExpected;
245                                 
246         }
247         catch(exception& e) {
248                 errorOut(e, "DeCalculator", "calcExpected");
249                 exit(1);
250         }
251 }
252 //***************************************************************************************************************
253 float DeCalculator::calcDE(vector<float> obs, vector<float> exp) {
254         try {
255                 
256                 //for each window
257                 float sum = 0.0;  //sum = sum from 1 to m of (oi-ei)^2
258                 int numZeros = 0;
259                 for (int m = 0; m < obs.size(); m++) {          
260                         
261                         //if (obs[m] != 0.0) {
262                                 sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); 
263                         //}else {  numZeros++;   }
264                         
265                 }
266                         
267                 float de = sqrt((sum / (obs.size() - 1 - numZeros)));
268                         
269                 return de;
270         }
271         catch(exception& e) {
272                 errorOut(e, "DeCalculator", "calcDE");
273                 exit(1);
274         }
275 }
276
277 //***************************************************************************************************************
278
279 vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
280         try {
281
282                 vector<float> prob;
283                 string freqfile = getRootName(filename) + "freq";
284                 ofstream outFreq;
285                 
286                 openOutputFile(freqfile, outFreq);
287                 
288                 string length = toString(seqs.size());  //if there are 5000 seqs in the template then set precision to 3
289                 int precision = length.length() - 1;
290                 
291                 //format output
292                 outFreq.setf(ios::fixed, ios::floatfield); outFreq.setf(ios::showpoint);
293                 
294                 //at each position in the sequence
295                 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
296                         
297                         vector<int> freq;   freq.resize(4,0);
298                         int gaps = 0;
299                         
300                         //find the frequency of each nucleotide
301                         for (int j = 0; j < seqs.size(); j++) {
302                                 
303                                 char value = seqs[j]->getAligned()[i];
304                                 
305                                 if(toupper(value) == 'A')                                                                       {       freq[0]++;      }
306                                 else if(toupper(value) == 'T' || toupper(value) == 'U')         {       freq[1]++;      }
307                                 else if(toupper(value) == 'G')                                                          {       freq[2]++;      }
308                                 else if(toupper(value) == 'C')                                                          {       freq[3]++;      }
309                                 else { gaps++; }
310                         }
311                         
312                         //find base with highest frequency
313                         int highest = 0;
314                         for (int m = 0; m < freq.size(); m++) {   if (freq[m] > highest) {  highest = freq[m];  }               }
315                         
316                         float highFreq = highest / (float) (seqs.size());       
317                         
318                         float Pi;
319                         Pi =  (highFreq - 0.25) / 0.75; 
320                         
321                         //cannot have probability less than 0.
322                         if (Pi < 0) { Pi = 0.0; }
323                         
324                         //saves this for later
325                         outFreq << setprecision(precision) << i << '\t' << highFreq << endl;
326         
327                         if (h.count(i) > 0) {
328                                 prob.push_back(Pi); 
329                         }
330                 }
331                 
332                 outFreq.close();
333                 
334                 return prob;
335                                 
336         }
337         catch(exception& e) {
338                 errorOut(e, "DeCalculator", "calcFreq");
339                 exit(1);
340         }
341 }
342 //***************************************************************************************************************
343 vector<float>  DeCalculator::findQav(vector<int> window, int size, vector<float> probabilityProfile) {
344         try {
345                 vector<float>  averages; 
346                                 
347                 //for each window find average
348                 for (int m = 0; m < window.size(); m++) {
349                                 
350                         float average = 0.0;
351                                 
352                         //while you are in the window for this sequence
353                         int count = 0;
354                         for (int j = window[m]; j < (window[m]+size); j++) {   
355                                 average += probabilityProfile[j];
356                                 count++;
357                         }
358                                 
359                         average = average / count;
360         
361                         //save this windows average
362                         averages.push_back(average);
363                 }
364                                 
365                 return averages;
366         }
367         catch(exception& e) {
368                 errorOut(e, "DeCalculator", "findQav");
369                 exit(1);
370         }
371 }
372 //***************************************************************************************************************
373 //seqs have already been masked
374 vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end) {
375         try {
376                 vector< vector<quanMember> > quan; 
377                 
378                 //percentage of mismatched pairs 1 to 100
379                 quan.resize(100);
380 //ofstream o;
381 //string out = "getQuantiles.out";
382 //openOutputFile(out, o);
383                 
384                 //for each sequence
385                 for(int i = start; i < end; i++){
386                 
387                         mothurOut("Processing sequence " + toString(i)); mothurOutEndLine();
388                         Sequence* query = new Sequence(seqs[i]->getName(), seqs[i]->getAligned());
389                         
390                         //compare to every other sequence in template
391                         for(int j = 0; j < i; j++){
392                                 
393                                 Sequence* subject = new Sequence(seqs[j]->getName(), seqs[j]->getAligned());
394                                 
395                                 map<int, int> trim;
396                                 map<int, int>::iterator it;
397                                 
398                                 trimSeqs(query, subject, trim);
399                                 
400                                 it = trim.begin();
401                                 int front = it->first; int back = it->second;
402                                 
403                                 //reset window for each new comparison
404                                 windowSizesTemplate[i] = window;
405                                 
406                                 vector<int> win = findWindows(query, front, back, windowSizesTemplate[i], increment);
407                                 
408                                 vector<float> obsi = calcObserved(query, subject, win, windowSizesTemplate[i]);
409                                 
410                                 vector<float> q = findQav(win, windowSizesTemplate[i], probProfile);
411                                                                         
412                                 float alpha = getCoef(obsi, q);
413                                                 
414                                 vector<float> exp = calcExpected(q, alpha);
415                                 
416                                 float de = calcDE(obsi, exp);
417                                                                 
418                                 float dist = calcDist(query, subject, front, back); 
419         //o << i << '\t' <<  j << '\t' << dist << '\t' << de << endl;                   
420                                 dist = ceil(dist);
421                                 
422                                 quanMember newScore(de, i, j);
423                                 
424                                 //dist-1 because vector indexes start at 0.
425                                 quan[dist-1].push_back(newScore);
426                                 
427                                 delete subject;
428                         }
429                         
430                         delete query;
431                 }
432
433                 return quan;
434                                                 
435         }
436         catch(exception& e) {
437                 errorOut(e, "DeCalculator", "getQuantiles");
438                 exit(1);
439         }
440 }
441 //********************************************************************************************************************
442 //sorts lowest to highest
443 inline bool compareQuanMembers(quanMember left, quanMember right){
444         return (left.score < right.score);      
445
446 //***************************************************************************************************************
447 //this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right.  may want to revisit in the future...
448 void DeCalculator::removeObviousOutliers(vector< vector<quanMember> >& quantiles, int num) {
449         try {
450                                                 
451                 for (int i = 0; i < quantiles.size(); i++) {
452                 
453                         //find mean of this quantile score
454                         sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
455                         
456                         float high = quantiles[i][int(quantiles[i].size() * 0.99)].score;
457                         float low =  quantiles[i][int(quantiles[i].size() * 0.01)].score;
458                         
459                         vector<quanMember> temp;
460                         
461                         //look at each value in quantiles to see if it is an outlier
462                         for (int j = 0; j < quantiles[i].size(); j++) {
463                                 //is this score between 1 and 99%
464                                 if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) {
465                                         temp.push_back(quantiles[i][j]);
466                                 }
467                         }
468                         
469                         quantiles[i] = temp;
470                 }
471
472 /*
473                 //find contributer with most offending score related to it
474                 int largestContrib = findLargestContrib(seen);
475         
476                 //while you still have guys to eliminate
477                 while (contributions.size() > 0) {
478                 
479                         mothurOut("Removing scores contributed by sequence " + toString(largestContrib) + " in your template file."); mothurOutEndLine();
480                         
481                         //remove from quantiles all scores that were made using this largestContrib
482                         for (int i = 0; i < quantiles.size(); i++) {
483 //cout << "before remove " << quantiles[i].size() << '\t';
484                                 removeContrib(largestContrib, quantiles[i]);
485 //cout << "after remove " << quantiles[i].size() << endl;
486                         }
487 //cout << count << " largest contrib = " << largestContrib << endl;  count++;
488                         //remove from contributions all scores that were made using this largestContrib
489                         removeContrib(largestContrib, contributions);
490                         
491                         //"erase" largestContrib
492                         seen[largestContrib] = -1;
493                         
494                         //get next largestContrib
495                         largestContrib = findLargestContrib(seen);
496                 }
497 ABOVE IS ATTEMPT #1             
498 **************************************************************************************************
499 BELOW IS ATTEMPT #2             
500                 vector<int> marked = returnObviousOutliers(quantiles, num);
501                 vector<int> copyMarked = marked;
502                 
503                 //find the 99% of marked
504                 sort(copyMarked.begin(), copyMarked.end());
505                 int high = copyMarked[int(copyMarked.size() * 0.99)];
506 cout << "high = " << high << endl;              
507                 
508                 for(int i = 0; i < marked.size(); i++) {
509                         if (marked[i] > high) { 
510                                 mothurOut("Removing scores contributed by sequence " + toString(marked[i]) + " in your template file."); mothurOutEndLine();
511                                 for (int i = 0; i < quantiles.size(); i++) {
512                                         removeContrib(marked[i], quantiles[i]);
513                                 }
514                         }
515
516                 }
517                 
518                 
519                 //adjust quantiles
520                 for (int i = 0; i < quantiles.size(); i++) {
521                         vector<float> temp;
522                         
523                         if (quantiles[i].size() == 0) {
524                                 //in case this is not a distance found in your template files
525                                 for (int g = 0; g < 6; g++) {
526                                         temp.push_back(0.0);
527                                 }
528                         }else{
529                                 
530                                 sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
531                                 
532                                 //save 10%
533                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)].score);
534                                 //save 25%
535                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)].score);
536                                 //save 50%
537                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)].score);
538                                 //save 75%
539                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)].score);
540                                 //save 95%
541                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)].score);
542                                 //save 99%
543                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)].score);
544                                 
545                         }
546                         
547                         quan[i] = temp;
548                         
549                 }
550 */
551                 
552         }
553         catch(exception& e) {
554                 errorOut(e, "DeCalculator", "removeObviousOutliers");
555                 exit(1);
556         }
557 }
558 //***************************************************************************************************************
559 //put quanMember in the vector based on how far they are from the 99% or 1%.  Biggest offenders in front.
560 /*vector<quanMember> DeCalculator::sortContrib(map<quanMember*, float> quan) {
561         try{
562                 
563                 vector<quanMember> newQuan;
564                 map<quanMember*, float>::iterator it;
565                 
566                 while (quan.size() > 0) {
567                         
568                          map<quanMember*, float>::iterator largest = quan.begin(); 
569                           
570                         //find biggest member
571                         for (it = quan.begin(); it != quan.end(); it++) {
572                                 if (it->second > largest->second) {  largest = it;  }
573                         }
574 cout << largest->second << '\t' << largest->first->score << '\t' << largest->first->member1 << '\t' << largest->first->member2 << endl;
575                         //save it 
576                         newQuan.push_back(*(largest->first));
577                 
578                         //erase from quan
579                         quan.erase(largest);
580                 }
581                 
582                 return newQuan;
583                 
584         }
585         catch(exception& e) {
586                 errorOut(e, "DeCalculator", "sortContrib");
587                 exit(1);
588         }
589 }
590
591 //***************************************************************************************************************
592 //used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used...
593 int DeCalculator::findLargestContrib(vector<int> seen) {
594         try{
595                 
596                 int largest = 0;
597                 
598                 int largestContribs;
599                 
600                 for (int i = 0; i < seen.size(); i++)  {  
601                         
602                         if (seen[i] > largest) {
603                                 largestContribs = i;
604                                 largest = seen[i];
605                         }
606                 }
607                 
608                 return largestContribs;
609                 
610         }
611         catch(exception& e) {
612                 errorOut(e, "DeCalculator", "findLargestContribs");
613                 exit(1);
614         }
615 }
616 //***************************************************************************************************************
617 void DeCalculator::removeContrib(int bad, vector<quanMember>& quan) {
618         try{
619         
620                 vector<quanMember> newQuan;
621                 for (int i = 0; i < quan.size(); i++)  {  
622                         if ((quan[i].member1 != bad) && (quan[i].member2 != bad) ) {  
623                                 newQuan.push_back(quan[i]);  
624                         }
625                 }
626                 
627                 quan = newQuan;
628                 
629         }
630         catch(exception& e) {
631                 errorOut(e, "DeCalculator", "removeContrib");
632                 exit(1);
633         }
634 }
635 */
636 //***************************************************************************************************************
637 float DeCalculator::findAverage(vector<float> myVector) {
638         try{
639                 
640                 float total = 0.0;
641                 for (int i = 0; i < myVector.size(); i++)  {  total += myVector[i];  }
642                 
643                 float average = total / (float) myVector.size();
644                 
645                 return average;
646                 
647         }
648         catch(exception& e) {
649                 errorOut(e, "DeCalculator", "findAverage");
650                 exit(1);
651         }
652 }
653
654 //***************************************************************************************************************
655 float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
656         try {
657         
658                 //find average prob for this seqs windows
659                 float probAverage = findAverage(qav);
660                                 
661                 //find observed average 
662                 float obsAverage = findAverage(obs);
663                         
664                 float coef = obsAverage / probAverage;
665                                                 
666                 return coef;
667         }
668         catch(exception& e) {
669                 errorOut(e, "DeCalculator", "getCoef");
670                 exit(1);
671         }
672 }
673 //***************************************************************************************************************
674
675