]> git.donarmstrong.com Git - mothur.git/blob - decalc.cpp
fixed pintail bug
[mothur.git] / decalc.cpp
1 /*
2  *  decalc.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 7/22/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "decalc.h"
11 #include "chimera.h"
12 #include "dist.h"
13 #include "eachgapdist.h"
14 #include "ignoregaps.h"
15
16
17 //***************************************************************************************************************
18 void DeCalculator::setMask(string ms) { 
19         try {
20                 seqMask = ms; 
21                 int count = 0;
22                 maskMap.clear();
23                 
24                 if (seqMask.length() != 0) {
25                         //whereever there is a base in the mask, save that value is query and subject
26                         for (int i = 0; i < seqMask.length(); i++) {
27                                 if (isalpha(seqMask[i])) {
28                                         h.insert(i);
29                                         maskMap[count] = i;
30                                         count++;
31                                         
32                                 }
33                         }
34                 }else {
35                         for (int i = 0; i < alignLength; i++) {   
36                                 h.insert(i); 
37                                 maskMap[count] = i;
38                                 count++;
39                         }
40                 }
41         }
42         catch(exception& e) {
43                 m->errorOut(e, "DeCalculator", "setMask");
44                 exit(1);
45         } 
46 }
47 //***************************************************************************************************************
48 void DeCalculator::runMask(Sequence* seq) {
49         try{
50                 
51                 string q = seq->getAligned();
52                 string tempQuery = "";
53                 
54                 //whereever there is a base in the mask, save that value is query and subject
55                 set<int>::iterator setit;
56                 for ( setit=h.begin() ; setit != h.end(); setit++ )  {
57                         tempQuery += q[*setit];
58                 }
59                 
60                 //save masked values
61                 seq->setAligned(tempQuery);
62                 seq->setUnaligned(tempQuery);
63         }
64         catch(exception& e) {
65                 m->errorOut(e, "DeCalculator", "runMask");
66                 exit(1);
67         }
68 }
69 //***************************************************************************************************************
70 //num is query's spot in querySeqs
71 void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map<int, int>& trim) {
72         try {
73                 
74                 string q = query->getAligned();
75                 string s = subject->getAligned();
76                 
77                 int front = 0;
78                 for (int i = 0; i < q.length(); i++) {
79 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
80                         if (isalpha(q[i]) && isalpha(s[i])) { front = i; break;  }
81                 }
82 //cout << endl << endl;         
83                 int back = 0;           
84                 for (int i = q.length(); i >= 0; i--) {
85 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
86                         if (isalpha(q[i]) && isalpha(s[i])) { back = i; break;  }
87                 }
88                 
89                 trim[front] = back;
90                 
91         }
92         catch(exception& e) {
93                 m->errorOut(e, "DeCalculator", "trimSeqs");
94                 exit(1);
95         }
96 }
97 //***************************************************************************************************************
98 vector<int>  DeCalculator::findWindows(Sequence* query, int front, int back, int& size, int increment) {
99         try {
100                 
101                 vector<int> win; 
102                 
103                 int cutoff = back - front;  //back - front 
104                         
105                 //if window is set to default
106                 if (size == 0) {  if (cutoff > 1200) {  size = 300; }
107                                                         else{  size = (cutoff / 4); }  } 
108                 else if (size > (cutoff / 4)) { 
109                                 m->mothurOut("You have selected too large a window size for sequence " + query->getName() + ".  I will choose an appropriate window size."); m->mothurOutEndLine();
110                                 size = (cutoff / 4); 
111                 }
112         
113         /*      string seq = query->getAligned().substr(front, cutoff);
114                         
115                 //count bases
116                 int numBases = 0;
117                 for (int l = 0; l < seq.length(); l++) {  if (isalpha(seq[l])) { numBases++; }  }
118 //cout << "num Bases = " << numBases << endl;                   
119                 //save start of seq
120                 win.push_back(front);
121 //cout << front << '\t';                
122                 //move ahead increment bases at a time until all bases are in a window
123                 int countBases = 0;
124                 int totalBases = 0;  //used to eliminate window of blanks at end of sequence
125                         
126                 seq = query->getAligned();
127                 for (int m = front; m < (back - size) ; m++) {
128                                 
129                         //count number of bases you see
130                         if (isalpha(seq[m])) { countBases++;  }
131                                 
132                         //if you have seen enough bases to make a new window
133                         if (countBases >= increment) {
134                                 //total bases is the number of bases in a window already.
135                                 totalBases += countBases;
136 //cout << "total bases = " << totalBases << endl;
137                                 win.push_back(m);  //save spot in alignment
138 //cout << m << '\t';
139                                 countBases = 0;                         //reset bases you've seen in this window
140                         }
141                                 
142                         //no need to continue if all your bases are in a window
143                         if (totalBases == numBases) {   break;  }
144                 }
145         
146
147                 //get last window if needed
148                 if (totalBases < numBases) {   win.push_back(back-size);  }
149 //cout << endl << endl;
150 */      
151                 
152                 //this follows wigeon, but we may want to consider that it chops off the end values if the sequence cannot be evenly divided into steps
153                 for (int i = front; i < (back - size) ; i+=increment) {  win.push_back(i);  }
154
155
156         
157                 return win;
158         
159         }
160         catch(exception& e) {
161                 m->errorOut(e, "DeCalculator", "findWindows");
162                 exit(1);
163         }
164 }
165
166 //***************************************************************************************************************
167 vector<float> DeCalculator::calcObserved(Sequence* query, Sequence* subject, vector<int> window, int size) {
168         try {
169                 
170                 vector<float> temp;     
171                 //int gaps = 0;         
172                 for (int i = 0; i < window.size(); i++) {
173                                                 
174                         string seqFrag = query->getAligned().substr(window[i], size);
175                         string seqFragsub = subject->getAligned().substr(window[i], size);
176                                 
177                         int diff = 0;
178                         for (int b = 0; b < seqFrag.length(); b++) {
179                                 //if at least one is a base and they are not equal
180                                 if( (isalpha(seqFrag[b]) || isalpha(seqFragsub[b])) && (seqFrag[b] != seqFragsub[b]) ) { diff++; }
181                                 
182                                 //ignore gaps
183                                 //if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
184                         }
185                
186                         //percentage of mismatched bases
187                         float dist;
188                         
189                         //if the whole fragment is 0 distance = 0
190                         //if ((seqFrag.length()-gaps) == 0)  { dist =  0.0; }
191                
192                         //percentage of mismatched bases
193                         //else {  dist = diff / (float) (seqFrag.length()-gaps) * 100;   } 
194                         
195                         dist = diff / (float) (seqFrag.length()) * 100; 
196                         
197                         temp.push_back(dist);
198                 }
199                         
200                 return temp;
201         }
202         catch(exception& e) {
203                 m->errorOut(e, "DeCalculator", "calcObserved");
204                 exit(1);
205         }
206 }
207 //***************************************************************************************************************
208 float DeCalculator::calcDist(Sequence* query, Sequence* subject, int front, int back) {
209         try {
210                 
211                 //so you only look at the trimmed part of the sequence
212                 int cutoff = back - front;  
213                 int gaps = 0;
214                         
215                 //from first startpoint with length back-front
216                 string seqFrag = query->getAligned().substr(front, cutoff);
217                 string seqFragsub = subject->getAligned().substr(front, cutoff);
218                                                                                                                 
219                 int diff = 0;
220                 for (int b = 0; b < seqFrag.length(); b++) {
221                         //ignore gaps
222                         if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
223                         if (seqFrag[b] != seqFragsub[b]) { diff++; }
224                 }
225                 
226                 //if the whole fragment is 0 distance = 0
227                 if ((seqFrag.length()-gaps) == 0)  { return 0.0; }
228                
229                 //percentage of mismatched bases
230                 float dist = diff / (float) (seqFrag.length()-gaps) * 100;       
231                                 
232                 return dist;
233         }
234         catch(exception& e) {
235                 m->errorOut(e, "DeCalculator", "calcDist");
236                 exit(1);
237         }
238 }
239
240 //***************************************************************************************************************
241 vector<float> DeCalculator::calcExpected(vector<float> qav, float coef) {
242         try {
243                 
244                 //for each window
245                 vector<float> queryExpected;
246                         
247                 for (int j = 0; j < qav.size(); j++) {          
248                                 
249                         float expected = qav[j] * coef;
250                                 
251                         queryExpected.push_back(expected);      
252                 }
253                         
254                 return queryExpected;
255                                 
256         }
257         catch(exception& e) {
258                 m->errorOut(e, "DeCalculator", "calcExpected");
259                 exit(1);
260         }
261 }
262 //***************************************************************************************************************
263 float DeCalculator::calcDE(vector<float> obs, vector<float> exp) {
264         try {
265                 
266                 //for each window
267                 float sum = 0.0;  //sum = sum from 1 to i of (oi-ei)^2
268                 int numZeros = 0;
269                 for (int j = 0; j < obs.size(); j++) {          
270                         
271                         //if (obs[j] != 0.0) {
272                                 sum += ((obs[j] - exp[j]) * (obs[j] - exp[j])); 
273                         //}else {  numZeros++;   }
274                         
275                 }
276                         
277                 float de = sqrt((sum / (obs.size() - 1 - numZeros)));
278                         
279                 return de;
280         }
281         catch(exception& e) {
282                 m->errorOut(e, "DeCalculator", "calcDE");
283                 exit(1);
284         }
285 }
286
287 //***************************************************************************************************************
288
289 vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
290         try {
291
292                 vector<float> prob;
293                 string freqfile = getRootName(filename) + "freq";
294                 ofstream outFreq;
295                 
296                 openOutputFile(freqfile, outFreq);
297                 
298                 string length = toString(seqs.size());  //if there are 5000 seqs in the template then set precision to 3
299                 int precision = length.length() - 1;
300                 
301                 //format output
302                 outFreq.setf(ios::fixed, ios::floatfield); outFreq.setf(ios::showpoint);
303                 
304                 //at each position in the sequence
305                 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
306                         
307                         vector<int> freq;   freq.resize(4,0);
308                         int gaps = 0;
309                         
310                         //find the frequency of each nucleotide
311                         for (int j = 0; j < seqs.size(); j++) {
312                                 
313                                 char value = seqs[j]->getAligned()[i];
314                                 
315                                 if(toupper(value) == 'A')                                                                       {       freq[0]++;      }
316                                 else if(toupper(value) == 'T' || toupper(value) == 'U')         {       freq[1]++;      }
317                                 else if(toupper(value) == 'G')                                                          {       freq[2]++;      }
318                                 else if(toupper(value) == 'C')                                                          {       freq[3]++;      }
319                                 else { gaps++; }
320                         }
321                         
322                         //find base with highest frequency
323                         int highest = 0;
324                         for (int j = 0; j < freq.size(); j++) {   if (freq[j] > highest) {  highest = freq[j];  }               }
325                         
326                         float highFreq = highest / (float) (seqs.size());       
327                         
328                         float Pi;
329                         Pi =  (highFreq - 0.25) / 0.75; 
330                         
331                         //cannot have probability less than 0.
332                         if (Pi < 0) { Pi = 0.0; }
333                         
334                         //saves this for later
335                         outFreq << setprecision(precision) << i << '\t' << highFreq << endl;
336         
337                         if (h.count(i) > 0) {
338                                 prob.push_back(Pi); 
339                         }
340                 }
341                 
342                 outFreq.close();
343                 
344                 return prob;
345                                 
346         }
347         catch(exception& e) {
348                 m->errorOut(e, "DeCalculator", "calcFreq");
349                 exit(1);
350         }
351 }
352 //***************************************************************************************************************
353 vector<float>  DeCalculator::findQav(vector<int> window, int size, vector<float> probabilityProfile) {
354         try {
355                 vector<float>  averages; 
356                                 
357                 //for each window find average
358                 for (int i = 0; i < window.size(); i++) {
359                                 
360                         float average = 0.0;
361                                 
362                         //while you are in the window for this sequence
363                         int count = 0;
364                         for (int j = window[i]; j < (window[i]+size); j++) {   
365                                 average += probabilityProfile[j];
366                                 count++;
367                         }
368                                 
369                         average = average / count;
370         
371                         //save this windows average
372                         averages.push_back(average);
373                 }
374                                 
375                 return averages;
376         }
377         catch(exception& e) {
378                 m->errorOut(e, "DeCalculator", "findQav");
379                 exit(1);
380         }
381 }
382 //***************************************************************************************************************
383 //seqs have already been masked
384 vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end) {
385         try {
386                 vector< vector<quanMember> > quan; 
387                 
388                 //percentage of mismatched pairs 1 to 100
389                 quan.resize(100);
390
391                 //for each sequence
392                 for(int i = start; i < end; i++){
393                 
394                         m->mothurOut("Processing sequence " + toString(i)); m->mothurOutEndLine();
395                         Sequence* query = new Sequence(seqs[i]->getName(), seqs[i]->getAligned());
396                         
397                         //compare to every other sequence in template
398                         for(int j = 0; j < i; j++){
399                                 
400                                 Sequence* subject = new Sequence(seqs[j]->getName(), seqs[j]->getAligned());
401                                 
402                                 if (m->control_pressed) { delete query; delete subject; return quan; }
403                                 
404                                 map<int, int> trim;
405                                 map<int, int>::iterator it;
406                                 
407                                 trimSeqs(query, subject, trim);
408                                 
409                                 it = trim.begin();
410                                 int front = it->first; int back = it->second;
411                                 
412                                 //reset window for each new comparison
413                                 windowSizesTemplate[i] = window;
414                                 
415                                 vector<int> win = findWindows(query, front, back, windowSizesTemplate[i], increment);
416                                 
417                                 vector<float> obsi = calcObserved(query, subject, win, windowSizesTemplate[i]);
418                                 
419                                 vector<float> q = findQav(win, windowSizesTemplate[i], probProfile);
420                                                                         
421                                 float alpha = getCoef(obsi, q);
422                                                 
423                                 vector<float> exp = calcExpected(q, alpha);
424                                 
425                                 float de = calcDE(obsi, exp);
426                                                                 
427                                 float dist = calcDist(query, subject, front, back); 
428         //cout << i << '\t' <<  j << '\t' << dist << '\t' << de << endl;                        
429                                 dist = ceil(dist);
430                                 
431                                 quanMember newScore(de, i, j);
432                                 
433                                 quan[dist].push_back(newScore);
434
435                                 delete subject;
436                         }
437                         
438                         delete query;
439                 }
440         
441
442                 return quan;
443                                                 
444         }
445         catch(exception& e) {
446                 m->errorOut(e, "DeCalculator", "getQuantiles");
447                 exit(1);
448         }
449 }
450 //********************************************************************************************************************
451 //sorts lowest to highest
452 inline bool compareQuanMembers(quanMember left, quanMember right){
453         return (left.score < right.score);      
454
455 //***************************************************************************************************************
456 //this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right.  may want to revisit in the future...
457 void DeCalculator::removeObviousOutliers(vector< vector<quanMember> >& quantiles, int num) {
458         try {
459                                                 
460                 for (int i = 0; i < quantiles.size(); i++) {
461                         
462                         //find mean of this quantile score
463                         sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
464                         
465                         vector<quanMember> temp;
466                         if (quantiles[i].size() != 0) {
467                                 float high = quantiles[i][int(quantiles[i].size() * 0.99)].score;
468                                 float low =  quantiles[i][int(quantiles[i].size() * 0.01)].score;
469                         
470                                 //look at each value in quantiles to see if it is an outlier
471                                 for (int j = 0; j < quantiles[i].size(); j++) {
472                                         //is this score between 1 and 99%
473                                         if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) {
474                                                 temp.push_back(quantiles[i][j]);
475                                         }
476                                 }
477                         }
478                         quantiles[i] = temp;
479                 }
480
481 /*
482                 //find contributer with most offending score related to it
483                 int largestContrib = findLargestContrib(seen);
484         
485                 //while you still have guys to eliminate
486                 while (contributions.size() > 0) {
487                 
488                         m->mothurOut("Removing scores contributed by sequence " + toString(largestContrib) + " in your template file."); m->mothurOutEndLine();
489                         
490                         //remove from quantiles all scores that were made using this largestContrib
491                         for (int i = 0; i < quantiles.size(); i++) {
492 //cout << "before remove " << quantiles[i].size() << '\t';
493                                 removeContrib(largestContrib, quantiles[i]);
494 //cout << "after remove " << quantiles[i].size() << endl;
495                         }
496 //cout << count << " largest contrib = " << largestContrib << endl;  count++;
497                         //remove from contributions all scores that were made using this largestContrib
498                         removeContrib(largestContrib, contributions);
499                         
500                         //"erase" largestContrib
501                         seen[largestContrib] = -1;
502                         
503                         //get next largestContrib
504                         largestContrib = findLargestContrib(seen);
505                 }
506 ABOVE IS ATTEMPT #1             
507 **************************************************************************************************
508 BELOW IS ATTEMPT #2             
509                 vector<int> marked = returnObviousOutliers(quantiles, num);
510                 vector<int> copyMarked = marked;
511                 
512                 //find the 99% of marked
513                 sort(copyMarked.begin(), copyMarked.end());
514                 int high = copyMarked[int(copyMarked.size() * 0.99)];
515 cout << "high = " << high << endl;              
516                 
517                 for(int i = 0; i < marked.size(); i++) {
518                         if (marked[i] > high) { 
519                                 m->mothurOut("Removing scores contributed by sequence " + toString(marked[i]) + " in your template file."); m->mothurOutEndLine();
520                                 for (int i = 0; i < quantiles.size(); i++) {
521                                         removeContrib(marked[i], quantiles[i]);
522                                 }
523                         }
524
525                 }
526                 
527                 
528                 //adjust quantiles
529                 for (int i = 0; i < quantiles.size(); i++) {
530                         vector<float> temp;
531                         
532                         if (quantiles[i].size() == 0) {
533                                 //in case this is not a distance found in your template files
534                                 for (int g = 0; g < 6; g++) {
535                                         temp.push_back(0.0);
536                                 }
537                         }else{
538                                 
539                                 sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
540                                 
541                                 //save 10%
542                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)].score);
543                                 //save 25%
544                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)].score);
545                                 //save 50%
546                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)].score);
547                                 //save 75%
548                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)].score);
549                                 //save 95%
550                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)].score);
551                                 //save 99%
552                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)].score);
553                                 
554                         }
555                         
556                         quan[i] = temp;
557                         
558                 }
559 */
560                 
561         }
562         catch(exception& e) {
563                 m->errorOut(e, "DeCalculator", "removeObviousOutliers");
564                 exit(1);
565         }
566 }
567 //***************************************************************************************************************
568 //put quanMember in the vector based on how far they are from the 99% or 1%.  Biggest offenders in front.
569 /*vector<quanMember> DeCalculator::sortContrib(map<quanMember*, float> quan) {
570         try{
571                 
572                 vector<quanMember> newQuan;
573                 map<quanMember*, float>::iterator it;
574                 
575                 while (quan.size() > 0) {
576                         
577                          map<quanMember*, float>::iterator largest = quan.begin(); 
578                           
579                         //find biggest member
580                         for (it = quan.begin(); it != quan.end(); it++) {
581                                 if (it->second > largest->second) {  largest = it;  }
582                         }
583 cout << largest->second << '\t' << largest->first->score << '\t' << largest->first->member1 << '\t' << largest->first->member2 << endl;
584                         //save it 
585                         newQuan.push_back(*(largest->first));
586                 
587                         //erase from quan
588                         quan.erase(largest);
589                 }
590                 
591                 return newQuan;
592                 
593         }
594         catch(exception& e) {
595                 m->errorOut(e, "DeCalculator", "sortContrib");
596                 exit(1);
597         }
598 }
599
600 //***************************************************************************************************************
601 //used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used...
602 int DeCalculator::findLargestContrib(vector<int> seen) {
603         try{
604                 
605                 int largest = 0;
606                 
607                 int largestContribs;
608                 
609                 for (int i = 0; i < seen.size(); i++)  {  
610                         
611                         if (seen[i] > largest) {
612                                 largestContribs = i;
613                                 largest = seen[i];
614                         }
615                 }
616                 
617                 return largestContribs;
618                 
619         }
620         catch(exception& e) {
621                 m->errorOut(e, "DeCalculator", "findLargestContribs");
622                 exit(1);
623         }
624 }
625 //***************************************************************************************************************
626 void DeCalculator::removeContrib(int bad, vector<quanMember>& quan) {
627         try{
628         
629                 vector<quanMember> newQuan;
630                 for (int i = 0; i < quan.size(); i++)  {  
631                         if ((quan[i].member1 != bad) && (quan[i].member2 != bad) ) {  
632                                 newQuan.push_back(quan[i]);  
633                         }
634                 }
635                 
636                 quan = newQuan;
637                 
638         }
639         catch(exception& e) {
640                 m->errorOut(e, "DeCalculator", "removeContrib");
641                 exit(1);
642         }
643 }
644 */
645 //***************************************************************************************************************
646 float DeCalculator::findAverage(vector<float> myVector) {
647         try{
648                 
649                 float total = 0.0;
650                 for (int i = 0; i < myVector.size(); i++)  {  total += myVector[i];  }
651                 
652                 float average = total / (float) myVector.size();
653                 
654                 return average;
655                 
656         }
657         catch(exception& e) {
658                 m->errorOut(e, "DeCalculator", "findAverage");
659                 exit(1);
660         }
661 }
662
663 //***************************************************************************************************************
664 float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
665         try {
666         
667                 //find average prob for this seqs windows
668                 float probAverage = findAverage(qav);
669                                 
670                 //find observed average 
671                 float obsAverage = findAverage(obs);
672                         
673                 float coef = obsAverage / probAverage;
674                                                 
675                 return coef;
676         }
677         catch(exception& e) {
678                 m->errorOut(e, "DeCalculator", "getCoef");
679                 exit(1);
680         }
681 }
682 //***************************************************************************************************************
683 //gets closest matches to each end, since chimeras will most likely have different parents on each end
684 vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db, int& numWanted, vector<int>& indexes) {
685         try {
686                 indexes.clear();
687                 
688                 vector<Sequence*> seqsMatches;  
689                 vector<SeqDist> distsLeft;
690                 vector<SeqDist> distsRight;
691                 
692                 Dist* distcalculator = new eachGapDist();
693                 
694                 string queryUnAligned = querySeq->getUnaligned();
695                 int numBases = int(queryUnAligned.length() * 0.33);
696                 
697                 string leftQuery = ""; //first 1/3 of the sequence
698                 string rightQuery = ""; //last 1/3 of the sequence
699                 string queryAligned = querySeq->getAligned();
700                 
701                 //left side
702                 bool foundFirstBase = false;
703                 int baseCount = 0;
704                 int leftSpot = 0;
705                 int firstBaseSpot = 0;
706                 for (int i = 0; i < queryAligned.length(); i++) {
707                         //if you are a base
708                         if (isalpha(queryAligned[i])) {         
709                                 baseCount++; 
710                                 if (!foundFirstBase) {   foundFirstBase = true;  firstBaseSpot = i;  }
711                         }
712                         
713                         //eliminate opening .'s
714                         if (foundFirstBase) {   leftQuery += queryAligned[i];  }
715                         //if you have 1/3
716                         if (baseCount >= numBases) {  leftSpot = i; break; } //first 1/3
717                 }
718                 
719                 //right side - count through another 1/3, so you are at last third
720                 baseCount = 0;
721                 int rightSpot = 0;
722                 for (int i = leftSpot; i < queryAligned.length(); i++) {
723                         //if you are a base
724                         if (isalpha(queryAligned[i])) {         baseCount++;    }
725                         //if you have 1/3
726                         if (baseCount >= numBases) { rightSpot = i;  break; } //last 1/3
727                 }
728                 
729                 //trim end
730                 //find last position in query that is a non gap character
731                 int lastBaseSpot = queryAligned.length()-1;
732                 for (int j = queryAligned.length()-1; j >= 0; j--) {
733                         if (isalpha(queryAligned[j])) {
734                                 lastBaseSpot = j;
735                                 break;
736                         }
737                 }
738                 rightQuery = queryAligned.substr(rightSpot, (lastBaseSpot-rightSpot)); //sequence from pos spot to end
739                 
740                 Sequence queryLeft(querySeq->getName(), leftQuery);
741                 Sequence queryRight(querySeq->getName(), rightQuery);
742 //cout << querySeq->getName() << '\t' << leftSpot << '\t' << rightSpot << '\t' << firstBaseSpot << '\t' << lastBaseSpot << endl;
743 //cout << queryUnAligned.length() << '\t' << queryLeft.getUnaligned().length() << '\t' << queryRight.getUnaligned().length() << endl;
744                 for(int j = 0; j < db.size(); j++){
745                         
746                         string dbAligned = db[j]->getAligned();
747                         string leftDB = dbAligned.substr(firstBaseSpot, (leftSpot-firstBaseSpot+1)); //first 1/3 of the sequence
748                         string rightDB = dbAligned.substr(rightSpot, (lastBaseSpot-rightSpot)); //last 1/3 of the sequence
749                         
750                         Sequence dbLeft(db[j]->getName(), leftDB);
751                         Sequence dbRight(db[j]->getName(), rightDB);
752
753                         distcalculator->calcDist(queryLeft, dbLeft);
754                         float distLeft = distcalculator->getDist();
755                         
756                         distcalculator->calcDist(queryRight, dbRight);
757                         float distRight = distcalculator->getDist();
758                         
759                         SeqDist subjectLeft;
760                         subjectLeft.seq = db[j];
761                         subjectLeft.dist = distLeft;
762                         subjectLeft.index = j;
763                         
764                         distsLeft.push_back(subjectLeft);
765                         
766                         SeqDist subjectRight;
767                         subjectRight.seq = db[j];
768                         subjectRight.dist = distRight;
769                         subjectRight.index = j;
770                         
771                         distsRight.push_back(subjectRight);
772
773                 }
774                 
775                 delete distcalculator;
776                 
777                 //sort by smallest distance
778                 sort(distsRight.begin(), distsRight.end(), compareSeqDist);
779                 sort(distsLeft.begin(), distsLeft.end(), compareSeqDist);
780                 
781                 //merge results         
782                 map<string, string> seen;
783                 map<string, string>::iterator it;
784                 
785                 vector<SeqDist> dists;
786                 float lastRight = distsRight[0].dist;
787                 float lastLeft = distsLeft[0].dist;
788                 int lasti = 0;
789                 for (int i = 0; i < distsLeft.size(); i++) {
790                         //add left if you havent already
791                         it = seen.find(distsLeft[i].seq->getName());
792                         if (it == seen.end()) {  
793                                 dists.push_back(distsLeft[i]);
794                                 seen[distsLeft[i].seq->getName()] = distsLeft[i].seq->getName();
795                                 lastLeft =  distsLeft[i].dist;
796                         }
797
798                         //add right if you havent already
799                         it = seen.find(distsRight[i].seq->getName());
800                         if (it == seen.end()) {  
801                                 dists.push_back(distsRight[i]);
802                                 seen[distsRight[i].seq->getName()] = distsRight[i].seq->getName();
803                                 lastRight =  distsRight[i].dist;
804                         }
805                         
806                         if (dists.size() > numWanted) { lasti = i; break; } //you have enough results
807                 }
808                 
809                 //add in dups
810                 lasti++;
811                 int i = lasti;
812                 while (i < distsLeft.size()) {  
813                         if (distsLeft[i].dist == lastLeft) {  dists.push_back(distsLeft[i]);  numWanted++; }
814                         else { break; }
815                         i++;
816                 }
817                 
818                 i = lasti;
819                 while (i < distsRight.size()) {  
820                         if (distsRight[i].dist == lastRight) {  dists.push_back(distsRight[i]);  numWanted++; }
821                         else { break; }
822                         i++;
823                 }
824
825                 if (numWanted > dists.size()) { m->mothurOut("numwanted is larger than the number of template sequences, adjusting numwanted."); m->mothurOutEndLine(); numWanted = dists.size(); }
826
827 //cout << numWanted << endl;
828                 for (int i = 0; i < numWanted; i++) {
829 //cout << dists[i].seq->getName() << '\t' << dists[i].dist << endl;
830                         Sequence* temp = new Sequence(dists[i].seq->getName(), dists[i].seq->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
831                         seqsMatches.push_back(temp);
832                         indexes.push_back(dists[i].index);
833                 }
834                 
835                 return seqsMatches;
836         }
837         catch(exception& e) {
838                 m->errorOut(e, "DeCalculator", "findClosest");
839                 exit(1);
840         }
841 }
842 //***************************************************************************************************************
843 Sequence* DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db) {
844         try {
845                 
846                 Sequence* seqsMatch;  
847                 
848                 Dist* distcalculator = new eachGapDist();
849                 int index = 0;
850                 int smallest = 1000000;
851                 
852                 for(int j = 0; j < db.size(); j++){
853                         
854                         distcalculator->calcDist(*querySeq, *db[j]);
855                         float dist = distcalculator->getDist();
856                         
857                         if (dist < smallest) { 
858                                 smallest = dist;
859                                 index = j;
860                         }
861                 }
862                 
863                 delete distcalculator;
864                 
865                 seqsMatch = new Sequence(db[index]->getName(), db[index]->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
866                         
867                 return seqsMatch;
868         }
869         catch(exception& e) {
870                 m->errorOut(e, "DeCalculator", "findClosest");
871                 exit(1);
872         }
873 }
874 /***************************************************************************************************************/
875 map<int, int> DeCalculator::trimSeqs(Sequence* query, vector<Sequence*> topMatches) {
876         try {
877                 
878                 int frontPos = 0;  //should contain first position in all seqs that is not a gap character
879                 int rearPos = query->getAligned().length();
880                 
881                 //********find first position in topMatches that is a non gap character***********//
882                 //find first position all query seqs that is a non gap character
883                 for (int i = 0; i < topMatches.size(); i++) {
884                         
885                         string aligned = topMatches[i]->getAligned();
886                         int pos = 0;
887                         
888                         //find first spot in this seq
889                         for (int j = 0; j < aligned.length(); j++) {
890                                 if (isalpha(aligned[j])) {
891                                         pos = j;
892                                         break;
893                                 }
894                         }
895                         
896                         //save this spot if it is the farthest
897                         if (pos > frontPos) { frontPos = pos; }
898                 }
899                 
900                 
901                 string aligned = query->getAligned();
902                 int pos = 0;
903                         
904                 //find first position in query that is a non gap character
905                 for (int j = 0; j < aligned.length(); j++) {
906                         if (isalpha(aligned[j])) {
907                                 pos = j;
908                                 break;
909                         }
910                 }
911                 
912                 //save this spot if it is the farthest
913                 if (pos > frontPos) { frontPos = pos; }
914                 
915                 
916                 //********find last position in topMatches that is a non gap character***********//
917                 for (int i = 0; i < topMatches.size(); i++) {
918                         
919                         string aligned = topMatches[i]->getAligned();
920                         int pos = aligned.length();
921                         
922                         //find first spot in this seq
923                         for (int j = aligned.length()-1; j >= 0; j--) {
924                                 if (isalpha(aligned[j])) {
925                                         pos = j;
926                                         break;
927                                 }
928                         }
929                         
930                         //save this spot if it is the farthest
931                         if (pos < rearPos) { rearPos = pos; }
932                 }
933                 
934                 
935                 aligned = query->getAligned();
936                 pos = aligned.length();
937                 
938                 //find last position in query that is a non gap character
939                 for (int j = aligned.length()-1; j >= 0; j--) {
940                         if (isalpha(aligned[j])) {
941                                 pos = j;
942                                 break;
943                         }
944                 }
945                 
946                 //save this spot if it is the farthest
947                 if (pos < rearPos) { rearPos = pos; }
948
949                 //check to make sure that is not whole seq
950                 if ((rearPos - frontPos - 1) <= 0) {  m->mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); m->mothurOutEndLine(); exit(1);  }
951 //cout << query->getName() << " front = " << frontPos << " rear = " << rearPos << endl;         
952                 //trim query
953                 string newAligned = query->getAligned();
954                 newAligned = newAligned.substr(frontPos, (rearPos-frontPos+1));
955                 query->setAligned(newAligned);
956                 
957                 //trim topMatches
958                 for (int i = 0; i < topMatches.size(); i++) {
959                         newAligned = topMatches[i]->getAligned();
960                         newAligned = newAligned.substr(frontPos, (rearPos-frontPos+1));
961                         topMatches[i]->setAligned(newAligned);
962                 }
963                 
964                 map<int, int> trimmedPos;
965                 
966                 for (int i = 0; i < newAligned.length(); i++) {
967                         trimmedPos[i] = i+frontPos;
968                 }
969                 
970                 return trimmedPos;
971         }
972         catch(exception& e) {
973                 m->errorOut(e, "DeCalculator", "trimSequences");
974                 exit(1);
975         }
976
977 }
978 //***************************************************************************************************************
979
980
981