]> git.donarmstrong.com Git - mothur.git/blob - decalc.cpp
chimera.slayer fix
[mothur.git] / decalc.cpp
1 /*
2  *  decalc.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 7/22/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "decalc.h"
11 #include "chimera.h"
12 #include "dist.h"
13 #include "eachgapdist.h"
14 #include "ignoregaps.h"
15
16
17 //***************************************************************************************************************
18 void DeCalculator::setMask(string ms) { 
19         try {
20                 seqMask = ms; 
21                 int count = 0;
22                 maskMap.clear();
23                 
24                 if (seqMask.length() != 0) {
25                         //whereever there is a base in the mask, save that value is query and subject
26                         for (int i = 0; i < seqMask.length(); i++) {
27                                 if (isalpha(seqMask[i])) {
28                                         h.insert(i);
29                                         maskMap[count] = i;
30                                         count++;
31                                         
32                                 }
33                         }
34                 }else {
35                         for (int i = 0; i < alignLength; i++) {   
36                                 h.insert(i); 
37                                 maskMap[count] = i;
38                                 count++;
39                         }
40                 }
41         }
42         catch(exception& e) {
43                 m->errorOut(e, "DeCalculator", "setMask");
44                 exit(1);
45         } 
46 }
47 //***************************************************************************************************************
48 void DeCalculator::runMask(Sequence* seq) {
49         try{
50                 
51                 string q = seq->getAligned();
52                 string tempQuery = "";
53                 
54                 //whereever there is a base in the mask, save that value is query and subject
55                 set<int>::iterator setit;
56                 for ( setit=h.begin() ; setit != h.end(); setit++ )  {
57                         tempQuery += q[*setit];
58                 }
59                 
60                 //save masked values
61                 seq->setAligned(tempQuery);
62                 seq->setUnaligned(tempQuery);
63         }
64         catch(exception& e) {
65                 m->errorOut(e, "DeCalculator", "runMask");
66                 exit(1);
67         }
68 }
69 //***************************************************************************************************************
70 //num is query's spot in querySeqs
71 void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map<int, int>& trim) {
72         try {
73                 
74                 string q = query->getAligned();
75                 string s = subject->getAligned();
76                 
77                 int front = 0;
78                 for (int i = 0; i < q.length(); i++) {
79 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
80                         if (isalpha(q[i]) && isalpha(s[i])) { front = i; break;  }
81                 }
82 //cout << endl << endl;         
83                 int back = 0;           
84                 for (int i = q.length(); i >= 0; i--) {
85 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
86                         if (isalpha(q[i]) && isalpha(s[i])) { back = i; break;  }
87                 }
88                 
89                 trim[front] = back;
90                 
91         }
92         catch(exception& e) {
93                 m->errorOut(e, "DeCalculator", "trimSeqs");
94                 exit(1);
95         }
96 }
97 //***************************************************************************************************************
98 vector<int>  DeCalculator::findWindows(Sequence* query, int front, int back, int& size, int increment) {
99         try {
100                 
101                 vector<int> win; 
102                 
103                 int cutoff = back - front;  //back - front 
104                         
105                 //if window is set to default
106                 if (size == 0) {  if (cutoff > 1200) {  size = 300; }
107                                                         else{  size = (cutoff / 4); }  } 
108                 else if (size > (cutoff / 4)) { 
109                                 m->mothurOut("You have selected too large a window size for sequence " + query->getName() + ".  I will choose an appropriate window size."); m->mothurOutEndLine();
110                                 size = (cutoff / 4); 
111                 }
112         
113         /*      string seq = query->getAligned().substr(front, cutoff);
114                         
115                 //count bases
116                 int numBases = 0;
117                 for (int l = 0; l < seq.length(); l++) {  if (isalpha(seq[l])) { numBases++; }  }
118 //cout << "num Bases = " << numBases << endl;                   
119                 //save start of seq
120                 win.push_back(front);
121 //cout << front << '\t';                
122                 //move ahead increment bases at a time until all bases are in a window
123                 int countBases = 0;
124                 int totalBases = 0;  //used to eliminate window of blanks at end of sequence
125                         
126                 seq = query->getAligned();
127                 for (int m = front; m < (back - size) ; m++) {
128                                 
129                         //count number of bases you see
130                         if (isalpha(seq[m])) { countBases++;  }
131                                 
132                         //if you have seen enough bases to make a new window
133                         if (countBases >= increment) {
134                                 //total bases is the number of bases in a window already.
135                                 totalBases += countBases;
136 //cout << "total bases = " << totalBases << endl;
137                                 win.push_back(m);  //save spot in alignment
138 //cout << m << '\t';
139                                 countBases = 0;                         //reset bases you've seen in this window
140                         }
141                                 
142                         //no need to continue if all your bases are in a window
143                         if (totalBases == numBases) {   break;  }
144                 }
145         
146
147                 //get last window if needed
148                 if (totalBases < numBases) {   win.push_back(back-size);  }
149 //cout << endl << endl;
150 */      
151                 
152                 //this follows wigeon, but we may want to consider that it chops off the end values if the sequence cannot be evenly divided into steps
153                 for (int i = front; i < (back - size) ; i+=increment) {  win.push_back(i);  }
154
155
156         
157                 return win;
158         
159         }
160         catch(exception& e) {
161                 m->errorOut(e, "DeCalculator", "findWindows");
162                 exit(1);
163         }
164 }
165
166 //***************************************************************************************************************
167 vector<float> DeCalculator::calcObserved(Sequence* query, Sequence* subject, vector<int> window, int size) {
168         try {
169                 
170                 vector<float> temp;     
171                 //int gaps = 0;         
172                 for (int i = 0; i < window.size(); i++) {
173                                                 
174                         string seqFrag = query->getAligned().substr(window[i], size);
175                         string seqFragsub = subject->getAligned().substr(window[i], size);
176                                 
177                         int diff = 0;
178                         for (int b = 0; b < seqFrag.length(); b++) {
179                                 //if at least one is a base and they are not equal
180                                 if( (isalpha(seqFrag[b]) || isalpha(seqFragsub[b])) && (seqFrag[b] != seqFragsub[b]) ) { diff++; }
181                                 
182                                 //ignore gaps
183                                 //if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
184                         }
185                
186                         //percentage of mismatched bases
187                         float dist;
188                         
189                         //if the whole fragment is 0 distance = 0
190                         //if ((seqFrag.length()-gaps) == 0)  { dist =  0.0; }
191                
192                         //percentage of mismatched bases
193                         //else {  dist = diff / (float) (seqFrag.length()-gaps) * 100;   } 
194                         
195                         dist = diff / (float) (seqFrag.length()) * 100; 
196                         
197                         temp.push_back(dist);
198                 }
199                         
200                 return temp;
201         }
202         catch(exception& e) {
203                 m->errorOut(e, "DeCalculator", "calcObserved");
204                 exit(1);
205         }
206 }
207 //***************************************************************************************************************
208 float DeCalculator::calcDist(Sequence* query, Sequence* subject, int front, int back) {
209         try {
210                 
211                 //so you only look at the trimmed part of the sequence
212                 int cutoff = back - front;  
213                 int gaps = 0;
214                         
215                 //from first startpoint with length back-front
216                 string seqFrag = query->getAligned().substr(front, cutoff);
217                 string seqFragsub = subject->getAligned().substr(front, cutoff);
218                                                                                                                 
219                 int diff = 0;
220                 for (int b = 0; b < seqFrag.length(); b++) {
221                         //ignore gaps
222                         if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
223                         if (seqFrag[b] != seqFragsub[b]) { diff++; }
224                 }
225                 
226                 //if the whole fragment is 0 distance = 0
227                 if ((seqFrag.length()-gaps) == 0)  { return 0.0; }
228                
229                 //percentage of mismatched bases
230                 float dist = diff / (float) (seqFrag.length()-gaps) * 100;       
231                                 
232                 return dist;
233         }
234         catch(exception& e) {
235                 m->errorOut(e, "DeCalculator", "calcDist");
236                 exit(1);
237         }
238 }
239
240 //***************************************************************************************************************
241 vector<float> DeCalculator::calcExpected(vector<float> qav, float coef) {
242         try {
243                 
244                 //for each window
245                 vector<float> queryExpected;
246                         
247                 for (int j = 0; j < qav.size(); j++) {          
248                                 
249                         float expected = qav[j] * coef;
250                                 
251                         queryExpected.push_back(expected);      
252                 }
253                         
254                 return queryExpected;
255                                 
256         }
257         catch(exception& e) {
258                 m->errorOut(e, "DeCalculator", "calcExpected");
259                 exit(1);
260         }
261 }
262 //***************************************************************************************************************
263 float DeCalculator::calcDE(vector<float> obs, vector<float> exp) {
264         try {
265                 
266                 //for each window
267                 float sum = 0.0;  //sum = sum from 1 to i of (oi-ei)^2
268                 int numZeros = 0;
269                 for (int j = 0; j < obs.size(); j++) {          
270                         
271                         //if (obs[j] != 0.0) {
272                                 sum += ((obs[j] - exp[j]) * (obs[j] - exp[j])); 
273                         //}else {  numZeros++;   }
274                         
275                 }
276                         
277                 float de = sqrt((sum / (obs.size() - 1 - numZeros)));
278                         
279                 return de;
280         }
281         catch(exception& e) {
282                 m->errorOut(e, "DeCalculator", "calcDE");
283                 exit(1);
284         }
285 }
286
287 //***************************************************************************************************************
288
289 vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
290         try {
291
292                 vector<float> prob;
293                 string freqfile = m->getRootName(filename) + "freq";
294                 ofstream outFreq;
295                 
296                 m->openOutputFile(freqfile, outFreq);
297                 
298                 outFreq << "#" << m->getVersion() << endl;
299                 
300                 string length = toString(seqs.size());  //if there are 5000 seqs in the template then set precision to 3
301                 int precision = length.length() - 1;
302                 
303                 //format output
304                 outFreq.setf(ios::fixed, ios::floatfield); outFreq.setf(ios::showpoint);
305                 
306                 //at each position in the sequence
307                 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
308                         
309                         vector<int> freq;   freq.resize(4,0);
310                         int gaps = 0;
311                         
312                         //find the frequency of each nucleotide
313                         for (int j = 0; j < seqs.size(); j++) {
314                                 
315                                 char value = seqs[j]->getAligned()[i];
316                                 
317                                 if(toupper(value) == 'A')                                                                       {       freq[0]++;      }
318                                 else if(toupper(value) == 'T' || toupper(value) == 'U')         {       freq[1]++;      }
319                                 else if(toupper(value) == 'G')                                                          {       freq[2]++;      }
320                                 else if(toupper(value) == 'C')                                                          {       freq[3]++;      }
321                                 else { gaps++; }
322                         }
323                         
324                         //find base with highest frequency
325                         int highest = 0;
326                         for (int j = 0; j < freq.size(); j++) {   if (freq[j] > highest) {  highest = freq[j];  }               }
327                         
328                         float highFreq = highest / (float) (seqs.size());       
329                         
330                         float Pi;
331                         Pi =  (highFreq - 0.25) / 0.75; 
332                         
333                         //cannot have probability less than 0.
334                         if (Pi < 0) { Pi = 0.0; }
335                         
336                         //saves this for later
337                         outFreq << setprecision(precision) << i << '\t' << highFreq << endl;
338         
339                         if (h.count(i) > 0) {
340                                 prob.push_back(Pi); 
341                         }
342                 }
343                 
344                 outFreq.close();
345                 
346                 return prob;
347                                 
348         }
349         catch(exception& e) {
350                 m->errorOut(e, "DeCalculator", "calcFreq");
351                 exit(1);
352         }
353 }
354 //***************************************************************************************************************
355 vector<float>  DeCalculator::findQav(vector<int> window, int size, vector<float> probabilityProfile) {
356         try {
357                 vector<float>  averages; 
358                                 
359                 //for each window find average
360                 for (int i = 0; i < window.size(); i++) {
361                                 
362                         float average = 0.0;
363                                 
364                         //while you are in the window for this sequence
365                         int count = 0;
366                         for (int j = window[i]; j < (window[i]+size); j++) {   
367                                 average += probabilityProfile[j];
368                                 count++;
369                         }
370                                 
371                         average = average / count;
372         
373                         //save this windows average
374                         averages.push_back(average);
375                 }
376                                 
377                 return averages;
378         }
379         catch(exception& e) {
380                 m->errorOut(e, "DeCalculator", "findQav");
381                 exit(1);
382         }
383 }
384 //***************************************************************************************************************
385 //seqs have already been masked
386 vector< vector<float> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end) {
387         try {
388                 vector< vector<float> > quan; 
389                 
390                 //percentage of mismatched pairs 1 to 100
391                 quan.resize(100);
392
393                 //for each sequence
394                 for(int i = start; i < end; i++){
395                 
396                         m->mothurOut("Processing sequence " + toString(i)); m->mothurOutEndLine();
397                         Sequence* query = new Sequence(seqs[i]->getName(), seqs[i]->getAligned());
398                         
399                         //compare to every other sequence in template
400                         for(int j = 0; j < i; j++){
401                                 
402                                 Sequence* subject = new Sequence(seqs[j]->getName(), seqs[j]->getAligned());
403                                 
404                                 if (m->control_pressed) { delete query; delete subject; return quan; }
405                                 
406                                 map<int, int> trim;
407                                 map<int, int>::iterator it;
408                                 
409                                 trimSeqs(query, subject, trim);
410                                 
411                                 it = trim.begin();
412                                 int front = it->first; int back = it->second;
413                                 
414                                 //reset window for each new comparison
415                                 windowSizesTemplate[i] = window;
416                                 
417                                 vector<int> win = findWindows(query, front, back, windowSizesTemplate[i], increment);
418                                 
419                                 vector<float> obsi = calcObserved(query, subject, win, windowSizesTemplate[i]);
420                                 
421                                 vector<float> q = findQav(win, windowSizesTemplate[i], probProfile);
422                                                                         
423                                 float alpha = getCoef(obsi, q);
424                                                 
425                                 vector<float> exp = calcExpected(q, alpha);
426                                 
427                                 float de = calcDE(obsi, exp);
428                                                                 
429                                 float dist = calcDist(query, subject, front, back); 
430         //cout << i << '\t' <<  j << '\t' << dist << '\t' << de << endl;                        
431                                 dist = ceil(dist);
432                                 
433                                 //quanMember newScore(de, i, j);
434                                 
435                                 quan[dist].push_back(de);
436
437                                 delete subject;
438                         }
439                         
440                         delete query;
441                 }
442         
443
444                 return quan;
445                                                 
446         }
447         catch(exception& e) {
448                 m->errorOut(e, "DeCalculator", "getQuantiles");
449                 exit(1);
450         }
451 }
452 //********************************************************************************************************************
453 //sorts lowest to highest
454 inline bool compareQuanMembers(quanMember left, quanMember right){
455         return (left.score < right.score);      
456
457 //***************************************************************************************************************
458 //this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right.  may want to revisit in the future...
459 void DeCalculator::removeObviousOutliers(vector< vector<float> >& quantiles, int num) {
460         try {
461                                                 
462                 for (int i = 0; i < quantiles.size(); i++) {
463                         
464                         //find mean of this quantile score
465                         sort(quantiles[i].begin(), quantiles[i].end());
466                         
467                         vector<float> temp;
468                         if (quantiles[i].size() != 0) {
469                                 float high = quantiles[i][int(quantiles[i].size() * 0.99)];
470                                 float low =  quantiles[i][int(quantiles[i].size() * 0.01)];
471                         
472                                 //look at each value in quantiles to see if it is an outlier
473                                 for (int j = 0; j < quantiles[i].size(); j++) {
474                                         //is this score between 1 and 99%
475                                         if ((quantiles[i][j] > low) && (quantiles[i][j] < high)) {
476                                                 temp.push_back(quantiles[i][j]);
477                                         }
478                                 }
479                         }
480                         quantiles[i] = temp;
481                 }
482
483 /*
484                 //find contributer with most offending score related to it
485                 int largestContrib = findLargestContrib(seen);
486         
487                 //while you still have guys to eliminate
488                 while (contributions.size() > 0) {
489                 
490                         m->mothurOut("Removing scores contributed by sequence " + toString(largestContrib) + " in your template file."); m->mothurOutEndLine();
491                         
492                         //remove from quantiles all scores that were made using this largestContrib
493                         for (int i = 0; i < quantiles.size(); i++) {
494 //cout << "before remove " << quantiles[i].size() << '\t';
495                                 removeContrib(largestContrib, quantiles[i]);
496 //cout << "after remove " << quantiles[i].size() << endl;
497                         }
498 //cout << count << " largest contrib = " << largestContrib << endl;  count++;
499                         //remove from contributions all scores that were made using this largestContrib
500                         removeContrib(largestContrib, contributions);
501                         
502                         //"erase" largestContrib
503                         seen[largestContrib] = -1;
504                         
505                         //get next largestContrib
506                         largestContrib = findLargestContrib(seen);
507                 }
508 ABOVE IS ATTEMPT #1             
509 **************************************************************************************************
510 BELOW IS ATTEMPT #2             
511                 vector<int> marked = returnObviousOutliers(quantiles, num);
512                 vector<int> copyMarked = marked;
513                 
514                 //find the 99% of marked
515                 sort(copyMarked.begin(), copyMarked.end());
516                 int high = copyMarked[int(copyMarked.size() * 0.99)];
517 cout << "high = " << high << endl;              
518                 
519                 for(int i = 0; i < marked.size(); i++) {
520                         if (marked[i] > high) { 
521                                 m->mothurOut("Removing scores contributed by sequence " + toString(marked[i]) + " in your template file."); m->mothurOutEndLine();
522                                 for (int i = 0; i < quantiles.size(); i++) {
523                                         removeContrib(marked[i], quantiles[i]);
524                                 }
525                         }
526
527                 }
528                 
529                 
530                 //adjust quantiles
531                 for (int i = 0; i < quantiles.size(); i++) {
532                         vector<float> temp;
533                         
534                         if (quantiles[i].size() == 0) {
535                                 //in case this is not a distance found in your template files
536                                 for (int g = 0; g < 6; g++) {
537                                         temp.push_back(0.0);
538                                 }
539                         }else{
540                                 
541                                 sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
542                                 
543                                 //save 10%
544                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)].score);
545                                 //save 25%
546                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)].score);
547                                 //save 50%
548                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)].score);
549                                 //save 75%
550                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)].score);
551                                 //save 95%
552                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)].score);
553                                 //save 99%
554                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)].score);
555                                 
556                         }
557                         
558                         quan[i] = temp;
559                         
560                 }
561 */
562                 
563         }
564         catch(exception& e) {
565                 m->errorOut(e, "DeCalculator", "removeObviousOutliers");
566                 exit(1);
567         }
568 }
569 //***************************************************************************************************************
570 //put quanMember in the vector based on how far they are from the 99% or 1%.  Biggest offenders in front.
571 /*vector<quanMember> DeCalculator::sortContrib(map<quanMember*, float> quan) {
572         try{
573                 
574                 vector<quanMember> newQuan;
575                 map<quanMember*, float>::iterator it;
576                 
577                 while (quan.size() > 0) {
578                         
579                          map<quanMember*, float>::iterator largest = quan.begin(); 
580                           
581                         //find biggest member
582                         for (it = quan.begin(); it != quan.end(); it++) {
583                                 if (it->second > largest->second) {  largest = it;  }
584                         }
585 cout << largest->second << '\t' << largest->first->score << '\t' << largest->first->member1 << '\t' << largest->first->member2 << endl;
586                         //save it 
587                         newQuan.push_back(*(largest->first));
588                 
589                         //erase from quan
590                         quan.erase(largest);
591                 }
592                 
593                 return newQuan;
594                 
595         }
596         catch(exception& e) {
597                 m->errorOut(e, "DeCalculator", "sortContrib");
598                 exit(1);
599         }
600 }
601
602 //***************************************************************************************************************
603 //used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used...
604 int DeCalculator::findLargestContrib(vector<int> seen) {
605         try{
606                 
607                 int largest = 0;
608                 
609                 int largestContribs;
610                 
611                 for (int i = 0; i < seen.size(); i++)  {  
612                         
613                         if (seen[i] > largest) {
614                                 largestContribs = i;
615                                 largest = seen[i];
616                         }
617                 }
618                 
619                 return largestContribs;
620                 
621         }
622         catch(exception& e) {
623                 m->errorOut(e, "DeCalculator", "findLargestContribs");
624                 exit(1);
625         }
626 }
627 //***************************************************************************************************************
628 void DeCalculator::removeContrib(int bad, vector<quanMember>& quan) {
629         try{
630         
631                 vector<quanMember> newQuan;
632                 for (int i = 0; i < quan.size(); i++)  {  
633                         if ((quan[i].member1 != bad) && (quan[i].member2 != bad) ) {  
634                                 newQuan.push_back(quan[i]);  
635                         }
636                 }
637                 
638                 quan = newQuan;
639                 
640         }
641         catch(exception& e) {
642                 m->errorOut(e, "DeCalculator", "removeContrib");
643                 exit(1);
644         }
645 }
646 */
647 //***************************************************************************************************************
648 float DeCalculator::findAverage(vector<float> myVector) {
649         try{
650                 
651                 float total = 0.0;
652                 for (int i = 0; i < myVector.size(); i++)  {  total += myVector[i];  }
653                 
654                 float average = total / (float) myVector.size();
655                 
656                 return average;
657                 
658         }
659         catch(exception& e) {
660                 m->errorOut(e, "DeCalculator", "findAverage");
661                 exit(1);
662         }
663 }
664
665 //***************************************************************************************************************
666 float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
667         try {
668         
669                 //find average prob for this seqs windows
670                 float probAverage = findAverage(qav);
671                                 
672                 //find observed average 
673                 float obsAverage = findAverage(obs);
674                         
675                 float coef = obsAverage / probAverage;
676                                                 
677                 return coef;
678         }
679         catch(exception& e) {
680                 m->errorOut(e, "DeCalculator", "getCoef");
681                 exit(1);
682         }
683 }
684 //***************************************************************************************************************
685 //gets closest matches to each end, since chimeras will most likely have different parents on each end
686 vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db, int& numWanted, vector<int>& indexes) {
687         try {
688                 indexes.clear();
689                 
690                 vector<Sequence*> seqsMatches;  
691                 vector<SeqDist> distsLeft;
692                 vector<SeqDist> distsRight;
693                 
694                 Dist* distcalculator = new eachGapDist();
695                 
696                 string queryUnAligned = querySeq->getUnaligned();
697                 int numBases = int(queryUnAligned.length() * 0.33);
698                 
699                 string leftQuery = ""; //first 1/3 of the sequence
700                 string rightQuery = ""; //last 1/3 of the sequence
701                 string queryAligned = querySeq->getAligned();
702                 
703                 //left side
704                 bool foundFirstBase = false;
705                 int baseCount = 0;
706                 int leftSpot = 0;
707                 int firstBaseSpot = 0;
708                 for (int i = 0; i < queryAligned.length(); i++) {
709                         //if you are a base
710                         if (isalpha(queryAligned[i])) {         
711                                 baseCount++; 
712                                 if (!foundFirstBase) {   foundFirstBase = true;  firstBaseSpot = i;  }
713                         }
714                         
715                         //eliminate opening .'s
716                         if (foundFirstBase) {   leftQuery += queryAligned[i];  }
717                         //if you have 1/3
718                         if (baseCount >= numBases) {  leftSpot = i; break; } //first 1/3
719                 }
720                 
721                 //right side - count through another 1/3, so you are at last third
722                 baseCount = 0;
723                 int rightSpot = 0;
724                 for (int i = leftSpot; i < queryAligned.length(); i++) {
725                         //if you are a base
726                         if (isalpha(queryAligned[i])) {         baseCount++;    }
727                         //if you have 1/3
728                         if (baseCount >= numBases) { rightSpot = i;  break; } //last 1/3
729                 }
730                 
731                 //trim end
732                 //find last position in query that is a non gap character
733                 int lastBaseSpot = queryAligned.length()-1;
734                 for (int j = queryAligned.length()-1; j >= 0; j--) {
735                         if (isalpha(queryAligned[j])) {
736                                 lastBaseSpot = j;
737                                 break;
738                         }
739                 }
740                 rightQuery = queryAligned.substr(rightSpot, (lastBaseSpot-rightSpot)); //sequence from pos spot to end
741                 
742                 Sequence queryLeft(querySeq->getName(), leftQuery);
743                 Sequence queryRight(querySeq->getName(), rightQuery);
744 //cout << querySeq->getName() << '\t' << leftSpot << '\t' << rightSpot << '\t' << firstBaseSpot << '\t' << lastBaseSpot << endl;
745 //cout << queryUnAligned.length() << '\t' << queryLeft.getUnaligned().length() << '\t' << queryRight.getUnaligned().length() << endl;
746                 for(int j = 0; j < db.size(); j++){
747                         
748                         string dbAligned = db[j]->getAligned();
749                         string leftDB = dbAligned.substr(firstBaseSpot, (leftSpot-firstBaseSpot+1)); //first 1/3 of the sequence
750                         string rightDB = dbAligned.substr(rightSpot, (lastBaseSpot-rightSpot)); //last 1/3 of the sequence
751                         
752                         Sequence dbLeft(db[j]->getName(), leftDB);
753                         Sequence dbRight(db[j]->getName(), rightDB);
754
755                         distcalculator->calcDist(queryLeft, dbLeft);
756                         float distLeft = distcalculator->getDist();
757                         
758                         distcalculator->calcDist(queryRight, dbRight);
759                         float distRight = distcalculator->getDist();
760                         
761                         SeqDist subjectLeft;
762                         subjectLeft.seq = db[j];
763                         subjectLeft.dist = distLeft;
764                         subjectLeft.index = j;
765                         
766                         distsLeft.push_back(subjectLeft);
767                         
768                         SeqDist subjectRight;
769                         subjectRight.seq = db[j];
770                         subjectRight.dist = distRight;
771                         subjectRight.index = j;
772                         
773                         distsRight.push_back(subjectRight);
774
775                 }
776                 
777                 delete distcalculator;
778                 
779                 //sort by smallest distance
780                 sort(distsRight.begin(), distsRight.end(), compareSeqDist);
781                 sort(distsLeft.begin(), distsLeft.end(), compareSeqDist);
782                 
783                 //merge results         
784                 map<string, string> seen;
785                 map<string, string>::iterator it;
786                 
787                 vector<SeqDist> dists;
788                 float lastRight = distsRight[0].dist;
789                 float lastLeft = distsLeft[0].dist;
790                 int lasti = 0;
791                 for (int i = 0; i < distsLeft.size(); i++) {
792                         //add left if you havent already
793                         it = seen.find(distsLeft[i].seq->getName());
794                         if (it == seen.end()) {  
795                                 dists.push_back(distsLeft[i]);
796                                 seen[distsLeft[i].seq->getName()] = distsLeft[i].seq->getName();
797                                 lastLeft =  distsLeft[i].dist;
798                         }
799
800                         //add right if you havent already
801                         it = seen.find(distsRight[i].seq->getName());
802                         if (it == seen.end()) {  
803                                 dists.push_back(distsRight[i]);
804                                 seen[distsRight[i].seq->getName()] = distsRight[i].seq->getName();
805                                 lastRight =  distsRight[i].dist;
806                         }
807                         
808                         if (dists.size() > numWanted) { lasti = i; break; } //you have enough results
809                 }
810                 
811                 //add in dups
812                 lasti++;
813                 int i = lasti;
814                 while (i < distsLeft.size()) {  
815                         if (distsLeft[i].dist == lastLeft) {  dists.push_back(distsLeft[i]);  numWanted++; }
816                         else { break; }
817                         i++;
818                 }
819                 
820                 i = lasti;
821                 while (i < distsRight.size()) {  
822                         if (distsRight[i].dist == lastRight) {  dists.push_back(distsRight[i]);  numWanted++; }
823                         else { break; }
824                         i++;
825                 }
826
827                 if (numWanted > dists.size()) { m->mothurOut("numwanted is larger than the number of template sequences, adjusting numwanted."); m->mothurOutEndLine(); numWanted = dists.size(); }
828
829 //cout << numWanted << endl;
830                 for (int i = 0; i < numWanted; i++) {
831 //cout << dists[i].seq->getName() << '\t' << dists[i].dist << endl;
832                         Sequence* temp = new Sequence(dists[i].seq->getName(), dists[i].seq->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
833                         seqsMatches.push_back(temp);
834                         indexes.push_back(dists[i].index);
835                 }
836                 
837                 return seqsMatches;
838         }
839         catch(exception& e) {
840                 m->errorOut(e, "DeCalculator", "findClosest");
841                 exit(1);
842         }
843 }
844 //***************************************************************************************************************
845 Sequence* DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db) {
846         try {
847                 
848                 Sequence* seqsMatch;  
849                 
850                 Dist* distcalculator = new eachGapDist();
851                 int index = 0;
852                 int smallest = 1000000;
853                 
854                 for(int j = 0; j < db.size(); j++){
855                         
856                         distcalculator->calcDist(*querySeq, *db[j]);
857                         float dist = distcalculator->getDist();
858                         
859                         if (dist < smallest) { 
860                                 smallest = dist;
861                                 index = j;
862                         }
863                 }
864                 
865                 delete distcalculator;
866                 
867                 seqsMatch = new Sequence(db[index]->getName(), db[index]->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
868                         
869                 return seqsMatch;
870         }
871         catch(exception& e) {
872                 m->errorOut(e, "DeCalculator", "findClosest");
873                 exit(1);
874         }
875 }
876 /***************************************************************************************************************/
877 map<int, int> DeCalculator::trimSeqs(Sequence* query, vector<Sequence*> topMatches) {
878         try {
879                 
880                 int frontPos = 0;  //should contain first position in all seqs that is not a gap character
881                 int rearPos = query->getAligned().length();
882                 
883                 //********find first position in topMatches that is a non gap character***********//
884                 //find first position all query seqs that is a non gap character
885                 for (int i = 0; i < topMatches.size(); i++) {
886                         
887                         string aligned = topMatches[i]->getAligned();
888                         int pos = 0;
889                         
890                         //find first spot in this seq
891                         for (int j = 0; j < aligned.length(); j++) {
892                                 if (isalpha(aligned[j])) {
893                                         pos = j;
894                                         break;
895                                 }
896                         }
897                         
898                         //save this spot if it is the farthest
899                         if (pos > frontPos) { frontPos = pos; }
900                 }
901                 
902                 
903                 string aligned = query->getAligned();
904                 int pos = 0;
905                         
906                 //find first position in query that is a non gap character
907                 for (int j = 0; j < aligned.length(); j++) {
908                         if (isalpha(aligned[j])) {
909                                 pos = j;
910                                 break;
911                         }
912                 }
913                 
914                 //save this spot if it is the farthest
915                 if (pos > frontPos) { frontPos = pos; }
916                 
917                 
918                 //********find last position in topMatches that is a non gap character***********//
919                 for (int i = 0; i < topMatches.size(); i++) {
920                         
921                         string aligned = topMatches[i]->getAligned();
922                         int pos = aligned.length();
923                         
924                         //find first spot in this seq
925                         for (int j = aligned.length()-1; j >= 0; j--) {
926                                 if (isalpha(aligned[j])) {
927                                         pos = j;
928                                         break;
929                                 }
930                         }
931                         
932                         //save this spot if it is the farthest
933                         if (pos < rearPos) { rearPos = pos; }
934                 }
935                 
936                 
937                 aligned = query->getAligned();
938                 pos = aligned.length();
939                 
940                 //find last position in query that is a non gap character
941                 for (int j = aligned.length()-1; j >= 0; j--) {
942                         if (isalpha(aligned[j])) {
943                                 pos = j;
944                                 break;
945                         }
946                 }
947                 
948                 //save this spot if it is the farthest
949                 if (pos < rearPos) { rearPos = pos; }
950                 
951                 map<int, int> trimmedPos;
952                 //check to make sure that is not whole seq
953                 if ((rearPos - frontPos - 1) <= 0) {  
954                         m->mothurOut("[ERROR]: when I trim your sequences, the entire sequence is trimmed."); m->mothurOutEndLine();  
955                         query->setAligned("");
956                         //trim topMatches
957                         for (int i = 0; i < topMatches.size(); i++) {
958                                 topMatches[i]->setAligned("");
959                         }
960                         
961                 }else {
962
963                         //trim query
964                         string newAligned = query->getAligned();
965                         newAligned = newAligned.substr(frontPos, (rearPos-frontPos+1));
966                         query->setAligned(newAligned);
967                         
968                         //trim topMatches
969                         for (int i = 0; i < topMatches.size(); i++) {
970                                 newAligned = topMatches[i]->getAligned();
971                                 newAligned = newAligned.substr(frontPos, (rearPos-frontPos+1));
972                                 topMatches[i]->setAligned(newAligned);
973                         }
974                         
975                         for (int i = 0; i < newAligned.length(); i++) {
976                                 trimmedPos[i] = i+frontPos;
977                         }
978                 }
979                 return trimmedPos;
980         }
981         catch(exception& e) {
982                 m->errorOut(e, "DeCalculator", "trimSequences");
983                 exit(1);
984         }
985
986 }
987 //***************************************************************************************************************
988
989
990