]> git.donarmstrong.com Git - mothur.git/blob - decalc.cpp
created mothurOut class to handle logfiles
[mothur.git] / decalc.cpp
1 /*
2  *  decalc.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 7/22/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "decalc.h"
11 #include "chimera.h"
12 #include "dist.h"
13 #include "eachgapdist.h"
14 #include "ignoregaps.h"
15
16
17 //***************************************************************************************************************
18 void DeCalculator::setMask(string ms) { 
19         try {
20                 seqMask = ms; 
21                 int count = 0;
22                 maskMap.clear();
23                 
24                 if (seqMask.length() != 0) {
25                         //whereever there is a base in the mask, save that value is query and subject
26                         for (int i = 0; i < seqMask.length(); i++) {
27                                 if (isalpha(seqMask[i])) {
28                                         h.insert(i);
29                                         maskMap[count] = i;
30                                         count++;
31                                         
32                                 }
33                         }
34                 }else {
35                         for (int i = 0; i < alignLength; i++) {   
36                                 h.insert(i); 
37                                 maskMap[count] = i;
38                                 count++;
39                         }
40                 }
41         }
42         catch(exception& e) {
43                 m->errorOut(e, "DeCalculator", "setMask");
44                 exit(1);
45         } 
46 }
47 //***************************************************************************************************************
48 void DeCalculator::runMask(Sequence* seq) {
49         try{
50                 
51                 string q = seq->getAligned();
52                 string tempQuery = "";
53                 
54                 //whereever there is a base in the mask, save that value is query and subject
55                 set<int>::iterator setit;
56                 for ( setit=h.begin() ; setit != h.end(); setit++ )  {
57                         tempQuery += q[*setit];
58                 }
59                 
60                 //save masked values
61                 seq->setAligned(tempQuery);
62                 seq->setUnaligned(tempQuery);
63         }
64         catch(exception& e) {
65                 m->errorOut(e, "DeCalculator", "runMask");
66                 exit(1);
67         }
68 }
69 //***************************************************************************************************************
70 //num is query's spot in querySeqs
71 void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map<int, int>& trim) {
72         try {
73                 
74                 string q = query->getAligned();
75                 string s = subject->getAligned();
76                 
77                 int front = 0;
78                 for (int i = 0; i < q.length(); i++) {
79 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
80                         if (isalpha(q[i]) && isalpha(s[i])) { front = i; break;  }
81                 }
82 //cout << endl << endl;         
83                 int back = 0;           
84                 for (int i = q.length(); i >= 0; i--) {
85 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
86                         if (isalpha(q[i]) && isalpha(s[i])) { back = i; break;  }
87                 }
88                 
89                 trim[front] = back;
90                 
91         }
92         catch(exception& e) {
93                 m->errorOut(e, "DeCalculator", "trimSeqs");
94                 exit(1);
95         }
96 }
97 //***************************************************************************************************************
98 vector<int>  DeCalculator::findWindows(Sequence* query, int front, int back, int& size, int increment) {
99         try {
100                 
101                 vector<int> win; 
102                 
103                 int cutoff = back - front;  //back - front 
104                         
105                 //if window is set to default
106                 if (size == 0) {  if (cutoff > 1200) {  size = 300; }
107                                                         else{  size = (cutoff / 4); }  } 
108                 else if (size > (cutoff / 4)) { 
109                                 m->mothurOut("You have selected too large a window size for sequence " + query->getName() + ".  I will choose an appropriate window size."); m->mothurOutEndLine();
110                                 size = (cutoff / 4); 
111                 }
112         
113         /*      string seq = query->getAligned().substr(front, cutoff);
114                         
115                 //count bases
116                 int numBases = 0;
117                 for (int l = 0; l < seq.length(); l++) {  if (isalpha(seq[l])) { numBases++; }  }
118 //cout << "num Bases = " << numBases << endl;                   
119                 //save start of seq
120                 win.push_back(front);
121 //cout << front << '\t';                
122                 //move ahead increment bases at a time until all bases are in a window
123                 int countBases = 0;
124                 int totalBases = 0;  //used to eliminate window of blanks at end of sequence
125                         
126                 seq = query->getAligned();
127                 for (int m = front; m < (back - size) ; m++) {
128                                 
129                         //count number of bases you see
130                         if (isalpha(seq[m])) { countBases++;  }
131                                 
132                         //if you have seen enough bases to make a new window
133                         if (countBases >= increment) {
134                                 //total bases is the number of bases in a window already.
135                                 totalBases += countBases;
136 //cout << "total bases = " << totalBases << endl;
137                                 win.push_back(m);  //save spot in alignment
138 //cout << m << '\t';
139                                 countBases = 0;                         //reset bases you've seen in this window
140                         }
141                                 
142                         //no need to continue if all your bases are in a window
143                         if (totalBases == numBases) {   break;  }
144                 }
145         
146
147                 //get last window if needed
148                 if (totalBases < numBases) {   win.push_back(back-size);  }
149 //cout << endl << endl;
150 */      
151                 
152                 //this follows wigeon, but we may want to consider that it chops off the end values if the sequence cannot be evenly divided into steps
153                 for (int m = front; m < (back - size) ; m+=increment) {  win.push_back(m);  }
154
155
156         
157                 return win;
158         
159         }
160         catch(exception& e) {
161                 m->errorOut(e, "DeCalculator", "findWindows");
162                 exit(1);
163         }
164 }
165
166 //***************************************************************************************************************
167 vector<float> DeCalculator::calcObserved(Sequence* query, Sequence* subject, vector<int> window, int size) {
168         try {
169                 
170                 vector<float> temp;     
171                 //int gaps = 0;         
172                 for (int m = 0; m < window.size(); m++) {
173                                                 
174                         string seqFrag = query->getAligned().substr(window[m], size);
175                         string seqFragsub = subject->getAligned().substr(window[m], size);
176                                 
177                         int diff = 0;
178                         for (int b = 0; b < seqFrag.length(); b++) {
179                                 //if at least one is a base and they are not equal
180                                 if( (isalpha(seqFrag[b]) || isalpha(seqFragsub[b])) && (seqFrag[b] != seqFragsub[b]) ) { diff++; }
181                                 
182                                 //ignore gaps
183                                 //if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
184                         }
185                
186                         //percentage of mismatched bases
187                         float dist;
188                         
189                         //if the whole fragment is 0 distance = 0
190                         //if ((seqFrag.length()-gaps) == 0)  { dist =  0.0; }
191                
192                         //percentage of mismatched bases
193                         //else {  dist = diff / (float) (seqFrag.length()-gaps) * 100;   } 
194                         
195                         dist = diff / (float) (seqFrag.length()) * 100; 
196                         
197                         temp.push_back(dist);
198                 }
199                         
200                 return temp;
201         }
202         catch(exception& e) {
203                 m->errorOut(e, "DeCalculator", "calcObserved");
204                 exit(1);
205         }
206 }
207 //***************************************************************************************************************
208 float DeCalculator::calcDist(Sequence* query, Sequence* subject, int front, int back) {
209         try {
210                 
211                 //so you only look at the trimmed part of the sequence
212                 int cutoff = back - front;  
213                 int gaps = 0;
214                         
215                 //from first startpoint with length back-front
216                 string seqFrag = query->getAligned().substr(front, cutoff);
217                 string seqFragsub = subject->getAligned().substr(front, cutoff);
218                                                                                                                 
219                 int diff = 0;
220                 for (int b = 0; b < seqFrag.length(); b++) {
221                         //ignore gaps
222                         if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
223                         if (seqFrag[b] != seqFragsub[b]) { diff++; }
224                 }
225                 
226                 //if the whole fragment is 0 distance = 0
227                 if ((seqFrag.length()-gaps) == 0)  { return 0.0; }
228                
229                 //percentage of mismatched bases
230                 float dist = diff / (float) (seqFrag.length()-gaps) * 100;       
231                                 
232                 return dist;
233         }
234         catch(exception& e) {
235                 m->errorOut(e, "DeCalculator", "calcDist");
236                 exit(1);
237         }
238 }
239
240 //***************************************************************************************************************
241 vector<float> DeCalculator::calcExpected(vector<float> qav, float coef) {
242         try {
243                 
244                 //for each window
245                 vector<float> queryExpected;
246                         
247                 for (int m = 0; m < qav.size(); m++) {          
248                                 
249                         float expected = qav[m] * coef;
250                                 
251                         queryExpected.push_back(expected);      
252                 }
253                         
254                 return queryExpected;
255                                 
256         }
257         catch(exception& e) {
258                 m->errorOut(e, "DeCalculator", "calcExpected");
259                 exit(1);
260         }
261 }
262 //***************************************************************************************************************
263 float DeCalculator::calcDE(vector<float> obs, vector<float> exp) {
264         try {
265                 
266                 //for each window
267                 float sum = 0.0;  //sum = sum from 1 to m of (oi-ei)^2
268                 int numZeros = 0;
269                 for (int m = 0; m < obs.size(); m++) {          
270                         
271                         //if (obs[m] != 0.0) {
272                                 sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); 
273                         //}else {  numZeros++;   }
274                         
275                 }
276                         
277                 float de = sqrt((sum / (obs.size() - 1 - numZeros)));
278                         
279                 return de;
280         }
281         catch(exception& e) {
282                 m->errorOut(e, "DeCalculator", "calcDE");
283                 exit(1);
284         }
285 }
286
287 //***************************************************************************************************************
288
289 vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
290         try {
291
292                 vector<float> prob;
293                 string freqfile = getRootName(filename) + "freq";
294                 ofstream outFreq;
295                 
296                 openOutputFile(freqfile, outFreq);
297                 
298                 string length = toString(seqs.size());  //if there are 5000 seqs in the template then set precision to 3
299                 int precision = length.length() - 1;
300                 
301                 //format output
302                 outFreq.setf(ios::fixed, ios::floatfield); outFreq.setf(ios::showpoint);
303                 
304                 //at each position in the sequence
305                 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
306                         
307                         vector<int> freq;   freq.resize(4,0);
308                         int gaps = 0;
309                         
310                         //find the frequency of each nucleotide
311                         for (int j = 0; j < seqs.size(); j++) {
312                                 
313                                 char value = seqs[j]->getAligned()[i];
314                                 
315                                 if(toupper(value) == 'A')                                                                       {       freq[0]++;      }
316                                 else if(toupper(value) == 'T' || toupper(value) == 'U')         {       freq[1]++;      }
317                                 else if(toupper(value) == 'G')                                                          {       freq[2]++;      }
318                                 else if(toupper(value) == 'C')                                                          {       freq[3]++;      }
319                                 else { gaps++; }
320                         }
321                         
322                         //find base with highest frequency
323                         int highest = 0;
324                         for (int m = 0; m < freq.size(); m++) {   if (freq[m] > highest) {  highest = freq[m];  }               }
325                         
326                         float highFreq = highest / (float) (seqs.size());       
327                         
328                         float Pi;
329                         Pi =  (highFreq - 0.25) / 0.75; 
330                         
331                         //cannot have probability less than 0.
332                         if (Pi < 0) { Pi = 0.0; }
333                         
334                         //saves this for later
335                         outFreq << setprecision(precision) << i << '\t' << highFreq << endl;
336         
337                         if (h.count(i) > 0) {
338                                 prob.push_back(Pi); 
339                         }
340                 }
341                 
342                 outFreq.close();
343                 
344                 return prob;
345                                 
346         }
347         catch(exception& e) {
348                 m->errorOut(e, "DeCalculator", "calcFreq");
349                 exit(1);
350         }
351 }
352 //***************************************************************************************************************
353 vector<float>  DeCalculator::findQav(vector<int> window, int size, vector<float> probabilityProfile) {
354         try {
355                 vector<float>  averages; 
356                                 
357                 //for each window find average
358                 for (int m = 0; m < window.size(); m++) {
359                                 
360                         float average = 0.0;
361                                 
362                         //while you are in the window for this sequence
363                         int count = 0;
364                         for (int j = window[m]; j < (window[m]+size); j++) {   
365                                 average += probabilityProfile[j];
366                                 count++;
367                         }
368                                 
369                         average = average / count;
370         
371                         //save this windows average
372                         averages.push_back(average);
373                 }
374                                 
375                 return averages;
376         }
377         catch(exception& e) {
378                 m->errorOut(e, "DeCalculator", "findQav");
379                 exit(1);
380         }
381 }
382 //***************************************************************************************************************
383 //seqs have already been masked
384 vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end) {
385         try {
386                 vector< vector<quanMember> > quan; 
387                 
388                 //percentage of mismatched pairs 1 to 100
389                 quan.resize(100);
390 //ofstream o;
391 //string out = "getQuantiles.out";
392 //openOutputFile(out, o);
393                 
394                 //for each sequence
395                 for(int i = start; i < end; i++){
396                 
397                         m->mothurOut("Processing sequence " + toString(i)); m->mothurOutEndLine();
398                         Sequence* query = new Sequence(seqs[i]->getName(), seqs[i]->getAligned());
399                         
400                         //compare to every other sequence in template
401                         for(int j = 0; j < i; j++){
402                                 
403                                 Sequence* subject = new Sequence(seqs[j]->getName(), seqs[j]->getAligned());
404                                 
405                                 map<int, int> trim;
406                                 map<int, int>::iterator it;
407                                 
408                                 trimSeqs(query, subject, trim);
409                                 
410                                 it = trim.begin();
411                                 int front = it->first; int back = it->second;
412                                 
413                                 //reset window for each new comparison
414                                 windowSizesTemplate[i] = window;
415                                 
416                                 vector<int> win = findWindows(query, front, back, windowSizesTemplate[i], increment);
417                                 
418                                 vector<float> obsi = calcObserved(query, subject, win, windowSizesTemplate[i]);
419                                 
420                                 vector<float> q = findQav(win, windowSizesTemplate[i], probProfile);
421                                                                         
422                                 float alpha = getCoef(obsi, q);
423                                                 
424                                 vector<float> exp = calcExpected(q, alpha);
425                                 
426                                 float de = calcDE(obsi, exp);
427                                                                 
428                                 float dist = calcDist(query, subject, front, back); 
429         //o << i << '\t' <<  j << '\t' << dist << '\t' << de << endl;                   
430                                 dist = ceil(dist);
431                                 
432                                 quanMember newScore(de, i, j);
433                                 
434                                 quan[dist].push_back(newScore);
435                                 
436                                 delete subject;
437                         }
438                         
439                         delete query;
440                 }
441
442                 return quan;
443                                                 
444         }
445         catch(exception& e) {
446                 m->errorOut(e, "DeCalculator", "getQuantiles");
447                 exit(1);
448         }
449 }
450 //********************************************************************************************************************
451 //sorts lowest to highest
452 inline bool compareQuanMembers(quanMember left, quanMember right){
453         return (left.score < right.score);      
454
455 //***************************************************************************************************************
456 //this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right.  may want to revisit in the future...
457 void DeCalculator::removeObviousOutliers(vector< vector<quanMember> >& quantiles, int num) {
458         try {
459                                                 
460                 for (int i = 0; i < quantiles.size(); i++) {
461                 
462                         //find mean of this quantile score
463                         sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
464                         
465                         float high = quantiles[i][int(quantiles[i].size() * 0.99)].score;
466                         float low =  quantiles[i][int(quantiles[i].size() * 0.01)].score;
467                         
468                         vector<quanMember> temp;
469                         
470                         //look at each value in quantiles to see if it is an outlier
471                         for (int j = 0; j < quantiles[i].size(); j++) {
472                                 //is this score between 1 and 99%
473                                 if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) {
474                                         temp.push_back(quantiles[i][j]);
475                                 }
476                         }
477                         
478                         quantiles[i] = temp;
479                 }
480
481 /*
482                 //find contributer with most offending score related to it
483                 int largestContrib = findLargestContrib(seen);
484         
485                 //while you still have guys to eliminate
486                 while (contributions.size() > 0) {
487                 
488                         m->mothurOut("Removing scores contributed by sequence " + toString(largestContrib) + " in your template file."); m->mothurOutEndLine();
489                         
490                         //remove from quantiles all scores that were made using this largestContrib
491                         for (int i = 0; i < quantiles.size(); i++) {
492 //cout << "before remove " << quantiles[i].size() << '\t';
493                                 removeContrib(largestContrib, quantiles[i]);
494 //cout << "after remove " << quantiles[i].size() << endl;
495                         }
496 //cout << count << " largest contrib = " << largestContrib << endl;  count++;
497                         //remove from contributions all scores that were made using this largestContrib
498                         removeContrib(largestContrib, contributions);
499                         
500                         //"erase" largestContrib
501                         seen[largestContrib] = -1;
502                         
503                         //get next largestContrib
504                         largestContrib = findLargestContrib(seen);
505                 }
506 ABOVE IS ATTEMPT #1             
507 **************************************************************************************************
508 BELOW IS ATTEMPT #2             
509                 vector<int> marked = returnObviousOutliers(quantiles, num);
510                 vector<int> copyMarked = marked;
511                 
512                 //find the 99% of marked
513                 sort(copyMarked.begin(), copyMarked.end());
514                 int high = copyMarked[int(copyMarked.size() * 0.99)];
515 cout << "high = " << high << endl;              
516                 
517                 for(int i = 0; i < marked.size(); i++) {
518                         if (marked[i] > high) { 
519                                 m->mothurOut("Removing scores contributed by sequence " + toString(marked[i]) + " in your template file."); m->mothurOutEndLine();
520                                 for (int i = 0; i < quantiles.size(); i++) {
521                                         removeContrib(marked[i], quantiles[i]);
522                                 }
523                         }
524
525                 }
526                 
527                 
528                 //adjust quantiles
529                 for (int i = 0; i < quantiles.size(); i++) {
530                         vector<float> temp;
531                         
532                         if (quantiles[i].size() == 0) {
533                                 //in case this is not a distance found in your template files
534                                 for (int g = 0; g < 6; g++) {
535                                         temp.push_back(0.0);
536                                 }
537                         }else{
538                                 
539                                 sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
540                                 
541                                 //save 10%
542                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)].score);
543                                 //save 25%
544                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)].score);
545                                 //save 50%
546                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)].score);
547                                 //save 75%
548                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)].score);
549                                 //save 95%
550                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)].score);
551                                 //save 99%
552                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)].score);
553                                 
554                         }
555                         
556                         quan[i] = temp;
557                         
558                 }
559 */
560                 
561         }
562         catch(exception& e) {
563                 m->errorOut(e, "DeCalculator", "removeObviousOutliers");
564                 exit(1);
565         }
566 }
567 //***************************************************************************************************************
568 //put quanMember in the vector based on how far they are from the 99% or 1%.  Biggest offenders in front.
569 /*vector<quanMember> DeCalculator::sortContrib(map<quanMember*, float> quan) {
570         try{
571                 
572                 vector<quanMember> newQuan;
573                 map<quanMember*, float>::iterator it;
574                 
575                 while (quan.size() > 0) {
576                         
577                          map<quanMember*, float>::iterator largest = quan.begin(); 
578                           
579                         //find biggest member
580                         for (it = quan.begin(); it != quan.end(); it++) {
581                                 if (it->second > largest->second) {  largest = it;  }
582                         }
583 cout << largest->second << '\t' << largest->first->score << '\t' << largest->first->member1 << '\t' << largest->first->member2 << endl;
584                         //save it 
585                         newQuan.push_back(*(largest->first));
586                 
587                         //erase from quan
588                         quan.erase(largest);
589                 }
590                 
591                 return newQuan;
592                 
593         }
594         catch(exception& e) {
595                 m->errorOut(e, "DeCalculator", "sortContrib");
596                 exit(1);
597         }
598 }
599
600 //***************************************************************************************************************
601 //used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used...
602 int DeCalculator::findLargestContrib(vector<int> seen) {
603         try{
604                 
605                 int largest = 0;
606                 
607                 int largestContribs;
608                 
609                 for (int i = 0; i < seen.size(); i++)  {  
610                         
611                         if (seen[i] > largest) {
612                                 largestContribs = i;
613                                 largest = seen[i];
614                         }
615                 }
616                 
617                 return largestContribs;
618                 
619         }
620         catch(exception& e) {
621                 m->errorOut(e, "DeCalculator", "findLargestContribs");
622                 exit(1);
623         }
624 }
625 //***************************************************************************************************************
626 void DeCalculator::removeContrib(int bad, vector<quanMember>& quan) {
627         try{
628         
629                 vector<quanMember> newQuan;
630                 for (int i = 0; i < quan.size(); i++)  {  
631                         if ((quan[i].member1 != bad) && (quan[i].member2 != bad) ) {  
632                                 newQuan.push_back(quan[i]);  
633                         }
634                 }
635                 
636                 quan = newQuan;
637                 
638         }
639         catch(exception& e) {
640                 m->errorOut(e, "DeCalculator", "removeContrib");
641                 exit(1);
642         }
643 }
644 */
645 //***************************************************************************************************************
646 float DeCalculator::findAverage(vector<float> myVector) {
647         try{
648                 
649                 float total = 0.0;
650                 for (int i = 0; i < myVector.size(); i++)  {  total += myVector[i];  }
651                 
652                 float average = total / (float) myVector.size();
653                 
654                 return average;
655                 
656         }
657         catch(exception& e) {
658                 m->errorOut(e, "DeCalculator", "findAverage");
659                 exit(1);
660         }
661 }
662
663 //***************************************************************************************************************
664 float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
665         try {
666         
667                 //find average prob for this seqs windows
668                 float probAverage = findAverage(qav);
669                                 
670                 //find observed average 
671                 float obsAverage = findAverage(obs);
672                         
673                 float coef = obsAverage / probAverage;
674                                                 
675                 return coef;
676         }
677         catch(exception& e) {
678                 m->errorOut(e, "DeCalculator", "getCoef");
679                 exit(1);
680         }
681 }
682 //***************************************************************************************************************
683 //gets closest matches to each end, since chimeras will most likely have different parents on each end
684 vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db, int& numWanted, vector<int>& indexes) {
685         try {
686                 indexes.clear();
687                 
688                 vector<Sequence*> seqsMatches;  
689                 vector<SeqDist> distsLeft;
690                 vector<SeqDist> distsRight;
691                 
692                 Dist* distcalculator = new eachGapDist();
693                 
694                 string queryUnAligned = querySeq->getUnaligned();
695                 int numBases = int(queryUnAligned.length() * 0.33);
696                 
697                 string leftQuery = ""; //first 1/3 of the sequence
698                 string rightQuery = ""; //last 1/3 of the sequence
699                 string queryAligned = querySeq->getAligned();
700                 
701                 //left side
702                 bool foundFirstBase = false;
703                 int baseCount = 0;
704                 int leftSpot = 0;
705                 int firstBaseSpot = 0;
706                 for (int i = 0; i < queryAligned.length(); i++) {
707                         //if you are a base
708                         if (isalpha(queryAligned[i])) {         
709                                 baseCount++; 
710                                 if (!foundFirstBase) {   foundFirstBase = true;  firstBaseSpot = i;  }
711                         }
712                         
713                         //eliminate opening .'s
714                         if (foundFirstBase) {   leftQuery += queryAligned[i];  }
715                         //if you have 1/3
716                         if (baseCount >= numBases) {  leftSpot = i; break; } //first 1/3
717                 }
718                 
719                 //right side - count through another 1/3, so you are at last third
720                 baseCount = 0;
721                 int rightSpot = 0;
722                 for (int i = leftSpot; i < queryAligned.length(); i++) {
723                         //if you are a base
724                         if (isalpha(queryAligned[i])) {         baseCount++;    }
725                         //if you have 1/3
726                         if (baseCount >= numBases) { rightSpot = i;  break; } //last 1/3
727                 }
728                 
729                 //trim end
730                 //find last position in query that is a non gap character
731                 int lastBaseSpot = queryAligned.length()-1;
732                 for (int j = queryAligned.length()-1; j >= 0; j--) {
733                         if (isalpha(queryAligned[j])) {
734                                 lastBaseSpot = j;
735                                 break;
736                         }
737                 }
738                 rightQuery = queryAligned.substr(rightSpot, (lastBaseSpot-rightSpot)); //sequence from pos spot to end
739                 
740                 Sequence queryLeft(querySeq->getName(), leftQuery);
741                 Sequence queryRight(querySeq->getName(), rightQuery);
742 //cout << querySeq->getName() << '\t' << leftSpot << '\t' << rightSpot << '\t' << firstBaseSpot << '\t' << lastBaseSpot << endl;
743 //cout << queryUnAligned.length() << '\t' << queryLeft.getUnaligned().length() << '\t' << queryRight.getUnaligned().length() << endl;
744                 for(int j = 0; j < db.size(); j++){
745                         
746                         string dbAligned = db[j]->getAligned();
747                         string leftDB = dbAligned.substr(firstBaseSpot, (leftSpot-firstBaseSpot+1)); //first 1/3 of the sequence
748                         string rightDB = dbAligned.substr(rightSpot, (lastBaseSpot-rightSpot)); //last 1/3 of the sequence
749                         
750                         Sequence dbLeft(db[j]->getName(), leftDB);
751                         Sequence dbRight(db[j]->getName(), rightDB);
752
753                         distcalculator->calcDist(queryLeft, dbLeft);
754                         float distLeft = distcalculator->getDist();
755                         
756                         distcalculator->calcDist(queryRight, dbRight);
757                         float distRight = distcalculator->getDist();
758                         
759                         SeqDist subjectLeft;
760                         subjectLeft.seq = db[j];
761                         subjectLeft.dist = distLeft;
762                         subjectLeft.index = j;
763                         
764                         distsLeft.push_back(subjectLeft);
765                         
766                         SeqDist subjectRight;
767                         subjectRight.seq = db[j];
768                         subjectRight.dist = distRight;
769                         subjectRight.index = j;
770                         
771                         distsRight.push_back(subjectRight);
772
773                 }
774                 
775                 delete distcalculator;
776                 
777                 //sort by smallest distance
778                 sort(distsRight.begin(), distsRight.end(), compareSeqDist);
779                 sort(distsLeft.begin(), distsLeft.end(), compareSeqDist);
780                 
781                 //merge results         
782                 map<string, string> seen;
783                 map<string, string>::iterator it;
784                 
785                 vector<SeqDist> dists;
786                 float lastRight = distsRight[0].dist;
787                 float lastLeft = distsLeft[0].dist;
788                 int lasti = 0;
789                 for (int i = 0; i < distsLeft.size(); i++) {
790                         //add left if you havent already
791                         it = seen.find(distsLeft[i].seq->getName());
792                         if (it == seen.end()) {  
793                                 dists.push_back(distsLeft[i]);
794                                 seen[distsLeft[i].seq->getName()] = distsLeft[i].seq->getName();
795                                 lastLeft =  distsLeft[i].dist;
796                         }
797
798                         //add right if you havent already
799                         it = seen.find(distsRight[i].seq->getName());
800                         if (it == seen.end()) {  
801                                 dists.push_back(distsRight[i]);
802                                 seen[distsRight[i].seq->getName()] = distsRight[i].seq->getName();
803                                 lastRight =  distsRight[i].dist;
804                         }
805                         
806                         if (dists.size() > numWanted) { lasti = i; break; } //you have enough results
807                 }
808                 
809                 //add in dups
810                 lasti++;
811                 int i = lasti;
812                 while (i < distsLeft.size()) {  
813                         if (distsLeft[i].dist == lastLeft) {  dists.push_back(distsLeft[i]);  numWanted++; }
814                         else { break; }
815                         i++;
816                 }
817                 
818                 i = lasti;
819                 while (i < distsRight.size()) {  
820                         if (distsRight[i].dist == lastRight) {  dists.push_back(distsRight[i]);  numWanted++; }
821                         else { break; }
822                         i++;
823                 }
824
825 //cout << numWanted << endl;
826                 for (int i = 0; i < numWanted; i++) {
827 //cout << dists[i].seq->getName() << '\t' << dists[i].dist << endl;
828                         Sequence* temp = new Sequence(dists[i].seq->getName(), dists[i].seq->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
829                         seqsMatches.push_back(temp);
830                         indexes.push_back(dists[i].index);
831                 }
832                 
833                 return seqsMatches;
834         }
835         catch(exception& e) {
836                 m->errorOut(e, "DeCalculator", "findClosest");
837                 exit(1);
838         }
839 }
840 //***************************************************************************************************************
841 Sequence* DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db) {
842         try {
843                 
844                 Sequence* seqsMatch;  
845                 
846                 Dist* distcalculator = new eachGapDist();
847                 int index = 0;
848                 int smallest = 1000000;
849                 
850                 for(int j = 0; j < db.size(); j++){
851                         
852                         distcalculator->calcDist(*querySeq, *db[j]);
853                         float dist = distcalculator->getDist();
854                         
855                         if (dist < smallest) { 
856                                 smallest = dist;
857                                 index = j;
858                         }
859                 }
860                 
861                 delete distcalculator;
862                 
863                 seqsMatch = new Sequence(db[index]->getName(), db[index]->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
864                         
865                 return seqsMatch;
866         }
867         catch(exception& e) {
868                 m->errorOut(e, "DeCalculator", "findClosest");
869                 exit(1);
870         }
871 }
872 /***************************************************************************************************************/
873 map<int, int> DeCalculator::trimSeqs(Sequence* query, vector<Sequence*> topMatches) {
874         try {
875                 
876                 int frontPos = 0;  //should contain first position in all seqs that is not a gap character
877                 int rearPos = query->getAligned().length();
878                 
879                 //********find first position in topMatches that is a non gap character***********//
880                 //find first position all query seqs that is a non gap character
881                 for (int i = 0; i < topMatches.size(); i++) {
882                         
883                         string aligned = topMatches[i]->getAligned();
884                         int pos = 0;
885                         
886                         //find first spot in this seq
887                         for (int j = 0; j < aligned.length(); j++) {
888                                 if (isalpha(aligned[j])) {
889                                         pos = j;
890                                         break;
891                                 }
892                         }
893                         
894                         //save this spot if it is the farthest
895                         if (pos > frontPos) { frontPos = pos; }
896                 }
897                 
898                 
899                 string aligned = query->getAligned();
900                 int pos = 0;
901                         
902                 //find first position in query that is a non gap character
903                 for (int j = 0; j < aligned.length(); j++) {
904                         if (isalpha(aligned[j])) {
905                                 pos = j;
906                                 break;
907                         }
908                 }
909                 
910                 //save this spot if it is the farthest
911                 if (pos > frontPos) { frontPos = pos; }
912                 
913                 
914                 //********find last position in topMatches that is a non gap character***********//
915                 for (int i = 0; i < topMatches.size(); i++) {
916                         
917                         string aligned = topMatches[i]->getAligned();
918                         int pos = aligned.length();
919                         
920                         //find first spot in this seq
921                         for (int j = aligned.length()-1; j >= 0; j--) {
922                                 if (isalpha(aligned[j])) {
923                                         pos = j;
924                                         break;
925                                 }
926                         }
927                         
928                         //save this spot if it is the farthest
929                         if (pos < rearPos) { rearPos = pos; }
930                 }
931                 
932                 
933                 aligned = query->getAligned();
934                 pos = aligned.length();
935                 
936                 //find last position in query that is a non gap character
937                 for (int j = aligned.length()-1; j >= 0; j--) {
938                         if (isalpha(aligned[j])) {
939                                 pos = j;
940                                 break;
941                         }
942                 }
943                 
944                 //save this spot if it is the farthest
945                 if (pos < rearPos) { rearPos = pos; }
946
947                 //check to make sure that is not whole seq
948                 if ((rearPos - frontPos - 1) <= 0) {  m->mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); m->mothurOutEndLine(); exit(1);  }
949 //cout << query->getName() << " front = " << frontPos << " rear = " << rearPos << endl;         
950                 //trim query
951                 string newAligned = query->getAligned();
952                 newAligned = newAligned.substr(frontPos, (rearPos-frontPos+1));
953                 query->setAligned(newAligned);
954                 
955                 //trim topMatches
956                 for (int i = 0; i < topMatches.size(); i++) {
957                         newAligned = topMatches[i]->getAligned();
958                         newAligned = newAligned.substr(frontPos, (rearPos-frontPos+1));
959                         topMatches[i]->setAligned(newAligned);
960                 }
961                 
962                 map<int, int> trimmedPos;
963                 
964                 for (int i = 0; i < newAligned.length(); i++) {
965                         trimmedPos[i] = i+frontPos;
966                 }
967                 
968                 return trimmedPos;
969         }
970         catch(exception& e) {
971                 m->errorOut(e, "DeCalculator", "trimSequences");
972                 exit(1);
973         }
974
975 }
976 //***************************************************************************************************************
977
978
979