]> git.donarmstrong.com Git - mothur.git/blob - decalc.cpp
committing so I can work on the other machine
[mothur.git] / decalc.cpp
1 /*
2  *  decalc.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 7/22/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "decalc.h"
11 #include "chimera.h"
12 #include "dist.h"
13 #include "eachgapdist.h"
14 #include "ignoregaps.h"
15
16
17 //***************************************************************************************************************
18 void DeCalculator::setMask(string ms) { 
19         try {
20                 seqMask = ms; 
21                 int count = 0;
22                 maskMap.clear();
23                 
24                 if (seqMask.length() != 0) {
25                         //whereever there is a base in the mask, save that value is query and subject
26                         for (int i = 0; i < seqMask.length(); i++) {
27                                 if (isalpha(seqMask[i])) {
28                                         h.insert(i);
29                                         maskMap[count] = i;
30                                         count++;
31                                         
32                                 }
33                         }
34                 }else {
35                         for (int i = 0; i < alignLength; i++) {   
36                                 h.insert(i); 
37                                 maskMap[count] = i;
38                                 count++;
39                         }
40                 }
41         }
42         catch(exception& e) {
43                 m->errorOut(e, "DeCalculator", "setMask");
44                 exit(1);
45         } 
46 }
47 //***************************************************************************************************************
48 void DeCalculator::runMask(Sequence* seq) {
49         try{
50                 
51                 string q = seq->getAligned();
52                 string tempQuery = "";
53                 
54                 //whereever there is a base in the mask, save that value is query and subject
55                 set<int>::iterator setit;
56                 for ( setit=h.begin() ; setit != h.end(); setit++ )  {
57                         tempQuery += q[*setit];
58                 }
59                 
60                 //save masked values
61                 seq->setAligned(tempQuery);
62                 seq->setUnaligned(tempQuery);
63         }
64         catch(exception& e) {
65                 m->errorOut(e, "DeCalculator", "runMask");
66                 exit(1);
67         }
68 }
69 //***************************************************************************************************************
70 //num is query's spot in querySeqs
71 void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map<int, int>& trim) {
72         try {
73                 
74                 string q = query->getAligned();
75                 string s = subject->getAligned();
76                 
77                 int front = 0;
78                 for (int i = 0; i < q.length(); i++) {
79 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
80                         if (isalpha(q[i]) && isalpha(s[i])) { front = i; break;  }
81                 }
82 //cout << endl << endl;         
83                 int back = 0;           
84                 for (int i = q.length(); i >= 0; i--) {
85 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
86                         if (isalpha(q[i]) && isalpha(s[i])) { back = i; break;  }
87                 }
88                 
89                 trim[front] = back;
90                 
91         }
92         catch(exception& e) {
93                 m->errorOut(e, "DeCalculator", "trimSeqs");
94                 exit(1);
95         }
96 }
97 //***************************************************************************************************************
98 vector<int>  DeCalculator::findWindows(Sequence* query, int front, int back, int& size, int increment) {
99         try {
100                 
101                 vector<int> win; 
102                 
103                 int cutoff = back - front;  //back - front 
104                         
105                 //if window is set to default
106                 if (size == 0) {  if (cutoff > 1200) {  size = 300; }
107                                                         else{  size = (cutoff / 4); }  } 
108                 else if (size > (cutoff / 4)) { 
109                                 m->mothurOut("You have selected too large a window size for sequence " + query->getName() + ".  I will choose an appropriate window size."); m->mothurOutEndLine();
110                                 size = (cutoff / 4); 
111                 }
112         
113         /*      string seq = query->getAligned().substr(front, cutoff);
114                         
115                 //count bases
116                 int numBases = 0;
117                 for (int l = 0; l < seq.length(); l++) {  if (isalpha(seq[l])) { numBases++; }  }
118 //cout << "num Bases = " << numBases << endl;                   
119                 //save start of seq
120                 win.push_back(front);
121 //cout << front << '\t';                
122                 //move ahead increment bases at a time until all bases are in a window
123                 int countBases = 0;
124                 int totalBases = 0;  //used to eliminate window of blanks at end of sequence
125                         
126                 seq = query->getAligned();
127                 for (int m = front; m < (back - size) ; m++) {
128                                 
129                         //count number of bases you see
130                         if (isalpha(seq[m])) { countBases++;  }
131                                 
132                         //if you have seen enough bases to make a new window
133                         if (countBases >= increment) {
134                                 //total bases is the number of bases in a window already.
135                                 totalBases += countBases;
136 //cout << "total bases = " << totalBases << endl;
137                                 win.push_back(m);  //save spot in alignment
138 //cout << m << '\t';
139                                 countBases = 0;                         //reset bases you've seen in this window
140                         }
141                                 
142                         //no need to continue if all your bases are in a window
143                         if (totalBases == numBases) {   break;  }
144                 }
145         
146
147                 //get last window if needed
148                 if (totalBases < numBases) {   win.push_back(back-size);  }
149 //cout << endl << endl;
150 */      
151                 
152                 //this follows wigeon, but we may want to consider that it chops off the end values if the sequence cannot be evenly divided into steps
153                 for (int m = front; m < (back - size) ; m+=increment) {  win.push_back(m);  }
154
155
156         
157                 return win;
158         
159         }
160         catch(exception& e) {
161                 m->errorOut(e, "DeCalculator", "findWindows");
162                 exit(1);
163         }
164 }
165
166 //***************************************************************************************************************
167 vector<float> DeCalculator::calcObserved(Sequence* query, Sequence* subject, vector<int> window, int size) {
168         try {
169                 
170                 vector<float> temp;     
171                 //int gaps = 0;         
172                 for (int m = 0; m < window.size(); m++) {
173                                                 
174                         string seqFrag = query->getAligned().substr(window[m], size);
175                         string seqFragsub = subject->getAligned().substr(window[m], size);
176                                 
177                         int diff = 0;
178                         for (int b = 0; b < seqFrag.length(); b++) {
179                                 //if at least one is a base and they are not equal
180                                 if( (isalpha(seqFrag[b]) || isalpha(seqFragsub[b])) && (seqFrag[b] != seqFragsub[b]) ) { diff++; }
181                                 
182                                 //ignore gaps
183                                 //if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
184                         }
185                
186                         //percentage of mismatched bases
187                         float dist;
188                         
189                         //if the whole fragment is 0 distance = 0
190                         //if ((seqFrag.length()-gaps) == 0)  { dist =  0.0; }
191                
192                         //percentage of mismatched bases
193                         //else {  dist = diff / (float) (seqFrag.length()-gaps) * 100;   } 
194                         
195                         dist = diff / (float) (seqFrag.length()) * 100; 
196                         
197                         temp.push_back(dist);
198                 }
199                         
200                 return temp;
201         }
202         catch(exception& e) {
203                 m->errorOut(e, "DeCalculator", "calcObserved");
204                 exit(1);
205         }
206 }
207 //***************************************************************************************************************
208 float DeCalculator::calcDist(Sequence* query, Sequence* subject, int front, int back) {
209         try {
210                 
211                 //so you only look at the trimmed part of the sequence
212                 int cutoff = back - front;  
213                 int gaps = 0;
214                         
215                 //from first startpoint with length back-front
216                 string seqFrag = query->getAligned().substr(front, cutoff);
217                 string seqFragsub = subject->getAligned().substr(front, cutoff);
218                                                                                                                 
219                 int diff = 0;
220                 for (int b = 0; b < seqFrag.length(); b++) {
221                         //ignore gaps
222                         if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
223                         if (seqFrag[b] != seqFragsub[b]) { diff++; }
224                 }
225                 
226                 //if the whole fragment is 0 distance = 0
227                 if ((seqFrag.length()-gaps) == 0)  { return 0.0; }
228                
229                 //percentage of mismatched bases
230                 float dist = diff / (float) (seqFrag.length()-gaps) * 100;       
231                                 
232                 return dist;
233         }
234         catch(exception& e) {
235                 m->errorOut(e, "DeCalculator", "calcDist");
236                 exit(1);
237         }
238 }
239
240 //***************************************************************************************************************
241 vector<float> DeCalculator::calcExpected(vector<float> qav, float coef) {
242         try {
243                 
244                 //for each window
245                 vector<float> queryExpected;
246                         
247                 for (int m = 0; m < qav.size(); m++) {          
248                                 
249                         float expected = qav[m] * coef;
250                                 
251                         queryExpected.push_back(expected);      
252                 }
253                         
254                 return queryExpected;
255                                 
256         }
257         catch(exception& e) {
258                 m->errorOut(e, "DeCalculator", "calcExpected");
259                 exit(1);
260         }
261 }
262 //***************************************************************************************************************
263 float DeCalculator::calcDE(vector<float> obs, vector<float> exp) {
264         try {
265                 
266                 //for each window
267                 float sum = 0.0;  //sum = sum from 1 to m of (oi-ei)^2
268                 int numZeros = 0;
269                 for (int m = 0; m < obs.size(); m++) {          
270                         
271                         //if (obs[m] != 0.0) {
272                                 sum += ((obs[m] - exp[m]) * (obs[m] - exp[m])); 
273                         //}else {  numZeros++;   }
274                         
275                 }
276                         
277                 float de = sqrt((sum / (obs.size() - 1 - numZeros)));
278                         
279                 return de;
280         }
281         catch(exception& e) {
282                 m->errorOut(e, "DeCalculator", "calcDE");
283                 exit(1);
284         }
285 }
286
287 //***************************************************************************************************************
288
289 vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
290         try {
291
292                 vector<float> prob;
293                 string freqfile = getRootName(filename) + "freq";
294                 ofstream outFreq;
295                 
296                 openOutputFile(freqfile, outFreq);
297                 
298                 string length = toString(seqs.size());  //if there are 5000 seqs in the template then set precision to 3
299                 int precision = length.length() - 1;
300                 
301                 //format output
302                 outFreq.setf(ios::fixed, ios::floatfield); outFreq.setf(ios::showpoint);
303                 
304                 //at each position in the sequence
305                 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
306                         
307                         vector<int> freq;   freq.resize(4,0);
308                         int gaps = 0;
309                         
310                         //find the frequency of each nucleotide
311                         for (int j = 0; j < seqs.size(); j++) {
312                                 
313                                 char value = seqs[j]->getAligned()[i];
314                                 
315                                 if(toupper(value) == 'A')                                                                       {       freq[0]++;      }
316                                 else if(toupper(value) == 'T' || toupper(value) == 'U')         {       freq[1]++;      }
317                                 else if(toupper(value) == 'G')                                                          {       freq[2]++;      }
318                                 else if(toupper(value) == 'C')                                                          {       freq[3]++;      }
319                                 else { gaps++; }
320                         }
321                         
322                         //find base with highest frequency
323                         int highest = 0;
324                         for (int m = 0; m < freq.size(); m++) {   if (freq[m] > highest) {  highest = freq[m];  }               }
325                         
326                         float highFreq = highest / (float) (seqs.size());       
327                         
328                         float Pi;
329                         Pi =  (highFreq - 0.25) / 0.75; 
330                         
331                         //cannot have probability less than 0.
332                         if (Pi < 0) { Pi = 0.0; }
333                         
334                         //saves this for later
335                         outFreq << setprecision(precision) << i << '\t' << highFreq << endl;
336         
337                         if (h.count(i) > 0) {
338                                 prob.push_back(Pi); 
339                         }
340                 }
341                 
342                 outFreq.close();
343                 
344                 return prob;
345                                 
346         }
347         catch(exception& e) {
348                 m->errorOut(e, "DeCalculator", "calcFreq");
349                 exit(1);
350         }
351 }
352 //***************************************************************************************************************
353 vector<float>  DeCalculator::findQav(vector<int> window, int size, vector<float> probabilityProfile) {
354         try {
355                 vector<float>  averages; 
356                                 
357                 //for each window find average
358                 for (int m = 0; m < window.size(); m++) {
359                                 
360                         float average = 0.0;
361                                 
362                         //while you are in the window for this sequence
363                         int count = 0;
364                         for (int j = window[m]; j < (window[m]+size); j++) {   
365                                 average += probabilityProfile[j];
366                                 count++;
367                         }
368                                 
369                         average = average / count;
370         
371                         //save this windows average
372                         averages.push_back(average);
373                 }
374                                 
375                 return averages;
376         }
377         catch(exception& e) {
378                 m->errorOut(e, "DeCalculator", "findQav");
379                 exit(1);
380         }
381 }
382 //***************************************************************************************************************
383 //seqs have already been masked
384 vector< vector<quanMember> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end) {
385         try {
386                 vector< vector<quanMember> > quan; 
387                 
388                 //percentage of mismatched pairs 1 to 100
389                 quan.resize(100);
390 //ofstream o;
391 //string out = "getQuantiles.out";
392 //openOutputFile(out, o);
393                 
394                 //for each sequence
395                 for(int i = start; i < end; i++){
396                 
397                         m->mothurOut("Processing sequence " + toString(i)); m->mothurOutEndLine();
398                         Sequence* query = new Sequence(seqs[i]->getName(), seqs[i]->getAligned());
399                         
400                         //compare to every other sequence in template
401                         for(int j = 0; j < i; j++){
402                                 
403                                 Sequence* subject = new Sequence(seqs[j]->getName(), seqs[j]->getAligned());
404                                 
405                                 if (m->control_pressed) { delete query; delete subject; return quan; }
406                                 
407                                 map<int, int> trim;
408                                 map<int, int>::iterator it;
409                                 
410                                 trimSeqs(query, subject, trim);
411                                 
412                                 it = trim.begin();
413                                 int front = it->first; int back = it->second;
414                                 
415                                 //reset window for each new comparison
416                                 windowSizesTemplate[i] = window;
417                                 
418                                 vector<int> win = findWindows(query, front, back, windowSizesTemplate[i], increment);
419                                 
420                                 vector<float> obsi = calcObserved(query, subject, win, windowSizesTemplate[i]);
421                                 
422                                 vector<float> q = findQav(win, windowSizesTemplate[i], probProfile);
423                                                                         
424                                 float alpha = getCoef(obsi, q);
425                                                 
426                                 vector<float> exp = calcExpected(q, alpha);
427                                 
428                                 float de = calcDE(obsi, exp);
429                                                                 
430                                 float dist = calcDist(query, subject, front, back); 
431         //o << i << '\t' <<  j << '\t' << dist << '\t' << de << endl;                   
432                                 dist = ceil(dist);
433                                 
434                                 quanMember newScore(de, i, j);
435                                 
436                                 quan[dist].push_back(newScore);
437                                 
438                                 delete subject;
439                         }
440                         
441                         delete query;
442                 }
443
444                 return quan;
445                                                 
446         }
447         catch(exception& e) {
448                 m->errorOut(e, "DeCalculator", "getQuantiles");
449                 exit(1);
450         }
451 }
452 //********************************************************************************************************************
453 //sorts lowest to highest
454 inline bool compareQuanMembers(quanMember left, quanMember right){
455         return (left.score < right.score);      
456
457 //***************************************************************************************************************
458 //this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right.  may want to revisit in the future...
459 void DeCalculator::removeObviousOutliers(vector< vector<quanMember> >& quantiles, int num) {
460         try {
461                                                 
462                 for (int i = 0; i < quantiles.size(); i++) {
463                 
464                         //find mean of this quantile score
465                         sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
466                         
467                         float high = quantiles[i][int(quantiles[i].size() * 0.99)].score;
468                         float low =  quantiles[i][int(quantiles[i].size() * 0.01)].score;
469                         
470                         vector<quanMember> temp;
471                         
472                         //look at each value in quantiles to see if it is an outlier
473                         for (int j = 0; j < quantiles[i].size(); j++) {
474                                 //is this score between 1 and 99%
475                                 if ((quantiles[i][j].score > low) && (quantiles[i][j].score < high)) {
476                                         temp.push_back(quantiles[i][j]);
477                                 }
478                         }
479                         
480                         quantiles[i] = temp;
481                 }
482
483 /*
484                 //find contributer with most offending score related to it
485                 int largestContrib = findLargestContrib(seen);
486         
487                 //while you still have guys to eliminate
488                 while (contributions.size() > 0) {
489                 
490                         m->mothurOut("Removing scores contributed by sequence " + toString(largestContrib) + " in your template file."); m->mothurOutEndLine();
491                         
492                         //remove from quantiles all scores that were made using this largestContrib
493                         for (int i = 0; i < quantiles.size(); i++) {
494 //cout << "before remove " << quantiles[i].size() << '\t';
495                                 removeContrib(largestContrib, quantiles[i]);
496 //cout << "after remove " << quantiles[i].size() << endl;
497                         }
498 //cout << count << " largest contrib = " << largestContrib << endl;  count++;
499                         //remove from contributions all scores that were made using this largestContrib
500                         removeContrib(largestContrib, contributions);
501                         
502                         //"erase" largestContrib
503                         seen[largestContrib] = -1;
504                         
505                         //get next largestContrib
506                         largestContrib = findLargestContrib(seen);
507                 }
508 ABOVE IS ATTEMPT #1             
509 **************************************************************************************************
510 BELOW IS ATTEMPT #2             
511                 vector<int> marked = returnObviousOutliers(quantiles, num);
512                 vector<int> copyMarked = marked;
513                 
514                 //find the 99% of marked
515                 sort(copyMarked.begin(), copyMarked.end());
516                 int high = copyMarked[int(copyMarked.size() * 0.99)];
517 cout << "high = " << high << endl;              
518                 
519                 for(int i = 0; i < marked.size(); i++) {
520                         if (marked[i] > high) { 
521                                 m->mothurOut("Removing scores contributed by sequence " + toString(marked[i]) + " in your template file."); m->mothurOutEndLine();
522                                 for (int i = 0; i < quantiles.size(); i++) {
523                                         removeContrib(marked[i], quantiles[i]);
524                                 }
525                         }
526
527                 }
528                 
529                 
530                 //adjust quantiles
531                 for (int i = 0; i < quantiles.size(); i++) {
532                         vector<float> temp;
533                         
534                         if (quantiles[i].size() == 0) {
535                                 //in case this is not a distance found in your template files
536                                 for (int g = 0; g < 6; g++) {
537                                         temp.push_back(0.0);
538                                 }
539                         }else{
540                                 
541                                 sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
542                                 
543                                 //save 10%
544                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)].score);
545                                 //save 25%
546                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)].score);
547                                 //save 50%
548                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)].score);
549                                 //save 75%
550                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)].score);
551                                 //save 95%
552                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)].score);
553                                 //save 99%
554                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)].score);
555                                 
556                         }
557                         
558                         quan[i] = temp;
559                         
560                 }
561 */
562                 
563         }
564         catch(exception& e) {
565                 m->errorOut(e, "DeCalculator", "removeObviousOutliers");
566                 exit(1);
567         }
568 }
569 //***************************************************************************************************************
570 //put quanMember in the vector based on how far they are from the 99% or 1%.  Biggest offenders in front.
571 /*vector<quanMember> DeCalculator::sortContrib(map<quanMember*, float> quan) {
572         try{
573                 
574                 vector<quanMember> newQuan;
575                 map<quanMember*, float>::iterator it;
576                 
577                 while (quan.size() > 0) {
578                         
579                          map<quanMember*, float>::iterator largest = quan.begin(); 
580                           
581                         //find biggest member
582                         for (it = quan.begin(); it != quan.end(); it++) {
583                                 if (it->second > largest->second) {  largest = it;  }
584                         }
585 cout << largest->second << '\t' << largest->first->score << '\t' << largest->first->member1 << '\t' << largest->first->member2 << endl;
586                         //save it 
587                         newQuan.push_back(*(largest->first));
588                 
589                         //erase from quan
590                         quan.erase(largest);
591                 }
592                 
593                 return newQuan;
594                 
595         }
596         catch(exception& e) {
597                 m->errorOut(e, "DeCalculator", "sortContrib");
598                 exit(1);
599         }
600 }
601
602 //***************************************************************************************************************
603 //used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used...
604 int DeCalculator::findLargestContrib(vector<int> seen) {
605         try{
606                 
607                 int largest = 0;
608                 
609                 int largestContribs;
610                 
611                 for (int i = 0; i < seen.size(); i++)  {  
612                         
613                         if (seen[i] > largest) {
614                                 largestContribs = i;
615                                 largest = seen[i];
616                         }
617                 }
618                 
619                 return largestContribs;
620                 
621         }
622         catch(exception& e) {
623                 m->errorOut(e, "DeCalculator", "findLargestContribs");
624                 exit(1);
625         }
626 }
627 //***************************************************************************************************************
628 void DeCalculator::removeContrib(int bad, vector<quanMember>& quan) {
629         try{
630         
631                 vector<quanMember> newQuan;
632                 for (int i = 0; i < quan.size(); i++)  {  
633                         if ((quan[i].member1 != bad) && (quan[i].member2 != bad) ) {  
634                                 newQuan.push_back(quan[i]);  
635                         }
636                 }
637                 
638                 quan = newQuan;
639                 
640         }
641         catch(exception& e) {
642                 m->errorOut(e, "DeCalculator", "removeContrib");
643                 exit(1);
644         }
645 }
646 */
647 //***************************************************************************************************************
648 float DeCalculator::findAverage(vector<float> myVector) {
649         try{
650                 
651                 float total = 0.0;
652                 for (int i = 0; i < myVector.size(); i++)  {  total += myVector[i];  }
653                 
654                 float average = total / (float) myVector.size();
655                 
656                 return average;
657                 
658         }
659         catch(exception& e) {
660                 m->errorOut(e, "DeCalculator", "findAverage");
661                 exit(1);
662         }
663 }
664
665 //***************************************************************************************************************
666 float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
667         try {
668         
669                 //find average prob for this seqs windows
670                 float probAverage = findAverage(qav);
671                                 
672                 //find observed average 
673                 float obsAverage = findAverage(obs);
674                         
675                 float coef = obsAverage / probAverage;
676                                                 
677                 return coef;
678         }
679         catch(exception& e) {
680                 m->errorOut(e, "DeCalculator", "getCoef");
681                 exit(1);
682         }
683 }
684 //***************************************************************************************************************
685 //gets closest matches to each end, since chimeras will most likely have different parents on each end
686 vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db, int& numWanted, vector<int>& indexes) {
687         try {
688                 indexes.clear();
689                 
690                 vector<Sequence*> seqsMatches;  
691                 vector<SeqDist> distsLeft;
692                 vector<SeqDist> distsRight;
693                 
694                 Dist* distcalculator = new eachGapDist();
695                 
696                 string queryUnAligned = querySeq->getUnaligned();
697                 int numBases = int(queryUnAligned.length() * 0.33);
698                 
699                 string leftQuery = ""; //first 1/3 of the sequence
700                 string rightQuery = ""; //last 1/3 of the sequence
701                 string queryAligned = querySeq->getAligned();
702                 
703                 //left side
704                 bool foundFirstBase = false;
705                 int baseCount = 0;
706                 int leftSpot = 0;
707                 int firstBaseSpot = 0;
708                 for (int i = 0; i < queryAligned.length(); i++) {
709                         //if you are a base
710                         if (isalpha(queryAligned[i])) {         
711                                 baseCount++; 
712                                 if (!foundFirstBase) {   foundFirstBase = true;  firstBaseSpot = i;  }
713                         }
714                         
715                         //eliminate opening .'s
716                         if (foundFirstBase) {   leftQuery += queryAligned[i];  }
717                         //if you have 1/3
718                         if (baseCount >= numBases) {  leftSpot = i; break; } //first 1/3
719                 }
720                 
721                 //right side - count through another 1/3, so you are at last third
722                 baseCount = 0;
723                 int rightSpot = 0;
724                 for (int i = leftSpot; i < queryAligned.length(); i++) {
725                         //if you are a base
726                         if (isalpha(queryAligned[i])) {         baseCount++;    }
727                         //if you have 1/3
728                         if (baseCount >= numBases) { rightSpot = i;  break; } //last 1/3
729                 }
730                 
731                 //trim end
732                 //find last position in query that is a non gap character
733                 int lastBaseSpot = queryAligned.length()-1;
734                 for (int j = queryAligned.length()-1; j >= 0; j--) {
735                         if (isalpha(queryAligned[j])) {
736                                 lastBaseSpot = j;
737                                 break;
738                         }
739                 }
740                 rightQuery = queryAligned.substr(rightSpot, (lastBaseSpot-rightSpot)); //sequence from pos spot to end
741                 
742                 Sequence queryLeft(querySeq->getName(), leftQuery);
743                 Sequence queryRight(querySeq->getName(), rightQuery);
744 //cout << querySeq->getName() << '\t' << leftSpot << '\t' << rightSpot << '\t' << firstBaseSpot << '\t' << lastBaseSpot << endl;
745 //cout << queryUnAligned.length() << '\t' << queryLeft.getUnaligned().length() << '\t' << queryRight.getUnaligned().length() << endl;
746                 for(int j = 0; j < db.size(); j++){
747                         
748                         string dbAligned = db[j]->getAligned();
749                         string leftDB = dbAligned.substr(firstBaseSpot, (leftSpot-firstBaseSpot+1)); //first 1/3 of the sequence
750                         string rightDB = dbAligned.substr(rightSpot, (lastBaseSpot-rightSpot)); //last 1/3 of the sequence
751                         
752                         Sequence dbLeft(db[j]->getName(), leftDB);
753                         Sequence dbRight(db[j]->getName(), rightDB);
754
755                         distcalculator->calcDist(queryLeft, dbLeft);
756                         float distLeft = distcalculator->getDist();
757                         
758                         distcalculator->calcDist(queryRight, dbRight);
759                         float distRight = distcalculator->getDist();
760                         
761                         SeqDist subjectLeft;
762                         subjectLeft.seq = db[j];
763                         subjectLeft.dist = distLeft;
764                         subjectLeft.index = j;
765                         
766                         distsLeft.push_back(subjectLeft);
767                         
768                         SeqDist subjectRight;
769                         subjectRight.seq = db[j];
770                         subjectRight.dist = distRight;
771                         subjectRight.index = j;
772                         
773                         distsRight.push_back(subjectRight);
774
775                 }
776                 
777                 delete distcalculator;
778                 
779                 //sort by smallest distance
780                 sort(distsRight.begin(), distsRight.end(), compareSeqDist);
781                 sort(distsLeft.begin(), distsLeft.end(), compareSeqDist);
782                 
783                 //merge results         
784                 map<string, string> seen;
785                 map<string, string>::iterator it;
786                 
787                 vector<SeqDist> dists;
788                 float lastRight = distsRight[0].dist;
789                 float lastLeft = distsLeft[0].dist;
790                 int lasti = 0;
791                 for (int i = 0; i < distsLeft.size(); i++) {
792                         //add left if you havent already
793                         it = seen.find(distsLeft[i].seq->getName());
794                         if (it == seen.end()) {  
795                                 dists.push_back(distsLeft[i]);
796                                 seen[distsLeft[i].seq->getName()] = distsLeft[i].seq->getName();
797                                 lastLeft =  distsLeft[i].dist;
798                         }
799
800                         //add right if you havent already
801                         it = seen.find(distsRight[i].seq->getName());
802                         if (it == seen.end()) {  
803                                 dists.push_back(distsRight[i]);
804                                 seen[distsRight[i].seq->getName()] = distsRight[i].seq->getName();
805                                 lastRight =  distsRight[i].dist;
806                         }
807                         
808                         if (dists.size() > numWanted) { lasti = i; break; } //you have enough results
809                 }
810                 
811                 //add in dups
812                 lasti++;
813                 int i = lasti;
814                 while (i < distsLeft.size()) {  
815                         if (distsLeft[i].dist == lastLeft) {  dists.push_back(distsLeft[i]);  numWanted++; }
816                         else { break; }
817                         i++;
818                 }
819                 
820                 i = lasti;
821                 while (i < distsRight.size()) {  
822                         if (distsRight[i].dist == lastRight) {  dists.push_back(distsRight[i]);  numWanted++; }
823                         else { break; }
824                         i++;
825                 }
826
827                 if (numWanted > dists.size()) { m->mothurOut("numwanted is larger than the number of template sequences, adjusting numwanted."); m->mothurOutEndLine(); numWanted = dists.size(); }
828
829 //cout << numWanted << endl;
830                 for (int i = 0; i < numWanted; i++) {
831 //cout << dists[i].seq->getName() << '\t' << dists[i].dist << endl;
832                         Sequence* temp = new Sequence(dists[i].seq->getName(), dists[i].seq->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
833                         seqsMatches.push_back(temp);
834                         indexes.push_back(dists[i].index);
835                 }
836                 
837                 return seqsMatches;
838         }
839         catch(exception& e) {
840                 m->errorOut(e, "DeCalculator", "findClosest");
841                 exit(1);
842         }
843 }
844 //***************************************************************************************************************
845 Sequence* DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db) {
846         try {
847                 
848                 Sequence* seqsMatch;  
849                 
850                 Dist* distcalculator = new eachGapDist();
851                 int index = 0;
852                 int smallest = 1000000;
853                 
854                 for(int j = 0; j < db.size(); j++){
855                         
856                         distcalculator->calcDist(*querySeq, *db[j]);
857                         float dist = distcalculator->getDist();
858                         
859                         if (dist < smallest) { 
860                                 smallest = dist;
861                                 index = j;
862                         }
863                 }
864                 
865                 delete distcalculator;
866                 
867                 seqsMatch = new Sequence(db[index]->getName(), db[index]->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
868                         
869                 return seqsMatch;
870         }
871         catch(exception& e) {
872                 m->errorOut(e, "DeCalculator", "findClosest");
873                 exit(1);
874         }
875 }
876 /***************************************************************************************************************/
877 map<int, int> DeCalculator::trimSeqs(Sequence* query, vector<Sequence*> topMatches) {
878         try {
879                 
880                 int frontPos = 0;  //should contain first position in all seqs that is not a gap character
881                 int rearPos = query->getAligned().length();
882                 
883                 //********find first position in topMatches that is a non gap character***********//
884                 //find first position all query seqs that is a non gap character
885                 for (int i = 0; i < topMatches.size(); i++) {
886                         
887                         string aligned = topMatches[i]->getAligned();
888                         int pos = 0;
889                         
890                         //find first spot in this seq
891                         for (int j = 0; j < aligned.length(); j++) {
892                                 if (isalpha(aligned[j])) {
893                                         pos = j;
894                                         break;
895                                 }
896                         }
897                         
898                         //save this spot if it is the farthest
899                         if (pos > frontPos) { frontPos = pos; }
900                 }
901                 
902                 
903                 string aligned = query->getAligned();
904                 int pos = 0;
905                         
906                 //find first position in query that is a non gap character
907                 for (int j = 0; j < aligned.length(); j++) {
908                         if (isalpha(aligned[j])) {
909                                 pos = j;
910                                 break;
911                         }
912                 }
913                 
914                 //save this spot if it is the farthest
915                 if (pos > frontPos) { frontPos = pos; }
916                 
917                 
918                 //********find last position in topMatches that is a non gap character***********//
919                 for (int i = 0; i < topMatches.size(); i++) {
920                         
921                         string aligned = topMatches[i]->getAligned();
922                         int pos = aligned.length();
923                         
924                         //find first spot in this seq
925                         for (int j = aligned.length()-1; j >= 0; j--) {
926                                 if (isalpha(aligned[j])) {
927                                         pos = j;
928                                         break;
929                                 }
930                         }
931                         
932                         //save this spot if it is the farthest
933                         if (pos < rearPos) { rearPos = pos; }
934                 }
935                 
936                 
937                 aligned = query->getAligned();
938                 pos = aligned.length();
939                 
940                 //find last position in query that is a non gap character
941                 for (int j = aligned.length()-1; j >= 0; j--) {
942                         if (isalpha(aligned[j])) {
943                                 pos = j;
944                                 break;
945                         }
946                 }
947                 
948                 //save this spot if it is the farthest
949                 if (pos < rearPos) { rearPos = pos; }
950
951                 //check to make sure that is not whole seq
952                 if ((rearPos - frontPos - 1) <= 0) {  m->mothurOut("Error, when I trim your sequences, the entire sequence is trimmed."); m->mothurOutEndLine(); exit(1);  }
953 //cout << query->getName() << " front = " << frontPos << " rear = " << rearPos << endl;         
954                 //trim query
955                 string newAligned = query->getAligned();
956                 newAligned = newAligned.substr(frontPos, (rearPos-frontPos+1));
957                 query->setAligned(newAligned);
958                 
959                 //trim topMatches
960                 for (int i = 0; i < topMatches.size(); i++) {
961                         newAligned = topMatches[i]->getAligned();
962                         newAligned = newAligned.substr(frontPos, (rearPos-frontPos+1));
963                         topMatches[i]->setAligned(newAligned);
964                 }
965                 
966                 map<int, int> trimmedPos;
967                 
968                 for (int i = 0; i < newAligned.length(); i++) {
969                         trimmedPos[i] = i+frontPos;
970                 }
971                 
972                 return trimmedPos;
973         }
974         catch(exception& e) {
975                 m->errorOut(e, "DeCalculator", "trimSequences");
976                 exit(1);
977         }
978
979 }
980 //***************************************************************************************************************
981
982
983