]> git.donarmstrong.com Git - mothur.git/blob - decalc.cpp
chimera.slayer
[mothur.git] / decalc.cpp
1 /*
2  *  decalc.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 7/22/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "decalc.h"
11 #include "chimera.h"
12 #include "dist.h"
13 #include "eachgapdist.h"
14 #include "ignoregaps.h"
15 #include "eachgapdist.h"
16
17 //***************************************************************************************************************
18 void DeCalculator::setMask(string ms) { 
19         try {
20                 seqMask = ms; 
21                 int count = 0;
22                 maskMap.clear();
23                 
24                 if (seqMask.length() != 0) {
25                         //whereever there is a base in the mask, save that value is query and subject
26                         for (int i = 0; i < seqMask.length(); i++) {
27                                 if (isalpha(seqMask[i])) {
28                                         h.insert(i);
29                                         maskMap[count] = i;
30                                         count++;
31                                         
32                                 }
33                         }
34                 }else {
35                         for (int i = 0; i < alignLength; i++) {   
36                                 h.insert(i); 
37                                 maskMap[count] = i;
38                                 count++;
39                         }
40                 }
41         }
42         catch(exception& e) {
43                 m->errorOut(e, "DeCalculator", "setMask");
44                 exit(1);
45         } 
46 }
47 //***************************************************************************************************************
48 void DeCalculator::runMask(Sequence* seq) {
49         try{
50                 
51                 string q = seq->getAligned();
52                 string tempQuery = "";
53                 
54                 //whereever there is a base in the mask, save that value is query and subject
55                 set<int>::iterator setit;
56                 for ( setit=h.begin() ; setit != h.end(); setit++ )  {
57                         tempQuery += q[*setit];
58                 }
59                 
60                 //save masked values
61                 seq->setAligned(tempQuery);
62                 seq->setUnaligned(tempQuery);
63         }
64         catch(exception& e) {
65                 m->errorOut(e, "DeCalculator", "runMask");
66                 exit(1);
67         }
68 }
69 //***************************************************************************************************************
70 //num is query's spot in querySeqs
71 void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map<int, int>& trim) {
72         try {
73                 
74                 string q = query->getAligned();
75                 string s = subject->getAligned();
76                 
77                 int front = 0;
78                 for (int i = 0; i < q.length(); i++) {
79 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
80                         if (isalpha(q[i]) && isalpha(s[i])) { front = i; break;  }
81                 }
82 //cout << endl << endl;         
83                 int back = 0;           
84                 for (int i = q.length(); i >= 0; i--) {
85 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
86                         if (isalpha(q[i]) && isalpha(s[i])) { back = i; break;  }
87                 }
88                 
89                 trim[front] = back;
90                 
91         }
92         catch(exception& e) {
93                 m->errorOut(e, "DeCalculator", "trimSeqs");
94                 exit(1);
95         }
96 }
97 //***************************************************************************************************************
98 vector<int>  DeCalculator::findWindows(Sequence* query, int front, int back, int& size, int increment) {
99         try {
100                 
101                 vector<int> win; 
102                 
103                 int cutoff = back - front;  //back - front 
104                         
105                 //if window is set to default
106                 if (size == 0) {  if (cutoff > 1200) {  size = 300; }
107                                                         else{  size = (cutoff / 4); }  } 
108                 else if (size > (cutoff / 4)) { 
109                                 m->mothurOut("You have selected too large a window size for sequence " + query->getName() + ".  I will choose an appropriate window size."); m->mothurOutEndLine();
110                                 size = (cutoff / 4); 
111                 }
112         
113         /*      string seq = query->getAligned().substr(front, cutoff);
114                         
115                 //count bases
116                 int numBases = 0;
117                 for (int l = 0; l < seq.length(); l++) {  if (isalpha(seq[l])) { numBases++; }  }
118 //cout << "num Bases = " << numBases << endl;                   
119                 //save start of seq
120                 win.push_back(front);
121 //cout << front << '\t';                
122                 //move ahead increment bases at a time until all bases are in a window
123                 int countBases = 0;
124                 int totalBases = 0;  //used to eliminate window of blanks at end of sequence
125                         
126                 seq = query->getAligned();
127                 for (int m = front; m < (back - size) ; m++) {
128                                 
129                         //count number of bases you see
130                         if (isalpha(seq[m])) { countBases++;  }
131                                 
132                         //if you have seen enough bases to make a new window
133                         if (countBases >= increment) {
134                                 //total bases is the number of bases in a window already.
135                                 totalBases += countBases;
136 //cout << "total bases = " << totalBases << endl;
137                                 win.push_back(m);  //save spot in alignment
138 //cout << m << '\t';
139                                 countBases = 0;                         //reset bases you've seen in this window
140                         }
141                                 
142                         //no need to continue if all your bases are in a window
143                         if (totalBases == numBases) {   break;  }
144                 }
145         
146
147                 //get last window if needed
148                 if (totalBases < numBases) {   win.push_back(back-size);  }
149 //cout << endl << endl;
150 */      
151                 
152                 //this follows wigeon, but we may want to consider that it chops off the end values if the sequence cannot be evenly divided into steps
153                 for (int i = front; i < (back - size) ; i+=increment) {  win.push_back(i);  }
154
155
156         
157                 return win;
158         
159         }
160         catch(exception& e) {
161                 m->errorOut(e, "DeCalculator", "findWindows");
162                 exit(1);
163         }
164 }
165
166 //***************************************************************************************************************
167 vector<float> DeCalculator::calcObserved(Sequence* query, Sequence* subject, vector<int> window, int size) {
168         try {
169                 
170                 vector<float> temp;     
171                 //int gaps = 0;         
172                 for (int i = 0; i < window.size(); i++) {
173                                                 
174                         string seqFrag = query->getAligned().substr(window[i], size);
175                         string seqFragsub = subject->getAligned().substr(window[i], size);
176                                 
177                         int diff = 0;
178                         for (int b = 0; b < seqFrag.length(); b++) {
179                                 //if at least one is a base and they are not equal
180                                 if( (isalpha(seqFrag[b]) || isalpha(seqFragsub[b])) && (seqFrag[b] != seqFragsub[b]) ) { diff++; }
181                                 
182                                 //ignore gaps
183                                 //if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
184                         }
185                
186                         //percentage of mismatched bases
187                         float dist;
188                         
189                         //if the whole fragment is 0 distance = 0
190                         //if ((seqFrag.length()-gaps) == 0)  { dist =  0.0; }
191                
192                         //percentage of mismatched bases
193                         //else {  dist = diff / (float) (seqFrag.length()-gaps) * 100;   } 
194                         
195                         dist = diff / (float) (seqFrag.length()) * 100; 
196                         
197                         temp.push_back(dist);
198                 }
199                         
200                 return temp;
201         }
202         catch(exception& e) {
203                 m->errorOut(e, "DeCalculator", "calcObserved");
204                 exit(1);
205         }
206 }
207 //***************************************************************************************************************
208 float DeCalculator::calcDist(Sequence* query, Sequence* subject, int front, int back) {
209         try {
210                 
211                 //so you only look at the trimmed part of the sequence
212                 int cutoff = back - front;  
213                 int gaps = 0;
214                         
215                 //from first startpoint with length back-front
216                 string seqFrag = query->getAligned().substr(front, cutoff);
217                 string seqFragsub = subject->getAligned().substr(front, cutoff);
218                                                                                                                 
219                 int diff = 0;
220                 for (int b = 0; b < seqFrag.length(); b++) {
221                         //ignore gaps
222                         if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
223                         if (seqFrag[b] != seqFragsub[b]) { diff++; }
224                 }
225                 
226                 //if the whole fragment is 0 distance = 0
227                 if ((seqFrag.length()-gaps) == 0)  { return 0.0; }
228                
229                 //percentage of mismatched bases
230                 float dist = diff / (float) (seqFrag.length()-gaps) * 100;       
231                                 
232                 return dist;
233         }
234         catch(exception& e) {
235                 m->errorOut(e, "DeCalculator", "calcDist");
236                 exit(1);
237         }
238 }
239
240 //***************************************************************************************************************
241 vector<float> DeCalculator::calcExpected(vector<float> qav, float coef) {
242         try {
243                 
244                 //for each window
245                 vector<float> queryExpected;
246                         
247                 for (int j = 0; j < qav.size(); j++) {          
248                                 
249                         float expected = qav[j] * coef;
250                                 
251                         queryExpected.push_back(expected);      
252                 }
253                         
254                 return queryExpected;
255                                 
256         }
257         catch(exception& e) {
258                 m->errorOut(e, "DeCalculator", "calcExpected");
259                 exit(1);
260         }
261 }
262 //***************************************************************************************************************
263 float DeCalculator::calcDE(vector<float> obs, vector<float> exp) {
264         try {
265                 
266                 //for each window
267                 float sum = 0.0;  //sum = sum from 1 to i of (oi-ei)^2
268                 int numZeros = 0;
269                 for (int j = 0; j < obs.size(); j++) {          
270                         
271                         //if (obs[j] != 0.0) {
272                                 sum += ((obs[j] - exp[j]) * (obs[j] - exp[j])); 
273                         //}else {  numZeros++;   }
274                         
275                 }
276                         
277                 float de = sqrt((sum / (obs.size() - 1 - numZeros)));
278                         
279                 return de;
280         }
281         catch(exception& e) {
282                 m->errorOut(e, "DeCalculator", "calcDE");
283                 exit(1);
284         }
285 }
286
287 //***************************************************************************************************************
288
289 vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
290         try {
291
292                 vector<float> prob;
293                 string freqfile = m->getRootName(filename) + "freq";
294                 ofstream outFreq;
295                 
296                 m->openOutputFile(freqfile, outFreq);
297                 
298                 outFreq << "#" << m->getVersion() << endl;
299                 
300                 string length = toString(seqs.size());  //if there are 5000 seqs in the template then set precision to 3
301                 int precision = length.length() - 1;
302                 
303                 //format output
304                 outFreq.setf(ios::fixed, ios::floatfield); outFreq.setf(ios::showpoint);
305                 
306                 //at each position in the sequence
307                 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
308                         
309                         vector<int> freq;   freq.resize(4,0);
310                         int gaps = 0;
311                         
312                         //find the frequency of each nucleotide
313                         for (int j = 0; j < seqs.size(); j++) {
314                                 
315                                 char value = seqs[j]->getAligned()[i];
316                                 
317                                 if(toupper(value) == 'A')                                                                       {       freq[0]++;      }
318                                 else if(toupper(value) == 'T' || toupper(value) == 'U')         {       freq[1]++;      }
319                                 else if(toupper(value) == 'G')                                                          {       freq[2]++;      }
320                                 else if(toupper(value) == 'C')                                                          {       freq[3]++;      }
321                                 else { gaps++; }
322                         }
323                         
324                         //find base with highest frequency
325                         int highest = 0;
326                         for (int j = 0; j < freq.size(); j++) {   if (freq[j] > highest) {  highest = freq[j];  }               }
327                         
328                         float highFreq = highest / (float) (seqs.size());       
329                         
330                         float Pi;
331                         Pi =  (highFreq - 0.25) / 0.75; 
332                         
333                         //cannot have probability less than 0.
334                         if (Pi < 0) { Pi = 0.0; }
335                         
336                         //saves this for later
337                         outFreq << setprecision(precision) << i << '\t' << highFreq << endl;
338         
339                         if (h.count(i) > 0) {
340                                 prob.push_back(Pi); 
341                         }
342                 }
343                 
344                 outFreq.close();
345                 
346                 return prob;
347                                 
348         }
349         catch(exception& e) {
350                 m->errorOut(e, "DeCalculator", "calcFreq");
351                 exit(1);
352         }
353 }
354 //***************************************************************************************************************
355 vector<float>  DeCalculator::findQav(vector<int> window, int size, vector<float> probabilityProfile) {
356         try {
357                 vector<float>  averages; 
358                                 
359                 //for each window find average
360                 for (int i = 0; i < window.size(); i++) {
361                                 
362                         float average = 0.0;
363                                 
364                         //while you are in the window for this sequence
365                         int count = 0;
366                         for (int j = window[i]; j < (window[i]+size); j++) {   
367                                 average += probabilityProfile[j];
368                                 count++;
369                         }
370                                 
371                         average = average / count;
372         
373                         //save this windows average
374                         averages.push_back(average);
375                 }
376                                 
377                 return averages;
378         }
379         catch(exception& e) {
380                 m->errorOut(e, "DeCalculator", "findQav");
381                 exit(1);
382         }
383 }
384 //***************************************************************************************************************
385 //seqs have already been masked
386 vector< vector<float> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end) {
387         try {
388                 vector< vector<float> > quan; 
389                 
390                 //percentage of mismatched pairs 1 to 100
391                 quan.resize(100);
392
393                 //for each sequence
394                 for(int i = start; i < end; i++){
395                 
396                         m->mothurOut("Processing sequence " + toString(i)); m->mothurOutEndLine();
397                         Sequence* query = new Sequence(seqs[i]->getName(), seqs[i]->getAligned());
398                         
399                         //compare to every other sequence in template
400                         for(int j = 0; j < i; j++){
401                                 
402                                 Sequence* subject = new Sequence(seqs[j]->getName(), seqs[j]->getAligned());
403                                 
404                                 if (m->control_pressed) { delete query; delete subject; return quan; }
405                                 
406                                 map<int, int> trim;
407                                 map<int, int>::iterator it;
408                                 
409                                 trimSeqs(query, subject, trim);
410                                 
411                                 it = trim.begin();
412                                 int front = it->first; int back = it->second;
413                                 
414                                 //reset window for each new comparison
415                                 windowSizesTemplate[i] = window;
416                                 
417                                 vector<int> win = findWindows(query, front, back, windowSizesTemplate[i], increment);
418                                 
419                                 vector<float> obsi = calcObserved(query, subject, win, windowSizesTemplate[i]);
420                                 
421                                 vector<float> q = findQav(win, windowSizesTemplate[i], probProfile);
422                                                                         
423                                 float alpha = getCoef(obsi, q);
424                                                 
425                                 vector<float> exp = calcExpected(q, alpha);
426                                 
427                                 float de = calcDE(obsi, exp);
428                                                                 
429                                 float dist = calcDist(query, subject, front, back); 
430         //cout << i << '\t' <<  j << '\t' << dist << '\t' << de << endl;                        
431                                 dist = ceil(dist);
432                                 
433                                 //quanMember newScore(de, i, j);
434                                 
435                                 quan[dist].push_back(de);
436
437                                 delete subject;
438                         }
439                         
440                         delete query;
441                 }
442         
443
444                 return quan;
445                                                 
446         }
447         catch(exception& e) {
448                 m->errorOut(e, "DeCalculator", "getQuantiles");
449                 exit(1);
450         }
451 }
452 //********************************************************************************************************************
453 //sorts lowest to highest
454 inline bool compareQuanMembers(quanMember left, quanMember right){
455         return (left.score < right.score);      
456
457 //***************************************************************************************************************
458 //this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right.  may want to revisit in the future...
459 void DeCalculator::removeObviousOutliers(vector< vector<float> >& quantiles, int num) {
460         try {
461                                                 
462                 for (int i = 0; i < quantiles.size(); i++) {
463                         
464                         //find mean of this quantile score
465                         sort(quantiles[i].begin(), quantiles[i].end());
466                         
467                         vector<float> temp;
468                         if (quantiles[i].size() != 0) {
469                                 float high = quantiles[i][int(quantiles[i].size() * 0.99)];
470                                 float low =  quantiles[i][int(quantiles[i].size() * 0.01)];
471                         
472                                 //look at each value in quantiles to see if it is an outlier
473                                 for (int j = 0; j < quantiles[i].size(); j++) {
474                                         //is this score between 1 and 99%
475                                         if ((quantiles[i][j] > low) && (quantiles[i][j] < high)) {
476                                                 temp.push_back(quantiles[i][j]);
477                                         }
478                                 }
479                         }
480                         quantiles[i] = temp;
481                 }
482
483 /*
484                 //find contributer with most offending score related to it
485                 int largestContrib = findLargestContrib(seen);
486         
487                 //while you still have guys to eliminate
488                 while (contributions.size() > 0) {
489                 
490                         m->mothurOut("Removing scores contributed by sequence " + toString(largestContrib) + " in your template file."); m->mothurOutEndLine();
491                         
492                         //remove from quantiles all scores that were made using this largestContrib
493                         for (int i = 0; i < quantiles.size(); i++) {
494 //cout << "before remove " << quantiles[i].size() << '\t';
495                                 removeContrib(largestContrib, quantiles[i]);
496 //cout << "after remove " << quantiles[i].size() << endl;
497                         }
498 //cout << count << " largest contrib = " << largestContrib << endl;  count++;
499                         //remove from contributions all scores that were made using this largestContrib
500                         removeContrib(largestContrib, contributions);
501                         
502                         //"erase" largestContrib
503                         seen[largestContrib] = -1;
504                         
505                         //get next largestContrib
506                         largestContrib = findLargestContrib(seen);
507                 }
508 ABOVE IS ATTEMPT #1             
509 **************************************************************************************************
510 BELOW IS ATTEMPT #2             
511                 vector<int> marked = returnObviousOutliers(quantiles, num);
512                 vector<int> copyMarked = marked;
513                 
514                 //find the 99% of marked
515                 sort(copyMarked.begin(), copyMarked.end());
516                 int high = copyMarked[int(copyMarked.size() * 0.99)];
517 cout << "high = " << high << endl;              
518                 
519                 for(int i = 0; i < marked.size(); i++) {
520                         if (marked[i] > high) { 
521                                 m->mothurOut("Removing scores contributed by sequence " + toString(marked[i]) + " in your template file."); m->mothurOutEndLine();
522                                 for (int i = 0; i < quantiles.size(); i++) {
523                                         removeContrib(marked[i], quantiles[i]);
524                                 }
525                         }
526
527                 }
528                 
529                 
530                 //adjust quantiles
531                 for (int i = 0; i < quantiles.size(); i++) {
532                         vector<float> temp;
533                         
534                         if (quantiles[i].size() == 0) {
535                                 //in case this is not a distance found in your template files
536                                 for (int g = 0; g < 6; g++) {
537                                         temp.push_back(0.0);
538                                 }
539                         }else{
540                                 
541                                 sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
542                                 
543                                 //save 10%
544                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)].score);
545                                 //save 25%
546                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)].score);
547                                 //save 50%
548                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)].score);
549                                 //save 75%
550                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)].score);
551                                 //save 95%
552                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)].score);
553                                 //save 99%
554                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)].score);
555                                 
556                         }
557                         
558                         quan[i] = temp;
559                         
560                 }
561 */
562                 
563         }
564         catch(exception& e) {
565                 m->errorOut(e, "DeCalculator", "removeObviousOutliers");
566                 exit(1);
567         }
568 }
569 //***************************************************************************************************************
570 //put quanMember in the vector based on how far they are from the 99% or 1%.  Biggest offenders in front.
571 /*vector<quanMember> DeCalculator::sortContrib(map<quanMember*, float> quan) {
572         try{
573                 
574                 vector<quanMember> newQuan;
575                 map<quanMember*, float>::iterator it;
576                 
577                 while (quan.size() > 0) {
578                         
579                          map<quanMember*, float>::iterator largest = quan.begin(); 
580                           
581                         //find biggest member
582                         for (it = quan.begin(); it != quan.end(); it++) {
583                                 if (it->second > largest->second) {  largest = it;  }
584                         }
585 cout << largest->second << '\t' << largest->first->score << '\t' << largest->first->member1 << '\t' << largest->first->member2 << endl;
586                         //save it 
587                         newQuan.push_back(*(largest->first));
588                 
589                         //erase from quan
590                         quan.erase(largest);
591                 }
592                 
593                 return newQuan;
594                 
595         }
596         catch(exception& e) {
597                 m->errorOut(e, "DeCalculator", "sortContrib");
598                 exit(1);
599         }
600 }
601
602 //***************************************************************************************************************
603 //used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used...
604 int DeCalculator::findLargestContrib(vector<int> seen) {
605         try{
606                 
607                 int largest = 0;
608                 
609                 int largestContribs;
610                 
611                 for (int i = 0; i < seen.size(); i++)  {  
612                         
613                         if (seen[i] > largest) {
614                                 largestContribs = i;
615                                 largest = seen[i];
616                         }
617                 }
618                 
619                 return largestContribs;
620                 
621         }
622         catch(exception& e) {
623                 m->errorOut(e, "DeCalculator", "findLargestContribs");
624                 exit(1);
625         }
626 }
627 //***************************************************************************************************************
628 void DeCalculator::removeContrib(int bad, vector<quanMember>& quan) {
629         try{
630         
631                 vector<quanMember> newQuan;
632                 for (int i = 0; i < quan.size(); i++)  {  
633                         if ((quan[i].member1 != bad) && (quan[i].member2 != bad) ) {  
634                                 newQuan.push_back(quan[i]);  
635                         }
636                 }
637                 
638                 quan = newQuan;
639                 
640         }
641         catch(exception& e) {
642                 m->errorOut(e, "DeCalculator", "removeContrib");
643                 exit(1);
644         }
645 }
646 */
647 //***************************************************************************************************************
648 float DeCalculator::findAverage(vector<float> myVector) {
649         try{
650                 
651                 float total = 0.0;
652                 for (int i = 0; i < myVector.size(); i++)  {  total += myVector[i];  }
653                 
654                 float average = total / (float) myVector.size();
655                 
656                 return average;
657                 
658         }
659         catch(exception& e) {
660                 m->errorOut(e, "DeCalculator", "findAverage");
661                 exit(1);
662         }
663 }
664
665 //***************************************************************************************************************
666 float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
667         try {
668         
669                 //find average prob for this seqs windows
670                 float probAverage = findAverage(qav);
671                                 
672                 //find observed average 
673                 float obsAverage = findAverage(obs);
674                         
675                 float coef = obsAverage / probAverage;
676                                                 
677                 return coef;
678         }
679         catch(exception& e) {
680                 m->errorOut(e, "DeCalculator", "getCoef");
681                 exit(1);
682         }
683 }
684 //***************************************************************************************************************
685 //gets closest matches to each end, since chimeras will most likely have different parents on each end
686 vector<Sequence*> DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*>& thisTemplate, vector<Sequence*>& thisFilteredTemplate, int numWanted, int minSim) {
687         try {
688                 //indexes.clear();
689                 
690                 vector<Sequence*> seqsMatches;  
691                 
692                 vector<SeqDist> distsLeft;
693                 vector<SeqDist> distsRight;
694                 
695                 Dist* distcalculator = new eachGapDist();
696                 
697                 string queryUnAligned = querySeq->getUnaligned();
698                 int numBases = int(queryUnAligned.length() * 0.33);
699                 
700                 string leftQuery = ""; //first 1/3 of the sequence
701                 string rightQuery = ""; //last 1/3 of the sequence
702                 string queryAligned = querySeq->getAligned();
703                 
704                 //left side
705                 bool foundFirstBase = false;
706                 int baseCount = 0;
707                 int leftSpot = 0;
708                 int firstBaseSpot = 0;
709                 for (int i = 0; i < queryAligned.length(); i++) {
710                         //if you are a base
711                         if (isalpha(queryAligned[i])) {         
712                                 baseCount++; 
713                                 if (!foundFirstBase) {   foundFirstBase = true;  firstBaseSpot = i;  }
714                         }
715                         
716                         //eliminate opening .'s
717                         if (foundFirstBase) {   leftQuery += queryAligned[i];  }
718                         //if you have 1/3
719                         if (baseCount >= numBases) {  leftSpot = i; break; } //first 1/3
720                 }
721                 
722                 //right side - count through another 1/3, so you are at last third
723                 baseCount = 0;
724                 int rightSpot = 0;
725                 for (int i = leftSpot; i < queryAligned.length(); i++) {
726                         //if you are a base
727                         if (isalpha(queryAligned[i])) {         baseCount++;    }
728                         //if you have 1/3
729                         if (baseCount >= numBases) { rightSpot = i;  break; } //last 1/3
730                 }
731                 
732                 //trim end
733                 //find last position in query that is a non gap character
734                 int lastBaseSpot = queryAligned.length()-1;
735                 for (int j = queryAligned.length()-1; j >= 0; j--) {
736                         if (isalpha(queryAligned[j])) {
737                                 lastBaseSpot = j;
738                                 break;
739                         }
740                 }
741                 rightQuery = queryAligned.substr(rightSpot, (lastBaseSpot-rightSpot)); //sequence from pos spot to end
742                 
743                 Sequence queryLeft(querySeq->getName(), leftQuery);
744                 Sequence queryRight(querySeq->getName(), rightQuery);
745 //cout << querySeq->getName() << '\t' << leftSpot << '\t' << rightSpot << '\t' << firstBaseSpot << '\t' << lastBaseSpot << endl;
746 //cout << queryUnAligned.length() << '\t' << queryLeft.getUnaligned().length() << '\t' << queryRight.getUnaligned().length() << endl;
747                 for(int j = 0; j < thisFilteredTemplate.size(); j++){
748                         
749                         string dbAligned = thisFilteredTemplate[j]->getAligned();
750                         string leftDB = dbAligned.substr(firstBaseSpot, (leftSpot-firstBaseSpot+1)); //first 1/3 of the sequence
751                         string rightDB = dbAligned.substr(rightSpot, (lastBaseSpot-rightSpot)); //last 1/3 of the sequence
752                         
753                         Sequence dbLeft(thisFilteredTemplate[j]->getName(), leftDB);
754                         Sequence dbRight(thisFilteredTemplate[j]->getName(), rightDB);
755
756                         distcalculator->calcDist(queryLeft, dbLeft);
757                         float distLeft = distcalculator->getDist();
758                         
759                         distcalculator->calcDist(queryRight, dbRight);
760                         float distRight = distcalculator->getDist();
761                         
762                         SeqDist subjectLeft;
763                         subjectLeft.seq = NULL;
764                         subjectLeft.dist = distLeft;
765                         subjectLeft.index = j;
766                         
767                         distsLeft.push_back(subjectLeft);
768                         
769                         SeqDist subjectRight;
770                         subjectRight.seq = NULL;
771                         subjectRight.dist = distRight;
772                         subjectRight.index = j;
773                         
774                         distsRight.push_back(subjectRight);
775
776                 }
777                 
778                 delete distcalculator;
779                 
780                 //sort by smallest distance
781                 sort(distsRight.begin(), distsRight.end(), compareSeqDist);
782                 sort(distsLeft.begin(), distsLeft.end(), compareSeqDist);
783                 
784                 //merge results         
785                 map<string, string> seen;
786                 map<string, string>::iterator it;
787                 
788                 vector<SeqDist> dists;
789                 float lastRight = distsRight[0].dist;
790                 float lastLeft = distsLeft[0].dist;
791                 //int lasti = 0;
792                 for (int i = 0; i < numWanted+1; i++) {
793                         
794                         if (m->control_pressed) { return seqsMatches; }
795                         
796                         //add left if you havent already
797                         it = seen.find(thisTemplate[distsLeft[i].index]->getName());
798                         if (it == seen.end()) {  
799                                 dists.push_back(distsLeft[i]);
800                                 seen[thisTemplate[distsLeft[i].index]->getName()] = thisTemplate[distsLeft[i].index]->getName();
801                                 lastLeft =  distsLeft[i].dist;
802 //                              cout << "loop-left\t" << db[distsLeft[i].index]->getName() << '\t' << distsLeft[i].dist << endl;
803                         }
804
805                         //add right if you havent already
806                         it = seen.find(thisTemplate[distsRight[i].index]->getName());
807                         if (it == seen.end()) {  
808                                 dists.push_back(distsRight[i]);
809                                 seen[thisTemplate[distsRight[i].index]->getName()] = thisTemplate[distsRight[i].index]->getName();
810                                 lastRight =  distsRight[i].dist;
811 //                              cout << "loop-right\t" << db[distsRight[i].index]->getName() << '\t' << distsRight[i].dist << endl;
812                         }
813                         
814                 }
815                 
816                 //are we still above the minimum similarity cutoff
817                 if ((lastLeft >= minSim) || (lastRight >= minSim)) {
818                         //add in ties from left
819                         int i = numWanted;
820                         while (i < distsLeft.size()) {  
821                                 if (distsLeft[i].dist == lastLeft) {  dists.push_back(distsLeft[i]);  }
822                                 else { break; }
823                                 i++;
824                         }
825                         
826                         //add in ties from right
827                         i = numWanted;
828                         while (i < distsRight.size()) {  
829                                 if (distsRight[i].dist == lastRight) {  dists.push_back(distsRight[i]);  }
830                                 else { break; }
831                                 i++;
832                         }
833                 }
834                 
835
836
837                 //cout << numWanted << endl;
838                 for (int i = 0; i < dists.size(); i++) {
839 //                      cout << db[dists[i].index]->getName() << '\t' << dists[i].dist << endl;
840
841                         if ((thisTemplate[dists[i].index]->getName() != querySeq->getName()) && (((1.0-dists[i].dist)*100) >= minSim)) {
842                                 Sequence* temp = new Sequence(thisTemplate[dists[i].index]->getName(), thisTemplate[dists[i].index]->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
843                                 //cout << querySeq->getName() << '\t' << thisTemplate[dists[i].index]->getName()  << '\t' << dists[i].dist << endl;
844                                 seqsMatches.push_back(temp);
845                         }
846
847                 }
848                 
849                 return seqsMatches;
850         }
851         catch(exception& e) {
852                 m->errorOut(e, "DeCalculator", "findClosest");
853                 exit(1);
854         }
855 }
856 //***************************************************************************************************************
857 Sequence* DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db) {
858         try {
859                 
860                 Sequence* seqsMatch;  
861                 
862                 Dist* distcalculator = new eachGapDist();
863                 int index = 0;
864                 int smallest = 1000000;
865                 
866                 for(int j = 0; j < db.size(); j++){
867                         
868                         distcalculator->calcDist(*querySeq, *db[j]);
869                         float dist = distcalculator->getDist();
870                         
871                         if (dist < smallest) { 
872                                 smallest = dist;
873                                 index = j;
874                         }
875                 }
876                 
877                 delete distcalculator;
878                 
879                 seqsMatch = new Sequence(db[index]->getName(), db[index]->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
880                         
881                 return seqsMatch;
882         }
883         catch(exception& e) {
884                 m->errorOut(e, "DeCalculator", "findClosest");
885                 exit(1);
886         }
887 }
888 /***************************************************************************************************************/
889 map<int, int> DeCalculator::trimSeqs(Sequence* query, vector<Sequence*> topMatches) {
890         try {
891                 
892                 int frontPos = 0;  //should contain first position in all seqs that is not a gap character
893                 int rearPos = query->getAligned().length();
894                 
895                 //********find first position in topMatches that is a non gap character***********//
896                 //find first position all query seqs that is a non gap character
897                 for (int i = 0; i < topMatches.size(); i++) {
898                         
899                         string aligned = topMatches[i]->getAligned();
900                         int pos = 0;
901                         
902                         //find first spot in this seq
903                         for (int j = 0; j < aligned.length(); j++) {
904                                 if (isalpha(aligned[j])) {
905                                         pos = j;
906                                         break;
907                                 }
908                         }
909                         
910                         //save this spot if it is the farthest
911                         if (pos > frontPos) { frontPos = pos; }
912                 }
913                 
914                 
915                 string aligned = query->getAligned();
916                 int pos = 0;
917                         
918                 //find first position in query that is a non gap character
919                 for (int j = 0; j < aligned.length(); j++) {
920                         if (isalpha(aligned[j])) {
921                                 pos = j;
922                                 break;
923                         }
924                 }
925                 
926                 //save this spot if it is the farthest
927                 if (pos > frontPos) { frontPos = pos; }
928                 
929                 
930                 //********find last position in topMatches that is a non gap character***********//
931                 for (int i = 0; i < topMatches.size(); i++) {
932                         
933                         string aligned = topMatches[i]->getAligned();
934                         int pos = aligned.length();
935                         
936                         //find first spot in this seq
937                         for (int j = aligned.length()-1; j >= 0; j--) {
938                                 if (isalpha(aligned[j])) {
939                                         pos = j;
940                                         break;
941                                 }
942                         }
943                         
944                         //save this spot if it is the farthest
945                         if (pos < rearPos) { rearPos = pos; }
946                 }
947                 
948                 
949                 aligned = query->getAligned();
950                 pos = aligned.length();
951                 
952                 //find last position in query that is a non gap character
953                 for (int j = aligned.length()-1; j >= 0; j--) {
954                         if (isalpha(aligned[j])) {
955                                 pos = j;
956                                 break;
957                         }
958                 }
959                 
960                 //save this spot if it is the farthest
961                 if (pos < rearPos) { rearPos = pos; }
962                 
963                 map<int, int> trimmedPos;
964                 //check to make sure that is not whole seq
965                 if ((rearPos - frontPos - 1) <= 0) {  
966                         query->setAligned("");
967                         //trim topMatches
968                         for (int i = 0; i < topMatches.size(); i++) {
969                                 topMatches[i]->setAligned("");
970                         }
971                         
972                 }else {
973
974                         //trim query
975                         string newAligned = query->getAligned();
976                         newAligned = newAligned.substr(frontPos, (rearPos-frontPos+1));
977                         query->setAligned(newAligned);
978                         
979                         //trim topMatches
980                         for (int i = 0; i < topMatches.size(); i++) {
981                                 newAligned = topMatches[i]->getAligned();
982                                 newAligned = newAligned.substr(frontPos, (rearPos-frontPos+1));
983                                 topMatches[i]->setAligned(newAligned);
984                         }
985                         
986                         for (int i = 0; i < newAligned.length(); i++) {
987                                 trimmedPos[i] = i+frontPos;
988                         }
989                 }
990                 return trimmedPos;
991         }
992         catch(exception& e) {
993                 m->errorOut(e, "DeCalculator", "trimSequences");
994                 exit(1);
995         }
996
997 }
998 //***************************************************************************************************************
999
1000
1001