]> git.donarmstrong.com Git - mothur.git/blob - decalc.cpp
sffinfo bug with flow grams right index when clipQualRight=0
[mothur.git] / decalc.cpp
1 /*
2  *  decalc.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 7/22/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "decalc.h"
11 #include "chimera.h"
12 #include "dist.h"
13 #include "eachgapdist.h"
14 #include "ignoregaps.h"
15 #include "eachgapdist.h"
16
17 //***************************************************************************************************************
18 void DeCalculator::setMask(string ms) { 
19         try {
20                 seqMask = ms; 
21                 int count = 0;
22                 maskMap.clear();
23                 
24                 if (seqMask.length() != 0) {
25                         //whereever there is a base in the mask, save that value is query and subject
26                         for (int i = 0; i < seqMask.length(); i++) {
27                                 if (isalpha(seqMask[i])) {
28                                         h.insert(i);
29                                         maskMap[count] = i;
30                                         count++;
31                                         
32                                 }
33                         }
34                 }else {
35                         for (int i = 0; i < alignLength; i++) {   
36                                 h.insert(i); 
37                                 maskMap[count] = i;
38                                 count++;
39                         }
40                 }
41         }
42         catch(exception& e) {
43                 m->errorOut(e, "DeCalculator", "setMask");
44                 exit(1);
45         } 
46 }
47 //***************************************************************************************************************
48 void DeCalculator::runMask(Sequence* seq) {
49         try{
50                 
51                 string q = seq->getAligned();
52                 string tempQuery = "";
53                 
54                 //whereever there is a base in the mask, save that value is query and subject
55                 set<int>::iterator setit;
56                 for ( setit=h.begin() ; setit != h.end(); setit++ )  {
57                         tempQuery += q[*setit];
58                 }
59                 
60                 //save masked values
61                 seq->setAligned(tempQuery);
62                 seq->setUnaligned(tempQuery);
63         }
64         catch(exception& e) {
65                 m->errorOut(e, "DeCalculator", "runMask");
66                 exit(1);
67         }
68 }
69 //***************************************************************************************************************
70 //num is query's spot in querySeqs
71 void DeCalculator::trimSeqs(Sequence* query, Sequence* subject, map<int, int>& trim) {
72         try {
73                 
74                 string q = query->getAligned();
75                 string s = subject->getAligned();
76                 
77                 int front = 0;
78                 for (int i = 0; i < q.length(); i++) {
79 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
80                         if (isalpha(q[i]) && isalpha(s[i])) { front = i; break;  }
81                 }
82 //cout << endl << endl;         
83                 int back = 0;           
84                 for (int i = q.length(); i >= 0; i--) {
85 //cout << "query = " << q[i] << " subject = " << s[i] << endl;
86                         if (isalpha(q[i]) && isalpha(s[i])) { back = i; break;  }
87                 }
88                 
89                 trim[front] = back;
90                 
91         }
92         catch(exception& e) {
93                 m->errorOut(e, "DeCalculator", "trimSeqs");
94                 exit(1);
95         }
96 }
97 //***************************************************************************************************************
98 vector<int>  DeCalculator::findWindows(Sequence* query, int front, int back, int& size, int increment) {
99         try {
100                 
101                 vector<int> win; 
102                 
103                 int cutoff = back - front;  //back - front 
104                         
105                 //if window is set to default
106                 if (size == 0) {  if (cutoff > 1200) {  size = 300; }
107                                                         else{  size = (cutoff / 4); }  } 
108                 else if (size > (cutoff / 4)) { 
109                                 m->mothurOut("You have selected too large a window size for sequence " + query->getName() + ".  I will choose an appropriate window size."); m->mothurOutEndLine();
110                                 size = (cutoff / 4); 
111                 }
112         
113         /*      string seq = query->getAligned().substr(front, cutoff);
114                         
115                 //count bases
116                 int numBases = 0;
117                 for (int l = 0; l < seq.length(); l++) {  if (isalpha(seq[l])) { numBases++; }  }
118 //cout << "num Bases = " << numBases << endl;                   
119                 //save start of seq
120                 win.push_back(front);
121 //cout << front << '\t';                
122                 //move ahead increment bases at a time until all bases are in a window
123                 int countBases = 0;
124                 int totalBases = 0;  //used to eliminate window of blanks at end of sequence
125                         
126                 seq = query->getAligned();
127                 for (int m = front; m < (back - size) ; m++) {
128                                 
129                         //count number of bases you see
130                         if (isalpha(seq[m])) { countBases++;  }
131                                 
132                         //if you have seen enough bases to make a new window
133                         if (countBases >= increment) {
134                                 //total bases is the number of bases in a window already.
135                                 totalBases += countBases;
136 //cout << "total bases = " << totalBases << endl;
137                                 win.push_back(m);  //save spot in alignment
138 //cout << m << '\t';
139                                 countBases = 0;                         //reset bases you've seen in this window
140                         }
141                                 
142                         //no need to continue if all your bases are in a window
143                         if (totalBases == numBases) {   break;  }
144                 }
145         
146
147                 //get last window if needed
148                 if (totalBases < numBases) {   win.push_back(back-size);  }
149 //cout << endl << endl;
150 */      
151                 
152                 //this follows wigeon, but we may want to consider that it chops off the end values if the sequence cannot be evenly divided into steps
153                 for (int i = front; i < (back - size) ; i+=increment) {  win.push_back(i);  }
154
155
156         
157                 return win;
158         
159         }
160         catch(exception& e) {
161                 m->errorOut(e, "DeCalculator", "findWindows");
162                 exit(1);
163         }
164 }
165
166 //***************************************************************************************************************
167 vector<float> DeCalculator::calcObserved(Sequence* query, Sequence* subject, vector<int> window, int size) {
168         try {
169                 
170                 vector<float> temp;     
171                 //int gaps = 0;         
172                 for (int i = 0; i < window.size(); i++) {
173                                                 
174                         string seqFrag = query->getAligned().substr(window[i], size);
175                         string seqFragsub = subject->getAligned().substr(window[i], size);
176                                 
177                         int diff = 0;
178                         for (int b = 0; b < seqFrag.length(); b++) {
179                                 //if at least one is a base and they are not equal
180                                 if( (isalpha(seqFrag[b]) || isalpha(seqFragsub[b])) && (seqFrag[b] != seqFragsub[b]) ) { diff++; }
181                                 
182                                 //ignore gaps
183                                 //if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
184                         }
185                
186                         //percentage of mismatched bases
187                         float dist;
188                         
189                         //if the whole fragment is 0 distance = 0
190                         //if ((seqFrag.length()-gaps) == 0)  { dist =  0.0; }
191                
192                         //percentage of mismatched bases
193                         //else {  dist = diff / (float) (seqFrag.length()-gaps) * 100;   } 
194                         
195                         dist = diff / (float) (seqFrag.length()) * 100; 
196                         
197                         temp.push_back(dist);
198                 }
199                         
200                 return temp;
201         }
202         catch(exception& e) {
203                 m->errorOut(e, "DeCalculator", "calcObserved");
204                 exit(1);
205         }
206 }
207 //***************************************************************************************************************
208 float DeCalculator::calcDist(Sequence* query, Sequence* subject, int front, int back) {
209         try {
210                 
211                 //so you only look at the trimmed part of the sequence
212                 int cutoff = back - front;  
213                 int gaps = 0;
214                         
215                 //from first startpoint with length back-front
216                 string seqFrag = query->getAligned().substr(front, cutoff);
217                 string seqFragsub = subject->getAligned().substr(front, cutoff);
218                                                                                                                 
219                 int diff = 0;
220                 for (int b = 0; b < seqFrag.length(); b++) {
221                         //ignore gaps
222                         if((!isalpha(seqFrag[b])) && (!isalpha(seqFragsub[b]))) { gaps++; }
223                         if (seqFrag[b] != seqFragsub[b]) { diff++; }
224                 }
225                 
226                 //if the whole fragment is 0 distance = 0
227                 if ((seqFrag.length()-gaps) == 0)  { return 0.0; }
228                
229                 //percentage of mismatched bases
230                 float dist = diff / (float) (seqFrag.length()-gaps) * 100;       
231                                 
232                 return dist;
233         }
234         catch(exception& e) {
235                 m->errorOut(e, "DeCalculator", "calcDist");
236                 exit(1);
237         }
238 }
239
240 //***************************************************************************************************************
241 vector<float> DeCalculator::calcExpected(vector<float> qav, float coef) {
242         try {
243                 
244                 //for each window
245                 vector<float> queryExpected;
246                         
247                 for (int j = 0; j < qav.size(); j++) {          
248                                 
249                         float expected = qav[j] * coef;
250                                 
251                         queryExpected.push_back(expected);      
252                 }
253                         
254                 return queryExpected;
255                                 
256         }
257         catch(exception& e) {
258                 m->errorOut(e, "DeCalculator", "calcExpected");
259                 exit(1);
260         }
261 }
262 //***************************************************************************************************************
263 float DeCalculator::calcDE(vector<float> obs, vector<float> exp) {
264         try {
265                 
266                 //for each window
267                 float sum = 0.0;  //sum = sum from 1 to i of (oi-ei)^2
268                 int numZeros = 0;
269                 for (int j = 0; j < obs.size(); j++) {          
270                         
271                         //if (obs[j] != 0.0) {
272                                 sum += ((obs[j] - exp[j]) * (obs[j] - exp[j])); 
273                         //}else {  numZeros++;   }
274                         
275                 }
276                         
277                 float de = sqrt((sum / (obs.size() - 1 - numZeros)));
278                         
279                 return de;
280         }
281         catch(exception& e) {
282                 m->errorOut(e, "DeCalculator", "calcDE");
283                 exit(1);
284         }
285 }
286
287 //***************************************************************************************************************
288
289 vector<float> DeCalculator::calcFreq(vector<Sequence*> seqs, string filename) {
290         try {
291
292                 vector<float> prob;
293                 string freqfile = m->getRootName(filename) + "freq";
294                 ofstream outFreq;
295                 
296                 m->openOutputFile(freqfile, outFreq);
297                 
298                 outFreq << "#" << m->getVersion() << endl;
299                 
300                 string length = toString(seqs.size());  //if there are 5000 seqs in the template then set precision to 3
301                 int precision = length.length() - 1;
302                 
303                 //format output
304                 outFreq.setf(ios::fixed, ios::floatfield); outFreq.setf(ios::showpoint);
305                 
306                 //at each position in the sequence
307                 for (int i = 0; i < seqs[0]->getAligned().length(); i++) {
308                         
309                         vector<int> freq;   freq.resize(4,0);
310                         int gaps = 0;
311                         
312                         //find the frequency of each nucleotide
313                         for (int j = 0; j < seqs.size(); j++) {
314                                 
315                                 char value = seqs[j]->getAligned()[i];
316                                 
317                                 if(toupper(value) == 'A')                                                                       {       freq[0]++;      }
318                                 else if(toupper(value) == 'T' || toupper(value) == 'U')         {       freq[1]++;      }
319                                 else if(toupper(value) == 'G')                                                          {       freq[2]++;      }
320                                 else if(toupper(value) == 'C')                                                          {       freq[3]++;      }
321                                 else { gaps++; }
322                         }
323                         
324                         //find base with highest frequency
325                         int highest = 0;
326                         for (int j = 0; j < freq.size(); j++) {   if (freq[j] > highest) {  highest = freq[j];  }               }
327                         
328                         float highFreq = highest / (float) (seqs.size());       
329                         
330                         float Pi;
331                         Pi =  (highFreq - 0.25) / 0.75; 
332                         
333                         //cannot have probability less than 0.
334                         if (Pi < 0) { Pi = 0.0; }
335                         
336                         //saves this for later
337                         outFreq << setprecision(precision) << i << '\t' << highFreq << endl;
338         
339                         if (h.count(i) > 0) {
340                                 prob.push_back(Pi); 
341                         }
342                 }
343                 
344                 outFreq.close();
345                 
346                 return prob;
347                                 
348         }
349         catch(exception& e) {
350                 m->errorOut(e, "DeCalculator", "calcFreq");
351                 exit(1);
352         }
353 }
354 //***************************************************************************************************************
355 vector<float>  DeCalculator::findQav(vector<int> window, int size, vector<float> probabilityProfile) {
356         try {
357                 vector<float>  averages; 
358                                 
359                 //for each window find average
360                 for (int i = 0; i < window.size(); i++) {
361                                 
362                         float average = 0.0;
363                                 
364                         //while you are in the window for this sequence
365                         int count = 0;
366                         for (int j = window[i]; j < (window[i]+size); j++) {   
367                                 average += probabilityProfile[j];
368                                 count++;
369                         }
370                                 
371                         average = average / count;
372         
373                         //save this windows average
374                         averages.push_back(average);
375                 }
376                                 
377                 return averages;
378         }
379         catch(exception& e) {
380                 m->errorOut(e, "DeCalculator", "findQav");
381                 exit(1);
382         }
383 }
384 //***************************************************************************************************************
385 //seqs have already been masked
386 vector< vector<float> > DeCalculator::getQuantiles(vector<Sequence*> seqs, vector<int> windowSizesTemplate, int window, vector<float> probProfile, int increment, int start, int end) {
387         try {
388                 vector< vector<float> > quan; 
389                 
390                 //percentage of mismatched pairs 1 to 100
391                 quan.resize(100);
392
393                 //for each sequence
394                 for(int i = start; i < end; i++){
395                 
396                         m->mothurOut("Processing sequence " + toString(i)); m->mothurOutEndLine();
397                         Sequence* query = new Sequence(seqs[i]->getName(), seqs[i]->getAligned());
398                         
399                         //compare to every other sequence in template
400                         for(int j = 0; j < i; j++){
401                                 
402                                 Sequence* subject = new Sequence(seqs[j]->getName(), seqs[j]->getAligned());
403                                 
404                                 if (m->control_pressed) { delete query; delete subject; return quan; }
405                                 
406                                 map<int, int> trim;
407                                 map<int, int>::iterator it;
408                                 
409                                 trimSeqs(query, subject, trim);
410                                 
411                                 it = trim.begin();
412                                 int front = it->first; int back = it->second;
413                                 
414                                 //reset window for each new comparison
415                                 windowSizesTemplate[i] = window;
416                                 
417                                 vector<int> win = findWindows(query, front, back, windowSizesTemplate[i], increment);
418                                 
419                                 vector<float> obsi = calcObserved(query, subject, win, windowSizesTemplate[i]);
420                                 
421                                 vector<float> q = findQav(win, windowSizesTemplate[i], probProfile);
422                                                                         
423                                 float alpha = getCoef(obsi, q);
424                                                 
425                                 vector<float> exp = calcExpected(q, alpha);
426                                 
427                                 float de = calcDE(obsi, exp);
428                                                                 
429                                 float dist = calcDist(query, subject, front, back); 
430         //cout << i << '\t' <<  j << '\t' << dist << '\t' << de << endl;                        
431                                 dist = ceil(dist);
432                                 
433                                 //quanMember newScore(de, i, j);
434                                 
435                                 quan[dist].push_back(de);
436
437                                 delete subject;
438                         }
439                         
440                         delete query;
441                 }
442         
443
444                 return quan;
445                                                 
446         }
447         catch(exception& e) {
448                 m->errorOut(e, "DeCalculator", "getQuantiles");
449                 exit(1);
450         }
451 }
452 //********************************************************************************************************************
453 //sorts lowest to highest
454 inline bool compareQuanMembers(quanMember left, quanMember right){
455         return (left.score < right.score);      
456
457 //***************************************************************************************************************
458 //this was going to be used by pintail to increase the sensitivity of the chimera detection, but it wasn't quite right.  may want to revisit in the future...
459 void DeCalculator::removeObviousOutliers(vector< vector<float> >& quantiles, int num) {
460         try {
461                                                 
462                 for (int i = 0; i < quantiles.size(); i++) {
463                         
464                         //find mean of this quantile score
465                         sort(quantiles[i].begin(), quantiles[i].end());
466                         
467                         vector<float> temp;
468                         if (quantiles[i].size() != 0) {
469                                 float high = quantiles[i][int(quantiles[i].size() * 0.99)];
470                                 float low =  quantiles[i][int(quantiles[i].size() * 0.01)];
471                         
472                                 //look at each value in quantiles to see if it is an outlier
473                                 for (int j = 0; j < quantiles[i].size(); j++) {
474                                         //is this score between 1 and 99%
475                                         if ((quantiles[i][j] > low) && (quantiles[i][j] < high)) {
476                                                 temp.push_back(quantiles[i][j]);
477                                         }
478                                 }
479                         }
480                         quantiles[i] = temp;
481                 }
482
483 /*
484                 //find contributer with most offending score related to it
485                 int largestContrib = findLargestContrib(seen);
486         
487                 //while you still have guys to eliminate
488                 while (contributions.size() > 0) {
489                 
490                         m->mothurOut("Removing scores contributed by sequence " + toString(largestContrib) + " in your template file."); m->mothurOutEndLine();
491                         
492                         //remove from quantiles all scores that were made using this largestContrib
493                         for (int i = 0; i < quantiles.size(); i++) {
494 //cout << "before remove " << quantiles[i].size() << '\t';
495                                 removeContrib(largestContrib, quantiles[i]);
496 //cout << "after remove " << quantiles[i].size() << endl;
497                         }
498 //cout << count << " largest contrib = " << largestContrib << endl;  count++;
499                         //remove from contributions all scores that were made using this largestContrib
500                         removeContrib(largestContrib, contributions);
501                         
502                         //"erase" largestContrib
503                         seen[largestContrib] = -1;
504                         
505                         //get next largestContrib
506                         largestContrib = findLargestContrib(seen);
507                 }
508 ABOVE IS ATTEMPT #1             
509 **************************************************************************************************
510 BELOW IS ATTEMPT #2             
511                 vector<int> marked = returnObviousOutliers(quantiles, num);
512                 vector<int> copyMarked = marked;
513                 
514                 //find the 99% of marked
515                 sort(copyMarked.begin(), copyMarked.end());
516                 int high = copyMarked[int(copyMarked.size() * 0.99)];
517 cout << "high = " << high << endl;              
518                 
519                 for(int i = 0; i < marked.size(); i++) {
520                         if (marked[i] > high) { 
521                                 m->mothurOut("Removing scores contributed by sequence " + toString(marked[i]) + " in your template file."); m->mothurOutEndLine();
522                                 for (int i = 0; i < quantiles.size(); i++) {
523                                         removeContrib(marked[i], quantiles[i]);
524                                 }
525                         }
526
527                 }
528                 
529                 
530                 //adjust quantiles
531                 for (int i = 0; i < quantiles.size(); i++) {
532                         vector<float> temp;
533                         
534                         if (quantiles[i].size() == 0) {
535                                 //in case this is not a distance found in your template files
536                                 for (int g = 0; g < 6; g++) {
537                                         temp.push_back(0.0);
538                                 }
539                         }else{
540                                 
541                                 sort(quantiles[i].begin(), quantiles[i].end(), compareQuanMembers);
542                                 
543                                 //save 10%
544                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.10)].score);
545                                 //save 25%
546                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.25)].score);
547                                 //save 50%
548                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.5)].score);
549                                 //save 75%
550                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.75)].score);
551                                 //save 95%
552                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.95)].score);
553                                 //save 99%
554                                 temp.push_back(quantiles[i][int(quantiles[i].size() * 0.99)].score);
555                                 
556                         }
557                         
558                         quan[i] = temp;
559                         
560                 }
561 */
562                 
563         }
564         catch(exception& e) {
565                 m->errorOut(e, "DeCalculator", "removeObviousOutliers");
566                 exit(1);
567         }
568 }
569 //***************************************************************************************************************
570 //put quanMember in the vector based on how far they are from the 99% or 1%.  Biggest offenders in front.
571 /*vector<quanMember> DeCalculator::sortContrib(map<quanMember*, float> quan) {
572         try{
573                 
574                 vector<quanMember> newQuan;
575                 map<quanMember*, float>::iterator it;
576                 
577                 while (quan.size() > 0) {
578                         
579                          map<quanMember*, float>::iterator largest = quan.begin(); 
580                           
581                         //find biggest member
582                         for (it = quan.begin(); it != quan.end(); it++) {
583                                 if (it->second > largest->second) {  largest = it;  }
584                         }
585 cout << largest->second << '\t' << largest->first->score << '\t' << largest->first->member1 << '\t' << largest->first->member2 << endl;
586                         //save it 
587                         newQuan.push_back(*(largest->first));
588                 
589                         //erase from quan
590                         quan.erase(largest);
591                 }
592                 
593                 return newQuan;
594                 
595         }
596         catch(exception& e) {
597                 m->errorOut(e, "DeCalculator", "sortContrib");
598                 exit(1);
599         }
600 }
601
602 ***************************************************************************************************************
603 //used by removeObviousOutliers which was attempt to increase sensitivity of chimera detection...not currently used...
604 int DeCalculator::findLargestContrib(vector<int> seen) {
605         try{
606                 
607                 int largest = 0;
608                 
609                 int largestContribs;
610                 
611                 for (int i = 0; i < seen.size(); i++)  {  
612                         
613                         if (seen[i] > largest) {
614                                 largestContribs = i;
615                                 largest = seen[i];
616                         }
617                 }
618                 
619                 return largestContribs;
620                 
621         }
622         catch(exception& e) {
623                 m->errorOut(e, "DeCalculator", "findLargestContribs");
624                 exit(1);
625         }
626 }
627 ***************************************************************************************************************
628 void DeCalculator::removeContrib(int bad, vector<quanMember>& quan) {
629         try{
630         
631                 vector<quanMember> newQuan;
632                 for (int i = 0; i < quan.size(); i++)  {  
633                         if ((quan[i].member1 != bad) && (quan[i].member2 != bad) ) {  
634                                 newQuan.push_back(quan[i]);  
635                         }
636                 }
637                 
638                 quan = newQuan;
639                 
640         }
641         catch(exception& e) {
642                 m->errorOut(e, "DeCalculator", "removeContrib");
643                 exit(1);
644         }
645 }
646 */
647 //***************************************************************************************************************
648 float DeCalculator::findAverage(vector<float> myVector) {
649         try{
650                 
651                 float total = 0.0;
652                 for (int i = 0; i < myVector.size(); i++)  {  total += myVector[i];  }
653                 
654                 float average = total / (float) myVector.size();
655                 
656                 return average;
657                 
658         }
659         catch(exception& e) {
660                 m->errorOut(e, "DeCalculator", "findAverage");
661                 exit(1);
662         }
663 }
664
665 //***************************************************************************************************************
666 float DeCalculator::getCoef(vector<float> obs, vector<float> qav) {
667         try {
668         
669                 //find average prob for this seqs windows
670                 float probAverage = findAverage(qav);
671                                 
672                 //find observed average 
673                 float obsAverage = findAverage(obs);
674                         
675                 float coef = obsAverage / probAverage;
676                                                 
677                 return coef;
678         }
679         catch(exception& e) {
680                 m->errorOut(e, "DeCalculator", "getCoef");
681                 exit(1);
682         }
683 }
684 //***************************************************************************************************************
685 //gets closest matches to each end, since chimeras will most likely have different parents on each end
686 vector<Sequence> DeCalculator::findClosest(Sequence querySeq, vector<Sequence*>& thisTemplate, vector<Sequence*>& thisFilteredTemplate, int numWanted, int minSim) {
687         try {
688                 //indexes.clear();
689                 
690                 vector<Sequence> seqsMatches;  
691                 
692                 vector<SeqDist> distsLeft;
693                 vector<SeqDist> distsRight;
694                 
695                 Dist* distcalculator = new eachGapDist();
696                 
697                 string queryUnAligned = querySeq.getUnaligned();
698                 int numBases = int(queryUnAligned.length() * 0.33);
699                 
700                 string leftQuery = ""; //first 1/3 of the sequence
701                 string rightQuery = ""; //last 1/3 of the sequence
702                 string queryAligned = querySeq.getAligned();
703                 
704                 //left side
705                 bool foundFirstBase = false;
706                 int baseCount = 0;
707                 int leftSpot = 0;
708                 int firstBaseSpot = 0;
709                 for (int i = 0; i < queryAligned.length(); i++) {
710                         //if you are a base
711                         if (isalpha(queryAligned[i])) {         
712                                 baseCount++; 
713                                 if (!foundFirstBase) {   foundFirstBase = true;  firstBaseSpot = i;  }
714                         }
715                         
716                         //eliminate opening .'s
717                         if (foundFirstBase) {   leftQuery += queryAligned[i];  }
718                         //if you have 1/3
719                         if (baseCount >= numBases) {  leftSpot = i; break; } //first 1/3
720                 }
721                 
722                 //right side - count through another 1/3, so you are at last third
723                 baseCount = 0;
724                 int rightSpot = 0;
725                 for (int i = leftSpot; i < queryAligned.length(); i++) {
726                         //if you are a base
727                         if (isalpha(queryAligned[i])) {         baseCount++;    }
728                         //if you have 1/3
729                         if (baseCount > numBases + 1) { rightSpot = i;  break; } //last 1/3
730                 }
731                 
732                 //trim end
733                 //find last position in query that is a non gap character
734                 int lastBaseSpot = queryAligned.length()-1;
735                 for (int j = queryAligned.length()-1; j >= 0; j--) {
736                         if (isalpha(queryAligned[j])) {
737                                 lastBaseSpot = j;
738                                 break;
739                         }
740                 }
741                 rightQuery = queryAligned.substr(rightSpot, (lastBaseSpot-rightSpot+1)); //sequence from pos spot to end
742                 
743                 Sequence queryLeft(querySeq.getName(), leftQuery);
744                 Sequence queryRight(querySeq.getName(), rightQuery);
745                 
746 //cout << querySeq->getName() << '\t' << leftSpot << '\t' << rightSpot << '\t' << firstBaseSpot << '\t' << lastBaseSpot << endl;
747 //cout << queryUnAligned.length() << '\t' << queryLeft.getUnaligned().length() << '\t' << queryRight.getUnaligned().length() << endl;
748                 for(int j = 0; j < thisFilteredTemplate.size(); j++){
749                         
750                         string dbAligned = thisFilteredTemplate[j]->getAligned();
751                         string leftDB = dbAligned.substr(firstBaseSpot, (leftSpot-firstBaseSpot+1)); //first 1/3 of the sequence
752                         string rightDB = dbAligned.substr(rightSpot, (lastBaseSpot-rightSpot+1)); //last 1/3 of the sequence
753
754                         Sequence dbLeft(thisFilteredTemplate[j]->getName(), leftDB);
755                         Sequence dbRight(thisFilteredTemplate[j]->getName(), rightDB);
756
757                         distcalculator->calcDist(queryLeft, dbLeft);
758                         float distLeft = distcalculator->getDist();
759                         
760                         distcalculator->calcDist(queryRight, dbRight);
761                         float distRight = distcalculator->getDist();
762
763                         SeqDist subjectLeft;
764                         subjectLeft.seq = NULL;
765                         subjectLeft.dist = distLeft;
766                         subjectLeft.index = j;
767                         
768                         distsLeft.push_back(subjectLeft);
769                         
770                         SeqDist subjectRight;
771                         subjectRight.seq = NULL;
772                         subjectRight.dist = distRight;
773                         subjectRight.index = j;
774                         
775                         distsRight.push_back(subjectRight);
776
777                 }
778                 
779                 delete distcalculator;
780                 
781                 //sort by smallest distance
782                 sort(distsRight.begin(), distsRight.end(), compareSeqDist);
783                 sort(distsLeft.begin(), distsLeft.end(), compareSeqDist);
784
785                 
786                 //merge results         
787                 map<string, string> seen;
788                 map<string, string>::iterator it;
789                 
790                 vector<SeqDist> dists;
791                 float lastRight = distsRight[0].dist;
792                 float lastLeft = distsLeft[0].dist;
793
794                 float maxDist = 1.0 - (minSim / 100.0);
795
796                 for (int i = 0; i < numWanted+1; i++) {
797                         if (m->control_pressed) { return seqsMatches; }
798                         
799                         //add left if you havent already
800                         it = seen.find(thisTemplate[distsLeft[i].index]->getName());
801                         if (it == seen.end() && distsLeft[i].dist <= maxDist) {  
802                                 dists.push_back(distsLeft[i]);
803                                 seen[thisTemplate[distsLeft[i].index]->getName()] = thisTemplate[distsLeft[i].index]->getName();
804                                 lastLeft =  distsLeft[i].dist;
805 //                              cout << "loop-left\t" << db[distsLeft[i].index]->getName() << '\t' << distsLeft[i].dist << endl;
806                         }
807
808                         //add right if you havent already
809                         it = seen.find(thisTemplate[distsRight[i].index]->getName());
810                         if (it == seen.end() && distsRight[i].dist <= maxDist) {  
811                                 dists.push_back(distsRight[i]);
812                                 seen[thisTemplate[distsRight[i].index]->getName()] = thisTemplate[distsRight[i].index]->getName();
813                                 lastRight =  distsRight[i].dist;
814 //                              cout << "loop-right\t" << db[distsRight[i].index]->getName() << '\t' << distsRight[i].dist << endl;
815                         }
816                         
817                         if (i == numWanted) { break; }
818                         
819                 }
820                 
821                 //are we still above the minimum similarity cutoff
822                 if ((lastLeft >= minSim) || (lastRight >= minSim)) {
823                         //add in ties from left
824                         int i = numWanted;
825                         while (i < distsLeft.size()) {  
826                                 if (distsLeft[i].dist == lastLeft) {  dists.push_back(distsLeft[i]);  }
827                                 else { break; }
828                                 i++;
829                         }
830                         
831                         //add in ties from right
832                         i = numWanted;
833                         while (i < distsRight.size()) {  
834                                 if (distsRight[i].dist == lastRight) {  dists.push_back(distsRight[i]);  }
835                                 else { break; }
836                                 i++;
837                         }
838                 }
839                 
840                 //cout << numWanted << endl;
841                 for (int i = 0; i < dists.size(); i++) {
842 //                      cout << db[dists[i].index]->getName() << '\t' << dists[i].dist << endl;
843
844                         if ((thisTemplate[dists[i].index]->getName() != querySeq.getName()) && (((1.0-dists[i].dist)*100) >= minSim)) {
845                                 Sequence temp(thisTemplate[dists[i].index]->getName(), thisTemplate[dists[i].index]->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
846                                 //cout << querySeq->getName() << '\t' << thisTemplate[dists[i].index]->getName()  << '\t' << dists[i].dist << endl;
847                                 seqsMatches.push_back(temp);
848                         }
849
850                 }
851                 
852                 return seqsMatches;
853         }
854         catch(exception& e) {
855                 m->errorOut(e, "DeCalculator", "findClosest");
856                 exit(1);
857         }
858 }
859 //***************************************************************************************************************
860 Sequence* DeCalculator::findClosest(Sequence* querySeq, vector<Sequence*> db) {
861         try {
862                 
863                 Sequence* seqsMatch;  
864                 
865                 Dist* distcalculator = new eachGapDist();
866                 int index = 0;
867                 int smallest = 1000000;
868                 
869                 for(int j = 0; j < db.size(); j++){
870                         
871                         distcalculator->calcDist(*querySeq, *db[j]);
872                         float dist = distcalculator->getDist();
873                         
874                         if (dist < smallest) { 
875                                 smallest = dist;
876                                 index = j;
877                         }
878                 }
879                 
880                 delete distcalculator;
881                 
882                 seqsMatch = new Sequence(db[index]->getName(), db[index]->getAligned()); //have to make a copy so you can trim and filter without stepping on eachother.
883                         
884                 return seqsMatch;
885         }
886         catch(exception& e) {
887                 m->errorOut(e, "DeCalculator", "findClosest");
888                 exit(1);
889         }
890 }
891 /***************************************************************************************************************/
892 map<int, int> DeCalculator::trimSeqs(Sequence& query, vector<Sequence>& topMatches) {
893         try {
894                 
895                 int frontPos = 0;  //should contain first position in all seqs that is not a gap character
896                 int rearPos = query.getAligned().length();
897                 
898                 //********find first position in topMatches that is a non gap character***********//
899                 //find first position all query seqs that is a non gap character
900                 for (int i = 0; i < topMatches.size(); i++) {
901                         
902                         string aligned = topMatches[i].getAligned();
903                         int pos = 0;
904                         
905                         //find first spot in this seq
906                         for (int j = 0; j < aligned.length(); j++) {
907                                 if (isalpha(aligned[j])) {
908                                         pos = j;
909                                         break;
910                                 }
911                         }
912                         
913                         //save this spot if it is the farthest
914                         if (pos > frontPos) { frontPos = pos; }
915                 }
916                 
917                 
918                 string aligned = query.getAligned();
919                 int pos = 0;
920                         
921                 //find first position in query that is a non gap character
922                 for (int j = 0; j < aligned.length(); j++) {
923                         if (isalpha(aligned[j])) {
924                                 pos = j;
925                                 break;
926                         }
927                 }
928                 
929                 //save this spot if it is the farthest
930                 if (pos > frontPos) { frontPos = pos; }
931                 
932                 
933                 //********find last position in topMatches that is a non gap character***********//
934                 for (int i = 0; i < topMatches.size(); i++) {
935                         
936                         string aligned = topMatches[i].getAligned();
937                         int pos = aligned.length();
938                         
939                         //find first spot in this seq
940                         for (int j = aligned.length()-1; j >= 0; j--) {
941                                 if (isalpha(aligned[j])) {
942                                         pos = j;
943                                         break;
944                                 }
945                         }
946                         
947                         //save this spot if it is the farthest
948                         if (pos < rearPos) { rearPos = pos; }
949                 }
950                 
951                 
952                 aligned = query.getAligned();
953                 pos = aligned.length();
954                 
955                 //find last position in query that is a non gap character
956                 for (int j = aligned.length()-1; j >= 0; j--) {
957                         if (isalpha(aligned[j])) {
958                                 pos = j;
959                                 break;
960                         }
961                 }
962                 
963                 //save this spot if it is the farthest
964                 if (pos < rearPos) { rearPos = pos; }
965                 
966                 map<int, int> trimmedPos;
967                 //check to make sure that is not whole seq
968                 if ((rearPos - frontPos - 1) <= 0) {  
969                         query.setAligned("");
970                         //trim topMatches
971                         for (int i = 0; i < topMatches.size(); i++) {
972                                 topMatches[i].setAligned("");
973                         }
974                         
975                 }else {
976
977                         //trim query
978                         string newAligned = query.getAligned();
979                         newAligned = newAligned.substr(frontPos, (rearPos-frontPos+1));
980                         query.setAligned(newAligned);
981                         
982                         //trim topMatches
983                         for (int i = 0; i < topMatches.size(); i++) {
984                                 newAligned = topMatches[i].getAligned();
985                                 newAligned = newAligned.substr(frontPos, (rearPos-frontPos+1));
986                                 topMatches[i].setAligned(newAligned);
987                         }
988                         
989                         for (int i = 0; i < newAligned.length(); i++) {
990                                 trimmedPos[i] = i+frontPos;
991                         }
992                 }
993                 return trimmedPos;
994         }
995         catch(exception& e) {
996                 m->errorOut(e, "DeCalculator", "trimSequences");
997                 exit(1);
998         }
999
1000 }
1001 //***************************************************************************************************************
1002
1003
1004