]> git.donarmstrong.com Git - mothur.git/blob - maligner.cpp
18f740911b2ccf342c4893cb00f629da95e7307b
[mothur.git] / maligner.cpp
1 /*
2  *  maligner.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/23/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "maligner.h"
11 #include "kmerdb.hpp"
12 #include "blastdb.hpp"
13
14 /***********************************************************************/
15 Maligner::Maligner(vector<Sequence*> temp, int num, int match, int misMatch, float div, int ms, int minCov, string mode, Database* dataLeft, Database* dataRight) :
16                 db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov), searchMethod(mode), databaseLeft(dataLeft), databaseRight(dataRight) { 
17                         
18                         
19                         m = MothurOut::getInstance(); 
20                         
21 //                      cout << matchScore << '\t' << misMatchPenalty << endl;
22 //                      
23 //                      matchScore = 1;
24 //                      misMatchPenalty = -1;
25                         
26                 }
27
28 /***********************************************************************/
29 string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
30         try {
31                 
32                 outputResults.clear();
33                 
34                 //make copy so trimming doesn't destroy query from calling class - remember to deallocate
35                 query = new Sequence(q->getName(), q->getAligned());
36                 
37                 string chimera;
38                 
39                 if (searchMethod == "distance") {
40                         //find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate
41                         refSeqs = decalc->findClosest(query, db, numWanted, indexes);
42                 }else if (searchMethod == "blast")  {
43                         refSeqs = getBlastSeqs(query, numWanted); //fills indexes
44                 }else if (searchMethod == "kmer") {
45                         refSeqs = getKmerSeqs(query, numWanted); //fills indexes
46                 }else { m->mothurOut("not valid search."); exit(1);  } //should never get here
47                 
48                 if (m->control_pressed) { return chimera;  }
49                 
50                 refSeqs = minCoverageFilter(refSeqs);
51
52                 if (refSeqs.size() < 2)  { 
53                         for (int i = 0; i < refSeqs.size(); i++) {  delete refSeqs[i];  }
54                         percentIdenticalQueryChimera = 0.0;
55                         return "unknown"; 
56                 }
57                 
58                 int chimeraPenalty = computeChimeraPenalty();
59
60                 //fills outputResults
61                 chimera = chimeraMaligner(chimeraPenalty, decalc);
62                 
63                 if (m->control_pressed) { return chimera;  }
64                                 
65                 //free memory
66                 delete query;
67
68                 for (int i = 0; i < refSeqs.size(); i++) {  delete refSeqs[i];  }
69                 
70                 return chimera;
71         }
72         catch(exception& e) {
73                 m->errorOut(e, "Maligner", "getResults");
74                 exit(1);
75         }
76 }
77 /***********************************************************************/
78 string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) {
79         try {
80                 
81                 string chimera;
82                 
83                 //trims seqs to first non gap char in all seqs and last non gap char in all seqs
84                 spotMap = decalc->trimSeqs(query, refSeqs);
85                 
86                 //you trimmed the whole sequence, skip
87                 if (query->getAligned() == "") { return "no"; }
88
89                 vector<Sequence*> temp = refSeqs;
90                 temp.push_back(query);
91                         
92                         
93                 verticalFilter(temp);
94
95                 vector< vector<score_struct> > matrix = buildScoreMatrix(query->getAligned().length(), refSeqs.size()); //builds and initializes
96                 
97                 if (m->control_pressed) { return chimera;  }
98                 
99                 fillScoreMatrix(matrix, refSeqs, chimeraPenalty);
100         
101                 vector<trace_struct> trace = extractHighestPath(matrix);
102                                 
103 //              cout << "traces\n";
104 //              for(int i=0;i<trace.size();i++){
105 //                      cout << trace[i].col << '\t' << trace[i].oldCol << '\t' << refSeqs[trace[i].row]->getName() << endl;
106 //              }
107                 
108                 if (trace.size() > 1) {         chimera = "yes";        }
109                 else { chimera = "no";  }
110                 
111                 int traceStart = trace[0].col;
112                 int traceEnd = trace[trace.size()-1].oldCol;    
113                 string queryInRange = query->getAligned();
114                 queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1));
115                 
116                 if (m->control_pressed) { return chimera;  }
117                 
118                 //save output results
119                 for (int i = 0; i < trace.size(); i++) {
120                         int regionStart = trace[i].col;
121                         int regionEnd = trace[i].oldCol;
122                         int seqIndex = trace[i].row;
123                         
124                         results temp;
125                         
126                         temp.parent = refSeqs[seqIndex]->getName();
127                         temp.parentAligned = db[indexes[seqIndex]]->getAligned();
128                         temp.nastRegionStart = spotMap[regionStart];
129                         temp.nastRegionEnd = spotMap[regionEnd];
130                         temp.regionStart = regionStart;
131                         temp.regionEnd = regionEnd;
132                         
133                         string parentInRange = refSeqs[seqIndex]->getAligned();
134                         parentInRange = parentInRange.substr(traceStart, (traceEnd-traceStart+1));
135                         
136                         temp.queryToParent = computePercentID(queryInRange, parentInRange);
137                         temp.divR = (percentIdenticalQueryChimera / temp.queryToParent);
138
139                         string queryInRegion = query->getAligned();
140                         queryInRegion = queryInRegion.substr(regionStart, (regionEnd-regionStart+1));
141                         
142                         string parentInRegion = refSeqs[seqIndex]->getAligned();
143                         parentInRegion = parentInRegion.substr(regionStart, (regionEnd-regionStart+1));
144                         
145                         temp.queryToParentLocal = computePercentID(queryInRegion, parentInRegion);
146                 
147                         outputResults.push_back(temp);
148                 }
149                 
150                 return chimera;
151         }
152         catch(exception& e) {
153                 m->errorOut(e, "Maligner", "chimeraMaligner");
154                 exit(1);
155         }
156 }
157 /***********************************************************************/
158 //removes top matches that do not have minimum coverage with query.
159 vector<Sequence*> Maligner::minCoverageFilter(vector<Sequence*> ref){  
160         try {
161                 vector<Sequence*> newRefs;
162                 
163                 string queryAligned = query->getAligned();
164                 
165                 for (int i = 0; i < ref.size(); i++) {
166                         
167                         string refAligned = ref[i]->getAligned();
168                         
169                         int numBases = 0;
170                         int numCovered = 0;
171                         
172                         //calculate coverage
173                         for (int j = 0; j < queryAligned.length(); j++) {
174                                 
175                                 if (isalpha(queryAligned[j])) {
176                                         numBases++;
177                                         
178                                         if (isalpha(refAligned[j])) {
179                                                 numCovered++;
180                                         }
181                                 }
182                         }
183                         
184                         int coverage = ((numCovered/(float)numBases)*100);
185                         
186                         //if coverage above minimum
187                         if (coverage > minCoverage) {
188                                 newRefs.push_back(ref[i]);
189                         }else {
190                                 delete ref[i];
191                         }
192                 }
193                 
194                 return newRefs;
195         }
196         catch(exception& e) {
197                 m->errorOut(e, "Maligner", "minCoverageFilter");
198                 exit(1);
199         }
200 }
201 /***********************************************************************/
202 // a breakpoint should yield fewer mismatches than this number with respect to the best parent sequence.
203 int Maligner::computeChimeraPenalty() {
204         try {
205                 
206                 int numAllowable = ((1.0 - (1.0/minDivR)) * query->getNumBases());
207
208 //              if(numAllowable < 1){   numAllowable = 1;       }
209                 
210                 int penalty = int(numAllowable + 1) * misMatchPenalty;
211
212                 return penalty;
213
214         }
215         catch(exception& e) {
216                 m->errorOut(e, "Maligner", "computeChimeraPenalty");
217                 exit(1);
218         }
219 }
220 /***********************************************************************/
221 //this is a vertical filter
222 void Maligner::verticalFilter(vector<Sequence*> seqs) {
223         try {
224                 vector<int> gaps;       gaps.resize(query->getAligned().length(), 0);
225                 
226                 string filterString = (string(query->getAligned().length(), '1'));
227                 
228                 //for each sequence
229                 for (int i = 0; i < seqs.size(); i++) {
230                 
231                         string seqAligned = seqs[i]->getAligned();
232                         
233                         for (int j = 0; j < seqAligned.length(); j++) {
234                                 //if this spot is a gap
235                                 if ((seqAligned[j] == '-') || (seqAligned[j] == '.'))   {       gaps[j]++;      }
236                         }
237                 }
238                 
239                 //zero out spot where all sequences have blanks
240                 int numColRemoved = 0;
241                 for(int i = 0; i < seqs[0]->getAligned().length(); i++){
242                         if(gaps[i] == seqs.size())      {       filterString[i] = '0';  numColRemoved++;  }
243                 }
244                 
245                 map<int, int> newMap;
246                 //for each sequence
247                 for (int i = 0; i < seqs.size(); i++) {
248                 
249                         string seqAligned = seqs[i]->getAligned();
250                         string newAligned = "";
251                         int count = 0;
252                         
253                         for (int j = 0; j < seqAligned.length(); j++) {
254                                 //if this spot is not a gap
255                                 if (filterString[j] == '1') { 
256                                         newAligned += seqAligned[j]; 
257                                         newMap[count] = spotMap[j];
258                                         count++;
259                                 }
260                         }
261                         
262                         seqs[i]->setAligned(newAligned);
263                 }
264
265                 spotMap = newMap;
266         }
267         catch(exception& e) {
268                 m->errorOut(e, "Maligner", "verticalFilter");
269                 exit(1);
270         }
271 }
272 //***************************************************************************************************************
273 vector< vector<score_struct> > Maligner::buildScoreMatrix(int cols, int rows) {
274         try{
275                 
276                 vector< vector<score_struct> > m(rows);
277                 
278                 for (int i = 0; i < rows; i++) {
279                         for (int j = 0; j < cols; j++) {
280                                 
281                                 //initialize each cell
282                                 score_struct temp;
283                                 temp.prev = -1;
284                                 temp.score = -9999999;
285                                 temp.col = j;
286                                 temp.row = i;
287                                 
288                                 m[i].push_back(temp);
289                         }
290                 }
291                 
292                 return m;
293         }
294         catch(exception& e) {
295                 m->errorOut(e, "Maligner", "buildScoreMatrix");
296                 exit(1);
297         }
298 }
299
300 //***************************************************************************************************************
301
302 void Maligner::fillScoreMatrix(vector<vector<score_struct> >& ms, vector<Sequence*> seqs, int penalty) {
303         try{
304                 
305                 //get matrix dimensions
306                 int numCols = query->getAligned().length();
307                 int numRows = seqs.size();
308                 
309                 //initialize first col
310                 string queryAligned = query->getAligned();
311                 for (int i = 0; i < numRows; i++) {
312                         string subjectAligned = seqs[i]->getAligned();
313                         
314                         //are you both gaps?
315                         if ((!isalpha(queryAligned[0])) && (!isalpha(subjectAligned[0]))) {
316                                 ms[i][0].score = 0;
317 //                              ms[i][0].mismatches = 0;
318                         }else if (queryAligned[0] == subjectAligned[0] || subjectAligned[0] == 'N') {
319                                 ms[i][0].score = matchScore;
320 //                              ms[i][0].mismatches = 0;
321                         }else{
322                                 ms[i][0].score = 0;
323 //                              ms[i][0].mismatches = 1;
324                         }
325                 }
326                 
327                 //fill rest of matrix
328                 for (int j = 1; j < numCols; j++) {  //iterate through matrix columns
329                 
330                         for (int i = 0; i < numRows; i++) {  //iterate through matrix rows
331                                 
332                                 string subjectAligned = seqs[i]->getAligned();
333                                 
334                                 int matchMisMatchScore = 0;
335                                 //are you both gaps?
336                                 if ((!isalpha(queryAligned[j])) && (!isalpha(subjectAligned[j]))) {
337                                         //leave the same
338                                 }else if ((toupper(queryAligned[j]) == 'N') || (toupper(subjectAligned[j]) == 'N')) {
339                                         matchMisMatchScore = matchScore;
340                                         //leave the same
341                                 }else if (queryAligned[j] == subjectAligned[j]) {
342                                         matchMisMatchScore = matchScore;
343 //                                      ms[i][j].mismatches = ms[i][j-1].mismatches;
344                                 }else if (queryAligned[j] != subjectAligned[j]) {
345                                         matchMisMatchScore = misMatchPenalty;
346 //                                      ms[i][j].mismatches = ms[i][j-1].mismatches + 1;
347                                 }
348                                 
349                                 //compute score based on previous columns scores
350                                 for (int prevIndex = 0; prevIndex < numRows; prevIndex++) { //iterate through rows
351                                         
352                                         int sumScore = matchMisMatchScore + ms[prevIndex][j-1].score;
353                                         
354                                         //you are not at yourself
355                                         if (prevIndex != i) {   sumScore += penalty;    }
356                                         if (sumScore < 0)       {       sumScore = 0;                   }
357                                         
358                                         if (sumScore > ms[i][j].score) {
359                                                 ms[i][j].score = sumScore;
360                                                 ms[i][j].prev = prevIndex;
361                                         }
362                                 }
363                         }
364                 }
365                 
366 //              for(int i=0;i<numRows;i++){
367 //                      cout << seqs[i]->getName();
368 //                      for(int j=0;j<numCols;j++){
369 //                              cout << '\t' << ms[i][j].mismatches;
370 //                      }
371 //                      cout << endl;
372 //              }
373 //              cout << endl;
374 //              for(int i=0;i<numRows;i++){
375 //                      cout << seqs[i]->getName();
376 //                      for(int j=0;j<numCols;j++){
377 //                              cout << '\t' << ms[i][j].score;
378 //                      }
379 //                      cout << endl;
380 //              }
381 //              cout << endl;
382 //              for(int i=0;i<numRows;i++){
383 //                      cout << seqs[i]->getName();
384 //                      for(int j=0;j<numCols;j++){
385 //                              cout << '\t' << ms[i][j].prev;
386 //                      }
387 //                      cout << endl;
388 //              }
389                 
390                 
391         }
392         catch(exception& e) {
393                 m->errorOut(e, "Maligner", "fillScoreMatrix");
394                 exit(1);
395         }
396 }
397
398 //***************************************************************************************************************
399
400 vector<trace_struct> Maligner::extractHighestPath(vector<vector<score_struct> > ms) {
401         try {
402         
403                 
404                 //get matrix dimensions
405                 int numCols = query->getAligned().length();
406                 int numRows = ms.size();
407         
408         
409                 //find highest score scoring matrix
410                 vector<score_struct> highestStruct;
411                 int highestScore = 0;
412                 
413                 for (int i = 0; i < numRows; i++) {
414                         for (int j = 0; j < numCols; j++) {
415                                 if (ms[i][j].score > highestScore) {
416                                         highestScore = ms[i][j].score;
417                                         highestStruct.resize(0);
418                                         highestStruct.push_back(ms[i][j]);
419                                 }
420                                 else if(ms[i][j].score == highestScore){
421                                         highestStruct.push_back(ms[i][j]);
422                                 }
423                         }
424                 }
425                         
426 //              cout << endl << highestScore << '\t' << highestStruct.size() << '\t' << highestStruct[0].row << endl;   
427                 
428                 vector<trace_struct> maxTrace;
429                 double maxPercentIdenticalQueryAntiChimera = 0;
430                 
431                 for(int i=0;i<highestStruct.size();i++){
432                         
433                         vector<score_struct> path;
434
435                         int rowIndex = highestStruct[i].row;
436                         int pos = highestStruct[i].col;
437                         int score = highestStruct[i].score;
438                                         
439                         while (pos >= 0 && score > 0) {
440                                 score_struct temp = ms[rowIndex][pos];
441                                 score = temp.score;
442                                 
443                                 if (score > 0) {        path.push_back(temp);   }
444                                 
445                                 rowIndex = temp.prev;
446                                 pos--;
447                         }
448
449                         reverse(path.begin(), path.end());
450
451                         vector<trace_struct> trace = mapTraceRegionsToAlignment(path, refSeqs);
452                 
453 //                      cout << "traces\n";
454 //                      for(int j=0;j<trace.size();j++){
455 //                              cout << trace[j].col << '\t' << trace[j].oldCol << '\t' << refSeqs[trace[j].row]->getName() << endl;
456 //                      }
457                                                 
458                         int traceStart = path[0].col;
459                         int traceEnd = path[path.size()-1].col; 
460 //                      cout << "traceStart/End\t" << traceStart << '\t' << traceEnd << endl;
461                         
462                         string queryInRange = query->getAligned();
463                         queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1));
464 //                      cout << "here" << endl;
465                         string chimeraSeq = constructChimericSeq(trace, refSeqs);
466                         string antiChimeraSeq = constructAntiChimericSeq(trace, refSeqs);
467                 
468                         percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq);                      
469                         double percentIdenticalQueryAntiChimera = computePercentID(queryInRange, antiChimeraSeq);
470 //                      cout << i << '\t' << percentIdenticalQueryChimera << '\t' << percentIdenticalQueryAntiChimera << endl;
471                         
472                         if(percentIdenticalQueryAntiChimera > maxPercentIdenticalQueryAntiChimera){
473                                 maxPercentIdenticalQueryAntiChimera = percentIdenticalQueryAntiChimera;
474                                 maxTrace = trace;
475                         }
476                 }
477 //              cout << maxPercentIdenticalQueryAntiChimera << endl;
478                 return maxTrace;
479                 
480         }
481         catch(exception& e) {
482                 m->errorOut(e, "Maligner", "extractHighestPath");
483                 exit(1);
484         }
485 }
486
487 //***************************************************************************************************************
488
489 vector<trace_struct> Maligner::mapTraceRegionsToAlignment(vector<score_struct> path, vector<Sequence*> seqs) {
490         try {
491                 vector<trace_struct> trace;
492                 
493                 int region_index = path[0].row;
494                 int region_start = path[0].col;
495         
496                 for (int i = 1; i < path.size(); i++) {
497                 
498                         int next_region_index = path[i].row;
499                         
500                         if (next_region_index != region_index) {
501                                 
502                                 // add trace region
503                                 int col_index = path[i].col;
504                                 trace_struct temp;
505                                 temp.col = region_start;
506                                 temp.oldCol = col_index-1;
507                                 temp.row = region_index;
508                                 
509                                 trace.push_back(temp);
510                                                         
511                                 region_index = path[i].row;
512                                 region_start = col_index;
513                         }
514                 }
515         
516                 // get last one
517                 trace_struct temp;
518                 temp.col = region_start;
519                 temp.oldCol = path[path.size()-1].col;
520                 temp.row = region_index;
521                 trace.push_back(temp);
522
523 //              cout << endl;
524 //              cout << trace.size() << endl;
525 //              for(int i=0;i<trace.size();i++){
526 //                      cout << seqs[trace[i].row]->getName() << endl;
527 //              }
528 //              cout << endl;
529                 
530                 return trace;
531                 
532         }
533         catch(exception& e) {
534                 m->errorOut(e, "Maligner", "mapTraceRegionsToAlignment");
535                 exit(1);
536         }
537 }
538
539 //***************************************************************************************************************
540
541 string Maligner::constructChimericSeq(vector<trace_struct> trace, vector<Sequence*> seqs) {
542         try {
543                 string chimera = "";
544                 
545                 for (int i = 0; i < trace.size(); i++) {
546 //                      cout << i << '\t' << trace[i].row << '\t' << trace[i].col << '\t' << trace[i].oldCol << endl;
547                         
548                         string seqAlign = seqs[trace[i].row]->getAligned();
549                         seqAlign = seqAlign.substr(trace[i].col, (trace[i].oldCol-trace[i].col+1));
550                         chimera += seqAlign;
551                 }
552                         
553                 return chimera;
554         }
555         catch(exception& e) {
556                 m->errorOut(e, "Maligner", "constructChimericSeq");
557                 exit(1);
558         }
559 }
560
561 //***************************************************************************************************************
562
563 string Maligner::constructAntiChimericSeq(vector<trace_struct> trace, vector<Sequence*> seqs) {
564         try {
565                 string antiChimera = "";
566                 
567                 for (int i = 0; i < trace.size(); i++) {
568 //                      cout << i << '\t' << (trace.size() - i - 1) << '\t' << trace[i].row << '\t' << trace[i].col << '\t' << trace[i].oldCol << endl;
569                         
570                         int oppositeIndex = trace.size() - i - 1;
571                         
572                         string seqAlign = seqs[trace[oppositeIndex].row]->getAligned();
573                         seqAlign = seqAlign.substr(trace[i].col, (trace[i].oldCol-trace[i].col+1));
574                         antiChimera += seqAlign;
575                 }
576                 
577                 return antiChimera;
578         }
579         catch(exception& e) {
580                 m->errorOut(e, "Maligner", "constructChimericSeq");
581                 exit(1);
582         }
583 }
584
585 //***************************************************************************************************************
586 float Maligner::computePercentID(string queryAlign, string chimera) {
587         try {
588         
589                 if (queryAlign.length() != chimera.length()) {
590                         m->mothurOut("Error, alignment strings are of different lengths: "); m->mothurOutEndLine();
591                         m->mothurOut(toString(queryAlign.length())); m->mothurOutEndLine(); m->mothurOutEndLine(); m->mothurOutEndLine(); m->mothurOutEndLine();
592                         m->mothurOut(toString(chimera.length())); m->mothurOutEndLine();
593                         return -1.0;
594                 }
595
596         
597                 int numBases = 0;
598                 int numIdentical = 0;
599         
600                 for (int i = 0; i < queryAlign.length(); i++) {
601                         if ((isalpha(queryAlign[i])) || (isalpha(chimera[i])))  {
602                                 numBases++;             
603                                 if (queryAlign[i] == chimera[i]) {
604                                         numIdentical++;
605                                 }
606                         }
607                 }
608         
609                 if (numBases == 0) { return 0; }
610         
611                 float percentIdentical = (numIdentical/(float)numBases) * 100;
612
613                 return percentIdentical;
614                 
615         }
616         catch(exception& e) {
617                 m->errorOut(e, "Maligner", "computePercentID");
618                 exit(1);
619         }
620 }
621 //***************************************************************************************************************
622 vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
623         try {   
624                 indexes.clear();
625                 vector<Sequence*> refResults;
626                                 
627                 //get parts of query
628                 string queryUnAligned = q->getUnaligned();
629                 string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence
630                 string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence
631                 
632                 Sequence* queryLeft = new Sequence(q->getName()+"left", leftQuery);
633                 Sequence* queryRight = new Sequence(q->getName()+"right", rightQuery);
634
635                 vector<int> tempIndexesLeft = databaseLeft->findClosestMegaBlast(queryLeft, num+1);
636                 vector<int> tempIndexesRight = databaseLeft->findClosestMegaBlast(queryRight, num+1);
637
638                 //if ((tempIndexesRight.size() == 0) && (tempIndexesLeft.size() == 0))  {  m->mothurOut("megablast returned " + toString(tempIndexesRight.size()) + " results for the right end, and " + toString(tempIndexesLeft.size()) + " for the left end. Needed " + toString(num+1) + ". Unable to process sequence " + q->getName()); m->mothurOutEndLine(); return refResults; }
639                 
640                 vector<int> smaller;
641                 vector<int> larger;
642                 
643                 if (tempIndexesRight.size() < tempIndexesLeft.size()) { smaller = tempIndexesRight;  larger = tempIndexesLeft; }
644                 else { smaller = tempIndexesLeft; larger = tempIndexesRight; } 
645                 
646                 //merge results         
647                 map<int, int> seen;
648                 map<int, int>::iterator it;
649                 
650                 vector<int> mergedResults;
651                 for (int i = 0; i < smaller.size(); i++) {
652                         //add left if you havent already
653                         it = seen.find(smaller[i]);
654                         if (it == seen.end()) {  
655                                 mergedResults.push_back(smaller[i]);
656                                 seen[smaller[i]] = smaller[i];
657                         }
658
659                         //add right if you havent already
660                         it = seen.find(larger[i]);
661                         if (it == seen.end()) {  
662                                 mergedResults.push_back(larger[i]);
663                                 seen[larger[i]] = larger[i];
664                         }
665                 }
666                 
667                 for (int i = smaller.size(); i < larger.size(); i++) {
668                         it = seen.find(larger[i]);
669                         if (it == seen.end()) {  
670                                 mergedResults.push_back(larger[i]);
671                                 seen[larger[i]] = larger[i];
672                         }
673                 }
674                 
675                 if (mergedResults.size() < numWanted) { numWanted = mergedResults.size(); }
676 //cout << q->getName() << " merged results size = " << mergedResults.size() << '\t' << "numwanted = " << numWanted <<  endl;            
677                 for (int i = 0; i < numWanted; i++) {
678 //cout << db[mergedResults[i]]->getName()  << '\t' << mergedResults[i] << endl; 
679                         if (db[mergedResults[i]]->getName() != q->getName()) { 
680                                 Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned());
681                                 refResults.push_back(temp);
682                                 indexes.push_back(mergedResults[i]);
683                         }
684 //cout << mergedResults[i] << endl;
685                 }
686 //cout << "done " << q->getName()  << endl;             
687                 delete queryRight;
688                 delete queryLeft;
689                         
690                 return refResults;
691         }
692         catch(exception& e) {
693                 m->errorOut(e, "Maligner", "getBlastSeqs");
694                 exit(1);
695         }
696 }
697 //***************************************************************************************************************
698 vector<Sequence*> Maligner::getKmerSeqs(Sequence* q, int num) {
699         try {   
700                 indexes.clear();
701                 
702                 //get parts of query
703                 string queryUnAligned = q->getUnaligned();
704                 string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence
705                 string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence
706
707                 Sequence* queryLeft = new Sequence(q->getName(), leftQuery);
708                 Sequence* queryRight = new Sequence(q->getName(), rightQuery);
709                 
710                 vector<int> tempIndexesLeft = databaseLeft->findClosestSequences(queryLeft, numWanted);
711                 vector<int> tempIndexesRight = databaseRight->findClosestSequences(queryRight, numWanted);
712                 
713                 //merge results         
714                 map<int, int> seen;
715                 map<int, int>::iterator it;
716                 
717                 vector<int> mergedResults;
718                 for (int i = 0; i < tempIndexesLeft.size(); i++) {
719                         //add left if you havent already
720                         it = seen.find(tempIndexesLeft[i]);
721                         if (it == seen.end()) {  
722                                 mergedResults.push_back(tempIndexesLeft[i]);
723                                 seen[tempIndexesLeft[i]] = tempIndexesLeft[i];
724                         }
725
726                         //add right if you havent already
727                         it = seen.find(tempIndexesRight[i]);
728                         if (it == seen.end()) {  
729                                 mergedResults.push_back(tempIndexesRight[i]);
730                                 seen[tempIndexesRight[i]] = tempIndexesRight[i];
731                         }
732                 }
733                 
734 //cout << q->getName() << endl;         
735                 vector<Sequence*> refResults;
736                 for (int i = 0; i < numWanted; i++) {
737 //cout << db[mergedResults[i]]->getName() << endl;      
738                         Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned());
739                         refResults.push_back(temp);
740                         indexes.push_back(mergedResults[i]);
741                 }
742 //cout << endl;         
743                 delete queryRight;
744                 delete queryLeft;
745                 
746                 return refResults;
747         }
748         catch(exception& e) {
749                 m->errorOut(e, "Maligner", "getKmerSeqs");
750                 exit(1);
751         }
752 }
753 //***************************************************************************************************************
754