]> git.donarmstrong.com Git - mothur.git/blob - maligner.cpp
eaaaa8789961d4938a1aba793fb32e7ac331a105
[mothur.git] / maligner.cpp
1 /*
2  *  maligner.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 9/23/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "maligner.h"
11 #include "kmerdb.hpp"
12 #include "blastdb.hpp"
13
14 /***********************************************************************/
15 Maligner::Maligner(vector<Sequence*> temp, int num, int match, int misMatch, float div, int ms, int minCov, string mode, Database* dataLeft, Database* dataRight) :
16                 db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov), searchMethod(mode), databaseLeft(dataLeft), databaseRight(dataRight) { 
17                         
18                         
19                         m = MothurOut::getInstance(); 
20                         
21 //                      cout << matchScore << '\t' << misMatchPenalty << endl;
22 //                      
23 //                      matchScore = 1;
24 //                      misMatchPenalty = -1;
25                         
26                 }
27
28 /***********************************************************************/
29 string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
30         try {
31                 
32                 outputResults.clear();
33                 
34                 //make copy so trimming doesn't destroy query from calling class - remember to deallocate
35                 query = new Sequence(q->getName(), q->getAligned());
36                 
37                 string chimera;
38                 
39                 if (searchMethod == "distance") {
40                         //find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate
41                         refSeqs = decalc->findClosest(query, db, numWanted, indexes);
42                 }else if (searchMethod == "blast")  {
43                         refSeqs = getBlastSeqs(query, numWanted); //fills indexes
44                 }else if (searchMethod == "kmer") {
45                         refSeqs = getKmerSeqs(query, numWanted); //fills indexes
46                 }else { m->mothurOut("not valid search."); exit(1);  } //should never get here
47                 
48                 if (m->control_pressed) { return chimera;  }
49                 
50                 refSeqs = minCoverageFilter(refSeqs);
51
52                 if (refSeqs.size() < 2)  { 
53                         for (int i = 0; i < refSeqs.size(); i++) {  delete refSeqs[i];  }
54                         percentIdenticalQueryChimera = 0.0;
55                         return "unknown"; 
56                 }
57                 
58                 int chimeraPenalty = computeChimeraPenalty();
59
60                 //fills outputResults
61                 chimera = chimeraMaligner(chimeraPenalty, decalc);
62                 
63                 if (m->control_pressed) { return chimera;  }
64                                 
65                 //free memory
66                 delete query;
67
68                 for (int i = 0; i < refSeqs.size(); i++) {  delete refSeqs[i];  }
69                 
70                 return chimera;
71         }
72         catch(exception& e) {
73                 m->errorOut(e, "Maligner", "getResults");
74                 exit(1);
75         }
76 }
77 /***********************************************************************/
78 string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) {
79         try {
80                 
81                 string chimera;
82                 
83                 //trims seqs to first non gap char in all seqs and last non gap char in all seqs
84                 spotMap = decalc->trimSeqs(query, refSeqs);
85                 
86                 //you trimmed the whole sequence, skip
87                 if (query->getAligned() == "") { return "no"; }
88
89                 vector<Sequence*> temp = refSeqs;
90                 temp.push_back(query);
91                 
92                 verticalFilter(temp);
93
94                 vector< vector<score_struct> > matrix = buildScoreMatrix(query->getAligned().length(), refSeqs.size()); //builds and initializes
95                 
96                 if (m->control_pressed) { return chimera;  }
97                 
98                 fillScoreMatrix(matrix, refSeqs, chimeraPenalty);
99         
100                 vector<trace_struct> trace = extractHighestPath(matrix);
101                                 
102 //              cout << "traces\n";
103 //              for(int i=0;i<trace.size();i++){
104 //                      cout << trace[i].col << '\t' << trace[i].oldCol << '\t' << refSeqs[trace[i].row]->getName() << endl;
105 //              }
106                 
107                 if (trace.size() > 1) {         chimera = "yes";        }
108                 else { chimera = "no";  }
109                 
110                 int traceStart = trace[0].col;
111                 int traceEnd = trace[trace.size()-1].oldCol;    
112                 string queryInRange = query->getAligned();
113                 queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1));
114                 
115                 if (m->control_pressed) { return chimera;  }
116                 
117                 //save output results
118                 for (int i = 0; i < trace.size(); i++) {
119                         int regionStart = trace[i].col;
120                         int regionEnd = trace[i].oldCol;
121                         int seqIndex = trace[i].row;
122                         
123                         results temp;
124                         
125                         temp.parent = refSeqs[seqIndex]->getName();
126                         temp.parentAligned = db[indexes[seqIndex]]->getAligned();
127                         temp.nastRegionStart = spotMap[regionStart];
128                         temp.nastRegionEnd = spotMap[regionEnd];
129                         temp.regionStart = regionStart;
130                         temp.regionEnd = regionEnd;
131                         
132                         string parentInRange = refSeqs[seqIndex]->getAligned();
133                         parentInRange = parentInRange.substr(traceStart, (traceEnd-traceStart+1));
134                         
135                         temp.queryToParent = computePercentID(queryInRange, parentInRange);
136                         temp.divR = (percentIdenticalQueryChimera / temp.queryToParent);
137
138                         string queryInRegion = query->getAligned();
139                         queryInRegion = queryInRegion.substr(regionStart, (regionEnd-regionStart+1));
140                         
141                         string parentInRegion = refSeqs[seqIndex]->getAligned();
142                         parentInRegion = parentInRegion.substr(regionStart, (regionEnd-regionStart+1));
143                         
144                         temp.queryToParentLocal = computePercentID(queryInRegion, parentInRegion);
145                 
146                         outputResults.push_back(temp);
147                 }
148                 
149                 return chimera;
150         }
151         catch(exception& e) {
152                 m->errorOut(e, "Maligner", "chimeraMaligner");
153                 exit(1);
154         }
155 }
156 /***********************************************************************/
157 //removes top matches that do not have minimum coverage with query.
158 vector<Sequence*> Maligner::minCoverageFilter(vector<Sequence*> ref){  
159         try {
160                 vector<Sequence*> newRefs;
161                 
162                 string queryAligned = query->getAligned();
163                 
164                 for (int i = 0; i < ref.size(); i++) {
165                         
166                         string refAligned = ref[i]->getAligned();
167                         
168                         int numBases = 0;
169                         int numCovered = 0;
170                         
171                         //calculate coverage
172                         for (int j = 0; j < queryAligned.length(); j++) {
173                                 
174                                 if (isalpha(queryAligned[j])) {
175                                         numBases++;
176                                         
177                                         if (isalpha(refAligned[j])) {
178                                                 numCovered++;
179                                         }
180                                 }
181                         }
182                         
183                         int coverage = ((numCovered/(float)numBases)*100);
184                         
185                         //if coverage above minimum
186                         if (coverage > minCoverage) {
187                                 newRefs.push_back(ref[i]);
188                         }else {
189                                 delete ref[i];
190                         }
191                 }
192                 
193                 return newRefs;
194         }
195         catch(exception& e) {
196                 m->errorOut(e, "Maligner", "minCoverageFilter");
197                 exit(1);
198         }
199 }
200 /***********************************************************************/
201 // a breakpoint should yield fewer mismatches than this number with respect to the best parent sequence.
202 int Maligner::computeChimeraPenalty() {
203         try {
204                 
205                 int numAllowable = ((1.0 - (1.0/minDivR)) * query->getNumBases());
206
207 //              if(numAllowable < 2){   numAllowable = 2;       }
208                 
209                 int penalty = int(numAllowable + 1) * misMatchPenalty;
210
211                 return penalty;
212
213         }
214         catch(exception& e) {
215                 m->errorOut(e, "Maligner", "computeChimeraPenalty");
216                 exit(1);
217         }
218 }
219 /***********************************************************************/
220 //this is a vertical filter
221 void Maligner::verticalFilter(vector<Sequence*> seqs) {
222         try {
223                 vector<int> gaps;       gaps.resize(query->getAligned().length(), 0);
224                 
225                 string filterString = (string(query->getAligned().length(), '1'));
226                 
227                 //for each sequence
228                 for (int i = 0; i < seqs.size(); i++) {
229                 
230                         string seqAligned = seqs[i]->getAligned();
231                         
232                         for (int j = 0; j < seqAligned.length(); j++) {
233                                 //if this spot is a gap
234                                 if ((seqAligned[j] == '-') || (seqAligned[j] == '.'))   {       gaps[j]++;      }
235                         }
236                 }
237                 
238                 //zero out spot where all sequences have blanks
239                 int numColRemoved = 0;
240                 for(int i = 0; i < seqs[0]->getAligned().length(); i++){
241                         if(gaps[i] == seqs.size())      {       filterString[i] = '0';  numColRemoved++;  }
242                 }
243                 
244                 map<int, int> newMap;
245                 //for each sequence
246                 for (int i = 0; i < seqs.size(); i++) {
247                 
248                         string seqAligned = seqs[i]->getAligned();
249                         string newAligned = "";
250                         int count = 0;
251                         
252                         for (int j = 0; j < seqAligned.length(); j++) {
253                                 //if this spot is not a gap
254                                 if (filterString[j] == '1') { 
255                                         newAligned += seqAligned[j]; 
256                                         newMap[count] = spotMap[j];
257                                         count++;
258                                 }
259                         }
260                         
261                         seqs[i]->setAligned(newAligned);
262                 }
263
264                 spotMap = newMap;
265         }
266         catch(exception& e) {
267                 m->errorOut(e, "Maligner", "verticalFilter");
268                 exit(1);
269         }
270 }
271 //***************************************************************************************************************
272 vector< vector<score_struct> > Maligner::buildScoreMatrix(int cols, int rows) {
273         try{
274                 
275                 vector< vector<score_struct> > m(rows);
276                 
277                 for (int i = 0; i < rows; i++) {
278                         for (int j = 0; j < cols; j++) {
279                                 
280                                 //initialize each cell
281                                 score_struct temp;
282                                 temp.prev = -1;
283                                 temp.score = -9999999;
284                                 temp.col = j;
285                                 temp.row = i;
286                                 
287                                 m[i].push_back(temp);
288                         }
289                 }
290                 
291                 return m;
292         }
293         catch(exception& e) {
294                 m->errorOut(e, "Maligner", "buildScoreMatrix");
295                 exit(1);
296         }
297 }
298
299 //***************************************************************************************************************
300
301 void Maligner::fillScoreMatrix(vector<vector<score_struct> >& ms, vector<Sequence*> seqs, int penalty) {
302         try{
303                 
304                 //get matrix dimensions
305                 int numCols = query->getAligned().length();
306                 int numRows = seqs.size();
307                 
308                 //initialize first col
309                 string queryAligned = query->getAligned();
310                 for (int i = 0; i < numRows; i++) {
311                         string subjectAligned = seqs[i]->getAligned();
312                         
313                         //are you both gaps?
314                         if ((!isalpha(queryAligned[0])) && (!isalpha(subjectAligned[0]))) {
315                                 ms[i][0].score = 0;
316 //                              ms[i][0].mismatches = 0;
317                         }else if (queryAligned[0] == subjectAligned[0]) {
318                                 ms[i][0].score = matchScore;
319 //                              ms[i][0].mismatches = 0;
320                         }else{
321                                 ms[i][0].score = 0;
322 //                              ms[i][0].mismatches = 1;
323                         }
324                 }
325                 
326                 //fill rest of matrix
327                 for (int j = 1; j < numCols; j++) {  //iterate through matrix columns
328                 
329                         for (int i = 0; i < numRows; i++) {  //iterate through matrix rows
330                                 
331                                 string subjectAligned = seqs[i]->getAligned();
332                                 
333                                 int matchMisMatchScore = 0;
334                                 //are you both gaps?
335                                 if ((!isalpha(queryAligned[j])) && (!isalpha(subjectAligned[j]))) {
336                                         //leave the same
337                                 }else if ((toupper(queryAligned[j]) == 'N') || (toupper(subjectAligned[j]) == 'N')) {
338                                         //leave the same
339                                 }else if (queryAligned[j] == subjectAligned[j]) {
340                                         matchMisMatchScore = matchScore;
341 //                                      ms[i][j].mismatches = ms[i][j-1].mismatches;
342                                 }else if (queryAligned[j] != subjectAligned[j]) {
343                                         matchMisMatchScore = misMatchPenalty;
344 //                                      ms[i][j].mismatches = ms[i][j-1].mismatches + 1;
345                                 }
346                                 
347                                 //compute score based on previous columns scores
348                                 for (int prevIndex = 0; prevIndex < numRows; prevIndex++) { //iterate through rows
349                                         
350                                         int sumScore = matchMisMatchScore + ms[prevIndex][j-1].score;
351                                         
352                                         //you are not at yourself
353                                         if (prevIndex != i) {   sumScore += penalty;    }
354                                         if (sumScore < 0)       {       sumScore = 0;                   }
355                                         
356                                         if (sumScore > ms[i][j].score) {
357                                                 ms[i][j].score = sumScore;
358                                                 ms[i][j].prev = prevIndex;
359                                         }
360                                 }
361                         }
362                 }
363                 
364 //              for(int i=0;i<numRows;i++){
365 //                      cout << seqs[i]->getName();
366 //                      for(int j=0;j<numCols;j++){
367 //                              cout << '\t' << ms[i][j].mismatches;
368 //                      }
369 //                      cout << endl;
370 //              }
371 //              cout << endl;
372 //              for(int i=0;i<numRows;i++){
373 //                      cout << seqs[i]->getName();
374 //                      for(int j=0;j<numCols;j++){
375 //                              cout << '\t' << ms[i][j].score;
376 //                      }
377 //                      cout << endl;
378 //              }
379 //              cout << endl;
380 //              for(int i=0;i<numRows;i++){
381 //                      cout << seqs[i]->getName();
382 //                      for(int j=0;j<numCols;j++){
383 //                              cout << '\t' << ms[i][j].prev;
384 //                      }
385 //                      cout << endl;
386 //              }
387                 
388                 
389         }
390         catch(exception& e) {
391                 m->errorOut(e, "Maligner", "fillScoreMatrix");
392                 exit(1);
393         }
394 }
395
396 //***************************************************************************************************************
397
398 vector<trace_struct> Maligner::extractHighestPath(vector<vector<score_struct> > ms) {
399         try {
400         
401                 
402                 //get matrix dimensions
403                 int numCols = query->getAligned().length();
404                 int numRows = ms.size();
405         
406         
407                 //find highest score scoring matrix
408                 vector<score_struct> highestStruct;
409                 int highestScore = 0;
410                 
411                 for (int i = 0; i < numRows; i++) {
412                         for (int j = 0; j < numCols; j++) {
413                                 if (ms[i][j].score > highestScore) {
414                                         highestScore = ms[i][j].score;
415                                         highestStruct.resize(0);
416                                         highestStruct.push_back(ms[i][j]);
417                                 }
418                                 else if(ms[i][j].score == highestScore){
419                                         highestStruct.push_back(ms[i][j]);
420                                 }
421                         }
422                 }
423                         
424 //              cout << highestScore << '\t' << highestStruct.size() << endl;   
425                 
426                 vector<trace_struct> maxTrace;
427                 double maxPercentIdenticalQueryAntiChimera = 0;
428                 int maxIndex = -1;
429                 
430                 for(int i=0;i<highestStruct.size();i++){
431                         
432                         vector<score_struct> path;
433
434                         int rowIndex = highestStruct[i].row;
435                         int pos = highestStruct[i].col;
436                         int score = highestStruct[i].score;
437                                         
438                         while (pos >= 0 && score > 0) {
439                                 score_struct temp = ms[rowIndex][pos];
440                                 score = temp.score;
441                                 
442                                 if (score > 0) {        path.push_back(temp);   }
443                                 
444                                 rowIndex = temp.prev;
445                                 pos--;
446                         }
447
448                         reverse(path.begin(), path.end());
449
450                         vector<trace_struct> trace = mapTraceRegionsToAlignment(path, refSeqs);
451                 
452 //                      cout << "traces\n";
453 //                      for(int j=0;j<trace.size();j++){
454 //                              cout << trace[j].col << '\t' << trace[j].oldCol << '\t' << refSeqs[trace[j].row]->getName() << endl;
455 //                      }
456                         
457 //                      Need to do something with this in a bit...
458 //                      if (trace.size() > 1) {         chimera = "yes";        }
459 //                      else { chimera = "no";  }
460                         
461                         int traceStart = path[0].col;
462                         int traceEnd = path[path.size()-1].col; 
463 //                      cout << "traceStart/End\t" << traceStart << '\t' << traceEnd << endl;
464                         
465                         string queryInRange = query->getAligned();
466                         queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1));
467 //                      cout << "here" << endl;
468                         string chimeraSeq = constructChimericSeq(trace, refSeqs);
469                         string antiChimeraSeq = constructAntiChimericSeq(trace, refSeqs);
470                 
471                         percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq);                      
472                         double percentIdenticalQueryAntiChimera = computePercentID(queryInRange, antiChimeraSeq);
473 //                      cout << i << '\t' << percentIdenticalQueryChimera << '\t' << percentIdenticalQueryAntiChimera << endl;
474                         
475                         if(percentIdenticalQueryAntiChimera > maxPercentIdenticalQueryAntiChimera){
476                                 maxPercentIdenticalQueryAntiChimera = percentIdenticalQueryAntiChimera;
477                                 maxTrace = trace;
478                                 maxIndex = i;
479                         }
480                 }
481 //              cout << maxPercentIdenticalQueryAntiChimera << '\t' << maxIndex << endl;
482                 return maxTrace;
483                 
484         }
485         catch(exception& e) {
486                 m->errorOut(e, "Maligner", "extractHighestPath");
487                 exit(1);
488         }
489 }
490
491 //***************************************************************************************************************
492
493 vector<trace_struct> Maligner::mapTraceRegionsToAlignment(vector<score_struct> path, vector<Sequence*> seqs) {
494         try {
495                 vector<trace_struct> trace;
496                 
497                 int region_index = path[0].row;
498                 int region_start = path[0].col;
499         
500                 for (int i = 1; i < path.size(); i++) {
501                 
502                         int next_region_index = path[i].row;
503                         
504                         if (next_region_index != region_index) {
505                                 
506                                 // add trace region
507                                 int col_index = path[i].col;
508                                 trace_struct temp;
509                                 temp.col = region_start;
510                                 temp.oldCol = col_index-1;
511                                 temp.row = region_index;
512                                 
513                                 trace.push_back(temp);
514                                                         
515                                 region_index = path[i].row;
516                                 region_start = col_index;
517                         }
518                 }
519         
520                 // get last one
521                 trace_struct temp;
522                 temp.col = region_start;
523                 temp.oldCol = path[path.size()-1].col;
524                 temp.row = region_index;
525                 trace.push_back(temp);
526
527 //              cout << endl;
528 //              cout << trace.size() << endl;
529 //              for(int i=0;i<trace.size();i++){
530 //                      cout << seqs[trace[i].row]->getName() << endl;
531 //              }
532 //              cout << endl;
533                 
534                 return trace;
535                 
536         }
537         catch(exception& e) {
538                 m->errorOut(e, "Maligner", "mapTraceRegionsToAlignment");
539                 exit(1);
540         }
541 }
542
543 //***************************************************************************************************************
544
545 string Maligner::constructChimericSeq(vector<trace_struct> trace, vector<Sequence*> seqs) {
546         try {
547                 string chimera = "";
548                 
549                 for (int i = 0; i < trace.size(); i++) {
550 //                      cout << i << '\t' << trace[i].row << '\t' << trace[i].col << '\t' << trace[i].oldCol << endl;
551                         
552                         string seqAlign = seqs[trace[i].row]->getAligned();
553                         seqAlign = seqAlign.substr(trace[i].col, (trace[i].oldCol-trace[i].col+1));
554                         chimera += seqAlign;
555                 }
556                         
557                 return chimera;
558         }
559         catch(exception& e) {
560                 m->errorOut(e, "Maligner", "constructChimericSeq");
561                 exit(1);
562         }
563 }
564
565 //***************************************************************************************************************
566
567 string Maligner::constructAntiChimericSeq(vector<trace_struct> trace, vector<Sequence*> seqs) {
568         try {
569                 string antiChimera = "";
570                 
571                 for (int i = 0; i < trace.size(); i++) {
572 //                      cout << i << '\t' << (trace.size() - i - 1) << '\t' << trace[i].row << '\t' << trace[i].col << '\t' << trace[i].oldCol << endl;
573                         
574                         int oppositeIndex = trace.size() - i - 1;
575                         
576                         string seqAlign = seqs[trace[oppositeIndex].row]->getAligned();
577                         seqAlign = seqAlign.substr(trace[i].col, (trace[i].oldCol-trace[i].col+1));
578                         antiChimera += seqAlign;
579                 }
580                 
581                 return antiChimera;
582         }
583         catch(exception& e) {
584                 m->errorOut(e, "Maligner", "constructChimericSeq");
585                 exit(1);
586         }
587 }
588
589 //***************************************************************************************************************
590 float Maligner::computePercentID(string queryAlign, string chimera) {
591         try {
592         
593                 if (queryAlign.length() != chimera.length()) {
594                         m->mothurOut("Error, alignment strings are of different lengths: "); m->mothurOutEndLine();
595                         m->mothurOut(toString(queryAlign.length())); m->mothurOutEndLine(); m->mothurOutEndLine(); m->mothurOutEndLine(); m->mothurOutEndLine();
596                         m->mothurOut(toString(chimera.length())); m->mothurOutEndLine();
597                         return -1.0;
598                 }
599
600         
601                 int numBases = 0;
602                 int numIdentical = 0;
603         
604                 for (int i = 0; i < queryAlign.length(); i++) {
605                         if ((isalpha(queryAlign[i])) || (isalpha(chimera[i])))  {
606                                 numBases++;             
607                                 if (queryAlign[i] == chimera[i]) {
608                                         numIdentical++;
609                                 }
610                         }
611                 }
612         
613                 if (numBases == 0) { return 0; }
614         
615                 float percentIdentical = (numIdentical/(float)numBases) * 100;
616
617                 return percentIdentical;
618                 
619         }
620         catch(exception& e) {
621                 m->errorOut(e, "Maligner", "computePercentID");
622                 exit(1);
623         }
624 }
625 //***************************************************************************************************************
626 vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
627         try {   
628                 indexes.clear();
629                 vector<Sequence*> refResults;
630                                 
631                 //get parts of query
632                 string queryUnAligned = q->getUnaligned();
633                 string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence
634                 string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence
635
636                 Sequence* queryLeft = new Sequence(q->getName()+"left", leftQuery);
637                 Sequence* queryRight = new Sequence(q->getName()+"right", rightQuery);
638                 
639                 vector<int> tempIndexesRight = databaseLeft->findClosestMegaBlast(queryRight, num+1);
640                 vector<int> tempIndexesLeft = databaseLeft->findClosestMegaBlast(queryLeft, num+1);
641                         
642                 //if ((tempIndexesRight.size() == 0) && (tempIndexesLeft.size() == 0))  {  m->mothurOut("megablast returned " + toString(tempIndexesRight.size()) + " results for the right end, and " + toString(tempIndexesLeft.size()) + " for the left end. Needed " + toString(num+1) + ". Unable to process sequence " + q->getName()); m->mothurOutEndLine(); return refResults; }
643                 
644                 vector<int> smaller;
645                 vector<int> larger;
646                 
647                 if (tempIndexesRight.size() < tempIndexesLeft.size()) { smaller = tempIndexesRight;  larger = tempIndexesLeft; }
648                 else { smaller = tempIndexesLeft; larger = tempIndexesRight; } 
649                 
650                 //merge results         
651                 map<int, int> seen;
652                 map<int, int>::iterator it;
653                 
654                 vector<int> mergedResults;
655                 for (int i = 0; i < smaller.size(); i++) {
656                         //add left if you havent already
657                         it = seen.find(smaller[i]);
658                         if (it == seen.end()) {  
659                                 mergedResults.push_back(smaller[i]);
660                                 seen[smaller[i]] = smaller[i];
661                         }
662
663                         //add right if you havent already
664                         it = seen.find(larger[i]);
665                         if (it == seen.end()) {  
666                                 mergedResults.push_back(larger[i]);
667                                 seen[larger[i]] = larger[i];
668                         }
669                 }
670                 
671                 for (int i = smaller.size(); i < larger.size(); i++) {
672                         it = seen.find(larger[i]);
673                         if (it == seen.end()) {  
674                                 mergedResults.push_back(larger[i]);
675                                 seen[larger[i]] = larger[i];
676                         }
677                 }
678                 
679                 if (mergedResults.size() < numWanted) { numWanted = mergedResults.size(); }
680 //cout << q->getName() << " merged results size = " << mergedResults.size() << '\t' << "numwanted = " << numWanted <<  endl;            
681                 for (int i = 0; i < numWanted; i++) {
682 //cout << db[mergedResults[i]]->getName()  << '\t' << mergedResults[i] << endl; 
683                         if (db[mergedResults[i]]->getName() != q->getName()) { 
684                                 Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned());
685                                 refResults.push_back(temp);
686                                 indexes.push_back(mergedResults[i]);
687                         }
688 //cout << mergedResults[i] << endl;
689                 }
690 //cout << "done " << q->getName()  << endl;             
691                 delete queryRight;
692                 delete queryLeft;
693                         
694                 return refResults;
695         }
696         catch(exception& e) {
697                 m->errorOut(e, "Maligner", "getBlastSeqs");
698                 exit(1);
699         }
700 }
701 //***************************************************************************************************************
702 vector<Sequence*> Maligner::getKmerSeqs(Sequence* q, int num) {
703         try {   
704                 indexes.clear();
705                 
706                 //get parts of query
707                 string queryUnAligned = q->getUnaligned();
708                 string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence
709                 string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence
710
711                 Sequence* queryLeft = new Sequence(q->getName(), leftQuery);
712                 Sequence* queryRight = new Sequence(q->getName(), rightQuery);
713                 
714                 vector<int> tempIndexesLeft = databaseLeft->findClosestSequences(queryLeft, numWanted);
715                 vector<int> tempIndexesRight = databaseRight->findClosestSequences(queryRight, numWanted);
716                 
717                 //merge results         
718                 map<int, int> seen;
719                 map<int, int>::iterator it;
720                 
721                 vector<int> mergedResults;
722                 for (int i = 0; i < tempIndexesLeft.size(); i++) {
723                         //add left if you havent already
724                         it = seen.find(tempIndexesLeft[i]);
725                         if (it == seen.end()) {  
726                                 mergedResults.push_back(tempIndexesLeft[i]);
727                                 seen[tempIndexesLeft[i]] = tempIndexesLeft[i];
728                         }
729
730                         //add right if you havent already
731                         it = seen.find(tempIndexesRight[i]);
732                         if (it == seen.end()) {  
733                                 mergedResults.push_back(tempIndexesRight[i]);
734                                 seen[tempIndexesRight[i]] = tempIndexesRight[i];
735                         }
736                 }
737                 
738 //cout << q->getName() << endl;         
739                 vector<Sequence*> refResults;
740                 for (int i = 0; i < numWanted; i++) {
741 //cout << db[mergedResults[i]]->getName() << endl;      
742                         Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned());
743                         refResults.push_back(temp);
744                         indexes.push_back(mergedResults[i]);
745                 }
746 //cout << endl;         
747                 delete queryRight;
748                 delete queryLeft;
749                 
750                 return refResults;
751         }
752         catch(exception& e) {
753                 m->errorOut(e, "Maligner", "getKmerSeqs");
754                 exit(1);
755         }
756 }
757 //***************************************************************************************************************
758