5 * Created by westcott on 9/23/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
12 #include "blastdb.hpp"
14 /***********************************************************************/
15 Maligner::Maligner(vector<Sequence*> temp, int num, int match, int misMatch, float div, int ms, int minCov, string mode, Database* dataLeft, Database* dataRight) :
16 db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov), searchMethod(mode), databaseLeft(dataLeft), databaseRight(dataRight) {
19 m = MothurOut::getInstance();
21 // cout << matchScore << '\t' << misMatchPenalty << endl;
24 // misMatchPenalty = -1;
28 /***********************************************************************/
29 string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
32 outputResults.clear();
34 //make copy so trimming doesn't destroy query from calling class - remember to deallocate
35 query = new Sequence(q->getName(), q->getAligned());
39 if (searchMethod == "distance") {
40 //find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate
41 refSeqs = decalc->findClosest(query, db, numWanted, indexes);
42 }else if (searchMethod == "blast") {
43 refSeqs = getBlastSeqs(query, numWanted); //fills indexes
44 }else if (searchMethod == "kmer") {
45 refSeqs = getKmerSeqs(query, numWanted); //fills indexes
46 }else { m->mothurOut("not valid search."); exit(1); } //should never get here
48 if (m->control_pressed) { return chimera; }
50 refSeqs = minCoverageFilter(refSeqs);
52 if (refSeqs.size() < 2) {
53 for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; }
54 percentIdenticalQueryChimera = 0.0;
58 int chimeraPenalty = computeChimeraPenalty();
61 chimera = chimeraMaligner(chimeraPenalty, decalc);
63 if (m->control_pressed) { return chimera; }
68 for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; }
73 m->errorOut(e, "Maligner", "getResults");
77 /***********************************************************************/
78 string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) {
83 //trims seqs to first non gap char in all seqs and last non gap char in all seqs
84 spotMap = decalc->trimSeqs(query, refSeqs);
86 //you trimmed the whole sequence, skip
87 if (query->getAligned() == "") { return "no"; }
89 vector<Sequence*> temp = refSeqs;
90 temp.push_back(query);
95 vector< vector<score_struct> > matrix = buildScoreMatrix(query->getAligned().length(), refSeqs.size()); //builds and initializes
97 if (m->control_pressed) { return chimera; }
99 fillScoreMatrix(matrix, refSeqs, chimeraPenalty);
101 vector<trace_struct> trace = extractHighestPath(matrix);
103 // cout << "traces\n";
104 // for(int i=0;i<trace.size();i++){
105 // cout << trace[i].col << '\t' << trace[i].oldCol << '\t' << refSeqs[trace[i].row]->getName() << endl;
108 if (trace.size() > 1) { chimera = "yes"; }
109 else { chimera = "no"; }
111 int traceStart = trace[0].col;
112 int traceEnd = trace[trace.size()-1].oldCol;
113 string queryInRange = query->getAligned();
114 queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1));
116 if (m->control_pressed) { return chimera; }
118 //save output results
119 for (int i = 0; i < trace.size(); i++) {
120 int regionStart = trace[i].col;
121 int regionEnd = trace[i].oldCol;
122 int seqIndex = trace[i].row;
126 temp.parent = refSeqs[seqIndex]->getName();
127 temp.parentAligned = db[indexes[seqIndex]]->getAligned();
128 temp.nastRegionStart = spotMap[regionStart];
129 temp.nastRegionEnd = spotMap[regionEnd];
130 temp.regionStart = regionStart;
131 temp.regionEnd = regionEnd;
133 string parentInRange = refSeqs[seqIndex]->getAligned();
134 parentInRange = parentInRange.substr(traceStart, (traceEnd-traceStart+1));
136 temp.queryToParent = computePercentID(queryInRange, parentInRange);
137 temp.divR = (percentIdenticalQueryChimera / temp.queryToParent);
139 string queryInRegion = query->getAligned();
140 queryInRegion = queryInRegion.substr(regionStart, (regionEnd-regionStart+1));
142 string parentInRegion = refSeqs[seqIndex]->getAligned();
143 parentInRegion = parentInRegion.substr(regionStart, (regionEnd-regionStart+1));
145 temp.queryToParentLocal = computePercentID(queryInRegion, parentInRegion);
147 outputResults.push_back(temp);
152 catch(exception& e) {
153 m->errorOut(e, "Maligner", "chimeraMaligner");
157 /***********************************************************************/
158 //removes top matches that do not have minimum coverage with query.
159 vector<Sequence*> Maligner::minCoverageFilter(vector<Sequence*> ref){
161 vector<Sequence*> newRefs;
163 string queryAligned = query->getAligned();
165 for (int i = 0; i < ref.size(); i++) {
167 string refAligned = ref[i]->getAligned();
173 for (int j = 0; j < queryAligned.length(); j++) {
175 if (isalpha(queryAligned[j])) {
178 if (isalpha(refAligned[j])) {
184 int coverage = ((numCovered/(float)numBases)*100);
186 //if coverage above minimum
187 if (coverage > minCoverage) {
188 newRefs.push_back(ref[i]);
196 catch(exception& e) {
197 m->errorOut(e, "Maligner", "minCoverageFilter");
201 /***********************************************************************/
202 // a breakpoint should yield fewer mismatches than this number with respect to the best parent sequence.
203 int Maligner::computeChimeraPenalty() {
206 int numAllowable = ((1.0 - (1.0/minDivR)) * query->getNumBases());
208 // if(numAllowable < 1){ numAllowable = 1; }
210 int penalty = int(numAllowable + 1) * misMatchPenalty;
215 catch(exception& e) {
216 m->errorOut(e, "Maligner", "computeChimeraPenalty");
220 /***********************************************************************/
221 //this is a vertical filter
222 void Maligner::verticalFilter(vector<Sequence*> seqs) {
224 vector<int> gaps; gaps.resize(query->getAligned().length(), 0);
226 string filterString = (string(query->getAligned().length(), '1'));
229 for (int i = 0; i < seqs.size(); i++) {
231 string seqAligned = seqs[i]->getAligned();
233 for (int j = 0; j < seqAligned.length(); j++) {
234 //if this spot is a gap
235 if ((seqAligned[j] == '-') || (seqAligned[j] == '.')) { gaps[j]++; }
239 //zero out spot where all sequences have blanks
240 int numColRemoved = 0;
241 for(int i = 0; i < seqs[0]->getAligned().length(); i++){
242 if(gaps[i] == seqs.size()) { filterString[i] = '0'; numColRemoved++; }
245 map<int, int> newMap;
247 for (int i = 0; i < seqs.size(); i++) {
249 string seqAligned = seqs[i]->getAligned();
250 string newAligned = "";
253 for (int j = 0; j < seqAligned.length(); j++) {
254 //if this spot is not a gap
255 if (filterString[j] == '1') {
256 newAligned += seqAligned[j];
257 newMap[count] = spotMap[j];
262 seqs[i]->setAligned(newAligned);
267 catch(exception& e) {
268 m->errorOut(e, "Maligner", "verticalFilter");
272 //***************************************************************************************************************
273 vector< vector<score_struct> > Maligner::buildScoreMatrix(int cols, int rows) {
276 vector< vector<score_struct> > m(rows);
278 for (int i = 0; i < rows; i++) {
279 for (int j = 0; j < cols; j++) {
281 //initialize each cell
284 temp.score = -9999999;
288 m[i].push_back(temp);
294 catch(exception& e) {
295 m->errorOut(e, "Maligner", "buildScoreMatrix");
300 //***************************************************************************************************************
302 void Maligner::fillScoreMatrix(vector<vector<score_struct> >& ms, vector<Sequence*> seqs, int penalty) {
305 //get matrix dimensions
306 int numCols = query->getAligned().length();
307 int numRows = seqs.size();
309 //initialize first col
310 string queryAligned = query->getAligned();
311 for (int i = 0; i < numRows; i++) {
312 string subjectAligned = seqs[i]->getAligned();
315 if ((!isalpha(queryAligned[0])) && (!isalpha(subjectAligned[0]))) {
317 // ms[i][0].mismatches = 0;
318 }else if (queryAligned[0] == subjectAligned[0] || subjectAligned[0] == 'N') {
319 ms[i][0].score = matchScore;
320 // ms[i][0].mismatches = 0;
323 // ms[i][0].mismatches = 1;
327 //fill rest of matrix
328 for (int j = 1; j < numCols; j++) { //iterate through matrix columns
330 for (int i = 0; i < numRows; i++) { //iterate through matrix rows
332 string subjectAligned = seqs[i]->getAligned();
334 int matchMisMatchScore = 0;
336 if ((!isalpha(queryAligned[j])) && (!isalpha(subjectAligned[j]))) {
338 }else if ((toupper(queryAligned[j]) == 'N') || (toupper(subjectAligned[j]) == 'N')) {
339 matchMisMatchScore = matchScore;
341 }else if (queryAligned[j] == subjectAligned[j]) {
342 matchMisMatchScore = matchScore;
343 // ms[i][j].mismatches = ms[i][j-1].mismatches;
344 }else if (queryAligned[j] != subjectAligned[j]) {
345 matchMisMatchScore = misMatchPenalty;
346 // ms[i][j].mismatches = ms[i][j-1].mismatches + 1;
349 //compute score based on previous columns scores
350 for (int prevIndex = 0; prevIndex < numRows; prevIndex++) { //iterate through rows
352 int sumScore = matchMisMatchScore + ms[prevIndex][j-1].score;
354 //you are not at yourself
355 if (prevIndex != i) { sumScore += penalty; }
356 if (sumScore < 0) { sumScore = 0; }
358 if (sumScore > ms[i][j].score) {
359 ms[i][j].score = sumScore;
360 ms[i][j].prev = prevIndex;
366 // for(int i=0;i<numRows;i++){
367 // cout << seqs[i]->getName();
368 // for(int j=0;j<numCols;j++){
369 // cout << '\t' << ms[i][j].mismatches;
374 // for(int i=0;i<numRows;i++){
375 // cout << seqs[i]->getName();
376 // for(int j=0;j<numCols;j++){
377 // cout << '\t' << ms[i][j].score;
382 // for(int i=0;i<numRows;i++){
383 // cout << seqs[i]->getName();
384 // for(int j=0;j<numCols;j++){
385 // cout << '\t' << ms[i][j].prev;
392 catch(exception& e) {
393 m->errorOut(e, "Maligner", "fillScoreMatrix");
398 //***************************************************************************************************************
400 vector<trace_struct> Maligner::extractHighestPath(vector<vector<score_struct> > ms) {
404 //get matrix dimensions
405 int numCols = query->getAligned().length();
406 int numRows = ms.size();
409 //find highest score scoring matrix
410 vector<score_struct> highestStruct;
411 int highestScore = 0;
413 for (int i = 0; i < numRows; i++) {
414 for (int j = 0; j < numCols; j++) {
415 if (ms[i][j].score > highestScore) {
416 highestScore = ms[i][j].score;
417 highestStruct.resize(0);
418 highestStruct.push_back(ms[i][j]);
420 else if(ms[i][j].score == highestScore){
421 highestStruct.push_back(ms[i][j]);
426 // cout << endl << highestScore << '\t' << highestStruct.size() << '\t' << highestStruct[0].row << endl;
428 vector<trace_struct> maxTrace;
429 double maxPercentIdenticalQueryAntiChimera = 0;
431 for(int i=0;i<highestStruct.size();i++){
433 vector<score_struct> path;
435 int rowIndex = highestStruct[i].row;
436 int pos = highestStruct[i].col;
437 int score = highestStruct[i].score;
439 while (pos >= 0 && score > 0) {
440 score_struct temp = ms[rowIndex][pos];
443 if (score > 0) { path.push_back(temp); }
445 rowIndex = temp.prev;
449 reverse(path.begin(), path.end());
451 vector<trace_struct> trace = mapTraceRegionsToAlignment(path, refSeqs);
453 // cout << "traces\n";
454 // for(int j=0;j<trace.size();j++){
455 // cout << trace[j].col << '\t' << trace[j].oldCol << '\t' << refSeqs[trace[j].row]->getName() << endl;
458 int traceStart = path[0].col;
459 int traceEnd = path[path.size()-1].col;
460 // cout << "traceStart/End\t" << traceStart << '\t' << traceEnd << endl;
462 string queryInRange = query->getAligned();
463 queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1));
464 // cout << "here" << endl;
465 string chimeraSeq = constructChimericSeq(trace, refSeqs);
466 string antiChimeraSeq = constructAntiChimericSeq(trace, refSeqs);
468 percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq);
469 double percentIdenticalQueryAntiChimera = computePercentID(queryInRange, antiChimeraSeq);
470 // cout << i << '\t' << percentIdenticalQueryChimera << '\t' << percentIdenticalQueryAntiChimera << endl;
472 if(percentIdenticalQueryAntiChimera > maxPercentIdenticalQueryAntiChimera){
473 maxPercentIdenticalQueryAntiChimera = percentIdenticalQueryAntiChimera;
477 // cout << maxPercentIdenticalQueryAntiChimera << endl;
481 catch(exception& e) {
482 m->errorOut(e, "Maligner", "extractHighestPath");
487 //***************************************************************************************************************
489 vector<trace_struct> Maligner::mapTraceRegionsToAlignment(vector<score_struct> path, vector<Sequence*> seqs) {
491 vector<trace_struct> trace;
493 int region_index = path[0].row;
494 int region_start = path[0].col;
496 for (int i = 1; i < path.size(); i++) {
498 int next_region_index = path[i].row;
500 if (next_region_index != region_index) {
503 int col_index = path[i].col;
505 temp.col = region_start;
506 temp.oldCol = col_index-1;
507 temp.row = region_index;
509 trace.push_back(temp);
511 region_index = path[i].row;
512 region_start = col_index;
518 temp.col = region_start;
519 temp.oldCol = path[path.size()-1].col;
520 temp.row = region_index;
521 trace.push_back(temp);
524 // cout << trace.size() << endl;
525 // for(int i=0;i<trace.size();i++){
526 // cout << seqs[trace[i].row]->getName() << endl;
533 catch(exception& e) {
534 m->errorOut(e, "Maligner", "mapTraceRegionsToAlignment");
539 //***************************************************************************************************************
541 string Maligner::constructChimericSeq(vector<trace_struct> trace, vector<Sequence*> seqs) {
545 for (int i = 0; i < trace.size(); i++) {
546 // cout << i << '\t' << trace[i].row << '\t' << trace[i].col << '\t' << trace[i].oldCol << endl;
548 string seqAlign = seqs[trace[i].row]->getAligned();
549 seqAlign = seqAlign.substr(trace[i].col, (trace[i].oldCol-trace[i].col+1));
555 catch(exception& e) {
556 m->errorOut(e, "Maligner", "constructChimericSeq");
561 //***************************************************************************************************************
563 string Maligner::constructAntiChimericSeq(vector<trace_struct> trace, vector<Sequence*> seqs) {
565 string antiChimera = "";
567 for (int i = 0; i < trace.size(); i++) {
568 // cout << i << '\t' << (trace.size() - i - 1) << '\t' << trace[i].row << '\t' << trace[i].col << '\t' << trace[i].oldCol << endl;
570 int oppositeIndex = trace.size() - i - 1;
572 string seqAlign = seqs[trace[oppositeIndex].row]->getAligned();
573 seqAlign = seqAlign.substr(trace[i].col, (trace[i].oldCol-trace[i].col+1));
574 antiChimera += seqAlign;
579 catch(exception& e) {
580 m->errorOut(e, "Maligner", "constructChimericSeq");
585 //***************************************************************************************************************
586 float Maligner::computePercentID(string queryAlign, string chimera) {
589 if (queryAlign.length() != chimera.length()) {
590 m->mothurOut("Error, alignment strings are of different lengths: "); m->mothurOutEndLine();
591 m->mothurOut(toString(queryAlign.length())); m->mothurOutEndLine(); m->mothurOutEndLine(); m->mothurOutEndLine(); m->mothurOutEndLine();
592 m->mothurOut(toString(chimera.length())); m->mothurOutEndLine();
598 int numIdentical = 0;
600 for (int i = 0; i < queryAlign.length(); i++) {
601 if ((isalpha(queryAlign[i])) || (isalpha(chimera[i]))) {
603 if (queryAlign[i] == chimera[i]) {
609 if (numBases == 0) { return 0; }
611 float percentIdentical = (numIdentical/(float)numBases) * 100;
613 return percentIdentical;
616 catch(exception& e) {
617 m->errorOut(e, "Maligner", "computePercentID");
621 //***************************************************************************************************************
622 vector<Sequence*> Maligner::getBlastSeqs(Sequence* q, int num) {
625 vector<Sequence*> refResults;
628 string queryUnAligned = q->getUnaligned();
629 string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence
630 string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence
632 Sequence* queryLeft = new Sequence(q->getName()+"left", leftQuery);
633 Sequence* queryRight = new Sequence(q->getName()+"right", rightQuery);
635 vector<int> tempIndexesLeft = databaseLeft->findClosestMegaBlast(queryLeft, num+1);
636 vector<float> leftScores = databaseLeft->getMegaBlastSearchScores();
637 vector<int> tempIndexesRight = databaseLeft->findClosestMegaBlast(queryRight, num+1);
638 vector<float> rightScores = databaseLeft->getMegaBlastSearchScores();
640 //if ((tempIndexesRight.size() == 0) && (tempIndexesLeft.size() == 0)) { m->mothurOut("megablast returned " + toString(tempIndexesRight.size()) + " results for the right end, and " + toString(tempIndexesLeft.size()) + " for the left end. Needed " + toString(num+1) + ". Unable to process sequence " + q->getName()); m->mothurOutEndLine(); return refResults; }
643 vector<float> smallerScores;
645 vector<float> largerScores;
647 if (tempIndexesRight.size() < tempIndexesLeft.size()) { smaller = tempIndexesRight; smallerScores = rightScores; larger = tempIndexesLeft; largerScores = leftScores; }
648 else { smaller = tempIndexesLeft; smallerScores = leftScores; larger = tempIndexesRight; largerScores = rightScores; }
650 //for (int i = 0; i < smaller.size(); i++) { cout << "smaller = " << smaller[i] << '\t' << smallerScores[i] << endl; }
652 //for (int i = 0; i < larger.size(); i++) { cout << "larger = " << larger[i] << '\t' << largerScores[i] << endl; }
656 map<int, int>::iterator it;
657 float lastSmaller = smallerScores[0];
658 float lastLarger = largerScores[0];
660 vector<int> mergedResults;
661 for (int i = 0; i < smaller.size(); i++) {
662 //add left if you havent already
663 it = seen.find(smaller[i]);
664 if (it == seen.end()) {
665 mergedResults.push_back(smaller[i]);
666 seen[smaller[i]] = smaller[i];
667 lastSmaller = smallerScores[i];
670 //add right if you havent already
671 it = seen.find(larger[i]);
672 if (it == seen.end()) {
673 mergedResults.push_back(larger[i]);
674 seen[larger[i]] = larger[i];
675 lastLarger = largerScores[i];
679 if (mergedResults.size() > num) { break; }
682 //save lasti for smaller ties below
684 int iSmaller = lasti;
686 if (!(mergedResults.size() > num)) { //if we still need more results.
687 for (int i = smaller.size(); i < larger.size(); i++) {
688 it = seen.find(larger[i]);
689 if (it == seen.end()) {
690 mergedResults.push_back(larger[i]);
691 seen[larger[i]] = larger[i];
692 lastLarger = largerScores[i];
696 if (mergedResults.size() > num) { break; }
701 //add in any ties from smaller
702 while (iSmaller < smaller.size()) {
703 if (smallerScores[iSmaller] == lastSmaller) {
704 it = seen.find(smaller[iSmaller]);
706 if (it == seen.end()) {
707 mergedResults.push_back(smaller[iSmaller]);
708 seen[smaller[iSmaller]] = smaller[iSmaller];
716 //add in any ties from larger
717 while (lasti < larger.size()) {
718 if (largerScores[lasti] == lastLarger) { //is it a tie
719 it = seen.find(larger[lasti]);
721 if (it == seen.end()) { //we haven't already seen it
722 mergedResults.push_back(larger[lasti]);
723 seen[larger[lasti]] = larger[lasti];
730 numWanted = seen.size();
732 if (mergedResults.size() < numWanted) { numWanted = mergedResults.size(); }
733 //cout << q->getName() << " merged results size = " << mergedResults.size() << '\t' << "numwanted = " << numWanted << endl;
734 for (int i = 0; i < numWanted; i++) {
735 //cout << db[mergedResults[i]]->getName() << '\t' << mergedResults[i] << endl;
736 if (db[mergedResults[i]]->getName() != q->getName()) {
737 Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned());
738 refResults.push_back(temp);
739 indexes.push_back(mergedResults[i]);
741 //cout << mergedResults[i] << endl;
743 //cout << "done " << q->getName() << endl;
749 catch(exception& e) {
750 m->errorOut(e, "Maligner", "getBlastSeqs");
754 //***************************************************************************************************************
755 vector<Sequence*> Maligner::getKmerSeqs(Sequence* q, int num) {
760 string queryUnAligned = q->getUnaligned();
761 string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence
762 string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence
764 Sequence* queryLeft = new Sequence(q->getName(), leftQuery);
765 Sequence* queryRight = new Sequence(q->getName(), rightQuery);
767 vector<int> tempIndexesLeft = databaseLeft->findClosestSequences(queryLeft, numWanted);
768 vector<int> tempIndexesRight = databaseRight->findClosestSequences(queryRight, numWanted);
772 map<int, int>::iterator it;
774 vector<int> mergedResults;
775 for (int i = 0; i < tempIndexesLeft.size(); i++) {
776 //add left if you havent already
777 it = seen.find(tempIndexesLeft[i]);
778 if (it == seen.end()) {
779 mergedResults.push_back(tempIndexesLeft[i]);
780 seen[tempIndexesLeft[i]] = tempIndexesLeft[i];
783 //add right if you havent already
784 it = seen.find(tempIndexesRight[i]);
785 if (it == seen.end()) {
786 mergedResults.push_back(tempIndexesRight[i]);
787 seen[tempIndexesRight[i]] = tempIndexesRight[i];
791 //cout << q->getName() << endl;
792 vector<Sequence*> refResults;
793 for (int i = 0; i < numWanted; i++) {
794 //cout << db[mergedResults[i]]->getName() << endl;
795 Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned());
796 refResults.push_back(temp);
797 indexes.push_back(mergedResults[i]);
805 catch(exception& e) {
806 m->errorOut(e, "Maligner", "getKmerSeqs");
810 //***************************************************************************************************************