5 * Created by westcott on 9/23/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
12 /***********************************************************************/ //int num, int match, int misMatch, , string mode, Database* dataLeft, Database* dataRight
13 Maligner::Maligner(vector<Sequence*> temp, int match, int misMatch, float div, int ms, int minCov) : db(temp), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov) {
14 //numWanted(num), , searchMethod(mode), databaseLeft(dataLeft), databaseRight(dataRight)
16 m = MothurOut::getInstance();
19 /***********************************************************************/
20 string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
23 outputResults.clear();
25 //make copy so trimming doesn't destroy query from calling class - remember to deallocate
26 query = new Sequence(q->getName(), q->getAligned());
30 //copy refSeqs so that filter does not effect original
31 for(int i = 0; i < db.size(); i++) {
32 Sequence* newSeq = new Sequence(db[i]->getName(), db[i]->getAligned());
33 refSeqs.push_back(newSeq);
36 refSeqs = minCoverageFilter(refSeqs);
38 if (refSeqs.size() < 2) {
39 for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; }
40 percentIdenticalQueryChimera = 0.0;
44 int chimeraPenalty = computeChimeraPenalty();
47 chimera = chimeraMaligner(chimeraPenalty, decalc);
49 if (m->control_pressed) { return chimera; }
54 for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; }
59 m->errorOut(e, "Maligner", "getResults");
63 /***********************************************************************/
64 string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) {
69 //trims seqs to first non gap char in all seqs and last non gap char in all seqs
70 spotMap = decalc->trimSeqs(query, refSeqs);
72 //you trimmed the whole sequence, skip
73 if (query->getAligned() == "") { return "no"; }
75 vector<Sequence*> temp = refSeqs;
77 // for(int i=0;i<refSeqs.size();i++){
78 // cout << refSeqs[i]->getName() << endl;
81 temp.push_back(query);
85 //for (int i = 0; i < refSeqs.size(); i++) { cout << refSeqs[i]->getName() << endl ; }//<< refSeqs[i]->getAligned() << endl
87 vector< vector<score_struct> > matrix = buildScoreMatrix(query->getAligned().length(), refSeqs.size()); //builds and initializes
89 if (m->control_pressed) { return chimera; }
91 fillScoreMatrix(matrix, refSeqs, chimeraPenalty);
93 vector<score_struct> path = extractHighestPath(matrix);
95 if (m->control_pressed) { return chimera; }
97 vector<trace_struct> trace = mapTraceRegionsToAlignment(path, refSeqs);
99 if (trace.size() > 1) { chimera = "yes"; }
100 else { chimera = "no"; return chimera; }
102 int traceStart = path[0].col;
103 int traceEnd = path[path.size()-1].col;
104 string queryInRange = query->getAligned();
105 queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1));
106 // cout << queryInRange << endl;
107 string chimeraSeq = constructChimericSeq(trace, refSeqs);
108 // cout << chimeraSeq << endl;
110 // cout << queryInRange.length() << endl;
111 // cout << chimeraSeq.length() << endl;
113 percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq);
115 // cout << percentIdenticalQueryChimera << endl;
117 vector<trace_struct> trace = extractHighestPath(matrix);
119 //cout << "traces\n";
120 //for(int i=0;i<trace.size();i++){
121 // cout << trace[i].col << '\t' << trace[i].oldCol << '\t' << refSeqs[trace[i].row]->getName() << endl;
124 if (trace.size() > 1) { chimera = "yes"; }
125 else { chimera = "no"; return chimera; }
127 int traceStart = trace[0].col;
128 int traceEnd = trace[trace.size()-1].oldCol;
129 string queryInRange = query->getAligned();
130 queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart));*/
132 if (m->control_pressed) { return chimera; }
134 //save output results
135 for (int i = 0; i < trace.size(); i++) {
136 int regionStart = trace[i].col;
137 int regionEnd = trace[i].oldCol;
138 int seqIndex = trace[i].row;
140 // cout << regionStart << '\t' << regionEnd << '\t' << seqIndex << endl;
143 temp.parent = refSeqs[seqIndex]->getName();
144 temp.parentAligned = db[seqIndex]->getAligned();
145 temp.nastRegionStart = spotMap[regionStart];
146 temp.nastRegionEnd = spotMap[regionEnd];
147 temp.regionStart = unalignedMap[regionStart];
148 temp.regionEnd = unalignedMap[regionEnd];
150 string parentInRange = refSeqs[seqIndex]->getAligned();
151 parentInRange = parentInRange.substr(traceStart, (traceEnd-traceStart+1));
153 temp.queryToParent = computePercentID(queryInRange, parentInRange);
154 temp.divR = (percentIdenticalQueryChimera / temp.queryToParent);
156 string queryInRegion = query->getAligned();
157 queryInRegion = queryInRegion.substr(regionStart, (regionEnd-regionStart+1));
159 string parentInRegion = refSeqs[seqIndex]->getAligned();
160 parentInRegion = parentInRegion.substr(regionStart, (regionEnd-regionStart+1));
162 temp.queryToParentLocal = computePercentID(queryInRegion, parentInRegion);
164 //cout << query->getName() << '\t' << temp.parent << '\t' << "NAST:" << temp.nastRegionStart << '-' << temp.nastRegionEnd << " G:" << temp.queryToParent << " L:" << temp.queryToParentLocal << ", " << temp.divR << endl;
166 outputResults.push_back(temp);
171 catch(exception& e) {
172 m->errorOut(e, "Maligner", "chimeraMaligner");
176 /***********************************************************************/
177 //removes top matches that do not have minimum coverage with query.
178 vector<Sequence*> Maligner::minCoverageFilter(vector<Sequence*> ref){
180 vector<Sequence*> newRefs;
182 string queryAligned = query->getAligned();
184 for (int i = 0; i < ref.size(); i++) {
186 string refAligned = ref[i]->getAligned();
192 for (int j = 0; j < queryAligned.length(); j++) {
194 if (isalpha(queryAligned[j])) {
197 if (isalpha(refAligned[j])) {
203 int coverage = ((numCovered/(float)numBases)*100);
205 //if coverage above minimum
206 if (coverage > minCoverage) {
207 newRefs.push_back(ref[i]);
215 catch(exception& e) {
216 m->errorOut(e, "Maligner", "minCoverageFilter");
220 /***********************************************************************/
221 // a breakpoint should yield fewer mismatches than this number with respect to the best parent sequence.
222 int Maligner::computeChimeraPenalty() {
225 int numAllowable = ((1.0 - (1.0/minDivR)) * query->getNumBases());
227 // if(numAllowable < 1){ numAllowable = 1; }
229 int penalty = int(numAllowable + 1) * misMatchPenalty;
234 catch(exception& e) {
235 m->errorOut(e, "Maligner", "computeChimeraPenalty");
239 /***********************************************************************/
240 //this is a vertical filter
241 void Maligner::verticalFilter(vector<Sequence*> seqs) {
243 vector<int> gaps; gaps.resize(query->getAligned().length(), 0);
245 string filterString = (string(query->getAligned().length(), '1'));
248 for (int i = 0; i < seqs.size(); i++) {
250 string seqAligned = seqs[i]->getAligned();
252 for (int j = 0; j < seqAligned.length(); j++) {
253 //if this spot is a gap
254 if ((seqAligned[j] == '-') || (seqAligned[j] == '.')) { gaps[j]++; }
258 //zero out spot where all sequences have blanks
259 int numColRemoved = 0;
260 for(int i = 0; i < seqs[0]->getAligned().length(); i++){
261 if(gaps[i] == seqs.size()) { filterString[i] = '0'; numColRemoved++; }
264 map<int, int> newMap;
266 for (int i = 0; i < seqs.size(); i++) {
268 string seqAligned = seqs[i]->getAligned();
269 string newAligned = "";
272 for (int j = 0; j < seqAligned.length(); j++) {
273 //if this spot is not a gap
274 if (filterString[j] == '1') {
275 newAligned += seqAligned[j];
276 newMap[count] = spotMap[j];
281 seqs[i]->setAligned(newAligned);
284 string query = seqs[seqs.size()-1]->getAligned();
285 int queryLength = query.length();
287 unalignedMap.resize(queryLength, 0);
290 for(int i=1;i<queryLength;i++){
291 if(query[i] != '.' && query[i] != '-'){
292 unalignedMap[i] = unalignedMap[i-1] + 1;
295 unalignedMap[i] = unalignedMap[i-1];
301 catch(exception& e) {
302 m->errorOut(e, "Maligner", "verticalFilter");
306 //***************************************************************************************************************
307 vector< vector<score_struct> > Maligner::buildScoreMatrix(int cols, int rows) {
310 vector< vector<score_struct> > m(rows);
312 for (int i = 0; i < rows; i++) {
313 for (int j = 0; j < cols; j++) {
315 //initialize each cell
318 temp.score = -9999999;
322 m[i].push_back(temp);
328 catch(exception& e) {
329 m->errorOut(e, "Maligner", "buildScoreMatrix");
334 //***************************************************************************************************************
336 void Maligner::fillScoreMatrix(vector<vector<score_struct> >& ms, vector<Sequence*> seqs, int penalty) {
339 //get matrix dimensions
340 int numCols = query->getAligned().length();
341 int numRows = seqs.size();
343 // cout << numRows << endl;
345 //initialize first col
346 string queryAligned = query->getAligned();
347 for (int i = 0; i < numRows; i++) {
348 string subjectAligned = seqs[i]->getAligned();
351 if ((!isalpha(queryAligned[0])) && (!isalpha(subjectAligned[0]))) {
353 // ms[i][0].mismatches = 0;
354 }else if (queryAligned[0] == subjectAligned[0]) { //|| subjectAligned[0] == 'N')
355 ms[i][0].score = matchScore;
356 // ms[i][0].mismatches = 0;
359 // ms[i][0].mismatches = 1;
363 //fill rest of matrix
364 for (int j = 1; j < numCols; j++) { //iterate through matrix columns
366 // for (int i = 0; i < 1; i++) { //iterate through matrix rows
367 for (int i = 0; i < numRows; i++) { //iterate through matrix rows
369 string subjectAligned = seqs[i]->getAligned();
371 int matchMisMatchScore = 0;
373 if ((!isalpha(queryAligned[j])) && (!isalpha(subjectAligned[j]))) {
375 }else if ((toupper(queryAligned[j]) == 'N') || (toupper(subjectAligned[j]) == 'N')) {
377 }else if (queryAligned[j] == subjectAligned[j]) {
378 matchMisMatchScore = matchScore;
379 }else if (queryAligned[j] != subjectAligned[j]) {
380 matchMisMatchScore = misMatchPenalty;
383 //compute score based on previous columns scores
384 for (int prevIndex = 0; prevIndex < numRows; prevIndex++) { //iterate through rows
386 int sumScore = matchMisMatchScore + ms[prevIndex][j-1].score;
388 //you are not at yourself
389 if (prevIndex != i) { sumScore += penalty; }
390 if (sumScore < 0) { sumScore = 0; }
392 if (sumScore > ms[i][j].score) {
393 ms[i][j].score = sumScore;
394 ms[i][j].prev = prevIndex;
397 // cout << i << '\t' << j << '\t' << queryAligned[j] << '\t' << subjectAligned[j] << '\t' << matchMisMatchScore << '\t' << ms[i][j].score << endl;
407 // cout << numRows << '\t' << numCols << endl;
408 // for(int i=0;i<numRows;i++){
409 // cout << seqs[i]->getName();
410 // for(int j=0;j<numCols;j++){
411 // cout << '\t' << ms[i][j].score;
417 // for(int i=0;i<numRows;i++){
418 // cout << seqs[i]->getName();
419 // for(int j=0;j<numCols;j++){
420 // cout << '\t' << ms[i][j].prev;
427 catch(exception& e) {
428 m->errorOut(e, "Maligner", "fillScoreMatrix");
432 //***************************************************************************************************************
433 vector<score_struct> Maligner::extractHighestPath(vector<vector<score_struct> > ms) {
436 //get matrix dimensions
437 int numCols = query->getAligned().length();
438 int numRows = ms.size();
441 //find highest score scoring matrix
442 score_struct highestStruct;
443 int highestScore = 0;
445 for (int i = 0; i < numRows; i++) {
446 for (int j = 0; j < numCols; j++) {
447 if (ms[i][j].score > highestScore) {
448 highestScore = ms[i][j].score;
449 highestStruct = ms[i][j];
454 vector<score_struct> path;
456 int rowIndex = highestStruct.row;
457 int pos = highestStruct.col;
458 int score = highestStruct.score;
460 // cout << rowIndex << '\t' << pos << '\t' << score << endl;
462 while (pos >= 0 && score > 0) {
463 score_struct temp = ms[rowIndex][pos];
466 if (score > 0) { path.push_back(temp); }
468 rowIndex = temp.prev;
472 reverse(path.begin(), path.end());
477 catch(exception& e) {
478 m->errorOut(e, "Maligner", "extractHighestPath");
482 //***************************************************************************************************************
483 vector<trace_struct> Maligner::mapTraceRegionsToAlignment(vector<score_struct> path, vector<Sequence*> seqs) {
485 vector<trace_struct> trace;
487 int region_index = path[0].row;
488 int region_start = path[0].col;
490 for (int i = 1; i < path.size(); i++) {
492 int next_region_index = path[i].row;
493 //cout << i << '\t' << next_region_index << endl;
495 if (next_region_index != region_index) {
498 int col_index = path[i].col;
500 temp.col = region_start;
501 temp.oldCol = col_index-1;
502 temp.row = region_index;
504 trace.push_back(temp);
506 region_index = path[i].row;
507 region_start = col_index;
513 temp.col = region_start;
514 temp.oldCol = path[path.size()-1].col;
515 temp.row = region_index;
516 trace.push_back(temp);
521 catch(exception& e) {
522 m->errorOut(e, "Maligner", "mapTraceRegionsToAlignment");
527 /***************************************************************************************************************
529 vector<trace_struct> Maligner::extractHighestPath(vector<vector<score_struct> > ms) {
533 //get matrix dimensions
534 int numCols = query->getAligned().length();
535 int numRows = ms.size();
538 //find highest score scoring matrix
539 vector<score_struct> highestStruct;
540 int highestScore = 0;
542 for (int i = 0; i < numRows; i++) {
543 for (int j = 0; j < numCols; j++) {
544 if (ms[i][j].score > highestScore) {
545 highestScore = ms[i][j].score;
546 highestStruct.resize(0);
547 highestStruct.push_back(ms[i][j]);
549 else if(ms[i][j].score == highestScore){
550 highestStruct.push_back(ms[i][j]);
555 //cout << endl << highestScore << '\t' << highestStruct.size() << '\t' << highestStruct[0].row << endl;
557 vector<trace_struct> maxTrace;
558 double maxPercentIdenticalQueryAntiChimera = 0;
560 for(int i=0;i<highestStruct.size();i++){
562 vector<score_struct> path;
564 int rowIndex = highestStruct[i].row;
565 int pos = highestStruct[i].col;
566 int score = highestStruct[i].score;
568 while (pos >= 0 && score > 0) {
569 score_struct temp = ms[rowIndex][pos];
572 if (score > 0) { path.push_back(temp); }
574 rowIndex = temp.prev;
578 reverse(path.begin(), path.end());
580 vector<trace_struct> trace = mapTraceRegionsToAlignment(path, refSeqs);
582 //cout << "traces\n";
583 //for(int j=0;j<trace.size();j++){
584 // cout << trace[j].col << '\t' << trace[j].oldCol << '\t' << refSeqs[trace[j].row]->getName() << endl;
587 int traceStart = path[0].col;
588 int traceEnd = path[path.size()-1].col;
589 // cout << "traceStart/End\t" << traceStart << '\t' << traceEnd << endl;
591 string queryInRange = query->getAligned();
592 queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1));
593 // cout << "here" << endl;
594 string chimeraSeq = constructChimericSeq(trace, refSeqs);
595 string antiChimeraSeq = constructAntiChimericSeq(trace, refSeqs);
597 percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq);
598 double percentIdenticalQueryAntiChimera = computePercentID(queryInRange, antiChimeraSeq);
599 // cout << i << '\t' << percentIdenticalQueryChimera << '\t' << percentIdenticalQueryAntiChimera << endl;
601 if(percentIdenticalQueryAntiChimera > maxPercentIdenticalQueryAntiChimera){
602 maxPercentIdenticalQueryAntiChimera = percentIdenticalQueryAntiChimera;
606 // cout << maxPercentIdenticalQueryAntiChimera << endl;
611 catch(exception& e) {
612 m->errorOut(e, "Maligner", "extractHighestPath");
617 //***************************************************************************************************************
619 vector<trace_struct> Maligner::mapTraceRegionsToAlignment(vector<score_struct> path, vector<Sequence*> seqs) {
621 vector<trace_struct> trace;
623 int region_index = path[0].row;
624 int region_start = path[0].col;
626 for (int i = 1; i < path.size(); i++) {
628 int next_region_index = path[i].row;
630 if (next_region_index != region_index) {
633 int col_index = path[i].col;
635 temp.col = region_start;
636 temp.oldCol = col_index-1;
637 temp.row = region_index;
639 trace.push_back(temp);
641 region_index = path[i].row;
642 region_start = col_index;
648 temp.col = region_start;
649 temp.oldCol = path[path.size()-1].col;
650 temp.row = region_index;
651 trace.push_back(temp);
654 // cout << trace.size() << endl;
655 // for(int i=0;i<trace.size();i++){
656 // cout << seqs[trace[i].row]->getName() << endl;
663 catch(exception& e) {
664 m->errorOut(e, "Maligner", "mapTraceRegionsToAlignment");
669 //***************************************************************************************************************
671 string Maligner::constructChimericSeq(vector<trace_struct> trace, vector<Sequence*> seqs) {
675 for (int i = 0; i < trace.size(); i++) {
676 // cout << i << '\t' << trace[i].row << '\t' << trace[i].col << '\t' << trace[i].oldCol << endl;
678 string seqAlign = seqs[trace[i].row]->getAligned();
679 seqAlign = seqAlign.substr(trace[i].col, (trace[i].oldCol-trace[i].col+1));
682 // cout << chimera << endl;
683 // if (chimera != "") { chimera = chimera.substr(0, (chimera.length()-1)); } //this was introducing a fence post error
684 // cout << chimera << endl;
687 catch(exception& e) {
688 m->errorOut(e, "Maligner", "constructChimericSeq");
693 //***************************************************************************************************************
695 string Maligner::constructAntiChimericSeq(vector<trace_struct> trace, vector<Sequence*> seqs) {
697 string antiChimera = "";
699 for (int i = 0; i < trace.size(); i++) {
700 // cout << i << '\t' << (trace.size() - i - 1) << '\t' << trace[i].row << '\t' << trace[i].col << '\t' << trace[i].oldCol << endl;
702 int oppositeIndex = trace.size() - i - 1;
704 string seqAlign = seqs[trace[oppositeIndex].row]->getAligned();
705 seqAlign = seqAlign.substr(trace[i].col, (trace[i].oldCol-trace[i].col+1));
706 antiChimera += seqAlign;
711 catch(exception& e) {
712 m->errorOut(e, "Maligner", "constructChimericSeq");
717 //***************************************************************************************************************
718 float Maligner::computePercentID(string queryAlign, string chimera) {
721 if (queryAlign.length() != chimera.length()) {
722 m->mothurOut("Error, alignment strings are of different lengths: "); m->mothurOutEndLine();
723 m->mothurOut(toString(queryAlign.length())); m->mothurOutEndLine();
724 m->mothurOut(toString(chimera.length())); m->mothurOutEndLine();
728 // cout << queryAlign.length() << endl;
729 int numIdentical = 0;
732 for (int i = 0; i < queryAlign.length(); i++) {
733 if (((queryAlign[i] != 'G') && (queryAlign[i] != 'T') && (queryAlign[i] != 'A') && (queryAlign[i] != 'C')&& (queryAlign[i] != '.') && (queryAlign[i] != '-')) ||
734 ((chimera[i] != 'G') && (chimera[i] != 'T') && (chimera[i] != 'A') && (chimera[i] != 'C')&& (chimera[i] != '.') && (chimera[i] != '-'))) {}
737 bool charA = false; bool charB = false;
738 if ((queryAlign[i] == 'G') || (queryAlign[i] == 'T') || (queryAlign[i] == 'A') || (queryAlign[i] == 'C')) { charA = true; }
739 if ((chimera[i] == 'G') || (chimera[i] == 'T') || (chimera[i] == 'A') || (chimera[i] == 'C')) { charB = true; }
742 if (charA || charB) {
744 if (charA) { countA++; }
745 if (charB) { countB++; }
747 if (queryAlign[i] == chimera[i]) {
751 // cout << queryAlign[i] << '\t' << chimera[i] << '\t' << countA << '\t' << countB << endl;
756 // cout << "pat\t" << countA << '\t' << countB << '\t' << numIdentical << endl;
759 float numBases = (countA + countB) /(float) 2;
761 if (numBases == 0) { return 0; }
763 // cout << numIdentical << '\t' << numBases << endl;
765 float percentIdentical = (numIdentical/(float)numBases) * 100;
767 // cout << percentIdentical << endl;
769 return percentIdentical;
772 catch(exception& e) {
773 m->errorOut(e, "Maligner", "computePercentID");
777 //***************************************************************************************************************