5 * Created by westcott on 9/23/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
12 /***********************************************************************/ //int num, int match, int misMatch, , string mode, Database* dataLeft, Database* dataRight
13 Maligner::Maligner(vector<Sequence*> temp, int match, int misMatch, float div, int ms, int minCov) : db(temp), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov) {
14 //numWanted(num), , searchMethod(mode), databaseLeft(dataLeft), databaseRight(dataRight)
16 m = MothurOut::getInstance();
19 /***********************************************************************/
20 string Maligner::getResults(Sequence* q, DeCalculator* decalc) {
23 outputResults.clear();
25 //make copy so trimming doesn't destroy query from calling class - remember to deallocate
26 query = new Sequence(q->getName(), q->getAligned());
30 //copy refSeqs so that filter does not effect original
31 for(int i = 0; i < db.size(); i++) {
32 Sequence* newSeq = new Sequence(db[i]->getName(), db[i]->getAligned());
33 refSeqs.push_back(newSeq);
36 refSeqs = minCoverageFilter(refSeqs);
38 if (refSeqs.size() < 2) {
39 for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; }
40 percentIdenticalQueryChimera = 0.0;
44 int chimeraPenalty = computeChimeraPenalty();
47 chimera = chimeraMaligner(chimeraPenalty, decalc);
49 if (m->control_pressed) { return chimera; }
54 for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; }
59 m->errorOut(e, "Maligner", "getResults");
63 /***********************************************************************/
64 string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) {
69 //trims seqs to first non gap char in all seqs and last non gap char in all seqs
70 spotMap = decalc->trimSeqs(query, refSeqs);
72 //you trimmed the whole sequence, skip
73 if (query->getAligned() == "") { return "no"; }
75 vector<Sequence*> temp = refSeqs;
76 temp.push_back(query);
80 //for (int i = 0; i < refSeqs.size(); i++) { cout << refSeqs[i]->getName() << endl ; }//<< refSeqs[i]->getAligned() << endl
82 vector< vector<score_struct> > matrix = buildScoreMatrix(query->getAligned().length(), refSeqs.size()); //builds and initializes
84 if (m->control_pressed) { return chimera; }
86 fillScoreMatrix(matrix, refSeqs, chimeraPenalty);
88 vector<score_struct> path = extractHighestPath(matrix);
90 if (m->control_pressed) { return chimera; }
92 vector<trace_struct> trace = mapTraceRegionsToAlignment(path, refSeqs);
94 if (trace.size() > 1) { chimera = "yes"; }
95 else { chimera = "no"; return chimera; }
97 int traceStart = path[0].col;
98 int traceEnd = path[path.size()-1].col;
99 string queryInRange = query->getAligned();
100 queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1));
102 string chimeraSeq = constructChimericSeq(trace, refSeqs);
103 // cout << queryInRange.length() << endl;
104 // cout << chimeraSeq.length() << endl;
106 percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq);
109 vector<trace_struct> trace = extractHighestPath(matrix);
111 //cout << "traces\n";
112 //for(int i=0;i<trace.size();i++){
113 // cout << trace[i].col << '\t' << trace[i].oldCol << '\t' << refSeqs[trace[i].row]->getName() << endl;
116 if (trace.size() > 1) { chimera = "yes"; }
117 else { chimera = "no"; return chimera; }
119 int traceStart = trace[0].col;
120 int traceEnd = trace[trace.size()-1].oldCol;
121 string queryInRange = query->getAligned();
122 queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart));*/
124 if (m->control_pressed) { return chimera; }
126 //save output results
127 for (int i = 0; i < trace.size(); i++) {
128 int regionStart = trace[i].col;
129 int regionEnd = trace[i].oldCol;
130 int seqIndex = trace[i].row;
134 temp.parent = refSeqs[seqIndex]->getName();
135 temp.parentAligned = db[seqIndex]->getAligned();
136 temp.nastRegionStart = spotMap[regionStart];
137 temp.nastRegionEnd = spotMap[regionEnd];
138 temp.regionStart = regionStart;
139 temp.regionEnd = regionEnd;
141 string parentInRange = refSeqs[seqIndex]->getAligned();
142 parentInRange = parentInRange.substr(traceStart, (traceEnd-traceStart+1));
144 temp.queryToParent = computePercentID(queryInRange, parentInRange);
145 temp.divR = (percentIdenticalQueryChimera / temp.queryToParent);
147 string queryInRegion = query->getAligned();
148 queryInRegion = queryInRegion.substr(regionStart, (regionEnd-regionStart+1));
150 string parentInRegion = refSeqs[seqIndex]->getAligned();
151 parentInRegion = parentInRegion.substr(regionStart, (regionEnd-regionStart+1));
153 temp.queryToParentLocal = computePercentID(queryInRegion, parentInRegion);
155 // cout << query->getName() << '\t' << temp.parent << '\t' << "NAST:" << temp.nastRegionStart << '-' << temp.nastRegionEnd << " G:" << temp.queryToParent << " L:" << temp.queryToParentLocal << ", " << temp.divR << endl;
157 outputResults.push_back(temp);
162 catch(exception& e) {
163 m->errorOut(e, "Maligner", "chimeraMaligner");
167 /***********************************************************************/
168 //removes top matches that do not have minimum coverage with query.
169 vector<Sequence*> Maligner::minCoverageFilter(vector<Sequence*> ref){
171 vector<Sequence*> newRefs;
173 string queryAligned = query->getAligned();
175 for (int i = 0; i < ref.size(); i++) {
177 string refAligned = ref[i]->getAligned();
183 for (int j = 0; j < queryAligned.length(); j++) {
185 if (isalpha(queryAligned[j])) {
188 if (isalpha(refAligned[j])) {
194 int coverage = ((numCovered/(float)numBases)*100);
196 //if coverage above minimum
197 if (coverage > minCoverage) {
198 newRefs.push_back(ref[i]);
206 catch(exception& e) {
207 m->errorOut(e, "Maligner", "minCoverageFilter");
211 /***********************************************************************/
212 // a breakpoint should yield fewer mismatches than this number with respect to the best parent sequence.
213 int Maligner::computeChimeraPenalty() {
216 int numAllowable = ((1.0 - (1.0/minDivR)) * query->getNumBases());
218 // if(numAllowable < 1){ numAllowable = 1; }
220 int penalty = int(numAllowable + 1) * misMatchPenalty;
225 catch(exception& e) {
226 m->errorOut(e, "Maligner", "computeChimeraPenalty");
230 /***********************************************************************/
231 //this is a vertical filter
232 void Maligner::verticalFilter(vector<Sequence*> seqs) {
234 vector<int> gaps; gaps.resize(query->getAligned().length(), 0);
236 string filterString = (string(query->getAligned().length(), '1'));
239 for (int i = 0; i < seqs.size(); i++) {
241 string seqAligned = seqs[i]->getAligned();
243 for (int j = 0; j < seqAligned.length(); j++) {
244 //if this spot is a gap
245 if ((seqAligned[j] == '-') || (seqAligned[j] == '.')) { gaps[j]++; }
249 //zero out spot where all sequences have blanks
250 int numColRemoved = 0;
251 for(int i = 0; i < seqs[0]->getAligned().length(); i++){
252 if(gaps[i] == seqs.size()) { filterString[i] = '0'; numColRemoved++; }
255 map<int, int> newMap;
257 for (int i = 0; i < seqs.size(); i++) {
259 string seqAligned = seqs[i]->getAligned();
260 string newAligned = "";
263 for (int j = 0; j < seqAligned.length(); j++) {
264 //if this spot is not a gap
265 if (filterString[j] == '1') {
266 newAligned += seqAligned[j];
267 newMap[count] = spotMap[j];
272 seqs[i]->setAligned(newAligned);
277 catch(exception& e) {
278 m->errorOut(e, "Maligner", "verticalFilter");
282 //***************************************************************************************************************
283 vector< vector<score_struct> > Maligner::buildScoreMatrix(int cols, int rows) {
286 vector< vector<score_struct> > m(rows);
288 for (int i = 0; i < rows; i++) {
289 for (int j = 0; j < cols; j++) {
291 //initialize each cell
294 temp.score = -9999999;
298 m[i].push_back(temp);
304 catch(exception& e) {
305 m->errorOut(e, "Maligner", "buildScoreMatrix");
310 //***************************************************************************************************************
312 void Maligner::fillScoreMatrix(vector<vector<score_struct> >& ms, vector<Sequence*> seqs, int penalty) {
315 //get matrix dimensions
316 int numCols = query->getAligned().length();
317 int numRows = seqs.size();
319 //initialize first col
320 string queryAligned = query->getAligned();
321 for (int i = 0; i < numRows; i++) {
322 string subjectAligned = seqs[i]->getAligned();
325 if ((!isalpha(queryAligned[0])) && (!isalpha(subjectAligned[0]))) {
327 // ms[i][0].mismatches = 0;
328 }else if (queryAligned[0] == subjectAligned[0]) { //|| subjectAligned[0] == 'N')
329 ms[i][0].score = matchScore;
330 // ms[i][0].mismatches = 0;
333 // ms[i][0].mismatches = 1;
337 //fill rest of matrix
338 for (int j = 1; j < numCols; j++) { //iterate through matrix columns
340 for (int i = 0; i < numRows; i++) { //iterate through matrix rows
342 string subjectAligned = seqs[i]->getAligned();
344 int matchMisMatchScore = 0;
346 if ((!isalpha(queryAligned[j])) && (!isalpha(subjectAligned[j]))) {
348 }else if ((toupper(queryAligned[j]) == 'N') || (toupper(subjectAligned[j]) == 'N')) {
349 //matchMisMatchScore = matchScore;
351 }else if (queryAligned[j] == subjectAligned[j]) {
352 matchMisMatchScore = matchScore;
353 // ms[i][j].mismatches = ms[i][j-1].mismatches;
354 }else if (queryAligned[j] != subjectAligned[j]) {
355 matchMisMatchScore = misMatchPenalty;
356 // ms[i][j].mismatches = ms[i][j-1].mismatches + 1;
359 //compute score based on previous columns scores
360 for (int prevIndex = 0; prevIndex < numRows; prevIndex++) { //iterate through rows
362 int sumScore = matchMisMatchScore + ms[prevIndex][j-1].score;
364 //you are not at yourself
365 if (prevIndex != i) { sumScore += penalty; }
366 if (sumScore < 0) { sumScore = 0; }
368 if (sumScore > ms[i][j].score) {
369 ms[i][j].score = sumScore;
370 ms[i][j].prev = prevIndex;
376 /* for(int i=0;i<numRows;i++){
377 cout << seqs[i]->getName();
378 for(int j=0;j<numCols;j++){
379 cout << '\t' << ms[i][j].mismatches;
384 /*cout << numRows << '\t' << numCols << endl;
385 for(int i=0;i<numRows;i++){
386 cout << seqs[i]->getName() << endl << seqs[i]->getAligned() << endl << endl;
387 if ((seqs[i]->getName() == "S000003470") || (seqs[i]->getName() == "S000383265") || (seqs[i]->getName() == "7000004128191054")) {
388 for(int j=0;j<numCols;j++){
389 cout << '\t' << ms[i][j].score;
395 /*for(int i=0;i<numRows;i++){
396 cout << seqs[i]->getName();
397 for(int j=0;j<numCols;j++){
398 cout << '\t' << ms[i][j].prev;
405 catch(exception& e) {
406 m->errorOut(e, "Maligner", "fillScoreMatrix");
410 //***************************************************************************************************************
411 vector<score_struct> Maligner::extractHighestPath(vector<vector<score_struct> > ms) {
414 //get matrix dimensions
415 int numCols = query->getAligned().length();
416 int numRows = ms.size();
419 //find highest score scoring matrix
420 score_struct highestStruct;
421 int highestScore = 0;
423 for (int i = 0; i < numRows; i++) {
424 for (int j = 0; j < numCols; j++) {
425 if (ms[i][j].score > highestScore) {
426 highestScore = ms[i][j].score;
427 highestStruct = ms[i][j];
432 vector<score_struct> path;
434 int rowIndex = highestStruct.row;
435 int pos = highestStruct.col;
436 int score = highestStruct.score;
438 while (pos >= 0 && score > 0) {
439 score_struct temp = ms[rowIndex][pos];
442 if (score > 0) { path.push_back(temp); }
444 rowIndex = temp.prev;
448 reverse(path.begin(), path.end());
453 catch(exception& e) {
454 m->errorOut(e, "Maligner", "extractHighestPath");
458 //***************************************************************************************************************
459 vector<trace_struct> Maligner::mapTraceRegionsToAlignment(vector<score_struct> path, vector<Sequence*> seqs) {
461 vector<trace_struct> trace;
463 int region_index = path[0].row;
464 int region_start = path[0].col;
466 for (int i = 1; i < path.size(); i++) {
468 int next_region_index = path[i].row;
470 if (next_region_index != region_index) {
473 int col_index = path[i].col;
475 temp.col = region_start;
476 temp.oldCol = col_index-1;
477 temp.row = region_index;
479 trace.push_back(temp);
481 region_index = path[i].row;
482 region_start = col_index;
488 temp.col = region_start;
489 temp.oldCol = path[path.size()-1].col;
490 temp.row = region_index;
491 trace.push_back(temp);
496 catch(exception& e) {
497 m->errorOut(e, "Maligner", "mapTraceRegionsToAlignment");
502 /***************************************************************************************************************
504 vector<trace_struct> Maligner::extractHighestPath(vector<vector<score_struct> > ms) {
508 //get matrix dimensions
509 int numCols = query->getAligned().length();
510 int numRows = ms.size();
513 //find highest score scoring matrix
514 vector<score_struct> highestStruct;
515 int highestScore = 0;
517 for (int i = 0; i < numRows; i++) {
518 for (int j = 0; j < numCols; j++) {
519 if (ms[i][j].score > highestScore) {
520 highestScore = ms[i][j].score;
521 highestStruct.resize(0);
522 highestStruct.push_back(ms[i][j]);
524 else if(ms[i][j].score == highestScore){
525 highestStruct.push_back(ms[i][j]);
530 //cout << endl << highestScore << '\t' << highestStruct.size() << '\t' << highestStruct[0].row << endl;
532 vector<trace_struct> maxTrace;
533 double maxPercentIdenticalQueryAntiChimera = 0;
535 for(int i=0;i<highestStruct.size();i++){
537 vector<score_struct> path;
539 int rowIndex = highestStruct[i].row;
540 int pos = highestStruct[i].col;
541 int score = highestStruct[i].score;
543 while (pos >= 0 && score > 0) {
544 score_struct temp = ms[rowIndex][pos];
547 if (score > 0) { path.push_back(temp); }
549 rowIndex = temp.prev;
553 reverse(path.begin(), path.end());
555 vector<trace_struct> trace = mapTraceRegionsToAlignment(path, refSeqs);
557 //cout << "traces\n";
558 //for(int j=0;j<trace.size();j++){
559 // cout << trace[j].col << '\t' << trace[j].oldCol << '\t' << refSeqs[trace[j].row]->getName() << endl;
562 int traceStart = path[0].col;
563 int traceEnd = path[path.size()-1].col;
564 // cout << "traceStart/End\t" << traceStart << '\t' << traceEnd << endl;
566 string queryInRange = query->getAligned();
567 queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1));
568 // cout << "here" << endl;
569 string chimeraSeq = constructChimericSeq(trace, refSeqs);
570 string antiChimeraSeq = constructAntiChimericSeq(trace, refSeqs);
572 percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq);
573 double percentIdenticalQueryAntiChimera = computePercentID(queryInRange, antiChimeraSeq);
574 // cout << i << '\t' << percentIdenticalQueryChimera << '\t' << percentIdenticalQueryAntiChimera << endl;
576 if(percentIdenticalQueryAntiChimera > maxPercentIdenticalQueryAntiChimera){
577 maxPercentIdenticalQueryAntiChimera = percentIdenticalQueryAntiChimera;
581 // cout << maxPercentIdenticalQueryAntiChimera << endl;
586 catch(exception& e) {
587 m->errorOut(e, "Maligner", "extractHighestPath");
592 //***************************************************************************************************************
594 vector<trace_struct> Maligner::mapTraceRegionsToAlignment(vector<score_struct> path, vector<Sequence*> seqs) {
596 vector<trace_struct> trace;
598 int region_index = path[0].row;
599 int region_start = path[0].col;
601 for (int i = 1; i < path.size(); i++) {
603 int next_region_index = path[i].row;
605 if (next_region_index != region_index) {
608 int col_index = path[i].col;
610 temp.col = region_start;
611 temp.oldCol = col_index-1;
612 temp.row = region_index;
614 trace.push_back(temp);
616 region_index = path[i].row;
617 region_start = col_index;
623 temp.col = region_start;
624 temp.oldCol = path[path.size()-1].col;
625 temp.row = region_index;
626 trace.push_back(temp);
629 // cout << trace.size() << endl;
630 // for(int i=0;i<trace.size();i++){
631 // cout << seqs[trace[i].row]->getName() << endl;
638 catch(exception& e) {
639 m->errorOut(e, "Maligner", "mapTraceRegionsToAlignment");
644 //***************************************************************************************************************
646 string Maligner::constructChimericSeq(vector<trace_struct> trace, vector<Sequence*> seqs) {
650 for (int i = 0; i < trace.size(); i++) {
651 // cout << i << '\t' << trace[i].row << '\t' << trace[i].col << '\t' << trace[i].oldCol << endl;
653 string seqAlign = seqs[trace[i].row]->getAligned();
654 seqAlign = seqAlign.substr(trace[i].col, (trace[i].oldCol-trace[i].col+1));
657 // cout << chimera << endl;
658 // if (chimera != "") { chimera = chimera.substr(0, (chimera.length()-1)); } //this was introducing a fence post error
659 // cout << chimera << endl;
662 catch(exception& e) {
663 m->errorOut(e, "Maligner", "constructChimericSeq");
668 //***************************************************************************************************************
670 string Maligner::constructAntiChimericSeq(vector<trace_struct> trace, vector<Sequence*> seqs) {
672 string antiChimera = "";
674 for (int i = 0; i < trace.size(); i++) {
675 // cout << i << '\t' << (trace.size() - i - 1) << '\t' << trace[i].row << '\t' << trace[i].col << '\t' << trace[i].oldCol << endl;
677 int oppositeIndex = trace.size() - i - 1;
679 string seqAlign = seqs[trace[oppositeIndex].row]->getAligned();
680 seqAlign = seqAlign.substr(trace[i].col, (trace[i].oldCol-trace[i].col+1));
681 antiChimera += seqAlign;
686 catch(exception& e) {
687 m->errorOut(e, "Maligner", "constructChimericSeq");
692 //***************************************************************************************************************
693 float Maligner::computePercentID(string queryAlign, string chimera) {
696 if (queryAlign.length() != chimera.length()) {
697 m->mothurOut("Error, alignment strings are of different lengths: "); m->mothurOutEndLine();
698 m->mothurOut(toString(queryAlign.length())); m->mothurOutEndLine();
699 m->mothurOut(toString(chimera.length())); m->mothurOutEndLine();
703 // cout << queryAlign.length() << endl;
704 int numIdentical = 0;
707 for (int i = 0; i < queryAlign.length(); i++) {
708 if (((queryAlign[i] != 'G') && (queryAlign[i] != 'T') && (queryAlign[i] != 'A') && (queryAlign[i] != 'C')&& (queryAlign[i] != '.') && (queryAlign[i] != '-')) ||
709 ((chimera[i] != 'G') && (chimera[i] != 'T') && (chimera[i] != 'A') && (chimera[i] != 'C')&& (chimera[i] != '.') && (chimera[i] != '-'))) {}
712 bool charA = false; bool charB = false;
713 if ((queryAlign[i] == 'G') || (queryAlign[i] == 'T') || (queryAlign[i] == 'A') || (queryAlign[i] == 'C')) { charA = true; }
714 if ((chimera[i] == 'G') || (chimera[i] == 'T') || (chimera[i] == 'A') || (chimera[i] == 'C')) { charB = true; }
717 if (charA || charB) {
719 if (charA) { countA++; }
720 if (charB) { countB++; }
722 if (queryAlign[i] == chimera[i]) {
726 // cout << queryAlign[i] << '\t' << chimera[i] << '\t' << countA << '\t' << countB << endl;
731 // cout << "pat\t" << countA << '\t' << countB << '\t' << numIdentical << endl;
734 float numBases = (countA + countB) /(float) 2;
736 if (numBases == 0) { return 0; }
738 float percentIdentical = (numIdentical/(float)numBases) * 100;
740 // cout << percentIdentical << endl;
742 return percentIdentical;
745 catch(exception& e) {
746 m->errorOut(e, "Maligner", "computePercentID");
750 //***************************************************************************************************************