X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=maligner.cpp;h=d7731facce712bb98fd239793cda78dfa7cc69cc;hb=2405cc589aaaf0c44809a48fe98d3b96863dac0b;hp=1fd27f95c850a2f211c5e2034ff988cfee173479;hpb=1e39c7af79c75317b61e457bf72a14e2138902fc;p=mothur.git diff --git a/maligner.cpp b/maligner.cpp index 1fd27f9..d7731fa 100644 --- a/maligner.cpp +++ b/maligner.cpp @@ -8,23 +8,14 @@ */ #include "maligner.h" -#include "kmerdb.hpp" -#include "blastdb.hpp" -/***********************************************************************/ -Maligner::Maligner(vector temp, int num, int match, int misMatch, float div, int ms, int minCov, string mode, Database* dataLeft, Database* dataRight) : - db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov), searchMethod(mode), databaseLeft(dataLeft), databaseRight(dataRight) { - +/***********************************************************************/ //int num, int match, int misMatch, , string mode, Database* dataLeft, Database* dataRight +Maligner::Maligner(vector temp, int match, int misMatch, float div, int ms, int minCov) : db(temp), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov) { + //numWanted(num), , searchMethod(mode), databaseLeft(dataLeft), databaseRight(dataRight) m = MothurOut::getInstance(); -// cout << matchScore << '\t' << misMatchPenalty << endl; -// -// matchScore = 1; -// misMatchPenalty = -1; - - } - +} /***********************************************************************/ string Maligner::getResults(Sequence* q, DeCalculator* decalc) { try { @@ -36,19 +27,14 @@ string Maligner::getResults(Sequence* q, DeCalculator* decalc) { string chimera; - if (searchMethod == "distance") { - //find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate - refSeqs = decalc->findClosest(query, db, numWanted, indexes); - }else if (searchMethod == "blast") { - refSeqs = getBlastSeqs(query, numWanted); //fills indexes - }else if (searchMethod == "kmer") { - refSeqs = getKmerSeqs(query, numWanted); //fills indexes - }else { m->mothurOut("not valid search."); exit(1); } //should never get here - - if (m->control_pressed) { return chimera; } + //copy refSeqs so that filter does not effect original + for(int i = 0; i < db.size(); i++) { + Sequence* newSeq = new Sequence(db[i]->getName(), db[i]->getAligned()); + refSeqs.push_back(newSeq); + } refSeqs = minCoverageFilter(refSeqs); - + if (refSeqs.size() < 2) { for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; } percentIdenticalQueryChimera = 0.0; @@ -56,7 +42,7 @@ string Maligner::getResults(Sequence* q, DeCalculator* decalc) { } int chimeraPenalty = computeChimeraPenalty(); - + //fills outputResults chimera = chimeraMaligner(chimeraPenalty, decalc); @@ -89,21 +75,43 @@ string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) { vector temp = refSeqs; temp.push_back(query); - verticalFilter(temp); + + //for (int i = 0; i < refSeqs.size(); i++) { cout << refSeqs[i]->getName() << endl ; }//<< refSeqs[i]->getAligned() << endl vector< vector > matrix = buildScoreMatrix(query->getAligned().length(), refSeqs.size()); //builds and initializes if (m->control_pressed) { return chimera; } fillScoreMatrix(matrix, refSeqs, chimeraPenalty); - + + vector path = extractHighestPath(matrix); + + if (m->control_pressed) { return chimera; } + + vector trace = mapTraceRegionsToAlignment(path, refSeqs); + + if (trace.size() > 1) { chimera = "yes"; } + else { chimera = "no"; return chimera; } + + int traceStart = path[0].col; + int traceEnd = path[path.size()-1].col; + string queryInRange = query->getAligned(); + queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1)); + + string chimeraSeq = constructChimericSeq(trace, refSeqs); +// cout << queryInRange.length() << endl; +// cout << chimeraSeq.length() << endl; + + percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq); + + /* vector trace = extractHighestPath(matrix); -// cout << "traces\n"; -// for(int i=0;igetName() << endl; -// } + //cout << "traces\n"; + //for(int i=0;igetName() << endl; + //} if (trace.size() > 1) { chimera = "yes"; } else { chimera = "no"; return chimera; } @@ -111,7 +119,7 @@ string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) { int traceStart = trace[0].col; int traceEnd = trace[trace.size()-1].oldCol; string queryInRange = query->getAligned(); - queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1)); + queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart));*/ if (m->control_pressed) { return chimera; } @@ -124,7 +132,7 @@ string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) { results temp; temp.parent = refSeqs[seqIndex]->getName(); - temp.parentAligned = db[indexes[seqIndex]]->getAligned(); + temp.parentAligned = db[seqIndex]->getAligned(); temp.nastRegionStart = spotMap[regionStart]; temp.nastRegionEnd = spotMap[regionEnd]; temp.regionStart = regionStart; @@ -143,7 +151,9 @@ string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) { parentInRegion = parentInRegion.substr(regionStart, (regionEnd-regionStart+1)); temp.queryToParentLocal = computePercentID(queryInRegion, parentInRegion); - + +// cout << query->getName() << '\t' << temp.parent << '\t' << "NAST:" << temp.nastRegionStart << '-' << temp.nastRegionEnd << " G:" << temp.queryToParent << " L:" << temp.queryToParentLocal << ", " << temp.divR << endl; + outputResults.push_back(temp); } @@ -315,7 +325,7 @@ void Maligner::fillScoreMatrix(vector >& ms, vector >& ms, vector >& ms, vectorgetName(); -// for(int j=0;jgetName(); -// for(int j=0;jgetName(); -// for(int j=0;jgetName(); + for(int j=0;jgetName() << endl << seqs[i]->getAligned() << endl << endl; + if ((seqs[i]->getName() == "S000003470") || (seqs[i]->getName() == "S000383265") || (seqs[i]->getName() == "7000004128191054")) { + for(int j=0;jgetName(); + for(int j=0;j >& ms, vector Maligner::extractHighestPath(vector > ms) { + try { + + //get matrix dimensions + int numCols = query->getAligned().length(); + int numRows = ms.size(); + + + //find highest score scoring matrix + score_struct highestStruct; + int highestScore = 0; + + for (int i = 0; i < numRows; i++) { + for (int j = 0; j < numCols; j++) { + if (ms[i][j].score > highestScore) { + highestScore = ms[i][j].score; + highestStruct = ms[i][j]; + } + } + } + + vector path; + + int rowIndex = highestStruct.row; + int pos = highestStruct.col; + int score = highestStruct.score; + + while (pos >= 0 && score > 0) { + score_struct temp = ms[rowIndex][pos]; + score = temp.score; + + if (score > 0) { path.push_back(temp); } + + rowIndex = temp.prev; + pos--; + } + + reverse(path.begin(), path.end()); + + return path; + + } + catch(exception& e) { + m->errorOut(e, "Maligner", "extractHighestPath"); + exit(1); + } +} +//*************************************************************************************************************** +vector Maligner::mapTraceRegionsToAlignment(vector path, vector seqs) { + try { + vector trace; + + int region_index = path[0].row; + int region_start = path[0].col; + + for (int i = 1; i < path.size(); i++) { + + int next_region_index = path[i].row; + + if (next_region_index != region_index) { + + // add trace region + int col_index = path[i].col; + trace_struct temp; + temp.col = region_start; + temp.oldCol = col_index-1; + temp.row = region_index; + + trace.push_back(temp); + + region_index = path[i].row; + region_start = col_index; + } + } + + // get last one + trace_struct temp; + temp.col = region_start; + temp.oldCol = path[path.size()-1].col; + temp.row = region_index; + trace.push_back(temp); + + return trace; + + } + catch(exception& e) { + m->errorOut(e, "Maligner", "mapTraceRegionsToAlignment"); + exit(1); + } +} + +/*************************************************************************************************************** vector Maligner::extractHighestPath(vector > ms) { try { @@ -423,7 +527,7 @@ vector Maligner::extractHighestPath(vector > } } -// cout << endl << highestScore << '\t' << highestStruct.size() << '\t' << highestStruct[0].row << endl; + //cout << endl << highestScore << '\t' << highestStruct.size() << '\t' << highestStruct[0].row << endl; vector maxTrace; double maxPercentIdenticalQueryAntiChimera = 0; @@ -450,10 +554,10 @@ vector Maligner::extractHighestPath(vector > vector trace = mapTraceRegionsToAlignment(path, refSeqs); -// cout << "traces\n"; -// for(int j=0;jgetName() << endl; -// } + //cout << "traces\n"; + //for(int j=0;jgetName() << endl; + //} int traceStart = path[0].col; int traceEnd = path[path.size()-1].col; @@ -476,6 +580,7 @@ vector Maligner::extractHighestPath(vector > } // cout << maxPercentIdenticalQueryAntiChimera << endl; return maxTrace; + } catch(exception& e) { @@ -535,7 +640,7 @@ vector Maligner::mapTraceRegionsToAlignment(vector p exit(1); } } - +*/ //*************************************************************************************************************** string Maligner::constructChimericSeq(vector trace, vector seqs) { @@ -549,7 +654,9 @@ string Maligner::constructChimericSeq(vector trace, vectormothurOut("Error, alignment strings are of different lengths: "); m->mothurOutEndLine(); - m->mothurOut(toString(queryAlign.length())); m->mothurOutEndLine(); m->mothurOutEndLine(); m->mothurOutEndLine(); m->mothurOutEndLine(); + m->mothurOut(toString(queryAlign.length())); m->mothurOutEndLine(); m->mothurOut(toString(chimera.length())); m->mothurOutEndLine(); return -1.0; } - - int numBases = 0; +// cout << queryAlign.length() << endl; int numIdentical = 0; - + int countA = 0; + int countB = 0; for (int i = 0; i < queryAlign.length(); i++) { - if ((isalpha(queryAlign[i])) || (isalpha(chimera[i]))) { - numBases++; - if (queryAlign[i] == chimera[i]) { - numIdentical++; - } - } - } - - if (numBases == 0) { return 0; } - - float percentIdentical = (numIdentical/(float)numBases) * 100; + if (((queryAlign[i] != 'G') && (queryAlign[i] != 'T') && (queryAlign[i] != 'A') && (queryAlign[i] != 'C')&& (queryAlign[i] != '.') && (queryAlign[i] != '-')) || + ((chimera[i] != 'G') && (chimera[i] != 'T') && (chimera[i] != 'A') && (chimera[i] != 'C')&& (chimera[i] != '.') && (chimera[i] != '-'))) {} + else { - return percentIdentical; - - } - catch(exception& e) { - m->errorOut(e, "Maligner", "computePercentID"); - exit(1); - } -} -//*************************************************************************************************************** -vector Maligner::getBlastSeqs(Sequence* q, int num) { - try { - indexes.clear(); - vector refResults; + bool charA = false; bool charB = false; + if ((queryAlign[i] == 'G') || (queryAlign[i] == 'T') || (queryAlign[i] == 'A') || (queryAlign[i] == 'C')) { charA = true; } + if ((chimera[i] == 'G') || (chimera[i] == 'T') || (chimera[i] == 'A') || (chimera[i] == 'C')) { charB = true; } + - //get parts of query - string queryUnAligned = q->getUnaligned(); - string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence - string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence - - Sequence* queryLeft = new Sequence(q->getName()+"left", leftQuery); - Sequence* queryRight = new Sequence(q->getName()+"right", rightQuery); - - vector tempIndexesLeft = databaseLeft->findClosestMegaBlast(queryLeft, num+1); - vector leftScores = databaseLeft->getMegaBlastSearchScores(); - vector tempIndexesRight = databaseLeft->findClosestMegaBlast(queryRight, num+1); - vector rightScores = databaseLeft->getMegaBlastSearchScores(); - - //if ((tempIndexesRight.size() == 0) && (tempIndexesLeft.size() == 0)) { m->mothurOut("megablast returned " + toString(tempIndexesRight.size()) + " results for the right end, and " + toString(tempIndexesLeft.size()) + " for the left end. Needed " + toString(num+1) + ". Unable to process sequence " + q->getName()); m->mothurOutEndLine(); return refResults; } - - vector smaller; - vector smallerScores; - vector larger; - vector largerScores; - - if (tempIndexesRight.size() < tempIndexesLeft.size()) { smaller = tempIndexesRight; smallerScores = rightScores; larger = tempIndexesLeft; largerScores = leftScores; } - else { smaller = tempIndexesLeft; smallerScores = leftScores; larger = tempIndexesRight; largerScores = rightScores; } - - //for (int i = 0; i < smaller.size(); i++) { cout << "smaller = " << smaller[i] << '\t' << smallerScores[i] << endl; } - //cout << endl; - //for (int i = 0; i < larger.size(); i++) { cout << "larger = " << larger[i] << '\t' << largerScores[i] << endl; } - - //merge results - map seen; - map::iterator it; - float lastSmaller = smallerScores[0]; - float lastLarger = largerScores[0]; - int lasti = 0; - vector mergedResults; - for (int i = 0; i < smaller.size(); i++) { - //add left if you havent already - it = seen.find(smaller[i]); - if (it == seen.end()) { - mergedResults.push_back(smaller[i]); - seen[smaller[i]] = smaller[i]; - lastSmaller = smallerScores[i]; - } - - //add right if you havent already - it = seen.find(larger[i]); - if (it == seen.end()) { - mergedResults.push_back(larger[i]); - seen[larger[i]] = larger[i]; - lastLarger = largerScores[i]; - } - - lasti = i; - if (mergedResults.size() > num) { break; } - } - - //save lasti for smaller ties below - lasti++; - int iSmaller = lasti; - - if (!(mergedResults.size() > num)) { //if we still need more results. - for (int i = smaller.size(); i < larger.size(); i++) { - it = seen.find(larger[i]); - if (it == seen.end()) { - mergedResults.push_back(larger[i]); - seen[larger[i]] = larger[i]; - lastLarger = largerScores[i]; + if (charA || charB) { + + if (charA) { countA++; } + if (charB) { countB++; } + + if (queryAlign[i] == chimera[i]) { + numIdentical++; + } } - - lasti = i; - if (mergedResults.size() > num) { break; } +// cout << queryAlign[i] << '\t' << chimera[i] << '\t' << countA << '\t' << countB << endl; + } } +// cout << "pat\t" << countA << '\t' << countB << '\t' << numIdentical << endl; + - //add in any ties from smaller - while (iSmaller < smaller.size()) { - if (smallerScores[iSmaller] == lastSmaller) { - it = seen.find(smaller[iSmaller]); - - if (it == seen.end()) { - mergedResults.push_back(smaller[iSmaller]); - seen[smaller[iSmaller]] = smaller[iSmaller]; - } - } - else { break; } - iSmaller++; - } - - lasti++; - //add in any ties from larger - while (lasti < larger.size()) { - if (largerScores[lasti] == lastLarger) { //is it a tie - it = seen.find(larger[lasti]); - - if (it == seen.end()) { //we haven't already seen it - mergedResults.push_back(larger[lasti]); - seen[larger[lasti]] = larger[lasti]; - } - } - else { break; } - lasti++; - } + float numBases = (countA + countB) /(float) 2; - numWanted = seen.size(); + if (numBases == 0) { return 0; } + + float percentIdentical = (numIdentical/(float)numBases) * 100; - if (mergedResults.size() < numWanted) { numWanted = mergedResults.size(); } -//cout << q->getName() << " merged results size = " << mergedResults.size() << '\t' << "numwanted = " << numWanted << endl; - for (int i = 0; i < numWanted; i++) { -//cout << db[mergedResults[i]]->getName() << '\t' << mergedResults[i] << endl; - //if (db[mergedResults[i]]->getName() != q->getName()) { - Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned()); - refResults.push_back(temp); - indexes.push_back(mergedResults[i]); - //} -//cout << mergedResults[i] << endl; - } -//cout << "done " << q->getName() << endl; - delete queryRight; - delete queryLeft; - - return refResults; - } - catch(exception& e) { - m->errorOut(e, "Maligner", "getBlastSeqs"); - exit(1); - } -} -//*************************************************************************************************************** -vector Maligner::getKmerSeqs(Sequence* q, int num) { - try { - indexes.clear(); - - //get parts of query - string queryUnAligned = q->getUnaligned(); - string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence - string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence - - Sequence* queryLeft = new Sequence(q->getName(), leftQuery); - Sequence* queryRight = new Sequence(q->getName(), rightQuery); - - vector tempIndexesLeft = databaseLeft->findClosestSequences(queryLeft, numWanted); - vector tempIndexesRight = databaseRight->findClosestSequences(queryRight, numWanted); - - //merge results - map seen; - map::iterator it; - - vector mergedResults; - for (int i = 0; i < tempIndexesLeft.size(); i++) { - //add left if you havent already - it = seen.find(tempIndexesLeft[i]); - if (it == seen.end()) { - mergedResults.push_back(tempIndexesLeft[i]); - seen[tempIndexesLeft[i]] = tempIndexesLeft[i]; - } - - //add right if you havent already - it = seen.find(tempIndexesRight[i]); - if (it == seen.end()) { - mergedResults.push_back(tempIndexesRight[i]); - seen[tempIndexesRight[i]] = tempIndexesRight[i]; - } - } +// cout << percentIdentical << endl; -//cout << q->getName() << endl; - vector refResults; - for (int i = 0; i < numWanted; i++) { -//cout << db[mergedResults[i]]->getName() << endl; - Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned()); - refResults.push_back(temp); - indexes.push_back(mergedResults[i]); - } -//cout << endl; - delete queryRight; - delete queryLeft; + return percentIdentical; - return refResults; } catch(exception& e) { - m->errorOut(e, "Maligner", "getKmerSeqs"); + m->errorOut(e, "Maligner", "computePercentID"); exit(1); } } //*************************************************************************************************************** -