X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=maligner.cpp;h=c5cc83a4d216b9c5a89eb6f60de8a15e5906d06c;hb=0571957d68cbbc0e425af1db8e808f826010b9e2;hp=b393194bbc3a4c89fd2ae8ea3f7f7ba67f84b927;hpb=9ada98592a54c82d08f3d46c9b1d8c3e472a922d;p=mothur.git diff --git a/maligner.cpp b/maligner.cpp index b393194..c5cc83a 100644 --- a/maligner.cpp +++ b/maligner.cpp @@ -8,12 +8,23 @@ */ #include "maligner.h" -#include "database.hpp" +#include "kmerdb.hpp" #include "blastdb.hpp" /***********************************************************************/ -Maligner::Maligner(vector temp, int num, int match, int misMatch, float div, int ms, int minCov, string mode) : - db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov), searchMethod(mode) {} +Maligner::Maligner(vector temp, int num, int match, int misMatch, float div, int ms, int minCov, string mode, Database* dataLeft, Database* dataRight) : + db(temp), numWanted(num), matchScore(match), misMatchPenalty(misMatch), minDivR(div), minSimilarity(ms), minCoverage(minCov), searchMethod(mode), databaseLeft(dataLeft), databaseRight(dataRight) { + + + m = MothurOut::getInstance(); + +// cout << matchScore << '\t' << misMatchPenalty << endl; +// +// matchScore = 1; +// misMatchPenalty = -1; + + } + /***********************************************************************/ string Maligner::getResults(Sequence* q, DeCalculator* decalc) { try { @@ -25,15 +36,21 @@ string Maligner::getResults(Sequence* q, DeCalculator* decalc) { string chimera; - if (searchMethod != "blast") { + if (searchMethod == "distance") { //find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate - refSeqs = decalc->findClosest(query, db, numWanted); - }else{ - refSeqs = getBlastSeqs(query, numWanted); - } + refSeqs = decalc->findClosest(query, db, numWanted, indexes); + }else if (searchMethod == "blast") { + refSeqs = getBlastSeqs(query, numWanted); //fills indexes + }else if (searchMethod == "kmer") { + refSeqs = getKmerSeqs(query, numWanted); //fills indexes + }else { m->mothurOut("not valid search."); exit(1); } //should never get here + + if (m->control_pressed) { return chimera; } refSeqs = minCoverageFilter(refSeqs); + + if (refSeqs.size() < 2) { for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; } percentIdenticalQueryChimera = 0.0; @@ -41,19 +58,21 @@ string Maligner::getResults(Sequence* q, DeCalculator* decalc) { } int chimeraPenalty = computeChimeraPenalty(); - + //cout << "chimeraPenalty = " << chimeraPenalty << endl; //fills outputResults chimera = chimeraMaligner(chimeraPenalty, decalc); + if (m->control_pressed) { return chimera; } //free memory delete query; + for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; } return chimera; } catch(exception& e) { - errorOut(e, "Maligner", "getResults"); + m->errorOut(e, "Maligner", "getResults"); exit(1); } } @@ -66,31 +85,38 @@ string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) { //trims seqs to first non gap char in all seqs and last non gap char in all seqs spotMap = decalc->trimSeqs(query, refSeqs); + //you trimmed the whole sequence, skip + if (query->getAligned() == "") { return "no"; } + vector temp = refSeqs; temp.push_back(query); - + verticalFilter(temp); + + //for (int i = 0; i < refSeqs.size(); i++) { cout << refSeqs[i]->getName() << endl << refSeqs[i]->getAligned() << endl; } vector< vector > matrix = buildScoreMatrix(query->getAligned().length(), refSeqs.size()); //builds and initializes - fillScoreMatrix(matrix, refSeqs, chimeraPenalty); - - vector path = extractHighestPath(matrix); + if (m->control_pressed) { return chimera; } - vector trace = mapTraceRegionsToAlignment(path, refSeqs); + fillScoreMatrix(matrix, refSeqs, chimeraPenalty); + + vector trace = extractHighestPath(matrix); + + //cout << "traces\n"; + //for(int i=0;igetName() << endl; + //} if (trace.size() > 1) { chimera = "yes"; } - else { chimera = "no"; } - - int traceStart = path[0].col; - int traceEnd = path[path.size()-1].col; + else { chimera = "no"; return chimera; } + int traceStart = trace[0].col; + int traceEnd = trace[trace.size()-1].oldCol; string queryInRange = query->getAligned(); queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1)); - - string chimeraSeq = constructChimericSeq(trace, refSeqs); - percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq); + if (m->control_pressed) { return chimera; } //save output results for (int i = 0; i < trace.size(); i++) { @@ -101,6 +127,7 @@ string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) { results temp; temp.parent = refSeqs[seqIndex]->getName(); + temp.parentAligned = db[indexes[seqIndex]]->getAligned(); temp.nastRegionStart = spotMap[regionStart]; temp.nastRegionEnd = spotMap[regionEnd]; temp.regionStart = regionStart; @@ -111,7 +138,7 @@ string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) { temp.queryToParent = computePercentID(queryInRange, parentInRange); temp.divR = (percentIdenticalQueryChimera / temp.queryToParent); - + string queryInRegion = query->getAligned(); queryInRegion = queryInRegion.substr(regionStart, (regionEnd-regionStart+1)); @@ -122,11 +149,11 @@ string Maligner::chimeraMaligner(int chimeraPenalty, DeCalculator* decalc) { outputResults.push_back(temp); } - + return chimera; } catch(exception& e) { - errorOut(e, "Maligner", "chimeraMaligner"); + m->errorOut(e, "Maligner", "chimeraMaligner"); exit(1); } } @@ -162,13 +189,15 @@ vector Maligner::minCoverageFilter(vector ref){ //if coverage above minimum if (coverage > minCoverage) { newRefs.push_back(ref[i]); + }else { + delete ref[i]; } } return newRefs; } catch(exception& e) { - errorOut(e, "Maligner", "minCoverageFilter"); + m->errorOut(e, "Maligner", "minCoverageFilter"); exit(1); } } @@ -178,14 +207,16 @@ int Maligner::computeChimeraPenalty() { try { int numAllowable = ((1.0 - (1.0/minDivR)) * query->getNumBases()); - + +// if(numAllowable < 1){ numAllowable = 1; } + int penalty = int(numAllowable + 1) * misMatchPenalty; - + return penalty; } catch(exception& e) { - errorOut(e, "Maligner", "computeChimeraPenalty"); + m->errorOut(e, "Maligner", "computeChimeraPenalty"); exit(1); } } @@ -237,7 +268,7 @@ void Maligner::verticalFilter(vector seqs) { spotMap = newMap; } catch(exception& e) { - errorOut(e, "Maligner", "verticalFilter"); + m->errorOut(e, "Maligner", "verticalFilter"); exit(1); } } @@ -245,9 +276,9 @@ void Maligner::verticalFilter(vector seqs) { vector< vector > Maligner::buildScoreMatrix(int cols, int rows) { try{ - vector< vector > m; m.resize(rows); + vector< vector > m(rows); - for (int i = 0; i < m.size(); i++) { + for (int i = 0; i < rows; i++) { for (int j = 0; j < cols; j++) { //initialize each cell @@ -264,12 +295,14 @@ vector< vector > Maligner::buildScoreMatrix(int cols, int rows) { return m; } catch(exception& e) { - errorOut(e, "Maligner", "buildScoreMatrix"); + m->errorOut(e, "Maligner", "buildScoreMatrix"); exit(1); } } + //*************************************************************************************************************** -void Maligner::fillScoreMatrix(vector >& m, vector seqs, int penalty) { + +void Maligner::fillScoreMatrix(vector >& ms, vector seqs, int penalty) { try{ //get matrix dimensions @@ -283,11 +316,14 @@ void Maligner::fillScoreMatrix(vector >& m, vector >& m, vector m[i][j].score) { - m[i][j].score = sumScore; - m[i][j].prev = prevIndex; + if (sumScore > ms[i][j].score) { + ms[i][j].score = sumScore; + ms[i][j].prev = prevIndex; } } } } + /* for(int i=0;igetName(); + for(int j=0;jgetName() << endl << seqs[i]->getAligned() << endl << endl; + if ((seqs[i]->getName() == "S000003470") || (seqs[i]->getName() == "S000383265") || (seqs[i]->getName() == "7000004128191054")) { + for(int j=0;jgetName(); + for(int j=0;jerrorOut(e, "Maligner", "fillScoreMatrix"); exit(1); } } + //*************************************************************************************************************** -vector Maligner::extractHighestPath(vector > m) { + +vector Maligner::extractHighestPath(vector > ms) { try { + //get matrix dimensions int numCols = query->getAligned().length(); - int numRows = m.size(); + int numRows = ms.size(); //find highest score scoring matrix - score_struct highestStruct; + vector highestStruct; int highestScore = 0; for (int i = 0; i < numRows; i++) { for (int j = 0; j < numCols; j++) { - if (m[i][j].score > highestScore) { - highestScore = m[i][j].score; - highestStruct = m[i][j]; + if (ms[i][j].score > highestScore) { + highestScore = ms[i][j].score; + highestStruct.resize(0); + highestStruct.push_back(ms[i][j]); + } + else if(ms[i][j].score == highestScore){ + highestStruct.push_back(ms[i][j]); } } } - - vector path; + + //cout << endl << highestScore << '\t' << highestStruct.size() << '\t' << highestStruct[0].row << endl; - int rowIndex = highestStruct.row; - int pos = highestStruct.col; - int score = highestStruct.score; + vector maxTrace; + double maxPercentIdenticalQueryAntiChimera = 0; - while (pos >= 0 && score > 0) { - score_struct temp = m[rowIndex][pos]; - score = temp.score; + for(int i=0;i path; + + int rowIndex = highestStruct[i].row; + int pos = highestStruct[i].col; + int score = highestStruct[i].score; + + while (pos >= 0 && score > 0) { + score_struct temp = ms[rowIndex][pos]; + score = temp.score; + + if (score > 0) { path.push_back(temp); } + + rowIndex = temp.prev; + pos--; + } + + reverse(path.begin(), path.end()); + + vector trace = mapTraceRegionsToAlignment(path, refSeqs); + + //cout << "traces\n"; + //for(int j=0;jgetName() << endl; + //} + + int traceStart = path[0].col; + int traceEnd = path[path.size()-1].col; +// cout << "traceStart/End\t" << traceStart << '\t' << traceEnd << endl; - if (score > 0) { path.push_back(temp); } + string queryInRange = query->getAligned(); + queryInRange = queryInRange.substr(traceStart, (traceEnd-traceStart+1)); +// cout << "here" << endl; + string chimeraSeq = constructChimericSeq(trace, refSeqs); + string antiChimeraSeq = constructAntiChimericSeq(trace, refSeqs); + + percentIdenticalQueryChimera = computePercentID(queryInRange, chimeraSeq); + double percentIdenticalQueryAntiChimera = computePercentID(queryInRange, antiChimeraSeq); +// cout << i << '\t' << percentIdenticalQueryChimera << '\t' << percentIdenticalQueryAntiChimera << endl; - rowIndex = temp.prev; - pos--; + if(percentIdenticalQueryAntiChimera > maxPercentIdenticalQueryAntiChimera){ + maxPercentIdenticalQueryAntiChimera = percentIdenticalQueryAntiChimera; + maxTrace = trace; + } } - - reverse(path.begin(), path.end()); +// cout << maxPercentIdenticalQueryAntiChimera << endl; + return maxTrace; - return path; } catch(exception& e) { - errorOut(e, "Maligner", "extractHighestPath"); + m->errorOut(e, "Maligner", "extractHighestPath"); exit(1); } } + //*************************************************************************************************************** + vector Maligner::mapTraceRegionsToAlignment(vector path, vector seqs) { try { vector trace; @@ -416,20 +527,31 @@ vector Maligner::mapTraceRegionsToAlignment(vector p temp.row = region_index; trace.push_back(temp); +// cout << endl; +// cout << trace.size() << endl; +// for(int i=0;igetName() << endl; +// } +// cout << endl; + return trace; } catch(exception& e) { - errorOut(e, "Maligner", "mapTraceRegionsToAlignment"); + m->errorOut(e, "Maligner", "mapTraceRegionsToAlignment"); exit(1); } } + //*************************************************************************************************************** + string Maligner::constructChimericSeq(vector trace, vector seqs) { try { string chimera = ""; for (int i = 0; i < trace.size(); i++) { +// cout << i << '\t' << trace[i].row << '\t' << trace[i].col << '\t' << trace[i].oldCol << endl; + string seqAlign = seqs[trace[i].row]->getAligned(); seqAlign = seqAlign.substr(trace[i].col, (trace[i].oldCol-trace[i].col+1)); chimera += seqAlign; @@ -438,18 +560,43 @@ string Maligner::constructChimericSeq(vector trace, vectorerrorOut(e, "Maligner", "constructChimericSeq"); + exit(1); + } +} + +//*************************************************************************************************************** + +string Maligner::constructAntiChimericSeq(vector trace, vector seqs) { + try { + string antiChimera = ""; + + for (int i = 0; i < trace.size(); i++) { +// cout << i << '\t' << (trace.size() - i - 1) << '\t' << trace[i].row << '\t' << trace[i].col << '\t' << trace[i].oldCol << endl; + + int oppositeIndex = trace.size() - i - 1; + + string seqAlign = seqs[trace[oppositeIndex].row]->getAligned(); + seqAlign = seqAlign.substr(trace[i].col, (trace[i].oldCol-trace[i].col+1)); + antiChimera += seqAlign; + } + + return antiChimera; + } + catch(exception& e) { + m->errorOut(e, "Maligner", "constructChimericSeq"); exit(1); } } + //*************************************************************************************************************** float Maligner::computePercentID(string queryAlign, string chimera) { try { if (queryAlign.length() != chimera.length()) { - mothurOut("Error, alignment strings are of different lengths: "); mothurOutEndLine(); - mothurOut(toString(queryAlign.length())); mothurOutEndLine(); mothurOutEndLine(); mothurOutEndLine(); mothurOutEndLine(); - mothurOut(toString(chimera.length())); mothurOutEndLine(); + m->mothurOut("Error, alignment strings are of different lengths: "); m->mothurOutEndLine(); + m->mothurOut(toString(queryAlign.length())); m->mothurOutEndLine(); m->mothurOutEndLine(); m->mothurOutEndLine(); m->mothurOutEndLine(); + m->mothurOut(toString(chimera.length())); m->mothurOutEndLine(); return -1.0; } @@ -474,18 +621,160 @@ float Maligner::computePercentID(string queryAlign, string chimera) { } catch(exception& e) { - errorOut(e, "Maligner", "computePercentID"); + m->errorOut(e, "Maligner", "computePercentID"); exit(1); } } //*************************************************************************************************************** vector Maligner::getBlastSeqs(Sequence* q, int num) { try { - //generate blastdb - Database* database = new BlastDB(-2.0, -1.0, matchScore, misMatchPenalty); - for (int i = 0; i < db.size(); i++) { database->addSequence(*db[i]); } - database->generateDB(); - database->setNumSeqs(db.size()); + indexes.clear(); + vector refResults; + + //get parts of query + string queryUnAligned = q->getUnaligned(); + string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence + string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence + + Sequence* queryLeft = new Sequence(q->getName()+"left", leftQuery); + Sequence* queryRight = new Sequence(q->getName()+"right", rightQuery); + + vector tempIndexesLeft = databaseLeft->findClosestMegaBlast(queryLeft, num+1); + vector leftScores = databaseLeft->getSearchScores(); + vector tempIndexesRight = databaseLeft->findClosestMegaBlast(queryRight, num+1); + vector rightScores = databaseLeft->getSearchScores(); + + //if ((tempIndexesRight.size() == 0) && (tempIndexesLeft.size() == 0)) { m->mothurOut("megablast returned " + toString(tempIndexesRight.size()) + " results for the right end, and " + toString(tempIndexesLeft.size()) + " for the left end. Needed " + toString(num+1) + ". Unable to process sequence " + q->getName()); m->mothurOutEndLine(); return refResults; } + + vector smaller; + vector smallerScores; + vector larger; + vector largerScores; + + if (tempIndexesRight.size() < tempIndexesLeft.size()) { smaller = tempIndexesRight; smallerScores = rightScores; larger = tempIndexesLeft; largerScores = leftScores; } + else { smaller = tempIndexesLeft; smallerScores = leftScores; larger = tempIndexesRight; largerScores = rightScores; } + + //for (int i = 0; i < smaller.size(); i++) { cout << "smaller = " << smaller[i] << '\t' << smallerScores[i] << endl; } + //cout << endl; + //for (int i = 0; i < larger.size(); i++) { cout << "larger = " << larger[i] << '\t' << largerScores[i] << endl; } + + //merge results + map seen; + map::iterator it; + float lastSmaller = smallerScores[0]; + float lastLarger = largerScores[0]; + int lasti = 0; + vector mergedResults; + for (int i = 0; i < smaller.size(); i++) { + //add left if you havent already + it = seen.find(smaller[i]); + if (it == seen.end()) { + mergedResults.push_back(smaller[i]); + seen[smaller[i]] = smaller[i]; + lastSmaller = smallerScores[i]; + } + + //add right if you havent already + it = seen.find(larger[i]); + if (it == seen.end()) { + mergedResults.push_back(larger[i]); + seen[larger[i]] = larger[i]; + lastLarger = largerScores[i]; + } + + lasti = i; + //if (mergedResults.size() > num) { break; } + } + + //save lasti for smaller ties below + /*lasti++; + int iSmaller = lasti; + + if (!(mergedResults.size() > num)) { //if we still need more results. + for (int i = smaller.size(); i < larger.size(); i++) { + it = seen.find(larger[i]); + if (it == seen.end()) { + mergedResults.push_back(larger[i]); + seen[larger[i]] = larger[i]; + lastLarger = largerScores[i]; + } + + lasti = i; + if (mergedResults.size() > num) { break; } + } + } + + + //add in any ties from smaller + while (iSmaller < smaller.size()) { + if (smallerScores[iSmaller] == lastSmaller) { + it = seen.find(smaller[iSmaller]); + + if (it == seen.end()) { + mergedResults.push_back(smaller[iSmaller]); + seen[smaller[iSmaller]] = smaller[iSmaller]; + } + } + else { break; } + iSmaller++; + } + + lasti++; + //add in any ties from larger + while (lasti < larger.size()) { + if (largerScores[lasti] == lastLarger) { //is it a tie + it = seen.find(larger[lasti]); + + if (it == seen.end()) { //we haven't already seen it + mergedResults.push_back(larger[lasti]); + seen[larger[lasti]] = larger[lasti]; + } + } + else { break; } + lasti++; + } + */ + + for (int i = smaller.size(); i < larger.size(); i++) { + //add right if you havent already + it = seen.find(larger[i]); + if (it == seen.end()) { + mergedResults.push_back(larger[i]); + seen[larger[i]] = larger[i]; + lastLarger = largerScores[i]; + } + } + numWanted = mergedResults.size(); + + if (mergedResults.size() < numWanted) { numWanted = mergedResults.size(); } +//cout << q->getName() << " merged results size = " << mergedResults.size() << '\t' << "numwanted = " << numWanted << endl; + for (int i = 0; i < numWanted; i++) { +//cout << db[mergedResults[i]]->getName() << '\t' << mergedResults[i] << endl; + + if (db[mergedResults[i]]->getName() != q->getName()) { + Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned()); + refResults.push_back(temp); + indexes.push_back(mergedResults[i]); + //cout << db[mergedResults[i]]->getName() << endl; + } + +//cout << mergedResults[i] << endl; + } +//cout << "done " << q->getName() << endl; + delete queryRight; + delete queryLeft; + + return refResults; + } + catch(exception& e) { + m->errorOut(e, "Maligner", "getBlastSeqs"); + exit(1); + } +} +//*************************************************************************************************************** +vector Maligner::getKmerSeqs(Sequence* q, int num) { + try { + indexes.clear(); //get parts of query string queryUnAligned = q->getUnaligned(); @@ -495,13 +784,17 @@ vector Maligner::getBlastSeqs(Sequence* q, int num) { Sequence* queryLeft = new Sequence(q->getName(), leftQuery); Sequence* queryRight = new Sequence(q->getName(), rightQuery); - vector tempIndexesRight = database->findClosestMegaBlast(queryRight, num+1); - vector tempIndexesLeft = database->findClosestMegaBlast(queryLeft, num+1); + vector tempIndexesLeft = databaseLeft->findClosestSequences(queryLeft, numWanted); + vector tempIndexesRight = databaseRight->findClosestSequences(queryRight, numWanted); + vector scoresLeft = databaseLeft->getSearchScores(); + vector scoresRight = databaseRight->getSearchScores(); //merge results map seen; map::iterator it; - + float lastRight = scoresRight[0]; + float lastLeft = scoresLeft[0]; + //int lasti = 0; vector mergedResults; for (int i = 0; i < tempIndexesLeft.size(); i++) { //add left if you havent already @@ -509,6 +802,7 @@ vector Maligner::getBlastSeqs(Sequence* q, int num) { if (it == seen.end()) { mergedResults.push_back(tempIndexesLeft[i]); seen[tempIndexesLeft[i]] = tempIndexesLeft[i]; + lastLeft = scoresLeft[i]; } //add right if you havent already @@ -516,27 +810,72 @@ vector Maligner::getBlastSeqs(Sequence* q, int num) { if (it == seen.end()) { mergedResults.push_back(tempIndexesRight[i]); seen[tempIndexesRight[i]] = tempIndexesRight[i]; + lastRight = scoresRight[i]; + } + + //if (mergedResults.size() > numWanted) { lasti = i; break; } //you have enough results + } + + //add in sequences with same distance as last sequence added + /*lasti++; + int i = lasti; + while (i < tempIndexesLeft.size()) { + if (scoresLeft[i] == lastLeft) { + it = seen.find(tempIndexesLeft[i]); + + if (it == seen.end()) { + mergedResults.push_back(tempIndexesLeft[i]); + seen[tempIndexesLeft[i]] = tempIndexesLeft[i]; + } } + else { break; } + i++; + } + + // cout << "lastRight\t" << lastRight << endl; + //add in sequences with same distance as last sequence added + i = lasti; + while (i < tempIndexesRight.size()) { + if (scoresRight[i] == lastRight) { + it = seen.find(tempIndexesRight[i]); + + if (it == seen.end()) { + mergedResults.push_back(tempIndexesRight[i]); + seen[tempIndexesRight[i]] = tempIndexesRight[i]; + } + } + else { break; } + i++; + }*/ + + numWanted = mergedResults.size(); + + if (numWanted > mergedResults.size()) { + //m->mothurOut("numwanted is larger than the number of template sequences, adjusting numwanted."); m->mothurOutEndLine(); + numWanted = mergedResults.size(); } +//cout << q->getName() << endl; vector refResults; for (int i = 0; i < numWanted; i++) { - Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned()); - refResults.push_back(temp); +//cout << db[mergedResults[i]]->getName() << endl; + if (db[mergedResults[i]]->getName() != q->getName()) { + Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned()); + refResults.push_back(temp); + indexes.push_back(mergedResults[i]); + } } - +//cout << endl; delete queryRight; delete queryLeft; - delete database; return refResults; } catch(exception& e) { - errorOut(e, "Maligner", "getBlastSeqs"); + m->errorOut(e, "Maligner", "getKmerSeqs"); exit(1); } } - //***************************************************************************************************************