X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=chimeraslayer.cpp;h=826f38ba962681e883809c5dcb2711939d205c99;hb=2405cc589aaaf0c44809a48fe98d3b96863dac0b;hp=78c9f9e9e24d0138b158791d7d3bce2f39eca026;hpb=3b137af694d322b7162e97275c070c41b42597a3;p=mothur.git diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp index 78c9f9e..826f38b 100644 --- a/chimeraslayer.cpp +++ b/chimeraslayer.cpp @@ -13,7 +13,7 @@ #include "blastdb.hpp" //*************************************************************************************************************** -ChimeraSlayer::ChimeraSlayer(string file, string temp, string mode, int k, int ms, int mms, int win, float div, +ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, string mode, int k, int ms, int mms, int win, float div, int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera() { try { fastafile = file; @@ -33,6 +33,7 @@ int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int num increment = inc; numWanted = numw; realign = r; + trimChimera = trim; decalc = new DeCalculator(); @@ -44,7 +45,7 @@ int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int num } } //*************************************************************************************************************** -ChimeraSlayer::ChimeraSlayer(string file, string temp, string name, string mode, string abunds, int k, int ms, int mms, int win, float div, +ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, map& prior, string mode, int k, int ms, int mms, int win, float div, int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera() { try { fastafile = file; templateSeqs = readSeqs(fastafile); @@ -64,96 +65,56 @@ ChimeraSlayer::ChimeraSlayer(string file, string temp, string name, string mode, increment = inc; numWanted = numw; realign = r; - includeAbunds = abunds; - - //read name file and create nameMapRank - readNameFile(name); + trimChimera = trim; + priority = prior; decalc = new DeCalculator(); createFilter(templateSeqs, 0.0); //just removed columns where all seqs have a gap - //run filter on template - for (int i = 0; i < templateSeqs.size(); i++) { runFilter(templateSeqs[i]); } - - } - catch(exception& e) { - m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer"); - exit(1); - } -} -//*************************************************************************************************************** -int ChimeraSlayer::readNameFile(string name) { - try { - ifstream in; - m->openInputFile(name, in); - - int maxRank = 0; - int minRank = 10000000; - - while(!in.eof()){ - - if (m->control_pressed) { in.close(); return 0; } - - string thisname, repnames; + if (searchMethod == "distance") { + createFilter(templateSeqs, 0.0); //just removed columns where all seqs have a gap - in >> thisname; m->gobble(in); //read from first column - in >> repnames; //read from second column - - map >::iterator it = nameMapRank.find(thisname); - if (it == nameMapRank.end()) { - - vector splitRepNames; - m->splitAtComma(repnames, splitRepNames); + //run filter on template copying templateSeqs into filteredTemplateSeqs + for (int i = 0; i < templateSeqs.size(); i++) { + if (m->control_pressed) { break; } - nameMapRank[thisname] = splitRepNames; - - if (splitRepNames.size() > maxRank) { maxRank = splitRepNames.size(); } - if (splitRepNames.size() < minRank) { minRank = splitRepNames.size(); } - - }else{ m->mothurOut(thisname + " is already in namesfile. I will use first definition."); m->mothurOutEndLine(); } - - m->gobble(in); - } - in.close(); - - //sanity check to make sure files match - for (int i = 0; i < templateSeqs.size(); i++) { - map >::iterator it = nameMapRank.find(templateSeqs[i]->getName()); - - if (it == nameMapRank.end()) { m->mothurOut("[ERROR]: " + templateSeqs[i]->getName() + " is not in namesfile, but is in fastafile. Every name in fasta file must be in first column of names file."); m->mothurOutEndLine(); m->control_pressed = true; } + Sequence* newSeq = new Sequence(templateSeqs[i]->getName(), templateSeqs[i]->getAligned()); + runFilter(newSeq); + filteredTemplateSeqs.push_back(newSeq); + } } - - if (maxRank == minRank) { m->mothurOut("[ERROR]: all sequences in namesfile have the same abundance, aborting."); m->mothurOutEndLine(); m->control_pressed = true; } - - return 0; - } catch(exception& e) { - m->errorOut(e, "ChimeraSlayer", "readNameFile"); + m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer"); exit(1); } } - //*************************************************************************************************************** int ChimeraSlayer::doPrep() { try { + if (searchMethod == "distance") { + //read in all query seqs + vector tempQuerySeqs = readSeqs(fastafile); - //read in all query seqs - vector tempQuerySeqs = readSeqs(fastafile); - - vector temp = templateSeqs; - for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); } + vector temp = templateSeqs; + for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); } - createFilter(temp, 0.0); //just removed columns where all seqs have a gap + createFilter(temp, 0.0); //just removed columns where all seqs have a gap - for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } + for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } - if (m->control_pressed) { return 0; } - - //run filter on template - for (int i = 0; i < templateSeqs.size(); i++) { if (m->control_pressed) { return 0; } runFilter(templateSeqs[i]); } + if (m->control_pressed) { return 0; } + //run filter on template copying templateSeqs into filteredTemplateSeqs + for (int i = 0; i < templateSeqs.size(); i++) { + if (m->control_pressed) { return 0; } + + Sequence* newSeq = new Sequence(templateSeqs[i]->getName(), templateSeqs[i]->getAligned()); + runFilter(newSeq); + filteredTemplateSeqs.push_back(newSeq); + } + } string kmerDBNameLeft; string kmerDBNameRight; @@ -256,7 +217,8 @@ int ChimeraSlayer::doPrep() { }else if (searchMethod == "blast") { //generate blastdb - databaseLeft = new BlastDB(-2.0, -1.0, match, misMatch); + databaseLeft = new BlastDB(m->getRootName(m->getSimpleName(fastafile)), -1.0, -1.0, 1, -3); + for (int i = 0; i < templateSeqs.size(); i++) { databaseLeft->addSequence(*templateSeqs[i]); } databaseLeft->generateDB(); databaseLeft->setNumSeqs(templateSeqs.size()); @@ -271,46 +233,26 @@ int ChimeraSlayer::doPrep() { } } //*************************************************************************************************************** -vector ChimeraSlayer::getTemplate(Sequence* q) { +vector ChimeraSlayer::getTemplate(Sequence* q, vector& userTemplateFiltered) { try { - vector thisTemplate; + //when template=self, the query file is sorted from most abundance to least abundant + //userTemplate grows as the query file is processed by adding sequences that are not chimeric and more abundant + vector userTemplate; - int thisRank; - string thisName = q->getName(); - map >::iterator itRank = nameMapRank.find(thisName); // you will find it because we already sanity checked - thisRank = (itRank->second).size(); - - //create list of names we want to put into the template - set namesToAdd; - for (itRank = nameMapRank.begin(); itRank != nameMapRank.end(); itRank++) { - if (itRank->first != thisName) { - if (includeAbunds == "greaterequal") { - if ((itRank->second).size() >= thisRank) { - //you are more abundant than me or equal to my abundance - for (int i = 0; i < (itRank->second).size(); i++) { - namesToAdd.insert((itRank->second)[i]); - } - } - }else if (includeAbunds == "greater") { - if ((itRank->second).size() > thisRank) { - //you are more abundant than me - for (int i = 0; i < (itRank->second).size(); i++) { - namesToAdd.insert((itRank->second)[i]); - } - } - }else if (includeAbunds == "all") { - //add everyone - for (int i = 0; i < (itRank->second).size(); i++) { - namesToAdd.insert((itRank->second)[i]); - } - } - } - } + int myAbund = priority[q->getName()]; - for (int i = 0; i < templateSeqs.size(); i++) { - if (namesToAdd.count(templateSeqs[i]->getName()) != 0) { - thisTemplate.push_back(templateSeqs[i]); + for (int i = 0; i < templateSeqs.size(); i++) { + + if (m->control_pressed) { return userTemplate; } + + //have I reached a sequence with the same abundance as myself? + if (!(priority[templateSeqs[i]->getName()] > myAbund)) { break; } + + //if its am not chimeric add it + if (chimericSeqs.count(templateSeqs[i]->getName()) == 0) { + userTemplate.push_back(templateSeqs[i]); + if (searchMethod == "distance") { userTemplateFiltered.push_back(filteredTemplateSeqs[i]); } } } @@ -326,69 +268,70 @@ vector ChimeraSlayer::getTemplate(Sequence* q) { string leftTemplateFileName = templatePath + "left." + m->getRootName(m->getSimpleName(templateFileName)); databaseLeft = new KmerDB(leftTemplateFileName, kmerSize); #ifdef USE_MPI - for (int i = 0; i < thisTemplate.size(); i++) { + for (int i = 0; i < userTemplate.size(); i++) { - if (m->control_pressed) { return thisTemplate; } + if (m->control_pressed) { return userTemplate; } - string leftFrag = thisTemplate[i]->getUnaligned(); + string leftFrag = userTemplate[i]->getUnaligned(); leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33)); - Sequence leftTemp(thisTemplate[i]->getName(), leftFrag); + Sequence leftTemp(userTemplate[i]->getName(), leftFrag); databaseLeft->addSequence(leftTemp); } databaseLeft->generateDB(); - databaseLeft->setNumSeqs(thisTemplate.size()); + databaseLeft->setNumSeqs(userTemplate.size()); - for (int i = 0; i < thisTemplate.size(); i++) { - if (m->control_pressed) { return thisTemplate; } + for (int i = 0; i < userTemplate.size(); i++) { + if (m->control_pressed) { return userTemplate; } - string rightFrag = thisTemplate[i]->getUnaligned(); + string rightFrag = userTemplate[i]->getUnaligned(); rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66)); - Sequence rightTemp(thisTemplate[i]->getName(), rightFrag); + Sequence rightTemp(userTemplate[i]->getName(), rightFrag); databaseRight->addSequence(rightTemp); } databaseRight->generateDB(); - databaseRight->setNumSeqs(thisTemplate.size()); + databaseRight->setNumSeqs(userTemplate.size()); #else - for (int i = 0; i < thisTemplate.size(); i++) { + for (int i = 0; i < userTemplate.size(); i++) { - if (m->control_pressed) { return thisTemplate; } + if (m->control_pressed) { return userTemplate; } - string leftFrag = thisTemplate[i]->getUnaligned(); + string leftFrag = userTemplate[i]->getUnaligned(); leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33)); - Sequence leftTemp(thisTemplate[i]->getName(), leftFrag); + Sequence leftTemp(userTemplate[i]->getName(), leftFrag); databaseLeft->addSequence(leftTemp); } databaseLeft->generateDB(); - databaseLeft->setNumSeqs(thisTemplate.size()); + databaseLeft->setNumSeqs(userTemplate.size()); - for (int i = 0; i < thisTemplate.size(); i++) { - if (m->control_pressed) { return thisTemplate; } + for (int i = 0; i < userTemplate.size(); i++) { + if (m->control_pressed) { return userTemplate; } - string rightFrag = thisTemplate[i]->getUnaligned(); + string rightFrag = userTemplate[i]->getUnaligned(); rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66)); - Sequence rightTemp(thisTemplate[i]->getName(), rightFrag); + Sequence rightTemp(userTemplate[i]->getName(), rightFrag); databaseRight->addSequence(rightTemp); } databaseRight->generateDB(); - databaseRight->setNumSeqs(thisTemplate.size()); + databaseRight->setNumSeqs(userTemplate.size()); #endif }else if (searchMethod == "blast") { //generate blastdb - databaseLeft = new BlastDB(-2.0, -1.0, match, misMatch); - for (int i = 0; i < thisTemplate.size(); i++) { if (m->control_pressed) { return thisTemplate; } databaseLeft->addSequence(*thisTemplate[i]); } + databaseLeft = new BlastDB(m->getRootName(m->getSimpleName(templateFileName)), -1.0, -1.0, 1, -3); + + for (int i = 0; i < userTemplate.size(); i++) { if (m->control_pressed) { return userTemplate; } databaseLeft->addSequence(*userTemplate[i]); } databaseLeft->generateDB(); - databaseLeft->setNumSeqs(thisTemplate.size()); + databaseLeft->setNumSeqs(userTemplate.size()); } - return thisTemplate; + return userTemplate; } catch(exception& e) { @@ -414,8 +357,11 @@ void ChimeraSlayer::printHeader(ostream& out) { out << "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n"; } //*************************************************************************************************************** -int ChimeraSlayer::print(ostream& out, ostream& outAcc) { +Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc) { try { + Sequence* trim = NULL; + if (trimChimera) { trim = new Sequence(trimQuery.getName(), trimQuery.getAligned()); } + if (chimeraFlags == "yes") { string chimeraFlag = "no"; if( (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR) @@ -427,14 +373,135 @@ int ChimeraSlayer::print(ostream& out, ostream& outAcc) { if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) { m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine(); outAcc << querySeq->getName() << endl; + + if (templateFileName == "self") { chimericSeqs.insert(querySeq->getName()); } + + if (trimChimera) { + int lengthLeft = chimeraResults[0].winLEnd - chimeraResults[0].winLStart; + int lengthRight = chimeraResults[0].winREnd - chimeraResults[0].winRStart; + + string newAligned = trim->getAligned(); + + if (lengthLeft > lengthRight) { //trim right + for (int i = (chimeraResults[0].winRStart-1); i < newAligned.length(); i++) { newAligned[i] = '.'; } + }else { //trim left + for (int i = 0; i < chimeraResults[0].winLEnd; i++) { newAligned[i] = '.'; } + } + trim->setAligned(newAligned); + } } } printBlock(chimeraResults[0], chimeraFlag, out); out << endl; - }else { out << querySeq->getName() << "\tno" << endl; } + }else { + out << querySeq->getName() << "\tno" << endl; + } - return 0; + return trim; + + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayer", "print"); + exit(1); + } +} +//*************************************************************************************************************** +Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc, data_results leftPiece, data_results rightPiece) { + try { + Sequence* trim = NULL; + + if (trimChimera) { + string aligned = leftPiece.trimQuery.getAligned() + rightPiece.trimQuery.getAligned(); + trim = new Sequence(leftPiece.trimQuery.getName(), aligned); + } + + if ((leftPiece.flag == "yes") || (rightPiece.flag == "yes")) { + + string chimeraFlag = "no"; + if (leftPiece.flag == "yes") { + + if( (leftPiece.results[0].bsa >= minBS && leftPiece.results[0].divr_qla_qrb >= divR) + || + (leftPiece.results[0].bsb >= minBS && leftPiece.results[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; } + } + + if (rightPiece.flag == "yes") { + if ( (rightPiece.results[0].bsa >= minBS && rightPiece.results[0].divr_qla_qrb >= divR) + || + (rightPiece.results[0].bsb >= minBS && rightPiece.results[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; } + } + + bool rightChimeric = false; + bool leftChimeric = false; + + if (chimeraFlag == "yes") { + //which peice is chimeric or are both + if (rightPiece.flag == "yes") { if ((rightPiece.results[0].bsa >= minBS) || (rightPiece.results[0].bsb >= minBS)) { rightChimeric = true; } } + if (leftPiece.flag == "yes") { if ((leftPiece.results[0].bsa >= minBS) || (leftPiece.results[0].bsb >= minBS)) { leftChimeric = true; } } + + if (rightChimeric || leftChimeric) { + m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine(); + outAcc << querySeq->getName() << endl; + + if (templateFileName == "self") { chimericSeqs.insert(querySeq->getName()); } + + if (trimChimera) { + string newAligned = trim->getAligned(); + + //right side is fine so keep that + if ((leftChimeric) && (!rightChimeric)) { + for (int i = 0; i < leftPiece.results[0].winREnd; i++) { newAligned[i] = '.'; } + }else if ((!leftChimeric) && (rightChimeric)) { //leftside is fine so keep that + for (int i = (rightPiece.results[0].winLStart-1); i < newAligned.length(); i++) { newAligned[i] = '.'; } + }else { //both sides are chimeric, keep longest piece + + int lengthLeftLeft = leftPiece.results[0].winLEnd - leftPiece.results[0].winLStart; + int lengthLeftRight = leftPiece.results[0].winREnd - leftPiece.results[0].winRStart; + + int longest = 1; // leftleft = 1, leftright = 2, rightleft = 3 rightright = 4 + int length = lengthLeftLeft; + if (lengthLeftLeft < lengthLeftRight) { longest = 2; length = lengthLeftRight; } + + int lengthRightLeft = rightPiece.results[0].winLEnd - rightPiece.results[0].winLStart; + int lengthRightRight = rightPiece.results[0].winREnd - rightPiece.results[0].winRStart; + + if (lengthRightLeft > length) { longest = 3; length = lengthRightLeft; } + if (lengthRightRight > length) { longest = 4; } + + if (longest == 1) { //leftleft + for (int i = (leftPiece.results[0].winRStart-1); i < newAligned.length(); i++) { newAligned[i] = '.'; } + }else if (longest == 2) { //leftright + //get rid of leftleft + for (int i = (leftPiece.results[0].winLStart-1); i < (leftPiece.results[0].winLEnd-1); i++) { newAligned[i] = '.'; } + //get rid of right + for (int i = (rightPiece.results[0].winLStart-1); i < newAligned.length(); i++) { newAligned[i] = '.'; } + }else if (longest == 3) { //rightleft + //get rid of left + for (int i = 0; i < leftPiece.results[0].winREnd; i++) { newAligned[i] = '.'; } + //get rid of rightright + for (int i = (rightPiece.results[0].winRStart-1); i < newAligned.length(); i++) { newAligned[i] = '.'; } + }else { //rightright + //get rid of left + for (int i = 0; i < leftPiece.results[0].winREnd; i++) { newAligned[i] = '.'; } + //get rid of rightleft + for (int i = (rightPiece.results[0].winLStart-1); i < (rightPiece.results[0].winLEnd-1); i++) { newAligned[i] = '.'; } + } + } + + trim->setAligned(newAligned); + } + + } + } + + printBlock(leftPiece, rightPiece, leftChimeric, rightChimeric, chimeraFlag, out); + out << endl; + }else { + out << querySeq->getName() << "\tno" << endl; + } + + return trim; } catch(exception& e) { @@ -442,15 +509,154 @@ int ChimeraSlayer::print(ostream& out, ostream& outAcc) { exit(1); } } + #ifdef USE_MPI //*************************************************************************************************************** -int ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) { +Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc, data_results leftPiece, data_results rightPiece) { + try { + MPI_Status status; + bool results = false; + string outAccString = ""; + string outputString = ""; + + Sequence* trim = NULL; + + if (trimChimera) { + string aligned = leftPiece.trimQuery.getAligned() + rightPiece.trimQuery.getAligned(); + trim = new Sequence(leftPiece.trimQuery.getName(), aligned); + } + + + if ((leftPiece.flag == "yes") || (rightPiece.flag == "yes")) { + + string chimeraFlag = "no"; + if (leftPiece.flag == "yes") { + + if( (leftPiece.results[0].bsa >= minBS && leftPiece.results[0].divr_qla_qrb >= divR) + || + (leftPiece.results[0].bsb >= minBS && leftPiece.results[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; } + } + + if (rightPiece.flag == "yes") { + if ( (rightPiece.results[0].bsa >= minBS && rightPiece.results[0].divr_qla_qrb >= divR) + || + (rightPiece.results[0].bsb >= minBS && rightPiece.results[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; } + } + + bool rightChimeric = false; + bool leftChimeric = false; + + if (chimeraFlag == "yes") { + //which peice is chimeric or are both + if (rightPiece.flag == "yes") { if ((rightPiece.results[0].bsa >= minBS) || (rightPiece.results[0].bsb >= minBS)) { rightChimeric = true; } } + if (leftPiece.flag == "yes") { if ((leftPiece.results[0].bsa >= minBS) || (leftPiece.results[0].bsb >= minBS)) { leftChimeric = true; } } + + if (rightChimeric || leftChimeric) { +// cout << querySeq->getName() << "\tyes" << endl; + outAccString += querySeq->getName() + "\n"; + results = true; + + if (templateFileName == "self") { chimericSeqs.insert(querySeq->getName()); } + + //write to accnos file + int length = outAccString.length(); + char* buf2 = new char[length]; + memcpy(buf2, outAccString.c_str(), length); + + MPI_File_write_shared(outAcc, buf2, length, MPI_CHAR, &status); + delete buf2; + + if (trimChimera) { + string newAligned = trim->getAligned(); + + //right side is fine so keep that + if ((leftChimeric) && (!rightChimeric)) { + for (int i = 0; i < leftPiece.results[0].winREnd; i++) { newAligned[i] = '.'; } + }else if ((!leftChimeric) && (rightChimeric)) { //leftside is fine so keep that + for (int i = (rightPiece.results[0].winLStart-1); i < newAligned.length(); i++) { newAligned[i] = '.'; } + }else { //both sides are chimeric, keep longest piece + + int lengthLeftLeft = leftPiece.results[0].winLEnd - leftPiece.results[0].winLStart; + int lengthLeftRight = leftPiece.results[0].winREnd - leftPiece.results[0].winRStart; + + int longest = 1; // leftleft = 1, leftright = 2, rightleft = 3 rightright = 4 + int length = lengthLeftLeft; + if (lengthLeftLeft < lengthLeftRight) { longest = 2; length = lengthLeftRight; } + + int lengthRightLeft = rightPiece.results[0].winLEnd - rightPiece.results[0].winLStart; + int lengthRightRight = rightPiece.results[0].winREnd - rightPiece.results[0].winRStart; + + if (lengthRightLeft > length) { longest = 3; length = lengthRightLeft; } + if (lengthRightRight > length) { longest = 4; } + + if (longest == 1) { //leftleft + for (int i = (leftPiece.results[0].winRStart-1); i < newAligned.length(); i++) { newAligned[i] = '.'; } + }else if (longest == 2) { //leftright + //get rid of leftleft + for (int i = (leftPiece.results[0].winLStart-1); i < (leftPiece.results[0].winLEnd-1); i++) { newAligned[i] = '.'; } + //get rid of right + for (int i = (rightPiece.results[0].winLStart-1); i < newAligned.length(); i++) { newAligned[i] = '.'; } + }else if (longest == 3) { //rightleft + //get rid of left + for (int i = 0; i < leftPiece.results[0].winREnd; i++) { newAligned[i] = '.'; } + //get rid of rightright + for (int i = (rightPiece.results[0].winRStart-1); i < newAligned.length(); i++) { newAligned[i] = '.'; } + }else { //rightright + //get rid of left + for (int i = 0; i < leftPiece.results[0].winREnd; i++) { newAligned[i] = '.'; } + //get rid of rightleft + for (int i = (rightPiece.results[0].winLStart-1); i < (rightPiece.results[0].winLEnd-1); i++) { newAligned[i] = '.'; } + } + } + + trim->setAligned(newAligned); + } + + } + } + + outputString = getBlock(leftPiece, rightPiece, leftChimeric, rightChimeric, chimeraFlag); + outputString += "\n"; + + //write to output file + int length = outputString.length(); + char* buf = new char[length]; + memcpy(buf, outputString.c_str(), length); + + MPI_File_write_shared(out, buf, length, MPI_CHAR, &status); + delete buf; + + }else { + outputString += querySeq->getName() + "\tno\n"; + + //write to output file + int length = outputString.length(); + char* buf = new char[length]; + memcpy(buf, outputString.c_str(), length); + + MPI_File_write_shared(out, buf, length, MPI_CHAR, &status); + delete buf; + } + + + return trim; + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayer", "print"); + exit(1); + } +} +//*************************************************************************************************************** +Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) { try { MPI_Status status; bool results = false; string outAccString = ""; string outputString = ""; + Sequence* trim = NULL; + if (trimChimera) { trim = new Sequence(trimQuery.getName(), trimQuery.getAligned()); } + if (chimeraFlags == "yes") { string chimeraFlag = "no"; if( (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR) @@ -464,41 +670,55 @@ int ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) { outAccString += querySeq->getName() + "\n"; results = true; + if (templateFileName == "self") { chimericSeqs.insert(querySeq->getName()); } + //write to accnos file int length = outAccString.length(); char* buf2 = new char[length]; memcpy(buf2, outAccString.c_str(), length); - + MPI_File_write_shared(outAcc, buf2, length, MPI_CHAR, &status); delete buf2; + + if (trimChimera) { + int lengthLeft = chimeraResults[0].winLEnd - chimeraResults[0].winLStart; + int lengthRight = chimeraResults[0].winREnd - chimeraResults[0].winRStart; + + string newAligned = trim->getAligned(); + if (lengthLeft > lengthRight) { //trim right + for (int i = (chimeraResults[0].winRStart-1); i < newAligned.length(); i++) { newAligned[i] = '.'; } + }else { //trim left + for (int i = 0; i < (chimeraResults[0].winLEnd-1); i++) { newAligned[i] = '.'; } + } + trim->setAligned(newAligned); + } } } outputString = getBlock(chimeraResults[0], chimeraFlag); outputString += "\n"; - //cout << outputString << endl; + //write to output file int length = outputString.length(); char* buf = new char[length]; memcpy(buf, outputString.c_str(), length); - + MPI_File_write_shared(out, buf, length, MPI_CHAR, &status); delete buf; - + }else { outputString += querySeq->getName() + "\tno\n"; - //cout << outputString << endl; + //write to output file int length = outputString.length(); char* buf = new char[length]; memcpy(buf, outputString.c_str(), length); - + MPI_File_write_shared(out, buf, length, MPI_CHAR, &status); delete buf; } - - return results; + return trim; } catch(exception& e) { m->errorOut(e, "ChimeraSlayer", "print"); @@ -510,25 +730,30 @@ int ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) { //*************************************************************************************************************** int ChimeraSlayer::getChimeras(Sequence* query) { try { + + trimQuery.setName(query->getName()); trimQuery.setAligned(query->getAligned()); + printResults.trimQuery = trimQuery; + chimeraFlags = "no"; - - //filter query - spotMap = runFilter(query); + printResults.flag = "no"; querySeq = query; //you must create a template vector thisTemplate; - if (templateFileName != "self") { thisTemplate = templateSeqs; } - else { thisTemplate = getTemplate(query); } //fills thistemplate and creates the databases + vector thisFilteredTemplate; + if (templateFileName != "self") { thisTemplate = templateSeqs; thisFilteredTemplate = filteredTemplateSeqs; } + else { thisTemplate = getTemplate(query, thisFilteredTemplate); } //fills this template and creates the databases if (m->control_pressed) { return 0; } if (thisTemplate.size() == 0) { return 0; } //not chimeric - //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity - Maligner maligner(thisTemplate, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight); - Slayer slayer(window, increment, minSim, divR, iters, minSNP); + //moved this out of maligner - 4/29/11 + vector refSeqs = getRefSeqs(query, thisTemplate, thisFilteredTemplate); + + Maligner maligner(refSeqs, match, misMatch, divR, minSim, minCov); + Slayer slayer(window, increment, minSim, divR, iters, minSNP, minBS); if (templateFileName == "self") { if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; } @@ -536,19 +761,28 @@ int ChimeraSlayer::getChimeras(Sequence* query) { } if (m->control_pressed) { return 0; } - + string chimeraFlag = maligner.getResults(query, decalc); + if (m->control_pressed) { return 0; } + vector Results = maligner.getOutput(); - - //found in testing realigning only made things worse - if (realign) { - ChimeraReAligner realigner(thisTemplate, match, misMatch); - realigner.reAlign(query, Results); - } - + + for (int i = 0; i < refSeqs.size(); i++) { delete refSeqs[i]; } + if (chimeraFlag == "yes") { - + + if (realign) { + vector parents; + for (int i = 0; i < Results.size(); i++) { + parents.push_back(Results[i].parentAligned); + } + + ChimeraReAligner realigner; + realigner.reAlign(query, parents); + + } + //get sequence that were given from maligner results vector seqs; map removeDups; @@ -569,9 +803,7 @@ int ChimeraSlayer::getChimeras(Sequence* query) { } for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) { - //Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template itSeq = parentNameSeq.find(itDup->first); -//cout << itDup->first << itSeq->second << endl; Sequence* seq = new Sequence(itDup->first, itSeq->second); SeqDist member; @@ -593,40 +825,31 @@ int ChimeraSlayer::getChimeras(Sequence* query) { seqs.pop_back(); } } - + //put seqs into vector to send to slayer + +// cout << query->getAligned() << endl; vector seqsForSlayer; - for (int k = 0; k < seqs.size(); k++) { seqsForSlayer.push_back(seqs[k].seq); } + for (int k = 0; k < seqs.size(); k++) { +// cout << seqs[k].seq->getAligned() << endl; + seqsForSlayer.push_back(seqs[k].seq); - //mask then send to slayer... - if (seqMask != "") { - decalc->setMask(seqMask); - - //mask querys - decalc->runMask(query); - - //mask parents - for (int k = 0; k < seqsForSlayer.size(); k++) { - decalc->runMask(seqsForSlayer[k]); - } - - spotMap = decalc->getMaskMap(); } if (m->control_pressed) { for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; } return 0; } - + //send to slayer chimeraFlags = slayer.getResults(query, seqsForSlayer); if (m->control_pressed) { return 0; } chimeraResults = slayer.getOutput(); + printResults.flag = chimeraFlags; + printResults.results = chimeraResults; + //free memory for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; } } - - //delete maligner; - //delete slayer; - + //cout << endl << endl; return 0; } catch(exception& e) { @@ -637,32 +860,66 @@ int ChimeraSlayer::getChimeras(Sequence* query) { //*************************************************************************************************************** void ChimeraSlayer::printBlock(data_struct data, string flag, ostream& out){ try { - //out << ":)\n"; - out << querySeq->getName() << '\t'; out << data.parentA.getName() << "\t" << data.parentB.getName() << '\t'; - //out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl; - //out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl; - + out << data.divr_qla_qrb << '\t' << data.qla_qrb << '\t' << data.bsa << '\t'; out << data.divr_qlb_qra << '\t' << data.qlb_qra << '\t' << data.bsb << '\t'; - out << flag << '\t' << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t'; + out << flag << '\t' << data.winLStart << "-" << data.winLEnd << '\t' << data.winRStart << "-" << data.winREnd << '\t'; - //out << "Similarity of parents: " << data.ab << endl; - //out << "Similarity of query to parentA: " << data.qa << endl; - //out << "Similarity of query to parentB: " << data.qb << endl; + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayer", "printBlock"); + exit(1); + } +} +//*************************************************************************************************************** +void ChimeraSlayer::printBlock(data_results leftdata, data_results rightdata, bool leftChimeric, bool rightChimeric, string flag, ostream& out){ + try { + if ((leftChimeric) && (!rightChimeric)) { //print left + out << querySeq->getName() << '\t'; + out << leftdata.results[0].parentA.getName() << "\t" << leftdata.results[0].parentB.getName() << '\t'; + + out << leftdata.results[0].divr_qla_qrb << '\t' << leftdata.results[0].qla_qrb << '\t' << leftdata.results[0].bsa << '\t'; + out << leftdata.results[0].divr_qlb_qra << '\t' << leftdata.results[0].qlb_qra << '\t' << leftdata.results[0].bsb << '\t'; - //out << "Per_id(QL,A): " << data.qla << endl; - //out << "Per_id(QL,B): " << data.qlb << endl; - //out << "Per_id(QR,A): " << data.qra << endl; - //out << "Per_id(QR,B): " << data.qrb << endl; - + out << flag << '\t' << leftdata.results[0].winLStart << "-" << leftdata.results[0].winLEnd << '\t' << leftdata.results[0].winRStart << "-" << leftdata.results[0].winREnd << '\t'; - //out << "DeltaL: " << (data.qla - data.qlb) << endl; - //out << "DeltaR: " << (data.qra - data.qrb) << endl; - + }else if ((!leftChimeric) && (rightChimeric)) { //print right + out << querySeq->getName() << '\t'; + out << rightdata.results[0].parentA.getName() << "\t" << rightdata.results[0].parentB.getName() << '\t'; + + out << rightdata.results[0].divr_qla_qrb << '\t' << rightdata.results[0].qla_qrb << '\t' << rightdata.results[0].bsa << '\t'; + out << rightdata.results[0].divr_qlb_qra << '\t' << rightdata.results[0].qlb_qra << '\t' << rightdata.results[0].bsb << '\t'; + + out << flag << '\t' << rightdata.results[0].winLStart << "-" << rightdata.results[0].winLEnd << '\t' << rightdata.results[0].winRStart << "-" << rightdata.results[0].winREnd << '\t'; + + }else { //print both results + if (leftdata.flag == "yes") { + out << querySeq->getName() + "_LEFT" << '\t'; + out << leftdata.results[0].parentA.getName() << "\t" << leftdata.results[0].parentB.getName() << '\t'; + + out << leftdata.results[0].divr_qla_qrb << '\t' << leftdata.results[0].qla_qrb << '\t' << leftdata.results[0].bsa << '\t'; + out << leftdata.results[0].divr_qlb_qra << '\t' << leftdata.results[0].qlb_qra << '\t' << leftdata.results[0].bsb << '\t'; + + out << flag << '\t' << leftdata.results[0].winLStart << "-" << leftdata.results[0].winLEnd << '\t' << leftdata.results[0].winRStart << "-" << leftdata.results[0].winREnd << '\t'; + } + + if (rightdata.flag == "yes") { + if (leftdata.flag == "yes") { out << endl; } + + out << querySeq->getName() + "_RIGHT"<< '\t'; + out << rightdata.results[0].parentA.getName() << "\t" << rightdata.results[0].parentB.getName() << '\t'; + + out << rightdata.results[0].divr_qla_qrb << '\t' << rightdata.results[0].qla_qrb << '\t' << rightdata.results[0].bsa << '\t'; + out << rightdata.results[0].divr_qlb_qra << '\t' << rightdata.results[0].qlb_qra << '\t' << rightdata.results[0].bsb << '\t'; + + out << flag << '\t' << rightdata.results[0].winLStart << "-" << rightdata.results[0].winLEnd << '\t' << rightdata.results[0].winRStart << "-" << rightdata.results[0].winREnd << '\t'; + + } + } } catch(exception& e) { m->errorOut(e, "ChimeraSlayer", "printBlock"); @@ -670,6 +927,62 @@ void ChimeraSlayer::printBlock(data_struct data, string flag, ostream& out){ } } //*************************************************************************************************************** +string ChimeraSlayer::getBlock(data_results leftdata, data_results rightdata, bool leftChimeric, bool rightChimeric, string flag){ + try { + + string out = ""; + + if ((leftChimeric) && (!rightChimeric)) { //get left + out += querySeq->getName() + "\t"; + out += leftdata.results[0].parentA.getName() + "\t" + leftdata.results[0].parentB.getName() + "\t"; + + out += toString(leftdata.results[0].divr_qla_qrb) + "\t" + toString(leftdata.results[0].qla_qrb) + "\t" + toString(leftdata.results[0].bsa) + "\t"; + out += toString(leftdata.results[0].divr_qlb_qra) + "\t" + toString(leftdata.results[0].qlb_qra) + "\t" + toString(leftdata.results[0].bsb) + "\t"; + + out += flag + "\t" + toString(leftdata.results[0].winLStart) + "-" + toString(leftdata.results[0].winLEnd) + "\t" + toString(leftdata.results[0].winRStart) + "-" + toString(leftdata.results[0].winREnd) + "\t"; + + }else if ((!leftChimeric) && (rightChimeric)) { //print right + out += querySeq->getName() + "\t"; + out += rightdata.results[0].parentA.getName() + "\t" + rightdata.results[0].parentB.getName() + "\t"; + + out += toString(rightdata.results[0].divr_qla_qrb) + "\t" + toString(rightdata.results[0].qla_qrb) + "\t" + toString(rightdata.results[0].bsa) + "\t"; + out += toString(rightdata.results[0].divr_qlb_qra) + "\t" + toString(rightdata.results[0].qlb_qra) + "\t" + toString(rightdata.results[0].bsb) + "\t"; + + out += flag + "\t" + toString(rightdata.results[0].winLStart) + "-" + toString(rightdata.results[0].winLEnd) + "\t" + toString(rightdata.results[0].winRStart) + "-" + toString(rightdata.results[0].winREnd) + "\t"; + + }else { //print both results + + if (leftdata.flag == "yes") { + out += querySeq->getName() + "_LEFT\t"; + out += leftdata.results[0].parentA.getName() + "\t" + leftdata.results[0].parentB.getName() + "\t"; + + out += toString(leftdata.results[0].divr_qla_qrb) + "\t" + toString(leftdata.results[0].qla_qrb) + "\t" + toString(leftdata.results[0].bsa) + "\t"; + out += toString(leftdata.results[0].divr_qlb_qra) + "\t" + toString(leftdata.results[0].qlb_qra) + "\t" + toString(leftdata.results[0].bsb) + "\t"; + + out += flag + "\t" + toString(leftdata.results[0].winLStart) + "-" + toString(leftdata.results[0].winLEnd) + "\t" + toString(leftdata.results[0].winRStart) + "-" + toString(leftdata.results[0].winREnd) + "\t"; + } + + if (rightdata.flag == "yes") { + if (leftdata.flag == "yes") { out += "\n"; } + out += querySeq->getName() + "_RIGHT\t"; + out += rightdata.results[0].parentA.getName() + "\t" + rightdata.results[0].parentB.getName() + "\t"; + + out += toString(rightdata.results[0].divr_qla_qrb) + "\t" + toString(rightdata.results[0].qla_qrb) + "\t" + toString(rightdata.results[0].bsa) + "\t"; + out += toString(rightdata.results[0].divr_qlb_qra) + "\t" + toString(rightdata.results[0].qlb_qra) + "\t" + toString(rightdata.results[0].bsb) + "\t"; + + out += flag + "\t" + toString(rightdata.results[0].winLStart) + "-" + toString(rightdata.results[0].winLEnd) + "\t" + toString(rightdata.results[0].winRStart) + "-" + toString(rightdata.results[0].winREnd) + "\t"; + } + } + + return out; + + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayer", "getBlock"); + exit(1); + } +} +//*************************************************************************************************************** string ChimeraSlayer::getBlock(data_struct data, string flag){ try { @@ -681,7 +994,7 @@ string ChimeraSlayer::getBlock(data_struct data, string flag){ outputString += toString(data.divr_qla_qrb) + "\t" + toString(data.qla_qrb) + "\t" + toString(data.bsa) + "\t"; outputString += toString(data.divr_qlb_qra) + "\t" + toString(data.qlb_qra) + "\t" + toString(data.bsb) + "\t"; - outputString += flag + "\t" + toString(spotMap[data.winLStart]) + "-" + toString(spotMap[data.winLEnd]) + "\t" + toString(spotMap[data.winRStart]) + "-" + toString(spotMap[data.winREnd]) + "\t"; + outputString += flag + "\t" + toString(data.winLStart) + "-" + toString(data.winLEnd) + "\t" + toString(data.winRStart) + "-" + toString(data.winREnd) + "\t"; return outputString; } @@ -690,5 +1003,172 @@ string ChimeraSlayer::getBlock(data_struct data, string flag){ exit(1); } } +//*************************************************************************************************************** +vector ChimeraSlayer::getRefSeqs(Sequence* q, vector& thisTemplate, vector& thisFilteredTemplate){ + try { + + vector refSeqs; + + if (searchMethod == "distance") { + //find closest seqs to query in template - returns copies of seqs so trim does not destroy - remember to deallocate + Sequence* newSeq = new Sequence(q->getName(), q->getAligned()); + runFilter(newSeq); + refSeqs = decalc->findClosest(newSeq, thisTemplate, thisFilteredTemplate, numWanted, minSim); + delete newSeq; + }else if (searchMethod == "blast") { + refSeqs = getBlastSeqs(q, thisTemplate, numWanted); //fills indexes + }else if (searchMethod == "kmer") { + refSeqs = getKmerSeqs(q, thisTemplate, numWanted); //fills indexes + }else { m->mothurOut("not valid search."); exit(1); } //should never get here + + return refSeqs; + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayer", "getRefSeqs"); + exit(1); + } +} //***************************************************************************************************************/ +vector ChimeraSlayer::getBlastSeqs(Sequence* q, vector& db, int num) { + try { + + vector refResults; + + //get parts of query + string queryUnAligned = q->getUnaligned(); + string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence + string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence +//cout << "whole length = " << queryUnAligned.length() << '\t' << "left length = " << leftQuery.length() << '\t' << "right length = "<< rightQuery.length() << endl; + Sequence* queryLeft = new Sequence(q->getName(), leftQuery); + Sequence* queryRight = new Sequence(q->getName(), rightQuery); + + vector tempIndexesLeft = databaseLeft->findClosestMegaBlast(queryLeft, num+1, minSim); + vector tempIndexesRight = databaseLeft->findClosestMegaBlast(queryRight, num+1, minSim); + //cout << q->getName() << '\t' << leftQuery << '\t' << "leftMatches = " << tempIndexesLeft.size() << '\t' << rightQuery << " rightMatches = " << tempIndexesRight.size() << endl; + vector smaller; + vector larger; + + if (tempIndexesRight.size() < tempIndexesLeft.size()) { smaller = tempIndexesRight; larger = tempIndexesLeft; } + else { smaller = tempIndexesLeft; larger = tempIndexesRight; } + + //merge results + map seen; + map::iterator it; + vector mergedResults; + for (int i = 0; i < smaller.size(); i++) { + if (m->control_pressed) { delete queryRight; delete queryLeft; return refResults; } + + //add left if you havent already + it = seen.find(smaller[i]); + if (it == seen.end()) { + mergedResults.push_back(smaller[i]); + seen[smaller[i]] = smaller[i]; + } + + //add right if you havent already + it = seen.find(larger[i]); + if (it == seen.end()) { + mergedResults.push_back(larger[i]); + seen[larger[i]] = larger[i]; + } + } + + for (int i = smaller.size(); i < larger.size(); i++) { + if (m->control_pressed) { delete queryRight; delete queryLeft; return refResults; } + + //add right if you havent already + it = seen.find(larger[i]); + if (it == seen.end()) { + mergedResults.push_back(larger[i]); + seen[larger[i]] = larger[i]; + } + } + + for (int i = 0; i < mergedResults.size(); i++) { + //cout << mergedResults[i] << '\t' << db[mergedResults[i]]->getName() << endl; + if (db[mergedResults[i]]->getName() != q->getName()) { + Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned()); + refResults.push_back(temp); + + } + } + + +// for(int i=0;igetName() << endl; +// } + + delete queryRight; + delete queryLeft; + + return refResults; + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayer", "getBlastSeqs"); + exit(1); + } +} +//*************************************************************************************************************** +vector ChimeraSlayer::getKmerSeqs(Sequence* q, vector& db, int num) { + try { + vector refResults; + + //get parts of query + string queryUnAligned = q->getUnaligned(); + string leftQuery = queryUnAligned.substr(0, int(queryUnAligned.length() * 0.33)); //first 1/3 of the sequence + string rightQuery = queryUnAligned.substr(int(queryUnAligned.length() * 0.66)); //last 1/3 of the sequence + + Sequence* queryLeft = new Sequence(q->getName(), leftQuery); + Sequence* queryRight = new Sequence(q->getName(), rightQuery); + + vector tempIndexesLeft = databaseLeft->findClosestSequences(queryLeft, num); + vector tempIndexesRight = databaseRight->findClosestSequences(queryRight, num); + + //merge results + map seen; + map::iterator it; + vector mergedResults; + for (int i = 0; i < tempIndexesLeft.size(); i++) { + + if (m->control_pressed) { delete queryRight; delete queryLeft; return refResults; } + + //add left if you havent already + it = seen.find(tempIndexesLeft[i]); + if (it == seen.end()) { + mergedResults.push_back(tempIndexesLeft[i]); + seen[tempIndexesLeft[i]] = tempIndexesLeft[i]; + } + + //add right if you havent already + it = seen.find(tempIndexesRight[i]); + if (it == seen.end()) { + mergedResults.push_back(tempIndexesRight[i]); + seen[tempIndexesRight[i]] = tempIndexesRight[i]; + } + } + + //numWanted = mergedResults.size(); + + //cout << q->getName() << endl; + + for (int i = 0; i < mergedResults.size(); i++) { + //cout << db[mergedResults[i]]->getName() << endl; + if (db[mergedResults[i]]->getName() != q->getName()) { + Sequence* temp = new Sequence(db[mergedResults[i]]->getName(), db[mergedResults[i]]->getAligned()); + refResults.push_back(temp); + } + } + //cout << endl; + delete queryRight; + delete queryLeft; + + return refResults; + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayer", "getKmerSeqs"); + exit(1); + } +} +//*************************************************************************************************************** +