X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=chimeraslayer.cpp;h=73037c783be84270a28b0dacf1795c51a3668d0e;hb=836150c0a3666899ad58426388f4999d6cf8829a;hp=eb00b59a324e5ef93850fa2a0aa16e1a0216a9dc;hpb=41f3f520282eb972d450ac6d23a0e80c546aa76e;p=mothur.git diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp index eb00b59..73037c7 100644 --- a/chimeraslayer.cpp +++ b/chimeraslayer.cpp @@ -45,7 +45,7 @@ int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int num } } //*************************************************************************************************************** -ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, string name, string mode, string abunds, int k, int ms, int mms, int win, float div, +ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, map& prior, string mode, int k, int ms, int mms, int win, float div, int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera() { try { fastafile = file; templateSeqs = readSeqs(fastafile); @@ -65,18 +65,16 @@ ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, string name, s increment = inc; numWanted = numw; realign = r; - includeAbunds = abunds; trimChimera = trim; - - //read name file and create nameMapRank - readNameFile(name); + priority = prior; decalc = new DeCalculator(); createFilter(templateSeqs, 0.0); //just removed columns where all seqs have a gap //run filter on template - for (int i = 0; i < templateSeqs.size(); i++) { runFilter(templateSeqs[i]); } + for (int i = 0; i < templateSeqs.size(); i++) { if (m->control_pressed) { break; } runFilter(templateSeqs[i]); } + } catch(exception& e) { @@ -84,59 +82,6 @@ ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, string name, s exit(1); } } -//*************************************************************************************************************** -int ChimeraSlayer::readNameFile(string name) { - try { - ifstream in; - m->openInputFile(name, in); - - int maxRank = 0; - int minRank = 10000000; - - while(!in.eof()){ - - if (m->control_pressed) { in.close(); return 0; } - - string thisname, repnames; - - in >> thisname; m->gobble(in); //read from first column - in >> repnames; //read from second column - - map >::iterator it = nameMapRank.find(thisname); - if (it == nameMapRank.end()) { - - vector splitRepNames; - m->splitAtComma(repnames, splitRepNames); - - nameMapRank[thisname] = splitRepNames; - - if (splitRepNames.size() > maxRank) { maxRank = splitRepNames.size(); } - if (splitRepNames.size() < minRank) { minRank = splitRepNames.size(); } - - }else{ m->mothurOut(thisname + " is already in namesfile. I will use first definition."); m->mothurOutEndLine(); } - - m->gobble(in); - } - in.close(); - - //sanity check to make sure files match - for (int i = 0; i < templateSeqs.size(); i++) { - map >::iterator it = nameMapRank.find(templateSeqs[i]->getName()); - - if (it == nameMapRank.end()) { m->mothurOut("[ERROR]: " + templateSeqs[i]->getName() + " is not in namesfile, but is in fastafile. Every name in fasta file must be in first column of names file."); m->mothurOutEndLine(); m->control_pressed = true; } - } - - if (maxRank == minRank) { m->mothurOut("[ERROR]: all sequences in namesfile have the same abundance, aborting."); m->mothurOutEndLine(); m->control_pressed = true; } - - return 0; - - } - catch(exception& e) { - m->errorOut(e, "ChimeraSlayer", "readNameFile"); - exit(1); - } -} - //*************************************************************************************************************** int ChimeraSlayer::doPrep() { try { @@ -258,7 +203,8 @@ int ChimeraSlayer::doPrep() { }else if (searchMethod == "blast") { //generate blastdb - databaseLeft = new BlastDB(-2.0, -1.0, match, misMatch); + databaseLeft = new BlastDB(-1.0, -1.0, 1, -3); + for (int i = 0; i < templateSeqs.size(); i++) { databaseLeft->addSequence(*templateSeqs[i]); } databaseLeft->generateDB(); databaseLeft->setNumSeqs(templateSeqs.size()); @@ -276,44 +222,21 @@ int ChimeraSlayer::doPrep() { vector ChimeraSlayer::getTemplate(Sequence* q) { try { - vector thisTemplate; + //when template=self, the query file is sorted from most abundance to least abundant + //userTemplate grows as the query file is processed by adding sequences that are not chimeric and more abundant + vector userTemplate; - int thisRank; - string thisName = q->getName(); - map >::iterator itRank = nameMapRank.find(thisName); // you will find it because we already sanity checked - thisRank = (itRank->second).size(); + int myAbund = priority[q->getName()]; - //create list of names we want to put into the template - set namesToAdd; - for (itRank = nameMapRank.begin(); itRank != nameMapRank.end(); itRank++) { - if (itRank->first != thisName) { - if (includeAbunds == "greaterequal") { - if ((itRank->second).size() >= thisRank) { - //you are more abundant than me or equal to my abundance - for (int i = 0; i < (itRank->second).size(); i++) { - namesToAdd.insert((itRank->second)[i]); - } - } - }else if (includeAbunds == "greater") { - if ((itRank->second).size() > thisRank) { - //you are more abundant than me - for (int i = 0; i < (itRank->second).size(); i++) { - namesToAdd.insert((itRank->second)[i]); - } - } - }else if (includeAbunds == "all") { - //add everyone - for (int i = 0; i < (itRank->second).size(); i++) { - namesToAdd.insert((itRank->second)[i]); - } - } - } - } - - for (int i = 0; i < templateSeqs.size(); i++) { - if (namesToAdd.count(templateSeqs[i]->getName()) != 0) { - thisTemplate.push_back(templateSeqs[i]); - } + for (int i = 0; i < templateSeqs.size(); i++) { + + if (m->control_pressed) { return userTemplate; } + + //have I reached a sequence with the same abundance as myself? + if (!(priority[templateSeqs[i]->getName()] > myAbund)) { break; } + + //if its am not chimeric add it + if (chimericSeqs.count(templateSeqs[i]->getName()) == 0) { userTemplate.push_back(templateSeqs[i]); } } string kmerDBNameLeft; @@ -328,69 +251,70 @@ vector ChimeraSlayer::getTemplate(Sequence* q) { string leftTemplateFileName = templatePath + "left." + m->getRootName(m->getSimpleName(templateFileName)); databaseLeft = new KmerDB(leftTemplateFileName, kmerSize); #ifdef USE_MPI - for (int i = 0; i < thisTemplate.size(); i++) { + for (int i = 0; i < userTemplate.size(); i++) { - if (m->control_pressed) { return thisTemplate; } + if (m->control_pressed) { return userTemplate; } - string leftFrag = thisTemplate[i]->getUnaligned(); + string leftFrag = userTemplate[i]->getUnaligned(); leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33)); - Sequence leftTemp(thisTemplate[i]->getName(), leftFrag); + Sequence leftTemp(userTemplate[i]->getName(), leftFrag); databaseLeft->addSequence(leftTemp); } databaseLeft->generateDB(); - databaseLeft->setNumSeqs(thisTemplate.size()); + databaseLeft->setNumSeqs(userTemplate.size()); - for (int i = 0; i < thisTemplate.size(); i++) { - if (m->control_pressed) { return thisTemplate; } + for (int i = 0; i < userTemplate.size(); i++) { + if (m->control_pressed) { return userTemplate; } - string rightFrag = thisTemplate[i]->getUnaligned(); + string rightFrag = userTemplate[i]->getUnaligned(); rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66)); - Sequence rightTemp(thisTemplate[i]->getName(), rightFrag); + Sequence rightTemp(userTemplate[i]->getName(), rightFrag); databaseRight->addSequence(rightTemp); } databaseRight->generateDB(); - databaseRight->setNumSeqs(thisTemplate.size()); + databaseRight->setNumSeqs(userTemplate.size()); #else - for (int i = 0; i < thisTemplate.size(); i++) { + for (int i = 0; i < userTemplate.size(); i++) { - if (m->control_pressed) { return thisTemplate; } + if (m->control_pressed) { return userTemplate; } - string leftFrag = thisTemplate[i]->getUnaligned(); + string leftFrag = userTemplate[i]->getUnaligned(); leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33)); - Sequence leftTemp(thisTemplate[i]->getName(), leftFrag); + Sequence leftTemp(userTemplate[i]->getName(), leftFrag); databaseLeft->addSequence(leftTemp); } databaseLeft->generateDB(); - databaseLeft->setNumSeqs(thisTemplate.size()); + databaseLeft->setNumSeqs(userTemplate.size()); - for (int i = 0; i < thisTemplate.size(); i++) { - if (m->control_pressed) { return thisTemplate; } + for (int i = 0; i < userTemplate.size(); i++) { + if (m->control_pressed) { return userTemplate; } - string rightFrag = thisTemplate[i]->getUnaligned(); + string rightFrag = userTemplate[i]->getUnaligned(); rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66)); - Sequence rightTemp(thisTemplate[i]->getName(), rightFrag); + Sequence rightTemp(userTemplate[i]->getName(), rightFrag); databaseRight->addSequence(rightTemp); } databaseRight->generateDB(); - databaseRight->setNumSeqs(thisTemplate.size()); + databaseRight->setNumSeqs(userTemplate.size()); #endif }else if (searchMethod == "blast") { //generate blastdb - databaseLeft = new BlastDB(-2.0, -1.0, match, misMatch); - for (int i = 0; i < thisTemplate.size(); i++) { if (m->control_pressed) { return thisTemplate; } databaseLeft->addSequence(*thisTemplate[i]); } + databaseLeft = new BlastDB(-1.0, -1.0, 1, -3); + + for (int i = 0; i < userTemplate.size(); i++) { if (m->control_pressed) { return userTemplate; } databaseLeft->addSequence(*userTemplate[i]); } databaseLeft->generateDB(); - databaseLeft->setNumSeqs(thisTemplate.size()); + databaseLeft->setNumSeqs(userTemplate.size()); } - return thisTemplate; + return userTemplate; } catch(exception& e) { @@ -419,7 +343,7 @@ void ChimeraSlayer::printHeader(ostream& out) { Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc) { try { Sequence* trim = NULL; - if (trimChimera) { trim = trimQuery; } + if (trimChimera) { trim = new Sequence(trimQuery.getName(), trimQuery.getAligned()); } if (chimeraFlags == "yes") { string chimeraFlag = "no"; @@ -433,6 +357,8 @@ Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc) { m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine(); outAcc << querySeq->getName() << endl; + if (templateFileName == "self") { chimericSeqs.insert(querySeq->getName()); } + if (trimChimera) { int lengthLeft = spotMap[chimeraResults[0].winLEnd] - spotMap[chimeraResults[0].winLStart]; int lengthRight = spotMap[chimeraResults[0].winREnd] - spotMap[chimeraResults[0].winRStart]; @@ -446,13 +372,14 @@ Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc) { } trim->setAligned(newAligned); } - } } printBlock(chimeraResults[0], chimeraFlag, out); out << endl; - }else { out << querySeq->getName() << "\tno" << endl; } + }else { + out << querySeq->getName() << "\tno" << endl; + } return trim; @@ -500,6 +427,8 @@ Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc, data_results leftP m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine(); outAcc << querySeq->getName() << endl; + if (templateFileName == "self") { chimericSeqs.insert(querySeq->getName()); } + if (trimChimera) { string newAligned = trim->getAligned(); @@ -551,7 +480,9 @@ Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc, data_results leftP printBlock(leftPiece, rightPiece, leftChimeric, rightChimeric, chimeraFlag, out); out << endl; - }else { out << querySeq->getName() << "\tno" << endl; } + }else { + out << querySeq->getName() << "\tno" << endl; + } return trim; @@ -608,6 +539,8 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc, data_results lef outAccString += querySeq->getName() + "\n"; results = true; + if (templateFileName == "self") { chimericSeqs.insert(querySeq->getName()); } + //write to accnos file int length = outAccString.length(); char* buf2 = new char[length]; @@ -705,7 +638,7 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) { string outputString = ""; Sequence* trim = NULL; - if (trimChimera) { trim = trimQuery; } + if (trimChimera) { trim = new Sequence(trimQuery.getName(), trimQuery.getAligned()); } if (chimeraFlags == "yes") { string chimeraFlag = "no"; @@ -720,6 +653,8 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) { outAccString += querySeq->getName() + "\n"; results = true; + if (templateFileName == "self") { chimericSeqs.insert(querySeq->getName()); } + //write to accnos file int length = outAccString.length(); char* buf2 = new char[length]; @@ -778,7 +713,9 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) { //*************************************************************************************************************** int ChimeraSlayer::getChimeras(Sequence* query) { try { - if (trimChimera) { trimQuery = new Sequence(query->getName(), query->getAligned()); printResults.trimQuery = *trimQuery; } + + trimQuery.setName(query->getName()); trimQuery.setAligned(query->getAligned()); + printResults.trimQuery = trimQuery; chimeraFlags = "no"; printResults.flag = "no"; @@ -792,7 +729,7 @@ int ChimeraSlayer::getChimeras(Sequence* query) { //you must create a template vector thisTemplate; if (templateFileName != "self") { thisTemplate = templateSeqs; } - else { thisTemplate = getTemplate(query); } //fills thistemplate and creates the databases + else { thisTemplate = getTemplate(query); } //fills this template and creates the databases if (m->control_pressed) { return 0; } @@ -810,17 +747,18 @@ int ChimeraSlayer::getChimeras(Sequence* query) { if (m->control_pressed) { return 0; } string chimeraFlag = maligner.getResults(query, decalc); + if (m->control_pressed) { return 0; } + vector Results = maligner.getOutput(); - - //found in testing realigning only made things worse - if (realign) { - ChimeraReAligner realigner(thisTemplate, match, misMatch); - realigner.reAlign(query, Results); - } if (chimeraFlag == "yes") { + if (realign) { + ChimeraReAligner realigner(thisTemplate, match, misMatch); + realigner.reAlign(query, Results); + } + //get sequence that were given from maligner results vector seqs; map removeDups;