X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=chimeraslayer.cpp;h=5295eace3546e14db6488ceb41e144915531c830;hb=6de5adaae66b28aa60a75f123005cede410c156c;hp=9b85b5cd4da6dc6e4bc28fcc9d279bbd768da7c5;hpb=a5afca18544555fba2d9c3670ad1f8574916b0a0;p=mothur.git diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp index 9b85b5c..5295eac 100644 --- a/chimeraslayer.cpp +++ b/chimeraslayer.cpp @@ -8,259 +8,403 @@ */ #include "chimeraslayer.h" +#include "chimerarealigner.h" +#include "kmerdb.hpp" +#include "blastdb.hpp" //*************************************************************************************************************** -ChimeraSlayer::ChimeraSlayer(string filename, string temp) { fastafile = filename; templateFile = temp; } -//*************************************************************************************************************** - -ChimeraSlayer::~ChimeraSlayer() { +ChimeraSlayer::ChimeraSlayer(string file, string temp, string mode, int k, int ms, int mms, int win, float div, +int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera() { try { - for (int i = 0; i < querySeqs.size(); i++) { delete querySeqs[i]; } - for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i]; } - } - catch(exception& e) { - errorOut(e, "ChimeraSlayer", "~ChimeraSlayer"); - exit(1); - } -} -//*************************************************************************************************************** -void ChimeraSlayer::print(ostream& out) { - try { - mothurOutEndLine(); - mothurOut("Only reporting sequence supported by 90% of bootstrapped results."); - mothurOutEndLine(); - - for (int i = 0; i < querySeqs.size(); i++) { + fastafile = file; + templateFileName = temp; templateSeqs = readSeqs(temp); + searchMethod = mode; + kmerSize = k; + match = ms; + misMatch = mms; + window = win; + divR = div; + minSim = minsim; + minCov = mincov; + minBS = minbs; + minSNP = minsnp; + parents = par; + iters = it; + increment = inc; + numWanted = numw; + realign = r; + + decalc = new DeCalculator(); - if (chimeraFlags[i] == "yes") { - if ((chimeraResults[i][0].bsa >= 90.0) || (chimeraResults[i][0].bsb >= 90.0)) { - mothurOut(querySeqs[i]->getName() + "\tyes"); mothurOutEndLine(); - out << querySeqs[i]->getName() << "\tyes" << endl; - }else { - out << querySeqs[i]->getName() << "\tno" << endl; - //mothurOut(querySeqs[i]->getName() + "\tno"); mothurOutEndLine(); - } - - printBlock(chimeraResults[i][0], out, i); - - out << endl; - }else{ - out << querySeqs[i]->getName() << "\tno" << endl; - //mothurOut(querySeqs[i]->getName() + "\tno"); mothurOutEndLine(); - } - } - + doPrep(); } catch(exception& e) { - errorOut(e, "ChimeraSlayer", "print"); + m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer"); exit(1); } } - //*************************************************************************************************************** -int ChimeraSlayer::getChimeras() { +int ChimeraSlayer::doPrep() { try { - //read in query sequences and subject sequences - mothurOut("Reading sequences and template file... "); cout.flush(); - querySeqs = readSeqs(fastafile); - templateSeqs = readSeqs(templateFile); - mothurOut("Done."); mothurOutEndLine(); + //read in all query seqs + vector tempQuerySeqs = readSeqs(fastafile); - int numSeqs = querySeqs.size(); + vector temp = templateSeqs; + for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); } - if (unaligned) { mothurOut("Your sequences need to be aligned when you use the chimeraslayer method."); mothurOutEndLine(); return 1; } + createFilter(temp, 0.0); //just removed columns where all seqs have a gap - chimeraResults.resize(numSeqs); - chimeraFlags.resize(numSeqs, "no"); - spotMap.resize(numSeqs); + for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } - //break up file if needed - int linesPerProcess = numSeqs / processors ; + if (m->control_pressed) { return 0; } - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - //find breakup of sequences for all times we will Parallelize - if (processors == 1) { lines.push_back(new linePair(0, numSeqs)); } - else { - //fill line pairs - for (int i = 0; i < (processors-1); i++) { - lines.push_back(new linePair((i*linesPerProcess), ((i*linesPerProcess) + linesPerProcess))); - } - //this is necessary to get remainder of processors / numSeqs so you don't miss any lines at the end - int i = processors - 1; - lines.push_back(new linePair((i*linesPerProcess), numSeqs)); - } - #else - lines.push_back(new linePair(0, numSeqs)); - #endif - - if (seqMask != "") { decalc = new DeCalculator(); } //to use below + //run filter on template + for (int i = 0; i < templateSeqs.size(); i++) { if (m->control_pressed) { return 0; } runFilter(templateSeqs[i]); } - //initialize spotMap - for (int j = 0; j < numSeqs; j++) { - for (int i = 0; i < querySeqs[0]->getAligned().length(); i++) { - spotMap[j][i] = i; + string kmerDBNameLeft; + string kmerDBNameRight; + + //generate the kmerdb to pass to maligner + if (searchMethod == "kmer") { + string rightTemplateFileName = "right." + templateFileName; + databaseRight = new KmerDB(rightTemplateFileName, kmerSize); + + string leftTemplateFileName = "left." + templateFileName; + databaseLeft = new KmerDB(leftTemplateFileName, kmerSize); + #ifdef USE_MPI + for (int i = 0; i < templateSeqs.size(); i++) { + + if (m->control_pressed) { return 0; } + + string leftFrag = templateSeqs[i]->getUnaligned(); + leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33)); + + Sequence leftTemp(templateSeqs[i]->getName(), leftFrag); + databaseLeft->addSequence(leftTemp); } - } - - //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity - maligner = new Maligner(templateSeqs, numWanted, match, misMatch, 1.01, minSim); - slayer = new Slayer(window, increment, minSim, divR, iters); - - for (int i = 0; i < querySeqs.size(); i++) { - - string chimeraFlag = maligner->getResults(querySeqs[i]); - //float percentIdentical = maligner->getPercentID(); - vector Results = maligner->getOutput(); - - cout << "Processing sequence: " << i+1 << endl; + databaseLeft->generateDB(); + databaseLeft->setNumSeqs(templateSeqs.size()); - //for (int j = 0; j < Results.size(); j++) { - //cout << "regionStart = " << Results[j].regionStart << "\tRegionEnd = " << Results[j].regionEnd << "\tName = " << Results[j].parent << "\tPerQP = " << Results[j].queryToParent << "\tLocalPerQP = " << Results[j].queryToParentLocal << "\tdivR = " << Results[j].divR << endl; - //} + for (int i = 0; i < templateSeqs.size(); i++) { + if (m->control_pressed) { return 0; } + + string rightFrag = templateSeqs[i]->getUnaligned(); + rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66)); + + Sequence rightTemp(templateSeqs[i]->getName(), rightFrag); + databaseRight->addSequence(rightTemp); + } + databaseRight->generateDB(); + databaseRight->setNumSeqs(templateSeqs.size()); + + #else + //leftside + kmerDBNameLeft = leftTemplateFileName.substr(0,leftTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; + ifstream kmerFileTestLeft(kmerDBNameLeft.c_str()); - //if (chimeraFlag == "yes") { + if(!kmerFileTestLeft){ - //get sequence that were given from maligner results - vector seqs; - map removeDups; - map::iterator itDup; - for (int j = 0; j < Results.size(); j++) { - float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal; - //only add if you are not a duplicate - itDup = removeDups.find(Results[j].parent); - if (itDup == removeDups.end()) { //this is not duplicate - removeDups[Results[j].parent] = dist; - }else if (dist > itDup->second) { //is this a stronger number for this parent - removeDups[Results[j].parent] = dist; - } - } - - for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) { - Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template + for (int i = 0; i < templateSeqs.size(); i++) { - SeqDist member; - member.seq = seq; - member.dist = itDup->second; + if (m->control_pressed) { return 0; } - seqs.push_back(member); - } - - //limit number of parents to explore - default 5 - if (Results.size() > parents) { - //sort by distance - sort(seqs.begin(), seqs.end(), compareSeqDist); - //prioritize larger more similiar sequence fragments - reverse(seqs.begin(), seqs.end()); + string leftFrag = templateSeqs[i]->getUnaligned(); + leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33)); - for (int k = seqs.size()-1; k > (parents-1); k--) { - delete seqs[k].seq; - seqs.pop_back(); - } + Sequence leftTemp(templateSeqs[i]->getName(), leftFrag); + databaseLeft->addSequence(leftTemp); } - - //put seqs into vector to send to slayer - vector seqsForSlayer; - for (int k = 0; k < seqs.size(); k++) { seqsForSlayer.push_back(seqs[k].seq); } -//ofstream out; -//string name = querySeqs[i]->getName(); -//cout << name << endl; -//name = name.substr(name.find_first_of("|")+1); -//cout << name << endl; -//name = name.substr(name.find_first_of("|")+1); -//cout << name << endl; -//name = name.substr(0, name.find_last_of("|")); -//cout << name << endl; -//string filename = name + ".seqsforslayer"; -//openOutputFile(filename, out); -//for (int u = 0; u < seqsForSlayer.size(); u++) { seqsForSlayer[u]->printSequence(out); } -//out.close(); -//filename = name + ".fasta"; -//openOutputFile(filename, out); -//q->printSequence(out); -//out.close(); - + databaseLeft->generateDB(); + + }else { + databaseLeft->readKmerDB(kmerFileTestLeft); + } + kmerFileTestLeft.close(); - //mask then send to slayer... - if (seqMask != "") { - decalc->setMask(seqMask); - - //mask querys - decalc->runMask(querySeqs[i]); + databaseLeft->setNumSeqs(templateSeqs.size()); + + //rightside + kmerDBNameRight = rightTemplateFileName.substr(0,rightTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; + ifstream kmerFileTestRight(kmerDBNameRight.c_str()); + + if(!kmerFileTestRight){ + + for (int i = 0; i < templateSeqs.size(); i++) { + if (m->control_pressed) { return 0; } - //mask parents - for (int k = 0; k < seqsForSlayer.size(); k++) { - decalc->runMask(seqsForSlayer[k]); - } + string rightFrag = templateSeqs[i]->getUnaligned(); + rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66)); - for (int i = 0; i < numSeqs; i++) { - spotMap[i] = decalc->getMaskMap(); - } - + Sequence rightTemp(templateSeqs[i]->getName(), rightFrag); + databaseRight->addSequence(rightTemp); } + databaseRight->generateDB(); - //send to slayer - chimeraFlags[i] = slayer->getResults(querySeqs[i], seqsForSlayer); - chimeraResults[i] = slayer->getOutput(); - - //free memory - for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; } - //} + }else { + databaseRight->readKmerDB(kmerFileTestRight); + } + kmerFileTestRight.close(); - } - //free memory - for (int i = 0; i < lines.size(); i++) { delete lines[i]; } + databaseRight->setNumSeqs(templateSeqs.size()); + #endif + }else if (searchMethod == "blast") { - if (seqMask != "") { - delete decalc; + //generate blastdb + databaseLeft = new BlastDB(-2.0, -1.0, match, misMatch); + for (int i = 0; i < templateSeqs.size(); i++) { databaseLeft->addSequence(*templateSeqs[i]); } + databaseLeft->generateDB(); + databaseLeft->setNumSeqs(templateSeqs.size()); } return 0; + } catch(exception& e) { - errorOut(e, "ChimeraSlayer", "getChimeras"); + m->errorOut(e, "ChimeraSlayer", "doprep"); exit(1); } } //*************************************************************************************************************** -Sequence* ChimeraSlayer::getSequence(string name) { - try{ - Sequence* temp; +ChimeraSlayer::~ChimeraSlayer() { + delete decalc; + if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; } + else if (searchMethod == "blast") { delete databaseLeft; } +} +//*************************************************************************************************************** +void ChimeraSlayer::printHeader(ostream& out) { + m->mothurOutEndLine(); + m->mothurOut("Only reporting sequence supported by " + toString(minBS) + "% of bootstrapped results."); + m->mothurOutEndLine(); + + out << "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n"; +} +//*************************************************************************************************************** +int ChimeraSlayer::print(ostream& out, ostream& outAcc) { + try { + if (chimeraFlags == "yes") { + string chimeraFlag = "no"; + if( (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR) + || + (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; } + + + if (chimeraFlag == "yes") { + if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) { + m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine(); + outAcc << querySeq->getName() << endl; + } + } + + printBlock(chimeraResults[0], out); + out << endl; + }else { out << querySeq->getName() << "\tno" << endl; } + + return 0; - //look through templateSeqs til you find it - int spot = -1; - for (int i = 0; i < templateSeqs.size(); i++) { - if (name == templateSeqs[i]->getName()) { - spot = i; - break; + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayer", "print"); + exit(1); + } +} +#ifdef USE_MPI +//*************************************************************************************************************** +int ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) { + try { + MPI_Status status; + bool results = false; + string outAccString = ""; + string outputString = ""; + + if (chimeraFlags == "yes") { + string chimeraFlag = "no"; + if( (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR) + || + (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; } + + + if (chimeraFlag == "yes") { + if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) { + cout << querySeq->getName() << "\tyes" << endl; + outAccString += querySeq->getName() + "\n"; + results = true; + + //write to accnos file + int length = outAccString.length(); + char* buf2 = new char[length]; + memcpy(buf2, outAccString.c_str(), length); + + MPI_File_write_shared(outAcc, buf2, length, MPI_CHAR, &status); + delete buf2; + } } + + outputString = getBlock(chimeraResults[0]); + outputString += "\n"; + //cout << outputString << endl; + //write to output file + int length = outputString.length(); + char* buf = new char[length]; + memcpy(buf, outputString.c_str(), length); + + MPI_File_write_shared(out, buf, length, MPI_CHAR, &status); + delete buf; + + }else { + outputString += querySeq->getName() + "\tno\n"; + //cout << outputString << endl; + //write to output file + int length = outputString.length(); + char* buf = new char[length]; + memcpy(buf, outputString.c_str(), length); + + MPI_File_write_shared(out, buf, length, MPI_CHAR, &status); + delete buf; } - if(spot == -1) { mothurOut("Error: Could not find sequence in chimeraSlayer."); mothurOutEndLine(); return NULL; } - temp = new Sequence(templateSeqs[spot]->getName(), templateSeqs[spot]->getAligned()); + return results; + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayer", "print"); + exit(1); + } +} +#endif + +//*************************************************************************************************************** +int ChimeraSlayer::getChimeras(Sequence* query) { + try { + chimeraFlags = "no"; + + //filter query + spotMap = runFilter(query); - return temp; + querySeq = query; + + //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity + maligner = new Maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight); + slayer = new Slayer(window, increment, minSim, divR, iters, minSNP); + + if (m->control_pressed) { return 0; } + + string chimeraFlag = maligner->getResults(query, decalc); + if (m->control_pressed) { return 0; } + vector Results = maligner->getOutput(); + + //found in testing realigning only made things worse + if (realign) { + ChimeraReAligner realigner(templateSeqs, match, misMatch); + realigner.reAlign(query, Results); + } + + if (chimeraFlag == "yes") { + + //get sequence that were given from maligner results + vector seqs; + map removeDups; + map::iterator itDup; + map parentNameSeq; + map::iterator itSeq; + for (int j = 0; j < Results.size(); j++) { + float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal; + //only add if you are not a duplicate + itDup = removeDups.find(Results[j].parent); + if (itDup == removeDups.end()) { //this is not duplicate + removeDups[Results[j].parent] = dist; + parentNameSeq[Results[j].parent] = Results[j].parentAligned; + }else if (dist > itDup->second) { //is this a stronger number for this parent + removeDups[Results[j].parent] = dist; + parentNameSeq[Results[j].parent] = Results[j].parentAligned; + } + } + + for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) { + //Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template + itSeq = parentNameSeq.find(itDup->first); +//cout << itDup->first << itSeq->second << endl; + Sequence* seq = new Sequence(itDup->first, itSeq->second); + + SeqDist member; + member.seq = seq; + member.dist = itDup->second; + + seqs.push_back(member); + } + + //limit number of parents to explore - default 3 + if (Results.size() > parents) { + //sort by distance + sort(seqs.begin(), seqs.end(), compareSeqDist); + //prioritize larger more similiar sequence fragments + reverse(seqs.begin(), seqs.end()); + + for (int k = seqs.size()-1; k > (parents-1); k--) { + delete seqs[k].seq; + seqs.pop_back(); + } + } + + //put seqs into vector to send to slayer + vector seqsForSlayer; + for (int k = 0; k < seqs.size(); k++) { seqsForSlayer.push_back(seqs[k].seq); } + + //mask then send to slayer... + if (seqMask != "") { + decalc->setMask(seqMask); + + //mask querys + decalc->runMask(query); + + //mask parents + for (int k = 0; k < seqsForSlayer.size(); k++) { + decalc->runMask(seqsForSlayer[k]); + } + + spotMap = decalc->getMaskMap(); + } + + if (m->control_pressed) { for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; } return 0; } + + //send to slayer + chimeraFlags = slayer->getResults(query, seqsForSlayer); + if (m->control_pressed) { return 0; } + chimeraResults = slayer->getOutput(); + + //free memory + for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; } + } + + delete maligner; + delete slayer; + + return 0; } catch(exception& e) { - errorOut(e, "ChimeraSlayer", "getSequence"); + m->errorOut(e, "ChimeraSlayer", "getChimeras"); exit(1); } } //*************************************************************************************************************** -void ChimeraSlayer::printBlock(data_struct data, ostream& out, int i){ +void ChimeraSlayer::printBlock(data_struct data, ostream& out){ try { + //out << ":)\n"; + + out << querySeq->getName() << '\t'; + out << data.parentA.getName() << "\t" << data.parentB.getName() << '\t'; + //out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl; + //out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl; - out << "parentA: " << data.parentA.getName() << " parentB: " << data.parentB.getName() << endl; - out << "Left Window: " << spotMap[i][data.winLStart] << " " << spotMap[i][data.winLEnd] << endl; - out << "Right Window: " << spotMap[i][data.winRStart] << " " << spotMap[i][data.winREnd] << endl; + out << data.divr_qla_qrb << '\t' << data.qla_qrb << '\t' << data.bsa << '\t'; + out << data.divr_qlb_qra << '\t' << data.qlb_qra << '\t' << data.bsb << '\t'; - out << "Divergence of Query to Leftside ParentA and Rightside ParentB: " << data.divr_qla_qrb << '\t' << "Bootstrap: " << data.bsa << endl; - out << "Divergence of Query to Rightside ParentA and Leftside ParentB: " << data.divr_qlb_qra << '\t' << "Bootstrap: " << data.bsb << endl; + out << "yes\t" << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t'; + + //out << "Similarity of parents: " << data.ab << endl; + //out << "Similarity of query to parentA: " << data.qa << endl; + //out << "Similarity of query to parentB: " << data.qb << endl; - out << "Similarity of parents: " << data.ab << endl; - out << "Similarity of query to parentA: " << data.qa << endl; - out << "Similarity of query to parentB: " << data.qb << endl; //out << "Per_id(QL,A): " << data.qla << endl; //out << "Per_id(QL,B): " << data.qlb << endl; @@ -268,14 +412,35 @@ void ChimeraSlayer::printBlock(data_struct data, ostream& out, int i){ //out << "Per_id(QR,B): " << data.qrb << endl; - out << "DeltaL: " << (data.qla - data.qlb) << endl; - out << "DeltaR: " << (data.qra - data.qrb) << endl; + //out << "DeltaL: " << (data.qla - data.qlb) << endl; + //out << "DeltaR: " << (data.qra - data.qrb) << endl; } catch(exception& e) { - errorOut(e, "ChimeraSlayer", "printBlock"); + m->errorOut(e, "ChimeraSlayer", "printBlock"); exit(1); } } //*************************************************************************************************************** +string ChimeraSlayer::getBlock(data_struct data){ + try { + + string outputString = ""; + + outputString += querySeq->getName() + "\t"; + outputString += data.parentA.getName() + "\t" + data.parentB.getName() + "\t"; + + outputString += toString(data.divr_qla_qrb) + "\t" + toString(data.qla_qrb) + "\t" + toString(data.bsa) + "\t"; + outputString += toString(data.divr_qlb_qra) + "\t" + toString(data.qlb_qra) + "\t" + toString(data.bsb) + "\t"; + + outputString += "yes\t" + toString(spotMap[data.winLStart]) + "-" + toString(spotMap[data.winLEnd]) + "\t" + toString(spotMap[data.winRStart]) + "-" + toString(spotMap[data.winREnd]) + "\t"; + + return outputString; + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayer", "getBlock"); + exit(1); + } +} +//***************************************************************************************************************/