X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;ds=sidebyside;f=chimeraslayer.cpp;h=57ed7130916b0e1d810b470fa0552dc5556a1143;hb=30c1fd8c45b6f0d66c17f2714dbb58b8ddccdce2;hp=8a0cbd067cc6cbfb8ced9fbb3a3cb89ea6fdc62b;hpb=e72551c9cc5542e6a354f0f3e415fea261421d72;p=mothur.git diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp index 8a0cbd0..57ed713 100644 --- a/chimeraslayer.cpp +++ b/chimeraslayer.cpp @@ -9,19 +9,128 @@ #include "chimeraslayer.h" #include "chimerarealigner.h" +#include "kmerdb.hpp" //*************************************************************************************************************** -ChimeraSlayer::ChimeraSlayer(string mode, bool r) : searchMethod(mode), realign(r) { decalc = new DeCalculator(); } +ChimeraSlayer::ChimeraSlayer(string mode, bool r, string f) : searchMethod(mode), realign(r), fastafile(f) { + decalc = new DeCalculator(); +} +//*************************************************************************************************************** +int ChimeraSlayer::doPrep() { + try { + + string kmerDBNameLeft; + string kmerDBNameRight; + + //generate the kmerdb to pass to maligner + if (searchMethod == "kmer") { + //leftside + string leftTemplateFileName = "left." + templateFileName; + databaseLeft = new KmerDB(leftTemplateFileName, kmerSize); + kmerDBNameLeft = leftTemplateFileName.substr(0,leftTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; + ifstream kmerFileTestLeft(kmerDBNameLeft.c_str()); + + if(!kmerFileTestLeft){ + + for (int i = 0; i < templateSeqs.size(); i++) { + + if (m->control_pressed) { return 0; } + + string leftFrag = templateSeqs[i]->getUnaligned(); + leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33)); + + Sequence leftTemp(templateSeqs[i]->getName(), leftFrag); + databaseLeft->addSequence(leftTemp); + } + databaseLeft->generateDB(); + + }else { + databaseLeft->readKmerDB(kmerFileTestLeft); + } + kmerFileTestLeft.close(); + + databaseLeft->setNumSeqs(templateSeqs.size()); + + //rightside + string rightTemplateFileName = "right." + templateFileName; + databaseRight = new KmerDB(rightTemplateFileName, kmerSize); + kmerDBNameRight = rightTemplateFileName.substr(0,rightTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; + ifstream kmerFileTestRight(kmerDBNameRight.c_str()); + + if(!kmerFileTestRight){ + + for (int i = 0; i < templateSeqs.size(); i++) { + if (m->control_pressed) { return 0; } + + string rightFrag = templateSeqs[i]->getUnaligned(); + rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66)); + + Sequence rightTemp(templateSeqs[i]->getName(), rightFrag); + databaseRight->addSequence(rightTemp); + } + databaseRight->generateDB(); + + }else { + databaseRight->readKmerDB(kmerFileTestRight); + } + kmerFileTestRight.close(); + + databaseRight->setNumSeqs(templateSeqs.size()); + + } + + int start = time(NULL); + //filter the sequences + //read in all query seqs + ifstream in; + openInputFile(fastafile, in); + + vector tempQuerySeqs; + while(!in.eof()){ + if (m->control_pressed) { for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } return 0; } + + Sequence* s = new Sequence(in); + gobble(in); + + if (s->getName() != "") { tempQuerySeqs.push_back(s); } + } + in.close(); + + vector temp = templateSeqs; + for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); } + + createFilter(temp, 0.0); //just removed columns where all seqs have a gap + + for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; } + + if (m->control_pressed) { return 0; } + + + //run filter on template + for (int i = 0; i < templateSeqs.size(); i++) { if (m->control_pressed) { return 0; } runFilter(templateSeqs[i]); } + + m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to filter."); m->mothurOutEndLine(); + + return 0; + + } + catch(exception& e) { + m->errorOut(e, "ChimeraSlayer", "doprep"); + exit(1); + } +} //*************************************************************************************************************** -ChimeraSlayer::~ChimeraSlayer() { delete decalc; } +ChimeraSlayer::~ChimeraSlayer() { delete decalc; if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; } } //*************************************************************************************************************** void ChimeraSlayer::printHeader(ostream& out) { - mothurOutEndLine(); - mothurOut("Only reporting sequence supported by 90% of bootstrapped results."); - mothurOutEndLine(); + m->mothurOutEndLine(); + m->mothurOut("Only reporting sequence supported by " + toString(minBS) + "% of bootstrapped results."); + m->mothurOutEndLine(); + + out << "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n"; } //*************************************************************************************************************** -void ChimeraSlayer::print(ostream& out) { +int ChimeraSlayer::print(ostream& out, ostream& outAcc) { try { if (chimeraFlags == "yes") { string chimeraFlag = "no"; @@ -32,17 +141,20 @@ void ChimeraSlayer::print(ostream& out) { if (chimeraFlag == "yes") { if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) { - mothurOut(querySeq->getName() + "\tyes"); mothurOutEndLine(); + m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine(); + outAcc << querySeq->getName() << endl; } } - out << querySeq->getName() << "\tyes" << endl; + printBlock(chimeraResults[0], out); out << endl; }else { out << querySeq->getName() << "\tno" << endl; } + return 0; + } catch(exception& e) { - errorOut(e, "ChimeraSlayer", "print"); + m->errorOut(e, "ChimeraSlayer", "print"); exit(1); } } @@ -50,150 +162,147 @@ void ChimeraSlayer::print(ostream& out) { int ChimeraSlayer::getChimeras(Sequence* query) { try { chimeraFlags = "no"; - querySeq = query; - for (int i = 0; i < query->getAligned().length(); i++) { - spotMap[i] = i; - } + //filter query + spotMap = runFilter(query); + + querySeq = query; //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity - maligner = new Maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod); + maligner = new Maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight); slayer = new Slayer(window, increment, minSim, divR, iters, minSNP); + if (m->control_pressed) { return 0; } + string chimeraFlag = maligner->getResults(query, decalc); + if (m->control_pressed) { return 0; } vector Results = maligner->getOutput(); - - //realign query to parents to improve slayers detection rate??? + + //found in testing realigning only made things worse if (realign) { ChimeraReAligner realigner(templateSeqs, match, misMatch); realigner.reAlign(query, Results); } - //if (chimeraFlag == "yes") { - - //get sequence that were given from maligner results - vector seqs; - map removeDups; - map::iterator itDup; - for (int j = 0; j < Results.size(); j++) { - float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal; - //only add if you are not a duplicate - itDup = removeDups.find(Results[j].parent); - if (itDup == removeDups.end()) { //this is not duplicate - removeDups[Results[j].parent] = dist; - }else if (dist > itDup->second) { //is this a stronger number for this parent - removeDups[Results[j].parent] = dist; - } - } - - for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) { - Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template + if (chimeraFlag == "yes") { - SeqDist member; - member.seq = seq; - member.dist = itDup->second; - - seqs.push_back(member); - } - - //limit number of parents to explore - default 3 - if (Results.size() > parents) { - //sort by distance - sort(seqs.begin(), seqs.end(), compareSeqDist); - //prioritize larger more similiar sequence fragments - reverse(seqs.begin(), seqs.end()); + //get sequence that were given from maligner results + vector seqs; + map removeDups; + map::iterator itDup; + map parentNameSeq; + map::iterator itSeq; + for (int j = 0; j < Results.size(); j++) { + float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal; + //only add if you are not a duplicate + itDup = removeDups.find(Results[j].parent); + if (itDup == removeDups.end()) { //this is not duplicate + removeDups[Results[j].parent] = dist; + parentNameSeq[Results[j].parent] = Results[j].parentAligned; + }else if (dist > itDup->second) { //is this a stronger number for this parent + removeDups[Results[j].parent] = dist; + parentNameSeq[Results[j].parent] = Results[j].parentAligned; + } + } - for (int k = seqs.size()-1; k > (parents-1); k--) { - delete seqs[k].seq; - seqs.pop_back(); + for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) { + //Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template + itSeq = parentNameSeq.find(itDup->first); +//cout << itDup->first << itSeq->second << endl; + Sequence* seq = new Sequence(itDup->first, itSeq->second); + + SeqDist member; + member.seq = seq; + member.dist = itDup->second; + + seqs.push_back(member); } - } - - //put seqs into vector to send to slayer - vector seqsForSlayer; - for (int k = 0; k < seqs.size(); k++) { seqsForSlayer.push_back(seqs[k].seq); } - //cout << i+1 << "num parents = " << seqsForSlayer.size() << '\t' << chimeraFlag << endl; -//ofstream out; -//string name = querySeqs[i]->getName(); -//cout << name << endl; -//name = name.substr(name.find_first_of("|")+1); -//cout << name << endl; -//name = name.substr(name.find_first_of("|")+1); -//cout << name << endl; -//name = name.substr(0, name.find_last_of("|")); -//cout << name << endl; -//string filename = toString(i+1) + ".seqsforslayer"; -//openOutputFile(filename, out); -//cout << querySeqs[i]->getName() << endl; -//for (int u = 0; u < seqsForSlayer.size(); u++) { cout << seqsForSlayer[u]->getName() << '\t'; seqsForSlayer[u]->printSequence(out); } -//cout << endl; -//out.close(); -//filename = toString(i+1) + ".fasta"; -//openOutputFile(filename, out); -//querySeqs[i]->printSequence(out); -//out.close(); - - - //mask then send to slayer... - if (seqMask != "") { - decalc->setMask(seqMask); + //limit number of parents to explore - default 3 + if (Results.size() > parents) { + //sort by distance + sort(seqs.begin(), seqs.end(), compareSeqDist); + //prioritize larger more similiar sequence fragments + reverse(seqs.begin(), seqs.end()); + + for (int k = seqs.size()-1; k > (parents-1); k--) { + delete seqs[k].seq; + seqs.pop_back(); + } + } - //mask querys - decalc->runMask(query); + //put seqs into vector to send to slayer + vector seqsForSlayer; + for (int k = 0; k < seqs.size(); k++) { seqsForSlayer.push_back(seqs[k].seq); } - //mask parents - for (int k = 0; k < seqsForSlayer.size(); k++) { - decalc->runMask(seqsForSlayer[k]); + //mask then send to slayer... + if (seqMask != "") { + decalc->setMask(seqMask); + + //mask querys + decalc->runMask(query); + + //mask parents + for (int k = 0; k < seqsForSlayer.size(); k++) { + decalc->runMask(seqsForSlayer[k]); + } + + spotMap = decalc->getMaskMap(); } - spotMap = decalc->getMaskMap(); + if (m->control_pressed) { for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; } return 0; } + + //send to slayer + chimeraFlags = slayer->getResults(query, seqsForSlayer); + if (m->control_pressed) { return 0; } + chimeraResults = slayer->getOutput(); + + //free memory + for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; } } - //send to slayer - chimeraFlags = slayer->getResults(query, seqsForSlayer); - chimeraResults = slayer->getOutput(); - - //free memory - for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; } - //} - + delete maligner; + delete slayer; + return 0; } catch(exception& e) { - errorOut(e, "ChimeraSlayer", "getChimeras"); + m->errorOut(e, "ChimeraSlayer", "getChimeras"); exit(1); } } //*************************************************************************************************************** void ChimeraSlayer::printBlock(data_struct data, ostream& out){ try { + //out << "Name\tParentA\tParentB\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n"; + + out << querySeq->getName() << '\t'; + out << data.parentA.getName() << "\t" << data.parentB.getName() << '\t'; + //out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl; + //out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl; + + out << data.divr_qla_qrb << '\t' << data.qla_qrb << '\t' << data.bsa << '\t'; + out << data.divr_qlb_qra << '\t' << data.qlb_qra << '\t' << data.bsb << '\t'; - out << "parentA: " << data.parentA.getName() << " parentB: " << data.parentB.getName() << endl; - out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl; - out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl; + out << "yes\t" << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t'; - out << "Divergence of Query to Leftside ParentA and Rightside ParentB: " << data.divr_qla_qrb << '\t' << "Bootstrap: " << data.bsa << endl; - out << "Divergence of Query to Rightside ParentA and Leftside ParentB: " << data.divr_qlb_qra << '\t' << "Bootstrap: " << data.bsb << endl; + //out << "Similarity of parents: " << data.ab << endl; + //out << "Similarity of query to parentA: " << data.qa << endl; + //out << "Similarity of query to parentB: " << data.qb << endl; - out << "Similarity of parents: " << data.ab << endl; - out << "Similarity of query to parentA: " << data.qa << endl; - out << "Similarity of query to parentB: " << data.qb << endl; - out << "Percent_ID QLA_QRB: " << data.qla_qrb << endl; - out << "Percent_ID QLB_QRA: " << data.qlb_qra << endl; //out << "Per_id(QL,A): " << data.qla << endl; //out << "Per_id(QL,B): " << data.qlb << endl; //out << "Per_id(QR,A): " << data.qra << endl; //out << "Per_id(QR,B): " << data.qrb << endl; - out << "DeltaL: " << (data.qla - data.qlb) << endl; - out << "DeltaR: " << (data.qra - data.qrb) << endl; + //out << "DeltaL: " << (data.qla - data.qlb) << endl; + //out << "DeltaR: " << (data.qra - data.qrb) << endl; } catch(exception& e) { - errorOut(e, "ChimeraSlayer", "printBlock"); + m->errorOut(e, "ChimeraSlayer", "printBlock"); exit(1); } }