5 * Created by westcott on 9/25/09.
6 * Copyright 2009 Pschloss Lab. All rights reserved.
10 #include "chimeraslayer.h"
11 #include "chimerarealigner.h"
13 #include "blastdb.hpp"
15 //***************************************************************************************************************
16 ChimeraSlayer::ChimeraSlayer(string file, string temp, string mode, int k, int ms, int mms, int win, float div,
17 int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera() {
20 templateFileName = temp; templateSeqs = readSeqs(temp);
37 decalc = new DeCalculator();
42 m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer");
46 //***************************************************************************************************************
47 ChimeraSlayer::ChimeraSlayer(string file, string temp, string name, string mode, int k, int ms, int mms, int win, float div,
48 int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera() {
50 fastafile = file; templateSeqs = readSeqs(fastafile);
51 templateFileName = temp;
68 //read name file and create nameMapRank
71 decalc = new DeCalculator();
73 createFilter(templateSeqs, 0.0); //just removed columns where all seqs have a gap
75 //run filter on template
76 for (int i = 0; i < templateSeqs.size(); i++) { runFilter(templateSeqs[i]); }
80 m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer");
84 //***************************************************************************************************************
85 int ChimeraSlayer::readNameFile(string name) {
88 m->openInputFile(name, in);
91 int minRank = 10000000;
95 if (m->control_pressed) { in.close(); return 0; }
97 string thisname, repnames;
99 in >> thisname; m->gobble(in); //read from first column
100 in >> repnames; //read from second column
102 map<string, vector<string> >::iterator it = nameMapRank.find(thisname);
103 if (it == nameMapRank.end()) {
105 vector<string> splitRepNames;
106 m->splitAtComma(repnames, splitRepNames);
108 nameMapRank[thisname] = splitRepNames;
110 if (splitRepNames.size() > maxRank) { maxRank = splitRepNames.size(); }
111 if (splitRepNames.size() < minRank) { minRank = splitRepNames.size(); }
113 }else{ m->mothurOut(thisname + " is already in namesfile. I will use first definition."); m->mothurOutEndLine(); }
119 //sanity check to make sure files match
120 for (int i = 0; i < templateSeqs.size(); i++) {
121 map<string, vector<string> >::iterator it = nameMapRank.find(templateSeqs[i]->getName());
123 if (it == nameMapRank.end()) { m->mothurOut("[ERROR]: " + templateSeqs[i]->getName() + " is not in namesfile, but is in fastafile. Every name in fasta file must be in first column of names file."); m->mothurOutEndLine(); m->control_pressed = true; }
126 if (maxRank == minRank) { m->mothurOut("[ERROR]: all sequences in namesfile have the same abundance, aborting."); m->mothurOutEndLine(); m->control_pressed = true; }
131 catch(exception& e) {
132 m->errorOut(e, "ChimeraSlayer", "readNameFile");
137 //***************************************************************************************************************
138 int ChimeraSlayer::doPrep() {
141 //read in all query seqs
142 vector<Sequence*> tempQuerySeqs = readSeqs(fastafile);
144 vector<Sequence*> temp = templateSeqs;
145 for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); }
147 createFilter(temp, 0.0); //just removed columns where all seqs have a gap
149 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
151 if (m->control_pressed) { return 0; }
153 //run filter on template
154 for (int i = 0; i < templateSeqs.size(); i++) { if (m->control_pressed) { return 0; } runFilter(templateSeqs[i]); }
156 string kmerDBNameLeft;
157 string kmerDBNameRight;
159 //generate the kmerdb to pass to maligner
160 if (searchMethod == "kmer") {
161 string templatePath = m->hasPath(templateFileName);
162 string rightTemplateFileName = templatePath + "right." + m->getRootName(m->getSimpleName(templateFileName));
163 databaseRight = new KmerDB(rightTemplateFileName, kmerSize);
165 string leftTemplateFileName = templatePath + "left." + m->getRootName(m->getSimpleName(templateFileName));
166 databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);
168 for (int i = 0; i < templateSeqs.size(); i++) {
170 if (m->control_pressed) { return 0; }
172 string leftFrag = templateSeqs[i]->getUnaligned();
173 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
175 Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
176 databaseLeft->addSequence(leftTemp);
178 databaseLeft->generateDB();
179 databaseLeft->setNumSeqs(templateSeqs.size());
181 for (int i = 0; i < templateSeqs.size(); i++) {
182 if (m->control_pressed) { return 0; }
184 string rightFrag = templateSeqs[i]->getUnaligned();
185 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
187 Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
188 databaseRight->addSequence(rightTemp);
190 databaseRight->generateDB();
191 databaseRight->setNumSeqs(templateSeqs.size());
195 kmerDBNameLeft = leftTemplateFileName.substr(0,leftTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
196 ifstream kmerFileTestLeft(kmerDBNameLeft.c_str());
197 bool needToGenerateLeft = true;
199 if(kmerFileTestLeft){
200 bool GoodFile = m->checkReleaseVersion(kmerFileTestLeft, m->getVersion());
201 if (GoodFile) { needToGenerateLeft = false; }
204 if(needToGenerateLeft){
206 for (int i = 0; i < templateSeqs.size(); i++) {
208 if (m->control_pressed) { return 0; }
210 string leftFrag = templateSeqs[i]->getUnaligned();
211 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
213 Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
214 databaseLeft->addSequence(leftTemp);
216 databaseLeft->generateDB();
219 databaseLeft->readKmerDB(kmerFileTestLeft);
221 kmerFileTestLeft.close();
223 databaseLeft->setNumSeqs(templateSeqs.size());
226 kmerDBNameRight = rightTemplateFileName.substr(0,rightTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
227 ifstream kmerFileTestRight(kmerDBNameRight.c_str());
228 bool needToGenerateRight = true;
230 if(kmerFileTestRight){
231 bool GoodFile = m->checkReleaseVersion(kmerFileTestRight, m->getVersion());
232 if (GoodFile) { needToGenerateRight = false; }
235 if(needToGenerateRight){
237 for (int i = 0; i < templateSeqs.size(); i++) {
238 if (m->control_pressed) { return 0; }
240 string rightFrag = templateSeqs[i]->getUnaligned();
241 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
243 Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
244 databaseRight->addSequence(rightTemp);
246 databaseRight->generateDB();
249 databaseRight->readKmerDB(kmerFileTestRight);
251 kmerFileTestRight.close();
253 databaseRight->setNumSeqs(templateSeqs.size());
255 }else if (searchMethod == "blast") {
258 databaseLeft = new BlastDB(-2.0, -1.0, match, misMatch);
259 for (int i = 0; i < templateSeqs.size(); i++) { databaseLeft->addSequence(*templateSeqs[i]); }
260 databaseLeft->generateDB();
261 databaseLeft->setNumSeqs(templateSeqs.size());
267 catch(exception& e) {
268 m->errorOut(e, "ChimeraSlayer", "doprep");
272 //***************************************************************************************************************
273 vector<Sequence*> ChimeraSlayer::getTemplate(Sequence* q) {
276 vector<Sequence*> thisTemplate;
279 string thisName = q->getName();
280 map<string, vector<string> >::iterator itRank = nameMapRank.find(thisName); // you will find it because we already sanity checked
281 thisRank = (itRank->second).size();
283 //create list of names we want to put into the template
284 set<string> namesToAdd;
285 for (itRank = nameMapRank.begin(); itRank != nameMapRank.end(); itRank++) {
286 if ((itRank->second).size() > thisRank) {
287 //you are more abundant than me
288 for (int i = 0; i < (itRank->second).size(); i++) {
289 namesToAdd.insert((itRank->second)[i]);
294 for (int i = 0; i < templateSeqs.size(); i++) {
295 if (namesToAdd.count(templateSeqs[i]->getName()) != 0) {
296 thisTemplate.push_back(templateSeqs[i]);
300 string kmerDBNameLeft;
301 string kmerDBNameRight;
303 //generate the kmerdb to pass to maligner
304 if (searchMethod == "kmer") {
305 string templatePath = m->hasPath(templateFileName);
306 string rightTemplateFileName = templatePath + "right." + m->getRootName(m->getSimpleName(templateFileName));
307 databaseRight = new KmerDB(rightTemplateFileName, kmerSize);
309 string leftTemplateFileName = templatePath + "left." + m->getRootName(m->getSimpleName(templateFileName));
310 databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);
312 for (int i = 0; i < thisTemplate.size(); i++) {
314 if (m->control_pressed) { return thisTemplate; }
316 string leftFrag = thisTemplate[i]->getUnaligned();
317 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
319 Sequence leftTemp(thisTemplate[i]->getName(), leftFrag);
320 databaseLeft->addSequence(leftTemp);
322 databaseLeft->generateDB();
323 databaseLeft->setNumSeqs(thisTemplate.size());
325 for (int i = 0; i < thisTemplate.size(); i++) {
326 if (m->control_pressed) { return thisTemplate; }
328 string rightFrag = thisTemplate[i]->getUnaligned();
329 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
331 Sequence rightTemp(thisTemplate[i]->getName(), rightFrag);
332 databaseRight->addSequence(rightTemp);
334 databaseRight->generateDB();
335 databaseRight->setNumSeqs(thisTemplate.size());
340 for (int i = 0; i < thisTemplate.size(); i++) {
342 if (m->control_pressed) { return thisTemplate; }
344 string leftFrag = thisTemplate[i]->getUnaligned();
345 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
347 Sequence leftTemp(thisTemplate[i]->getName(), leftFrag);
348 databaseLeft->addSequence(leftTemp);
350 databaseLeft->generateDB();
351 databaseLeft->setNumSeqs(thisTemplate.size());
353 for (int i = 0; i < thisTemplate.size(); i++) {
354 if (m->control_pressed) { return thisTemplate; }
356 string rightFrag = thisTemplate[i]->getUnaligned();
357 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
359 Sequence rightTemp(thisTemplate[i]->getName(), rightFrag);
360 databaseRight->addSequence(rightTemp);
362 databaseRight->generateDB();
363 databaseRight->setNumSeqs(thisTemplate.size());
365 }else if (searchMethod == "blast") {
368 databaseLeft = new BlastDB(-2.0, -1.0, match, misMatch);
369 for (int i = 0; i < thisTemplate.size(); i++) { if (m->control_pressed) { return thisTemplate; } databaseLeft->addSequence(*thisTemplate[i]); }
370 databaseLeft->generateDB();
371 databaseLeft->setNumSeqs(thisTemplate.size());
377 catch(exception& e) {
378 m->errorOut(e, "ChimeraSlayer", "getTemplate");
383 //***************************************************************************************************************
384 ChimeraSlayer::~ChimeraSlayer() {
386 if (templateFileName != "self") {
387 if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; }
388 else if (searchMethod == "blast") { delete databaseLeft; }
391 //***************************************************************************************************************
392 void ChimeraSlayer::printHeader(ostream& out) {
393 m->mothurOutEndLine();
394 m->mothurOut("Only reporting sequence supported by " + toString(minBS) + "% of bootstrapped results.");
395 m->mothurOutEndLine();
397 out << "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n";
399 //***************************************************************************************************************
400 int ChimeraSlayer::print(ostream& out, ostream& outAcc) {
402 if (chimeraFlags == "yes") {
403 string chimeraFlag = "no";
404 if( (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
406 (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
409 if (chimeraFlag == "yes") {
410 if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
411 m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine();
412 outAcc << querySeq->getName() << endl;
416 printBlock(chimeraResults[0], chimeraFlag, out);
418 }else { out << querySeq->getName() << "\tno" << endl; }
423 catch(exception& e) {
424 m->errorOut(e, "ChimeraSlayer", "print");
429 //***************************************************************************************************************
430 int ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
433 bool results = false;
434 string outAccString = "";
435 string outputString = "";
437 if (chimeraFlags == "yes") {
438 string chimeraFlag = "no";
439 if( (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
441 (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
444 if (chimeraFlag == "yes") {
445 if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
446 cout << querySeq->getName() << "\tyes" << endl;
447 outAccString += querySeq->getName() + "\n";
450 //write to accnos file
451 int length = outAccString.length();
452 char* buf2 = new char[length];
453 memcpy(buf2, outAccString.c_str(), length);
455 MPI_File_write_shared(outAcc, buf2, length, MPI_CHAR, &status);
460 outputString = getBlock(chimeraResults[0], chimeraFlag);
461 outputString += "\n";
462 //cout << outputString << endl;
463 //write to output file
464 int length = outputString.length();
465 char* buf = new char[length];
466 memcpy(buf, outputString.c_str(), length);
468 MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
472 outputString += querySeq->getName() + "\tno\n";
473 //cout << outputString << endl;
474 //write to output file
475 int length = outputString.length();
476 char* buf = new char[length];
477 memcpy(buf, outputString.c_str(), length);
479 MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
486 catch(exception& e) {
487 m->errorOut(e, "ChimeraSlayer", "print");
493 //***************************************************************************************************************
494 int ChimeraSlayer::getChimeras(Sequence* query) {
499 spotMap = runFilter(query);
503 //you must create a template
504 vector<Sequence*> thisTemplate;
505 if (templateFileName != "self") { thisTemplate = templateSeqs; }
506 else { thisTemplate = getTemplate(query); } //fills thistemplate and creates the databases
508 if (m->control_pressed) { return 0; }
510 if (thisTemplate.size() == 0) { return 0; } //not chimeric
512 //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity
513 Maligner maligner(thisTemplate, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight);
514 Slayer slayer(window, increment, minSim, divR, iters, minSNP);
516 if (templateFileName == "self") {
517 if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; }
518 else if (searchMethod == "blast") { delete databaseLeft; }
521 if (m->control_pressed) { return 0; }
523 string chimeraFlag = maligner.getResults(query, decalc);
524 if (m->control_pressed) { return 0; }
525 vector<results> Results = maligner.getOutput();
527 //found in testing realigning only made things worse
529 ChimeraReAligner realigner(thisTemplate, match, misMatch);
530 realigner.reAlign(query, Results);
533 if (chimeraFlag == "yes") {
535 //get sequence that were given from maligner results
536 vector<SeqDist> seqs;
537 map<string, float> removeDups;
538 map<string, float>::iterator itDup;
539 map<string, string> parentNameSeq;
540 map<string, string>::iterator itSeq;
541 for (int j = 0; j < Results.size(); j++) {
542 float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
543 //only add if you are not a duplicate
544 itDup = removeDups.find(Results[j].parent);
545 if (itDup == removeDups.end()) { //this is not duplicate
546 removeDups[Results[j].parent] = dist;
547 parentNameSeq[Results[j].parent] = Results[j].parentAligned;
548 }else if (dist > itDup->second) { //is this a stronger number for this parent
549 removeDups[Results[j].parent] = dist;
550 parentNameSeq[Results[j].parent] = Results[j].parentAligned;
554 for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) {
555 //Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template
556 itSeq = parentNameSeq.find(itDup->first);
557 //cout << itDup->first << itSeq->second << endl;
558 Sequence* seq = new Sequence(itDup->first, itSeq->second);
562 member.dist = itDup->second;
564 seqs.push_back(member);
567 //limit number of parents to explore - default 3
568 if (Results.size() > parents) {
570 sort(seqs.begin(), seqs.end(), compareSeqDist);
571 //prioritize larger more similiar sequence fragments
572 reverse(seqs.begin(), seqs.end());
574 for (int k = seqs.size()-1; k > (parents-1); k--) {
580 //put seqs into vector to send to slayer
581 vector<Sequence*> seqsForSlayer;
582 for (int k = 0; k < seqs.size(); k++) { seqsForSlayer.push_back(seqs[k].seq); }
584 //mask then send to slayer...
586 decalc->setMask(seqMask);
589 decalc->runMask(query);
592 for (int k = 0; k < seqsForSlayer.size(); k++) {
593 decalc->runMask(seqsForSlayer[k]);
596 spotMap = decalc->getMaskMap();
599 if (m->control_pressed) { for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; } return 0; }
602 chimeraFlags = slayer.getResults(query, seqsForSlayer);
603 if (m->control_pressed) { return 0; }
604 chimeraResults = slayer.getOutput();
607 for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; }
615 catch(exception& e) {
616 m->errorOut(e, "ChimeraSlayer", "getChimeras");
620 //***************************************************************************************************************
621 void ChimeraSlayer::printBlock(data_struct data, string flag, ostream& out){
625 out << querySeq->getName() << '\t';
626 out << data.parentA.getName() << "\t" << data.parentB.getName() << '\t';
627 //out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl;
628 //out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl;
630 out << data.divr_qla_qrb << '\t' << data.qla_qrb << '\t' << data.bsa << '\t';
631 out << data.divr_qlb_qra << '\t' << data.qlb_qra << '\t' << data.bsb << '\t';
633 out << flag << '\t' << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t';
635 //out << "Similarity of parents: " << data.ab << endl;
636 //out << "Similarity of query to parentA: " << data.qa << endl;
637 //out << "Similarity of query to parentB: " << data.qb << endl;
640 //out << "Per_id(QL,A): " << data.qla << endl;
641 //out << "Per_id(QL,B): " << data.qlb << endl;
642 //out << "Per_id(QR,A): " << data.qra << endl;
643 //out << "Per_id(QR,B): " << data.qrb << endl;
646 //out << "DeltaL: " << (data.qla - data.qlb) << endl;
647 //out << "DeltaR: " << (data.qra - data.qrb) << endl;
650 catch(exception& e) {
651 m->errorOut(e, "ChimeraSlayer", "printBlock");
655 //***************************************************************************************************************
656 string ChimeraSlayer::getBlock(data_struct data, string flag){
659 string outputString = "";
661 outputString += querySeq->getName() + "\t";
662 outputString += data.parentA.getName() + "\t" + data.parentB.getName() + "\t";
664 outputString += toString(data.divr_qla_qrb) + "\t" + toString(data.qla_qrb) + "\t" + toString(data.bsa) + "\t";
665 outputString += toString(data.divr_qlb_qra) + "\t" + toString(data.qlb_qra) + "\t" + toString(data.bsb) + "\t";
667 outputString += flag + "\t" + toString(spotMap[data.winLStart]) + "-" + toString(spotMap[data.winLEnd]) + "\t" + toString(spotMap[data.winRStart]) + "-" + toString(spotMap[data.winREnd]) + "\t";
671 catch(exception& e) {
672 m->errorOut(e, "ChimeraSlayer", "getBlock");
676 //***************************************************************************************************************/