5 * Created by westcott on 9/25/09.
6 * Copyright 2009 Pschloss Lab. All rights reserved.
10 #include "chimeraslayer.h"
11 #include "chimerarealigner.h"
13 #include "blastdb.hpp"
15 //***************************************************************************************************************
16 ChimeraSlayer::ChimeraSlayer(string file, string temp, string mode, int k, int ms, int mms, int win, float div,
17 int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera() {
20 templateFileName = temp; templateSeqs = readSeqs(temp);
37 decalc = new DeCalculator();
42 m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer");
46 //***************************************************************************************************************
47 ChimeraSlayer::ChimeraSlayer(string file, string temp, string name, string mode, string abunds, int k, int ms, int mms, int win, float div,
48 int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera() {
50 fastafile = file; templateSeqs = readSeqs(fastafile);
51 templateFileName = temp;
67 includeAbunds = abunds;
69 //read name file and create nameMapRank
72 decalc = new DeCalculator();
74 createFilter(templateSeqs, 0.0); //just removed columns where all seqs have a gap
76 //run filter on template
77 for (int i = 0; i < templateSeqs.size(); i++) { runFilter(templateSeqs[i]); }
81 m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer");
85 //***************************************************************************************************************
86 int ChimeraSlayer::readNameFile(string name) {
89 m->openInputFile(name, in);
92 int minRank = 10000000;
96 if (m->control_pressed) { in.close(); return 0; }
98 string thisname, repnames;
100 in >> thisname; m->gobble(in); //read from first column
101 in >> repnames; //read from second column
103 map<string, vector<string> >::iterator it = nameMapRank.find(thisname);
104 if (it == nameMapRank.end()) {
106 vector<string> splitRepNames;
107 m->splitAtComma(repnames, splitRepNames);
109 nameMapRank[thisname] = splitRepNames;
111 if (splitRepNames.size() > maxRank) { maxRank = splitRepNames.size(); }
112 if (splitRepNames.size() < minRank) { minRank = splitRepNames.size(); }
114 }else{ m->mothurOut(thisname + " is already in namesfile. I will use first definition."); m->mothurOutEndLine(); }
120 //sanity check to make sure files match
121 for (int i = 0; i < templateSeqs.size(); i++) {
122 map<string, vector<string> >::iterator it = nameMapRank.find(templateSeqs[i]->getName());
124 if (it == nameMapRank.end()) { m->mothurOut("[ERROR]: " + templateSeqs[i]->getName() + " is not in namesfile, but is in fastafile. Every name in fasta file must be in first column of names file."); m->mothurOutEndLine(); m->control_pressed = true; }
127 if (maxRank == minRank) { m->mothurOut("[ERROR]: all sequences in namesfile have the same abundance, aborting."); m->mothurOutEndLine(); m->control_pressed = true; }
132 catch(exception& e) {
133 m->errorOut(e, "ChimeraSlayer", "readNameFile");
138 //***************************************************************************************************************
139 int ChimeraSlayer::doPrep() {
142 //read in all query seqs
143 vector<Sequence*> tempQuerySeqs = readSeqs(fastafile);
145 vector<Sequence*> temp = templateSeqs;
146 for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); }
148 createFilter(temp, 0.0); //just removed columns where all seqs have a gap
150 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
152 if (m->control_pressed) { return 0; }
154 //run filter on template
155 for (int i = 0; i < templateSeqs.size(); i++) { if (m->control_pressed) { return 0; } runFilter(templateSeqs[i]); }
157 string kmerDBNameLeft;
158 string kmerDBNameRight;
160 //generate the kmerdb to pass to maligner
161 if (searchMethod == "kmer") {
162 string templatePath = m->hasPath(templateFileName);
163 string rightTemplateFileName = templatePath + "right." + m->getRootName(m->getSimpleName(templateFileName));
164 databaseRight = new KmerDB(rightTemplateFileName, kmerSize);
166 string leftTemplateFileName = templatePath + "left." + m->getRootName(m->getSimpleName(templateFileName));
167 databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);
169 for (int i = 0; i < templateSeqs.size(); i++) {
171 if (m->control_pressed) { return 0; }
173 string leftFrag = templateSeqs[i]->getUnaligned();
174 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
176 Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
177 databaseLeft->addSequence(leftTemp);
179 databaseLeft->generateDB();
180 databaseLeft->setNumSeqs(templateSeqs.size());
182 for (int i = 0; i < templateSeqs.size(); i++) {
183 if (m->control_pressed) { return 0; }
185 string rightFrag = templateSeqs[i]->getUnaligned();
186 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
188 Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
189 databaseRight->addSequence(rightTemp);
191 databaseRight->generateDB();
192 databaseRight->setNumSeqs(templateSeqs.size());
196 kmerDBNameLeft = leftTemplateFileName.substr(0,leftTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
197 ifstream kmerFileTestLeft(kmerDBNameLeft.c_str());
198 bool needToGenerateLeft = true;
200 if(kmerFileTestLeft){
201 bool GoodFile = m->checkReleaseVersion(kmerFileTestLeft, m->getVersion());
202 if (GoodFile) { needToGenerateLeft = false; }
205 if(needToGenerateLeft){
207 for (int i = 0; i < templateSeqs.size(); i++) {
209 if (m->control_pressed) { return 0; }
211 string leftFrag = templateSeqs[i]->getUnaligned();
212 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
214 Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
215 databaseLeft->addSequence(leftTemp);
217 databaseLeft->generateDB();
220 databaseLeft->readKmerDB(kmerFileTestLeft);
222 kmerFileTestLeft.close();
224 databaseLeft->setNumSeqs(templateSeqs.size());
227 kmerDBNameRight = rightTemplateFileName.substr(0,rightTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
228 ifstream kmerFileTestRight(kmerDBNameRight.c_str());
229 bool needToGenerateRight = true;
231 if(kmerFileTestRight){
232 bool GoodFile = m->checkReleaseVersion(kmerFileTestRight, m->getVersion());
233 if (GoodFile) { needToGenerateRight = false; }
236 if(needToGenerateRight){
238 for (int i = 0; i < templateSeqs.size(); i++) {
239 if (m->control_pressed) { return 0; }
241 string rightFrag = templateSeqs[i]->getUnaligned();
242 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
244 Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
245 databaseRight->addSequence(rightTemp);
247 databaseRight->generateDB();
250 databaseRight->readKmerDB(kmerFileTestRight);
252 kmerFileTestRight.close();
254 databaseRight->setNumSeqs(templateSeqs.size());
256 }else if (searchMethod == "blast") {
259 databaseLeft = new BlastDB(-2.0, -1.0, match, misMatch);
260 for (int i = 0; i < templateSeqs.size(); i++) { databaseLeft->addSequence(*templateSeqs[i]); }
261 databaseLeft->generateDB();
262 databaseLeft->setNumSeqs(templateSeqs.size());
268 catch(exception& e) {
269 m->errorOut(e, "ChimeraSlayer", "doprep");
273 //***************************************************************************************************************
274 vector<Sequence*> ChimeraSlayer::getTemplate(Sequence* q) {
277 vector<Sequence*> thisTemplate;
280 string thisName = q->getName();
281 map<string, vector<string> >::iterator itRank = nameMapRank.find(thisName); // you will find it because we already sanity checked
282 thisRank = (itRank->second).size();
284 //create list of names we want to put into the template
285 set<string> namesToAdd;
286 for (itRank = nameMapRank.begin(); itRank != nameMapRank.end(); itRank++) {
287 if (itRank->first != thisName) {
288 if (includeAbunds == "greaterequal") {
289 if ((itRank->second).size() >= thisRank) {
290 //you are more abundant than me or equal to my abundance
291 for (int i = 0; i < (itRank->second).size(); i++) {
292 namesToAdd.insert((itRank->second)[i]);
295 }else if (includeAbunds == "greater") {
296 if ((itRank->second).size() > thisRank) {
297 //you are more abundant than me
298 for (int i = 0; i < (itRank->second).size(); i++) {
299 namesToAdd.insert((itRank->second)[i]);
302 }else if (includeAbunds == "all") {
304 for (int i = 0; i < (itRank->second).size(); i++) {
305 namesToAdd.insert((itRank->second)[i]);
311 for (int i = 0; i < templateSeqs.size(); i++) {
312 if (namesToAdd.count(templateSeqs[i]->getName()) != 0) {
313 thisTemplate.push_back(templateSeqs[i]);
317 string kmerDBNameLeft;
318 string kmerDBNameRight;
320 //generate the kmerdb to pass to maligner
321 if (searchMethod == "kmer") {
322 string templatePath = m->hasPath(templateFileName);
323 string rightTemplateFileName = templatePath + "right." + m->getRootName(m->getSimpleName(templateFileName));
324 databaseRight = new KmerDB(rightTemplateFileName, kmerSize);
326 string leftTemplateFileName = templatePath + "left." + m->getRootName(m->getSimpleName(templateFileName));
327 databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);
329 for (int i = 0; i < thisTemplate.size(); i++) {
331 if (m->control_pressed) { return thisTemplate; }
333 string leftFrag = thisTemplate[i]->getUnaligned();
334 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
336 Sequence leftTemp(thisTemplate[i]->getName(), leftFrag);
337 databaseLeft->addSequence(leftTemp);
339 databaseLeft->generateDB();
340 databaseLeft->setNumSeqs(thisTemplate.size());
342 for (int i = 0; i < thisTemplate.size(); i++) {
343 if (m->control_pressed) { return thisTemplate; }
345 string rightFrag = thisTemplate[i]->getUnaligned();
346 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
348 Sequence rightTemp(thisTemplate[i]->getName(), rightFrag);
349 databaseRight->addSequence(rightTemp);
351 databaseRight->generateDB();
352 databaseRight->setNumSeqs(thisTemplate.size());
357 for (int i = 0; i < thisTemplate.size(); i++) {
359 if (m->control_pressed) { return thisTemplate; }
361 string leftFrag = thisTemplate[i]->getUnaligned();
362 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
364 Sequence leftTemp(thisTemplate[i]->getName(), leftFrag);
365 databaseLeft->addSequence(leftTemp);
367 databaseLeft->generateDB();
368 databaseLeft->setNumSeqs(thisTemplate.size());
370 for (int i = 0; i < thisTemplate.size(); i++) {
371 if (m->control_pressed) { return thisTemplate; }
373 string rightFrag = thisTemplate[i]->getUnaligned();
374 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
376 Sequence rightTemp(thisTemplate[i]->getName(), rightFrag);
377 databaseRight->addSequence(rightTemp);
379 databaseRight->generateDB();
380 databaseRight->setNumSeqs(thisTemplate.size());
382 }else if (searchMethod == "blast") {
385 databaseLeft = new BlastDB(-2.0, -1.0, match, misMatch);
386 for (int i = 0; i < thisTemplate.size(); i++) { if (m->control_pressed) { return thisTemplate; } databaseLeft->addSequence(*thisTemplate[i]); }
387 databaseLeft->generateDB();
388 databaseLeft->setNumSeqs(thisTemplate.size());
394 catch(exception& e) {
395 m->errorOut(e, "ChimeraSlayer", "getTemplate");
400 //***************************************************************************************************************
401 ChimeraSlayer::~ChimeraSlayer() {
403 if (templateFileName != "self") {
404 if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; }
405 else if (searchMethod == "blast") { delete databaseLeft; }
408 //***************************************************************************************************************
409 void ChimeraSlayer::printHeader(ostream& out) {
410 m->mothurOutEndLine();
411 m->mothurOut("Only reporting sequence supported by " + toString(minBS) + "% of bootstrapped results.");
412 m->mothurOutEndLine();
414 out << "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n";
416 //***************************************************************************************************************
417 int ChimeraSlayer::print(ostream& out, ostream& outAcc) {
419 if (chimeraFlags == "yes") {
420 string chimeraFlag = "no";
421 if( (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
423 (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
426 if (chimeraFlag == "yes") {
427 if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
428 m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine();
429 outAcc << querySeq->getName() << endl;
433 printBlock(chimeraResults[0], chimeraFlag, out);
435 }else { out << querySeq->getName() << "\tno" << endl; }
440 catch(exception& e) {
441 m->errorOut(e, "ChimeraSlayer", "print");
446 //***************************************************************************************************************
447 int ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
450 bool results = false;
451 string outAccString = "";
452 string outputString = "";
454 if (chimeraFlags == "yes") {
455 string chimeraFlag = "no";
456 if( (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
458 (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
461 if (chimeraFlag == "yes") {
462 if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
463 cout << querySeq->getName() << "\tyes" << endl;
464 outAccString += querySeq->getName() + "\n";
467 //write to accnos file
468 int length = outAccString.length();
469 char* buf2 = new char[length];
470 memcpy(buf2, outAccString.c_str(), length);
472 MPI_File_write_shared(outAcc, buf2, length, MPI_CHAR, &status);
477 outputString = getBlock(chimeraResults[0], chimeraFlag);
478 outputString += "\n";
479 //cout << outputString << endl;
480 //write to output file
481 int length = outputString.length();
482 char* buf = new char[length];
483 memcpy(buf, outputString.c_str(), length);
485 MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
489 outputString += querySeq->getName() + "\tno\n";
490 //cout << outputString << endl;
491 //write to output file
492 int length = outputString.length();
493 char* buf = new char[length];
494 memcpy(buf, outputString.c_str(), length);
496 MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
503 catch(exception& e) {
504 m->errorOut(e, "ChimeraSlayer", "print");
510 //***************************************************************************************************************
511 int ChimeraSlayer::getChimeras(Sequence* query) {
516 spotMap = runFilter(query);
520 //you must create a template
521 vector<Sequence*> thisTemplate;
522 if (templateFileName != "self") { thisTemplate = templateSeqs; }
523 else { thisTemplate = getTemplate(query); } //fills thistemplate and creates the databases
525 if (m->control_pressed) { return 0; }
527 if (thisTemplate.size() == 0) { return 0; } //not chimeric
529 //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity
530 Maligner maligner(thisTemplate, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight);
531 Slayer slayer(window, increment, minSim, divR, iters, minSNP);
533 if (templateFileName == "self") {
534 if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; }
535 else if (searchMethod == "blast") { delete databaseLeft; }
538 if (m->control_pressed) { return 0; }
540 string chimeraFlag = maligner.getResults(query, decalc);
541 if (m->control_pressed) { return 0; }
542 vector<results> Results = maligner.getOutput();
544 //found in testing realigning only made things worse
546 ChimeraReAligner realigner(thisTemplate, match, misMatch);
547 realigner.reAlign(query, Results);
550 if (chimeraFlag == "yes") {
552 //get sequence that were given from maligner results
553 vector<SeqDist> seqs;
554 map<string, float> removeDups;
555 map<string, float>::iterator itDup;
556 map<string, string> parentNameSeq;
557 map<string, string>::iterator itSeq;
558 for (int j = 0; j < Results.size(); j++) {
559 float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
560 //only add if you are not a duplicate
561 itDup = removeDups.find(Results[j].parent);
562 if (itDup == removeDups.end()) { //this is not duplicate
563 removeDups[Results[j].parent] = dist;
564 parentNameSeq[Results[j].parent] = Results[j].parentAligned;
565 }else if (dist > itDup->second) { //is this a stronger number for this parent
566 removeDups[Results[j].parent] = dist;
567 parentNameSeq[Results[j].parent] = Results[j].parentAligned;
571 for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) {
572 //Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template
573 itSeq = parentNameSeq.find(itDup->first);
574 //cout << itDup->first << itSeq->second << endl;
575 Sequence* seq = new Sequence(itDup->first, itSeq->second);
579 member.dist = itDup->second;
581 seqs.push_back(member);
584 //limit number of parents to explore - default 3
585 if (Results.size() > parents) {
587 sort(seqs.begin(), seqs.end(), compareSeqDist);
588 //prioritize larger more similiar sequence fragments
589 reverse(seqs.begin(), seqs.end());
591 for (int k = seqs.size()-1; k > (parents-1); k--) {
597 //put seqs into vector to send to slayer
598 vector<Sequence*> seqsForSlayer;
599 for (int k = 0; k < seqs.size(); k++) { seqsForSlayer.push_back(seqs[k].seq); }
601 //mask then send to slayer...
603 decalc->setMask(seqMask);
606 decalc->runMask(query);
609 for (int k = 0; k < seqsForSlayer.size(); k++) {
610 decalc->runMask(seqsForSlayer[k]);
613 spotMap = decalc->getMaskMap();
616 if (m->control_pressed) { for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; } return 0; }
619 chimeraFlags = slayer.getResults(query, seqsForSlayer);
620 if (m->control_pressed) { return 0; }
621 chimeraResults = slayer.getOutput();
624 for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; }
632 catch(exception& e) {
633 m->errorOut(e, "ChimeraSlayer", "getChimeras");
637 //***************************************************************************************************************
638 void ChimeraSlayer::printBlock(data_struct data, string flag, ostream& out){
642 out << querySeq->getName() << '\t';
643 out << data.parentA.getName() << "\t" << data.parentB.getName() << '\t';
644 //out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl;
645 //out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl;
647 out << data.divr_qla_qrb << '\t' << data.qla_qrb << '\t' << data.bsa << '\t';
648 out << data.divr_qlb_qra << '\t' << data.qlb_qra << '\t' << data.bsb << '\t';
650 out << flag << '\t' << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t';
652 //out << "Similarity of parents: " << data.ab << endl;
653 //out << "Similarity of query to parentA: " << data.qa << endl;
654 //out << "Similarity of query to parentB: " << data.qb << endl;
657 //out << "Per_id(QL,A): " << data.qla << endl;
658 //out << "Per_id(QL,B): " << data.qlb << endl;
659 //out << "Per_id(QR,A): " << data.qra << endl;
660 //out << "Per_id(QR,B): " << data.qrb << endl;
663 //out << "DeltaL: " << (data.qla - data.qlb) << endl;
664 //out << "DeltaR: " << (data.qra - data.qrb) << endl;
667 catch(exception& e) {
668 m->errorOut(e, "ChimeraSlayer", "printBlock");
672 //***************************************************************************************************************
673 string ChimeraSlayer::getBlock(data_struct data, string flag){
676 string outputString = "";
678 outputString += querySeq->getName() + "\t";
679 outputString += data.parentA.getName() + "\t" + data.parentB.getName() + "\t";
681 outputString += toString(data.divr_qla_qrb) + "\t" + toString(data.qla_qrb) + "\t" + toString(data.bsa) + "\t";
682 outputString += toString(data.divr_qlb_qra) + "\t" + toString(data.qlb_qra) + "\t" + toString(data.bsb) + "\t";
684 outputString += flag + "\t" + toString(spotMap[data.winLStart]) + "-" + toString(spotMap[data.winLEnd]) + "\t" + toString(spotMap[data.winRStart]) + "-" + toString(spotMap[data.winREnd]) + "\t";
688 catch(exception& e) {
689 m->errorOut(e, "ChimeraSlayer", "getBlock");
693 //***************************************************************************************************************/