5 * Created by westcott on 9/25/09.
6 * Copyright 2009 Pschloss Lab. All rights reserved.
10 #include "chimeraslayer.h"
11 #include "chimerarealigner.h"
13 #include "blastdb.hpp"
15 //***************************************************************************************************************
16 ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, string mode, int k, int ms, int mms, int win, float div,
17 int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera() {
20 templateFileName = temp; templateSeqs = readSeqs(temp);
38 decalc = new DeCalculator();
43 m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer");
47 //***************************************************************************************************************
48 ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, string name, string mode, string abunds, int k, int ms, int mms, int win, float div,
49 int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera() {
51 fastafile = file; templateSeqs = readSeqs(fastafile);
52 templateFileName = temp;
68 includeAbunds = abunds;
71 //read name file and create nameMapRank
74 decalc = new DeCalculator();
76 createFilter(templateSeqs, 0.0); //just removed columns where all seqs have a gap
78 //run filter on template
79 for (int i = 0; i < templateSeqs.size(); i++) { runFilter(templateSeqs[i]); }
83 m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer");
87 //***************************************************************************************************************
88 int ChimeraSlayer::readNameFile(string name) {
91 m->openInputFile(name, in);
94 int minRank = 10000000;
98 if (m->control_pressed) { in.close(); return 0; }
100 string thisname, repnames;
102 in >> thisname; m->gobble(in); //read from first column
103 in >> repnames; //read from second column
105 map<string, vector<string> >::iterator it = nameMapRank.find(thisname);
106 if (it == nameMapRank.end()) {
108 vector<string> splitRepNames;
109 m->splitAtComma(repnames, splitRepNames);
111 nameMapRank[thisname] = splitRepNames;
113 if (splitRepNames.size() > maxRank) { maxRank = splitRepNames.size(); }
114 if (splitRepNames.size() < minRank) { minRank = splitRepNames.size(); }
116 }else{ m->mothurOut(thisname + " is already in namesfile. I will use first definition."); m->mothurOutEndLine(); }
122 //sanity check to make sure files match
123 for (int i = 0; i < templateSeqs.size(); i++) {
124 map<string, vector<string> >::iterator it = nameMapRank.find(templateSeqs[i]->getName());
126 if (it == nameMapRank.end()) { m->mothurOut("[ERROR]: " + templateSeqs[i]->getName() + " is not in namesfile, but is in fastafile. Every name in fasta file must be in first column of names file."); m->mothurOutEndLine(); m->control_pressed = true; }
129 if (maxRank == minRank) { m->mothurOut("[ERROR]: all sequences in namesfile have the same abundance, aborting."); m->mothurOutEndLine(); m->control_pressed = true; }
134 catch(exception& e) {
135 m->errorOut(e, "ChimeraSlayer", "readNameFile");
140 //***************************************************************************************************************
141 int ChimeraSlayer::doPrep() {
144 //read in all query seqs
145 vector<Sequence*> tempQuerySeqs = readSeqs(fastafile);
147 vector<Sequence*> temp = templateSeqs;
148 for (int i = 0; i < tempQuerySeqs.size(); i++) { temp.push_back(tempQuerySeqs[i]); }
150 createFilter(temp, 0.0); //just removed columns where all seqs have a gap
152 for (int i = 0; i < tempQuerySeqs.size(); i++) { delete tempQuerySeqs[i]; }
154 if (m->control_pressed) { return 0; }
156 //run filter on template
157 for (int i = 0; i < templateSeqs.size(); i++) { if (m->control_pressed) { return 0; } runFilter(templateSeqs[i]); }
159 string kmerDBNameLeft;
160 string kmerDBNameRight;
162 //generate the kmerdb to pass to maligner
163 if (searchMethod == "kmer") {
164 string templatePath = m->hasPath(templateFileName);
165 string rightTemplateFileName = templatePath + "right." + m->getRootName(m->getSimpleName(templateFileName));
166 databaseRight = new KmerDB(rightTemplateFileName, kmerSize);
168 string leftTemplateFileName = templatePath + "left." + m->getRootName(m->getSimpleName(templateFileName));
169 databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);
171 for (int i = 0; i < templateSeqs.size(); i++) {
173 if (m->control_pressed) { return 0; }
175 string leftFrag = templateSeqs[i]->getUnaligned();
176 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
178 Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
179 databaseLeft->addSequence(leftTemp);
181 databaseLeft->generateDB();
182 databaseLeft->setNumSeqs(templateSeqs.size());
184 for (int i = 0; i < templateSeqs.size(); i++) {
185 if (m->control_pressed) { return 0; }
187 string rightFrag = templateSeqs[i]->getUnaligned();
188 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
190 Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
191 databaseRight->addSequence(rightTemp);
193 databaseRight->generateDB();
194 databaseRight->setNumSeqs(templateSeqs.size());
198 kmerDBNameLeft = leftTemplateFileName.substr(0,leftTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
199 ifstream kmerFileTestLeft(kmerDBNameLeft.c_str());
200 bool needToGenerateLeft = true;
202 if(kmerFileTestLeft){
203 bool GoodFile = m->checkReleaseVersion(kmerFileTestLeft, m->getVersion());
204 if (GoodFile) { needToGenerateLeft = false; }
207 if(needToGenerateLeft){
209 for (int i = 0; i < templateSeqs.size(); i++) {
211 if (m->control_pressed) { return 0; }
213 string leftFrag = templateSeqs[i]->getUnaligned();
214 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
216 Sequence leftTemp(templateSeqs[i]->getName(), leftFrag);
217 databaseLeft->addSequence(leftTemp);
219 databaseLeft->generateDB();
222 databaseLeft->readKmerDB(kmerFileTestLeft);
224 kmerFileTestLeft.close();
226 databaseLeft->setNumSeqs(templateSeqs.size());
229 kmerDBNameRight = rightTemplateFileName.substr(0,rightTemplateFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
230 ifstream kmerFileTestRight(kmerDBNameRight.c_str());
231 bool needToGenerateRight = true;
233 if(kmerFileTestRight){
234 bool GoodFile = m->checkReleaseVersion(kmerFileTestRight, m->getVersion());
235 if (GoodFile) { needToGenerateRight = false; }
238 if(needToGenerateRight){
240 for (int i = 0; i < templateSeqs.size(); i++) {
241 if (m->control_pressed) { return 0; }
243 string rightFrag = templateSeqs[i]->getUnaligned();
244 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
246 Sequence rightTemp(templateSeqs[i]->getName(), rightFrag);
247 databaseRight->addSequence(rightTemp);
249 databaseRight->generateDB();
252 databaseRight->readKmerDB(kmerFileTestRight);
254 kmerFileTestRight.close();
256 databaseRight->setNumSeqs(templateSeqs.size());
258 }else if (searchMethod == "blast") {
261 databaseLeft = new BlastDB(-2.0, -1.0, match, misMatch);
262 for (int i = 0; i < templateSeqs.size(); i++) { databaseLeft->addSequence(*templateSeqs[i]); }
263 databaseLeft->generateDB();
264 databaseLeft->setNumSeqs(templateSeqs.size());
270 catch(exception& e) {
271 m->errorOut(e, "ChimeraSlayer", "doprep");
275 //***************************************************************************************************************
276 vector<Sequence*> ChimeraSlayer::getTemplate(Sequence* q) {
279 vector<Sequence*> thisTemplate;
282 string thisName = q->getName();
283 map<string, vector<string> >::iterator itRank = nameMapRank.find(thisName); // you will find it because we already sanity checked
284 thisRank = (itRank->second).size();
286 //create list of names we want to put into the template
287 set<string> namesToAdd;
288 for (itRank = nameMapRank.begin(); itRank != nameMapRank.end(); itRank++) {
289 if (itRank->first != thisName) {
290 if (includeAbunds == "greaterequal") {
291 if ((itRank->second).size() >= thisRank) {
292 //you are more abundant than me or equal to my abundance
293 for (int i = 0; i < (itRank->second).size(); i++) {
294 namesToAdd.insert((itRank->second)[i]);
297 }else if (includeAbunds == "greater") {
298 if ((itRank->second).size() > thisRank) {
299 //you are more abundant than me
300 for (int i = 0; i < (itRank->second).size(); i++) {
301 namesToAdd.insert((itRank->second)[i]);
304 }else if (includeAbunds == "all") {
306 for (int i = 0; i < (itRank->second).size(); i++) {
307 namesToAdd.insert((itRank->second)[i]);
313 for (int i = 0; i < templateSeqs.size(); i++) {
314 if (namesToAdd.count(templateSeqs[i]->getName()) != 0) {
315 thisTemplate.push_back(templateSeqs[i]);
319 string kmerDBNameLeft;
320 string kmerDBNameRight;
322 //generate the kmerdb to pass to maligner
323 if (searchMethod == "kmer") {
324 string templatePath = m->hasPath(templateFileName);
325 string rightTemplateFileName = templatePath + "right." + m->getRootName(m->getSimpleName(templateFileName));
326 databaseRight = new KmerDB(rightTemplateFileName, kmerSize);
328 string leftTemplateFileName = templatePath + "left." + m->getRootName(m->getSimpleName(templateFileName));
329 databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);
331 for (int i = 0; i < thisTemplate.size(); i++) {
333 if (m->control_pressed) { return thisTemplate; }
335 string leftFrag = thisTemplate[i]->getUnaligned();
336 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
338 Sequence leftTemp(thisTemplate[i]->getName(), leftFrag);
339 databaseLeft->addSequence(leftTemp);
341 databaseLeft->generateDB();
342 databaseLeft->setNumSeqs(thisTemplate.size());
344 for (int i = 0; i < thisTemplate.size(); i++) {
345 if (m->control_pressed) { return thisTemplate; }
347 string rightFrag = thisTemplate[i]->getUnaligned();
348 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
350 Sequence rightTemp(thisTemplate[i]->getName(), rightFrag);
351 databaseRight->addSequence(rightTemp);
353 databaseRight->generateDB();
354 databaseRight->setNumSeqs(thisTemplate.size());
359 for (int i = 0; i < thisTemplate.size(); i++) {
361 if (m->control_pressed) { return thisTemplate; }
363 string leftFrag = thisTemplate[i]->getUnaligned();
364 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
366 Sequence leftTemp(thisTemplate[i]->getName(), leftFrag);
367 databaseLeft->addSequence(leftTemp);
369 databaseLeft->generateDB();
370 databaseLeft->setNumSeqs(thisTemplate.size());
372 for (int i = 0; i < thisTemplate.size(); i++) {
373 if (m->control_pressed) { return thisTemplate; }
375 string rightFrag = thisTemplate[i]->getUnaligned();
376 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
378 Sequence rightTemp(thisTemplate[i]->getName(), rightFrag);
379 databaseRight->addSequence(rightTemp);
381 databaseRight->generateDB();
382 databaseRight->setNumSeqs(thisTemplate.size());
384 }else if (searchMethod == "blast") {
387 databaseLeft = new BlastDB(-2.0, -1.0, match, misMatch);
388 for (int i = 0; i < thisTemplate.size(); i++) { if (m->control_pressed) { return thisTemplate; } databaseLeft->addSequence(*thisTemplate[i]); }
389 databaseLeft->generateDB();
390 databaseLeft->setNumSeqs(thisTemplate.size());
396 catch(exception& e) {
397 m->errorOut(e, "ChimeraSlayer", "getTemplate");
402 //***************************************************************************************************************
403 ChimeraSlayer::~ChimeraSlayer() {
405 if (templateFileName != "self") {
406 if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; }
407 else if (searchMethod == "blast") { delete databaseLeft; }
410 //***************************************************************************************************************
411 void ChimeraSlayer::printHeader(ostream& out) {
412 m->mothurOutEndLine();
413 m->mothurOut("Only reporting sequence supported by " + toString(minBS) + "% of bootstrapped results.");
414 m->mothurOutEndLine();
416 out << "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n";
418 //***************************************************************************************************************
419 Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc) {
421 Sequence* trim = NULL;
422 if (trimChimera) { trim = trimQuery; }
424 if (chimeraFlags == "yes") {
425 string chimeraFlag = "no";
426 if( (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
428 (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
431 if (chimeraFlag == "yes") {
432 if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
433 m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine();
434 outAcc << querySeq->getName() << endl;
437 int lengthLeft = spotMap[chimeraResults[0].winLEnd] - spotMap[chimeraResults[0].winLStart];
438 int lengthRight = spotMap[chimeraResults[0].winREnd] - spotMap[chimeraResults[0].winRStart];
440 string newAligned = trim->getAligned();
442 if (lengthLeft > lengthRight) { //trim right
443 for (int i = (spotMap[chimeraResults[0].winRStart]-1); i < newAligned.length(); i++) { newAligned[i] = '.'; }
445 for (int i = 0; i < spotMap[chimeraResults[0].winLEnd]; i++) { newAligned[i] = '.'; }
447 trim->setAligned(newAligned);
453 printBlock(chimeraResults[0], chimeraFlag, out);
455 }else { out << querySeq->getName() << "\tno" << endl; }
460 catch(exception& e) {
461 m->errorOut(e, "ChimeraSlayer", "print");
466 //***************************************************************************************************************
467 Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
470 bool results = false;
471 string outAccString = "";
472 string outputString = "";
474 Sequence* trim = NULL;
475 if (trimChimera) { trim = trimQuery; }
477 if (chimeraFlags == "yes") {
478 string chimeraFlag = "no";
479 if( (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
481 (chimeraResults[0].bsb >= minBS && chimeraResults[0].divr_qlb_qra >= divR) ) { chimeraFlag = "yes"; }
484 if (chimeraFlag == "yes") {
485 if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
486 cout << querySeq->getName() << "\tyes" << endl;
487 outAccString += querySeq->getName() + "\n";
490 //write to accnos file
491 int length = outAccString.length();
492 char* buf2 = new char[length];
493 memcpy(buf2, outAccString.c_str(), length);
495 MPI_File_write_shared(outAcc, buf2, length, MPI_CHAR, &status);
499 int lengthLeft = spotMap[chimeraResults[0].winLEnd] - spotMap[chimeraResults[0].winLStart];
500 int lengthRight = spotMap[chimeraResults[0].winREnd] - spotMap[chimeraResults[0].winRStart];
502 string newAligned = trim->getAligned();
503 if (lengthLeft > lengthRight) { //trim right
504 for (int i = (spotMap[chimeraResults[0].winRStart]-1); i < newAligned.length(); i++) { newAligned[i] = '.'; }
506 for (int i = 0; i < (spotMap[chimeraResults[0].winLEnd]-1); i++) { newAligned[i] = '.'; }
508 trim->setAligned(newAligned);
513 outputString = getBlock(chimeraResults[0], chimeraFlag);
514 outputString += "\n";
515 //cout << outputString << endl;
516 //write to output file
517 int length = outputString.length();
518 char* buf = new char[length];
519 memcpy(buf, outputString.c_str(), length);
521 MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
525 outputString += querySeq->getName() + "\tno\n";
526 //cout << outputString << endl;
527 //write to output file
528 int length = outputString.length();
529 char* buf = new char[length];
530 memcpy(buf, outputString.c_str(), length);
532 MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
539 catch(exception& e) {
540 m->errorOut(e, "ChimeraSlayer", "print");
546 //***************************************************************************************************************
547 int ChimeraSlayer::getChimeras(Sequence* query) {
549 if (trimChimera) { trimQuery = new Sequence(query->getName(), query->getAligned()); }
554 spotMap = runFilter(query);
558 //you must create a template
559 vector<Sequence*> thisTemplate;
560 if (templateFileName != "self") { thisTemplate = templateSeqs; }
561 else { thisTemplate = getTemplate(query); } //fills thistemplate and creates the databases
563 if (m->control_pressed) { return 0; }
565 if (thisTemplate.size() == 0) { return 0; } //not chimeric
567 //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity
568 Maligner maligner(thisTemplate, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight);
569 Slayer slayer(window, increment, minSim, divR, iters, minSNP);
571 if (templateFileName == "self") {
572 if (searchMethod == "kmer") { delete databaseRight; delete databaseLeft; }
573 else if (searchMethod == "blast") { delete databaseLeft; }
576 if (m->control_pressed) { return 0; }
578 string chimeraFlag = maligner.getResults(query, decalc);
579 if (m->control_pressed) { return 0; }
580 vector<results> Results = maligner.getOutput();
582 //found in testing realigning only made things worse
584 ChimeraReAligner realigner(thisTemplate, match, misMatch);
585 realigner.reAlign(query, Results);
588 if (chimeraFlag == "yes") {
590 //get sequence that were given from maligner results
591 vector<SeqDist> seqs;
592 map<string, float> removeDups;
593 map<string, float>::iterator itDup;
594 map<string, string> parentNameSeq;
595 map<string, string>::iterator itSeq;
596 for (int j = 0; j < Results.size(); j++) {
597 float dist = (Results[j].regionEnd - Results[j].regionStart + 1) * Results[j].queryToParentLocal;
598 //only add if you are not a duplicate
599 itDup = removeDups.find(Results[j].parent);
600 if (itDup == removeDups.end()) { //this is not duplicate
601 removeDups[Results[j].parent] = dist;
602 parentNameSeq[Results[j].parent] = Results[j].parentAligned;
603 }else if (dist > itDup->second) { //is this a stronger number for this parent
604 removeDups[Results[j].parent] = dist;
605 parentNameSeq[Results[j].parent] = Results[j].parentAligned;
609 for (itDup = removeDups.begin(); itDup != removeDups.end(); itDup++) {
610 //Sequence* seq = getSequence(itDup->first); //makes copy so you can filter and mask and not effect template
611 itSeq = parentNameSeq.find(itDup->first);
612 //cout << itDup->first << itSeq->second << endl;
613 Sequence* seq = new Sequence(itDup->first, itSeq->second);
617 member.dist = itDup->second;
619 seqs.push_back(member);
622 //limit number of parents to explore - default 3
623 if (Results.size() > parents) {
625 sort(seqs.begin(), seqs.end(), compareSeqDist);
626 //prioritize larger more similiar sequence fragments
627 reverse(seqs.begin(), seqs.end());
629 for (int k = seqs.size()-1; k > (parents-1); k--) {
635 //put seqs into vector to send to slayer
636 vector<Sequence*> seqsForSlayer;
637 for (int k = 0; k < seqs.size(); k++) { seqsForSlayer.push_back(seqs[k].seq); }
639 //mask then send to slayer...
641 decalc->setMask(seqMask);
644 decalc->runMask(query);
647 for (int k = 0; k < seqsForSlayer.size(); k++) {
648 decalc->runMask(seqsForSlayer[k]);
651 spotMap = decalc->getMaskMap();
654 if (m->control_pressed) { for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; } return 0; }
657 chimeraFlags = slayer.getResults(query, seqsForSlayer);
658 if (m->control_pressed) { return 0; }
659 chimeraResults = slayer.getOutput();
662 for (int k = 0; k < seqs.size(); k++) { delete seqs[k].seq; }
670 catch(exception& e) {
671 m->errorOut(e, "ChimeraSlayer", "getChimeras");
675 //***************************************************************************************************************
676 void ChimeraSlayer::printBlock(data_struct data, string flag, ostream& out){
680 out << querySeq->getName() << '\t';
681 out << data.parentA.getName() << "\t" << data.parentB.getName() << '\t';
682 //out << "Left Window: " << spotMap[data.winLStart] << " " << spotMap[data.winLEnd] << endl;
683 //out << "Right Window: " << spotMap[data.winRStart] << " " << spotMap[data.winREnd] << endl;
685 out << data.divr_qla_qrb << '\t' << data.qla_qrb << '\t' << data.bsa << '\t';
686 out << data.divr_qlb_qra << '\t' << data.qlb_qra << '\t' << data.bsb << '\t';
688 out << flag << '\t' << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t';
690 //out << "Similarity of parents: " << data.ab << endl;
691 //out << "Similarity of query to parentA: " << data.qa << endl;
692 //out << "Similarity of query to parentB: " << data.qb << endl;
695 //out << "Per_id(QL,A): " << data.qla << endl;
696 //out << "Per_id(QL,B): " << data.qlb << endl;
697 //out << "Per_id(QR,A): " << data.qra << endl;
698 //out << "Per_id(QR,B): " << data.qrb << endl;
701 //out << "DeltaL: " << (data.qla - data.qlb) << endl;
702 //out << "DeltaR: " << (data.qra - data.qrb) << endl;
705 catch(exception& e) {
706 m->errorOut(e, "ChimeraSlayer", "printBlock");
710 //***************************************************************************************************************
711 string ChimeraSlayer::getBlock(data_struct data, string flag){
714 string outputString = "";
716 outputString += querySeq->getName() + "\t";
717 outputString += data.parentA.getName() + "\t" + data.parentB.getName() + "\t";
719 outputString += toString(data.divr_qla_qrb) + "\t" + toString(data.qla_qrb) + "\t" + toString(data.bsa) + "\t";
720 outputString += toString(data.divr_qlb_qra) + "\t" + toString(data.qlb_qra) + "\t" + toString(data.bsb) + "\t";
722 outputString += flag + "\t" + toString(spotMap[data.winLStart]) + "-" + toString(spotMap[data.winLEnd]) + "\t" + toString(spotMap[data.winRStart]) + "-" + toString(spotMap[data.winREnd]) + "\t";
726 catch(exception& e) {
727 m->errorOut(e, "ChimeraSlayer", "getBlock");
731 //***************************************************************************************************************/