]> git.donarmstrong.com Git - mothur.git/blobdiff - chimeraslayer.cpp
added mantel command
[mothur.git] / chimeraslayer.cpp
index 1bfdafeaa5f6540e4e8254572b0202c81c1a0325..38f7abe49bfc7266e6eb4acd123ff057b8780c42 100644 (file)
@@ -13,7 +13,7 @@
 #include "blastdb.hpp"
 
 //***************************************************************************************************************
-ChimeraSlayer::ChimeraSlayer(string file, string temp, string mode, int k, int ms, int mms, int win, float div, 
+ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, string mode, int k, int ms, int mms, int win, float div, 
 int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera()  {      
        try {
                fastafile = file;
@@ -33,6 +33,7 @@ int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int num
                increment = inc;
                numWanted = numw;
                realign = r; 
+               trimChimera = trim;
        
                decalc = new DeCalculator();    
                
@@ -43,6 +44,99 @@ int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int num
                exit(1);
        }
 }
+//***************************************************************************************************************
+ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, string name, string mode, string abunds, int k, int ms, int mms, int win, float div, 
+                                                        int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera()  {      
+       try {
+               fastafile = file; templateSeqs = readSeqs(fastafile);
+               templateFileName = temp; 
+               searchMethod = mode;
+               kmerSize = k;
+               match = ms;
+               misMatch = mms;
+               window = win;
+               divR = div;
+               minSim = minsim;
+               minCov = mincov;
+               minBS = minbs;
+               minSNP = minsnp;
+               parents = par;
+               iters = it;
+               increment = inc;
+               numWanted = numw;
+               realign = r; 
+               includeAbunds = abunds;
+               trimChimera = trim;
+               
+               //read name file and create nameMapRank
+               readNameFile(name);
+               
+               decalc = new DeCalculator();    
+               
+               createFilter(templateSeqs, 0.0); //just removed columns where all seqs have a gap
+               
+               //run filter on template
+               for (int i = 0; i < templateSeqs.size(); i++) { runFilter(templateSeqs[i]);  }
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer");
+               exit(1);
+       }
+}
+//***************************************************************************************************************
+int ChimeraSlayer::readNameFile(string name) {
+       try {
+               ifstream in;
+               m->openInputFile(name, in);
+               
+               int maxRank = 0;
+               int minRank = 10000000;
+               
+               while(!in.eof()){
+                       
+                       if (m->control_pressed) { in.close(); return 0; }
+                       
+                       string thisname, repnames;
+                       
+                       in >> thisname;         m->gobble(in);          //read from first column
+                       in >> repnames;                 //read from second column
+                       
+                       map<string, vector<string> >::iterator it = nameMapRank.find(thisname);
+                       if (it == nameMapRank.end()) {
+                               
+                               vector<string> splitRepNames;
+                               m->splitAtComma(repnames, splitRepNames);
+                               
+                               nameMapRank[thisname] = splitRepNames;  
+                               
+                               if (splitRepNames.size() > maxRank) { maxRank = splitRepNames.size(); }
+                               if (splitRepNames.size() < minRank) { minRank = splitRepNames.size(); }
+                               
+                       }else{  m->mothurOut(thisname + " is already in namesfile. I will use first definition."); m->mothurOutEndLine();  }
+                       
+                       m->gobble(in);
+               }
+               in.close();     
+               
+               //sanity check to make sure files match
+               for (int i = 0; i < templateSeqs.size(); i++) {
+                       map<string, vector<string> >::iterator it = nameMapRank.find(templateSeqs[i]->getName());
+                       
+                       if (it == nameMapRank.end()) { m->mothurOut("[ERROR]: " + templateSeqs[i]->getName() + " is not in namesfile, but is in fastafile. Every name in fasta file must be in first column of names file."); m->mothurOutEndLine(); m->control_pressed = true;  }
+               }
+               
+               if (maxRank == minRank) { m->mothurOut("[ERROR]: all sequences in namesfile have the same abundance, aborting."); m->mothurOutEndLine(); m->control_pressed = true;  }
+               
+               return 0;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ChimeraSlayer", "readNameFile");
+               exit(1);
+       }
+}
+
 //***************************************************************************************************************
 int ChimeraSlayer::doPrep() {
        try {
@@ -67,10 +161,11 @@ int ChimeraSlayer::doPrep() {
        
                //generate the kmerdb to pass to maligner
                if (searchMethod == "kmer") { 
-                       string rightTemplateFileName = "right." + templateFileName;
+                       string templatePath = m->hasPath(templateFileName);
+                       string rightTemplateFileName = templatePath + "right." + m->getRootName(m->getSimpleName(templateFileName));
                        databaseRight = new KmerDB(rightTemplateFileName, kmerSize);
                                
-                       string leftTemplateFileName = "left." + templateFileName;
+                       string leftTemplateFileName = templatePath + "left." + m->getRootName(m->getSimpleName(templateFileName));
                        databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);      
                #ifdef USE_MPI
                        for (int i = 0; i < templateSeqs.size(); i++) {
@@ -177,11 +272,140 @@ int ChimeraSlayer::doPrep() {
                exit(1);
        }
 }
+//***************************************************************************************************************
+vector<Sequence*> ChimeraSlayer::getTemplate(Sequence* q) {
+       try {
+               
+               vector<Sequence*> thisTemplate;
+               
+               int thisRank;
+               string thisName = q->getName();
+               map<string, vector<string> >::iterator itRank = nameMapRank.find(thisName); // you will find it because we already sanity checked
+               thisRank = (itRank->second).size();
+               
+               //create list of names we want to put into the template
+               set<string> namesToAdd;
+               for (itRank = nameMapRank.begin(); itRank != nameMapRank.end(); itRank++) {
+                       if (itRank->first != thisName) {
+                               if (includeAbunds == "greaterequal") {
+                                       if ((itRank->second).size() >= thisRank) {
+                                               //you are more abundant than me or equal to my abundance
+                                               for (int i = 0; i < (itRank->second).size(); i++) {
+                                                       namesToAdd.insert((itRank->second)[i]);
+                                               }
+                                       }
+                               }else if (includeAbunds == "greater") {
+                                       if ((itRank->second).size() > thisRank) {
+                                               //you are more abundant than me
+                                               for (int i = 0; i < (itRank->second).size(); i++) {
+                                                       namesToAdd.insert((itRank->second)[i]);
+                                               }
+                                       }
+                               }else if (includeAbunds == "all") {
+                                       //add everyone
+                                       for (int i = 0; i < (itRank->second).size(); i++) {
+                                               namesToAdd.insert((itRank->second)[i]);
+                                       }
+                               }
+                       }
+               }
+               
+               for (int i = 0; i < templateSeqs.size(); i++) {  
+                       if (namesToAdd.count(templateSeqs[i]->getName()) != 0) { 
+                               thisTemplate.push_back(templateSeqs[i]);
+                       }
+               }
+               
+               string  kmerDBNameLeft;
+               string  kmerDBNameRight;
+               
+               //generate the kmerdb to pass to maligner
+               if (searchMethod == "kmer") { 
+                       string templatePath = m->hasPath(templateFileName);
+                       string rightTemplateFileName = templatePath + "right." + m->getRootName(m->getSimpleName(templateFileName));
+                       databaseRight = new KmerDB(rightTemplateFileName, kmerSize);
+                       
+                       string leftTemplateFileName = templatePath + "left." + m->getRootName(m->getSimpleName(templateFileName));
+                       databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);      
+#ifdef USE_MPI
+                       for (int i = 0; i < thisTemplate.size(); i++) {
+                               
+                               if (m->control_pressed) { return thisTemplate; } 
+                               
+                               string leftFrag = thisTemplate[i]->getUnaligned();
+                               leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
+                               
+                               Sequence leftTemp(thisTemplate[i]->getName(), leftFrag);
+                               databaseLeft->addSequence(leftTemp);    
+                       }
+                       databaseLeft->generateDB();
+                       databaseLeft->setNumSeqs(thisTemplate.size());
+                       
+                       for (int i = 0; i < thisTemplate.size(); i++) {
+                               if (m->control_pressed) { return thisTemplate;  } 
+                               
+                               string rightFrag = thisTemplate[i]->getUnaligned();
+                               rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
+                               
+                               Sequence rightTemp(thisTemplate[i]->getName(), rightFrag);
+                               databaseRight->addSequence(rightTemp);  
+                       }
+                       databaseRight->generateDB();
+                       databaseRight->setNumSeqs(thisTemplate.size());
+                       
+#else  
+                       
+                       
+                       for (int i = 0; i < thisTemplate.size(); i++) {
+                               
+                               if (m->control_pressed) { return thisTemplate; } 
+                               
+                               string leftFrag = thisTemplate[i]->getUnaligned();
+                               leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
+                               
+                               Sequence leftTemp(thisTemplate[i]->getName(), leftFrag);
+                               databaseLeft->addSequence(leftTemp);    
+                       }
+                       databaseLeft->generateDB();
+                       databaseLeft->setNumSeqs(thisTemplate.size());
+                               
+                       for (int i = 0; i < thisTemplate.size(); i++) {
+                               if (m->control_pressed) { return thisTemplate; } 
+                                       
+                               string rightFrag = thisTemplate[i]->getUnaligned();
+                               rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
+                                       
+                               Sequence rightTemp(thisTemplate[i]->getName(), rightFrag);
+                               databaseRight->addSequence(rightTemp);  
+                       }
+                       databaseRight->generateDB();
+                       databaseRight->setNumSeqs(thisTemplate.size());
+#endif 
+               }else if (searchMethod == "blast") {
+                       
+                       //generate blastdb
+                       databaseLeft = new BlastDB(-2.0, -1.0, match, misMatch);
+                       for (int i = 0; i < thisTemplate.size(); i++) { if (m->control_pressed) { return thisTemplate; }  databaseLeft->addSequence(*thisTemplate[i]);  }
+                       databaseLeft->generateDB();
+                       databaseLeft->setNumSeqs(thisTemplate.size());
+               }
+               
+               return thisTemplate;
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "ChimeraSlayer", "getTemplate");
+               exit(1);
+       }
+}
+
 //***************************************************************************************************************
 ChimeraSlayer::~ChimeraSlayer() {      
        delete decalc;  
-       if (searchMethod == "kmer") {  delete databaseRight;  delete databaseLeft;  }   
-       else if (searchMethod == "blast") {  delete databaseLeft; }
+       if (templateFileName != "self") {
+               if (searchMethod == "kmer") {  delete databaseRight;  delete databaseLeft;  }   
+               else if (searchMethod == "blast") {  delete databaseLeft; }
+       }
 }
 //***************************************************************************************************************
 void ChimeraSlayer::printHeader(ostream& out) {
@@ -192,8 +416,11 @@ void ChimeraSlayer::printHeader(ostream& out) {
        out << "Name\tLeftParent\tRightParent\tDivQLAQRB\tPerIDQLAQRB\tBootStrapA\tDivQLBQRA\tPerIDQLBQRA\tBootStrapB\tFlag\tLeftWindow\tRightWindow\n";
 }
 //***************************************************************************************************************
-int ChimeraSlayer::print(ostream& out, ostream& outAcc) {
+Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc) {
        try {
+               Sequence* trim = NULL;
+               if (trimChimera) { trim = trimQuery; }
+               
                if (chimeraFlags == "yes") {
                        string chimeraFlag = "no";
                        if(  (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
@@ -205,6 +432,21 @@ int ChimeraSlayer::print(ostream& out, ostream& outAcc) {
                                if ((chimeraResults[0].bsa >= minBS) || (chimeraResults[0].bsb >= minBS)) {
                                        m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine();
                                        outAcc << querySeq->getName() << endl;
+                                       
+                                       if (trimChimera) {  
+                                               int lengthLeft = spotMap[chimeraResults[0].winLEnd] - spotMap[chimeraResults[0].winLStart];
+                                               int lengthRight = spotMap[chimeraResults[0].winREnd] - spotMap[chimeraResults[0].winRStart];
+                                               
+                                               string newAligned = trim->getAligned();
+
+                                               if (lengthLeft > lengthRight) { //trim right
+                                                       for (int i = (spotMap[chimeraResults[0].winRStart]-1); i < newAligned.length(); i++) { newAligned[i] = '.'; }
+                                               }else { //trim left
+                                                       for (int i = 0; i < spotMap[chimeraResults[0].winLEnd]; i++) { newAligned[i] = '.'; }
+                                               }
+                                               trim->setAligned(newAligned);
+                                       }
+                                               
                                }
                        }
                        
@@ -212,7 +454,7 @@ int ChimeraSlayer::print(ostream& out, ostream& outAcc) {
                        out << endl;
                }else {  out << querySeq->getName() << "\tno" << endl;  }
                
-               return 0;
+               return trim;
                
        }
        catch(exception& e) {
@@ -222,13 +464,16 @@ int ChimeraSlayer::print(ostream& out, ostream& outAcc) {
 }
 #ifdef USE_MPI
 //***************************************************************************************************************
-int ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
+Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
        try {
                MPI_Status status;
                bool results = false;
                string outAccString = "";
                string outputString = "";
                
+               Sequence* trim = NULL;
+               if (trimChimera) { trim = trimQuery; }
+               
                if (chimeraFlags == "yes") {
                        string chimeraFlag = "no";
                        if(  (chimeraResults[0].bsa >= minBS && chimeraResults[0].divr_qla_qrb >= divR)
@@ -249,6 +494,19 @@ int ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
                                
                                        MPI_File_write_shared(outAcc, buf2, length, MPI_CHAR, &status);
                                        delete buf2;
+                                       
+                                       if (trimChimera) {  
+                                               int lengthLeft = spotMap[chimeraResults[0].winLEnd] - spotMap[chimeraResults[0].winLStart];
+                                               int lengthRight = spotMap[chimeraResults[0].winREnd] - spotMap[chimeraResults[0].winRStart];
+                                               
+                                               string newAligned = trim->getAligned();
+                                               if (lengthLeft > lengthRight) { //trim right
+                                                       for (int i = (spotMap[chimeraResults[0].winRStart]-1); i < newAligned.length(); i++) { newAligned[i] = '.'; }
+                                               }else { //trim left
+                                                       for (int i = 0; i < (spotMap[chimeraResults[0].winLEnd]-1); i++) { newAligned[i] = '.'; }
+                                               }
+                                               trim->setAligned(newAligned);   
+                                       }
                                }
                        }
                        
@@ -276,7 +534,7 @@ int ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
                }
                
                
-               return results;
+               return trim;
        }
        catch(exception& e) {
                m->errorOut(e, "ChimeraSlayer", "print");
@@ -288,6 +546,8 @@ int ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
 //***************************************************************************************************************
 int ChimeraSlayer::getChimeras(Sequence* query) {
        try {
+               if (trimChimera) { trimQuery = new Sequence(query->getName(), query->getAligned());  }
+               
                chimeraFlags = "no";
 
                //filter query
@@ -295,19 +555,33 @@ int ChimeraSlayer::getChimeras(Sequence* query) {
                
                querySeq = query;
                
+               //you must create a template
+               vector<Sequence*> thisTemplate;
+               if (templateFileName != "self") { thisTemplate = templateSeqs; }
+               else { thisTemplate = getTemplate(query); } //fills thistemplate and creates the databases
+               
+               if (m->control_pressed) {  return 0;  }
+               
+               if (thisTemplate.size() == 0) {  return 0; } //not chimeric
+               
                //referenceSeqs, numWanted, matchScore, misMatchPenalty, divR, minSimilarity
-               maligner = new Maligner(templateSeqs, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight);
-               slayer = new Slayer(window, increment, minSim, divR, iters, minSNP);
+               Maligner maligner(thisTemplate, numWanted, match, misMatch, divR, minSim, minCov, searchMethod, databaseLeft, databaseRight);
+               Slayer slayer(window, increment, minSim, divR, iters, minSNP);
+               
+               if (templateFileName == "self") {
+                       if (searchMethod == "kmer") {  delete databaseRight;  delete databaseLeft;  }   
+                       else if (searchMethod == "blast") {  delete databaseLeft; }
+               }
        
                if (m->control_pressed) {  return 0;  }
                
-               string chimeraFlag = maligner->getResults(query, decalc);
+               string chimeraFlag = maligner.getResults(query, decalc);
                if (m->control_pressed) {  return 0;  }
-               vector<results> Results = maligner->getOutput();
+               vector<results> Results = maligner.getOutput();
                        
                //found in testing realigning only made things worse
                if (realign) {
-                       ChimeraReAligner realigner(templateSeqs, match, misMatch);
+                       ChimeraReAligner realigner(thisTemplate, match, misMatch);
                        realigner.reAlign(query, Results);
                }
 
@@ -380,16 +654,16 @@ int ChimeraSlayer::getChimeras(Sequence* query) {
                        if (m->control_pressed) {  for (int k = 0; k < seqs.size(); k++) {  delete seqs[k].seq;   }  return 0;  }
                        
                        //send to slayer
-                       chimeraFlags = slayer->getResults(query, seqsForSlayer);
+                       chimeraFlags = slayer.getResults(query, seqsForSlayer);
                        if (m->control_pressed) {  return 0;  }
-                       chimeraResults = slayer->getOutput();
+                       chimeraResults = slayer.getOutput();
                        
                        //free memory
                        for (int k = 0; k < seqs.size(); k++) {  delete seqs[k].seq;   }
                }
                
-               delete maligner;
-               delete slayer;
+               //delete maligner;
+               //delete slayer;
                
                return 0;
        }