]> git.donarmstrong.com Git - mothur.git/blobdiff - chimeraslayer.cpp
finished chimera.slayer template=self change
[mothur.git] / chimeraslayer.cpp
index eae6acf4e20b717d5327722ec22085fa96ecc0af..c6613bfd507c3dd74f00edeaaa452c8e9d39e950 100644 (file)
@@ -45,7 +45,7 @@ int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int num
        }
 }
 //***************************************************************************************************************
-ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, string name, string mode, string abunds, int k, int ms, int mms, int win, float div, 
+ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, map<string, int>& prior, string mode, int k, int ms, int mms, int win, float div, 
                                                         int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera()  {      
        try {
                fastafile = file; templateSeqs = readSeqs(fastafile);
@@ -65,18 +65,16 @@ ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, string name, s
                increment = inc;
                numWanted = numw;
                realign = r; 
-               includeAbunds = abunds;
                trimChimera = trim;
-               
-               //read name file and create nameMapRank
-               readNameFile(name);
+               priority = prior;
                
                decalc = new DeCalculator();    
                
                createFilter(templateSeqs, 0.0); //just removed columns where all seqs have a gap
                
                //run filter on template
-               for (int i = 0; i < templateSeqs.size(); i++) { runFilter(templateSeqs[i]);  }
+               for (int i = 0; i < templateSeqs.size(); i++) {  if (m->control_pressed) {  break; }  runFilter(templateSeqs[i]);  }
+
                
        }
        catch(exception& e) {
@@ -84,59 +82,6 @@ ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, string name, s
                exit(1);
        }
 }
-//***************************************************************************************************************
-int ChimeraSlayer::readNameFile(string name) {
-       try {
-               ifstream in;
-               m->openInputFile(name, in);
-               
-               int maxRank = 0;
-               int minRank = 10000000;
-               
-               while(!in.eof()){
-                       
-                       if (m->control_pressed) { in.close(); return 0; }
-                       
-                       string thisname, repnames;
-                       
-                       in >> thisname;         m->gobble(in);          //read from first column
-                       in >> repnames;                 //read from second column
-                       
-                       map<string, vector<string> >::iterator it = nameMapRank.find(thisname);
-                       if (it == nameMapRank.end()) {
-                               
-                               vector<string> splitRepNames;
-                               m->splitAtComma(repnames, splitRepNames);
-                               
-                               nameMapRank[thisname] = splitRepNames;  
-                               
-                               if (splitRepNames.size() > maxRank) { maxRank = splitRepNames.size(); }
-                               if (splitRepNames.size() < minRank) { minRank = splitRepNames.size(); }
-                               
-                       }else{  m->mothurOut(thisname + " is already in namesfile. I will use first definition."); m->mothurOutEndLine();  }
-                       
-                       m->gobble(in);
-               }
-               in.close();     
-               
-               //sanity check to make sure files match
-               for (int i = 0; i < templateSeqs.size(); i++) {
-                       map<string, vector<string> >::iterator it = nameMapRank.find(templateSeqs[i]->getName());
-                       
-                       if (it == nameMapRank.end()) { m->mothurOut("[ERROR]: " + templateSeqs[i]->getName() + " is not in namesfile, but is in fastafile. Every name in fasta file must be in first column of names file."); m->mothurOutEndLine(); m->control_pressed = true;  }
-               }
-               
-               if (maxRank == minRank) { m->mothurOut("[ERROR]: all sequences in namesfile have the same abundance, aborting."); m->mothurOutEndLine(); m->control_pressed = true;  }
-               
-               return 0;
-               
-       }
-       catch(exception& e) {
-               m->errorOut(e, "ChimeraSlayer", "readNameFile");
-               exit(1);
-       }
-}
-
 //***************************************************************************************************************
 int ChimeraSlayer::doPrep() {
        try {
@@ -258,7 +203,8 @@ int ChimeraSlayer::doPrep() {
                }else if (searchMethod == "blast") {
                
                        //generate blastdb
-                       databaseLeft = new BlastDB(-2.0, -1.0, match, misMatch);
+                       databaseLeft = new BlastDB(-1.0, -1.0, 1, -3);
+
                        for (int i = 0; i < templateSeqs.size(); i++) {         databaseLeft->addSequence(*templateSeqs[i]);    }
                        databaseLeft->generateDB();
                        databaseLeft->setNumSeqs(templateSeqs.size());
@@ -276,44 +222,21 @@ int ChimeraSlayer::doPrep() {
 vector<Sequence*> ChimeraSlayer::getTemplate(Sequence* q) {
        try {
                
-               vector<Sequence*> thisTemplate;
+               //when template=self, the query file is sorted from most abundance to least abundant
+               //userTemplate grows as the query file is processed by adding sequences that are not chimeric and more abundant
+               vector<Sequence*> userTemplate;
                
-               int thisRank;
-               string thisName = q->getName();
-               map<string, vector<string> >::iterator itRank = nameMapRank.find(thisName); // you will find it because we already sanity checked
-               thisRank = (itRank->second).size();
+               int myAbund = priority[q->getName()];
                
-               //create list of names we want to put into the template
-               set<string> namesToAdd;
-               for (itRank = nameMapRank.begin(); itRank != nameMapRank.end(); itRank++) {
-                       if (itRank->first != thisName) {
-                               if (includeAbunds == "greaterequal") {
-                                       if ((itRank->second).size() >= thisRank) {
-                                               //you are more abundant than me or equal to my abundance
-                                               for (int i = 0; i < (itRank->second).size(); i++) {
-                                                       namesToAdd.insert((itRank->second)[i]);
-                                               }
-                                       }
-                               }else if (includeAbunds == "greater") {
-                                       if ((itRank->second).size() > thisRank) {
-                                               //you are more abundant than me
-                                               for (int i = 0; i < (itRank->second).size(); i++) {
-                                                       namesToAdd.insert((itRank->second)[i]);
-                                               }
-                                       }
-                               }else if (includeAbunds == "all") {
-                                       //add everyone
-                                       for (int i = 0; i < (itRank->second).size(); i++) {
-                                               namesToAdd.insert((itRank->second)[i]);
-                                       }
-                               }
-                       }
-               }
-               
-               for (int i = 0; i < templateSeqs.size(); i++) {  
-                       if (namesToAdd.count(templateSeqs[i]->getName()) != 0) { 
-                               thisTemplate.push_back(templateSeqs[i]);
-                       }
+               for (int i = 0; i < templateSeqs.size(); i++) {
+                       
+                       if (m->control_pressed) { return userTemplate; } 
+                       
+                       //have I reached a sequence with the same abundance as myself?
+                       if (!(priority[templateSeqs[i]->getName()] > myAbund)) { break; }
+                       
+                       //if its am not chimeric add it
+                       if (chimericSeqs.count(templateSeqs[i]->getName()) == 0) { userTemplate.push_back(templateSeqs[i]); }
                }
                
                string  kmerDBNameLeft;
@@ -328,69 +251,70 @@ vector<Sequence*> ChimeraSlayer::getTemplate(Sequence* q) {
                        string leftTemplateFileName = templatePath + "left." + m->getRootName(m->getSimpleName(templateFileName));
                        databaseLeft = new KmerDB(leftTemplateFileName, kmerSize);      
 #ifdef USE_MPI
-                       for (int i = 0; i < thisTemplate.size(); i++) {
+                       for (int i = 0; i < userTemplate.size(); i++) {
                                
-                               if (m->control_pressed) { return thisTemplate; } 
+                               if (m->control_pressed) { return userTemplate; } 
                                
-                               string leftFrag = thisTemplate[i]->getUnaligned();
+                               string leftFrag = userTemplate[i]->getUnaligned();
                                leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
                                
-                               Sequence leftTemp(thisTemplate[i]->getName(), leftFrag);
+                               Sequence leftTemp(userTemplate[i]->getName(), leftFrag);
                                databaseLeft->addSequence(leftTemp);    
                        }
                        databaseLeft->generateDB();
-                       databaseLeft->setNumSeqs(thisTemplate.size());
+                       databaseLeft->setNumSeqs(userTemplate.size());
                        
-                       for (int i = 0; i < thisTemplate.size(); i++) {
-                               if (m->control_pressed) { return thisTemplate;  } 
+                       for (int i = 0; i < userTemplate.size(); i++) {
+                               if (m->control_pressed) { return userTemplate; } 
                                
-                               string rightFrag = thisTemplate[i]->getUnaligned();
+                               string rightFrag = userTemplate[i]->getUnaligned();
                                rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
                                
-                               Sequence rightTemp(thisTemplate[i]->getName(), rightFrag);
+                               Sequence rightTemp(userTemplate[i]->getName(), rightFrag);
                                databaseRight->addSequence(rightTemp);  
                        }
                        databaseRight->generateDB();
-                       databaseRight->setNumSeqs(thisTemplate.size());
+                       databaseRight->setNumSeqs(userTemplate.size());
                        
 #else  
                        
                        
-                       for (int i = 0; i < thisTemplate.size(); i++) {
+                       for (int i = 0; i < userTemplate.size(); i++) {
                                
-                               if (m->control_pressed) { return thisTemplate; } 
+                               if (m->control_pressed) { return userTemplate; } 
                                
-                               string leftFrag = thisTemplate[i]->getUnaligned();
+                               string leftFrag = userTemplate[i]->getUnaligned();
                                leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
                                
-                               Sequence leftTemp(thisTemplate[i]->getName(), leftFrag);
+                               Sequence leftTemp(userTemplate[i]->getName(), leftFrag);
                                databaseLeft->addSequence(leftTemp);    
                        }
                        databaseLeft->generateDB();
-                       databaseLeft->setNumSeqs(thisTemplate.size());
+                       databaseLeft->setNumSeqs(userTemplate.size());
                                
-                       for (int i = 0; i < thisTemplate.size(); i++) {
-                               if (m->control_pressed) { return thisTemplate; } 
+                       for (int i = 0; i < userTemplate.size(); i++) {
+                               if (m->control_pressed) { return userTemplate; }  
                                        
-                               string rightFrag = thisTemplate[i]->getUnaligned();
+                               string rightFrag = userTemplate[i]->getUnaligned();
                                rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
                                        
-                               Sequence rightTemp(thisTemplate[i]->getName(), rightFrag);
+                               Sequence rightTemp(userTemplate[i]->getName(), rightFrag);
                                databaseRight->addSequence(rightTemp);  
                        }
                        databaseRight->generateDB();
-                       databaseRight->setNumSeqs(thisTemplate.size());
+                       databaseRight->setNumSeqs(userTemplate.size());
 #endif 
                }else if (searchMethod == "blast") {
                        
                        //generate blastdb
-                       databaseLeft = new BlastDB(-2.0, -1.0, match, misMatch);
-                       for (int i = 0; i < thisTemplate.size(); i++) { if (m->control_pressed) { return thisTemplate; }  databaseLeft->addSequence(*thisTemplate[i]);  }
+                       databaseLeft = new BlastDB(-1.0, -1.0, 1, -3);
+
+                       for (int i = 0; i < userTemplate.size(); i++) { if (m->control_pressed) { return userTemplate; }   databaseLeft->addSequence(*userTemplate[i]); }
                        databaseLeft->generateDB();
-                       databaseLeft->setNumSeqs(thisTemplate.size());
+                       databaseLeft->setNumSeqs(userTemplate.size());
                }
                
-               return thisTemplate;
+               return userTemplate;
                
        }
        catch(exception& e) {
@@ -419,7 +343,7 @@ void ChimeraSlayer::printHeader(ostream& out) {
 Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc) {
        try {
                Sequence* trim = NULL;
-               if (trimChimera) { trim = trimQuery; }
+               if (trimChimera) { trim = new Sequence(trimQuery.getName(), trimQuery.getAligned()); }
                
                if (chimeraFlags == "yes") {
                        string chimeraFlag = "no";
@@ -433,6 +357,8 @@ Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc) {
                                        m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine();
                                        outAcc << querySeq->getName() << endl;
                                        
+                                       if (templateFileName == "self") {  chimericSeqs.insert(querySeq->getName()); }
+                                       
                                        if (trimChimera) {  
                                                int lengthLeft = spotMap[chimeraResults[0].winLEnd] - spotMap[chimeraResults[0].winLStart];
                                                int lengthRight = spotMap[chimeraResults[0].winREnd] - spotMap[chimeraResults[0].winRStart];
@@ -446,13 +372,14 @@ Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc) {
                                                }
                                                trim->setAligned(newAligned);
                                        }
-                                               
                                }
                        }
                        
                        printBlock(chimeraResults[0], chimeraFlag, out);
                        out << endl;
-               }else {  out << querySeq->getName() << "\tno" << endl;  }
+               }else {  
+                       out << querySeq->getName() << "\tno" << endl; 
+               }
                
                return trim;
                
@@ -500,6 +427,8 @@ Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc, data_results leftP
                                        m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine();
                                        outAcc << querySeq->getName() << endl;
                                        
+                                       if (templateFileName == "self") {  chimericSeqs.insert(querySeq->getName()); }
+                                       
                                        if (trimChimera) {  
                                                string newAligned = trim->getAligned();
                                                                                                
@@ -510,15 +439,15 @@ Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc, data_results leftP
                                                        for (int i = (rightPiece.spotMap[rightPiece.results[0].winLStart]-1); i < newAligned.length(); i++) { newAligned[i] = '.'; }
                                                }else { //both sides are chimeric, keep longest piece
                                                        
-                                                       int lengthLeftLeft = leftPiece.spotMap[leftPiece.results[0].winLStart] - leftPiece.spotMap[leftPiece.results[0].winLEnd];
-                                                       int lengthLeftRight = leftPiece.spotMap[leftPiece.results[0].winRStart] - leftPiece.spotMap[leftPiece.results[0].winREnd];
+                                                       int lengthLeftLeft = leftPiece.spotMap[leftPiece.results[0].winLEnd] - leftPiece.spotMap[leftPiece.results[0].winLStart];
+                                                       int lengthLeftRight = leftPiece.spotMap[leftPiece.results[0].winREnd] - leftPiece.spotMap[leftPiece.results[0].winRStart];
                                                        
                                                        int longest = 1; // leftleft = 1, leftright = 2, rightleft = 3 rightright = 4
                                                        int length = lengthLeftLeft;
                                                        if (lengthLeftLeft < lengthLeftRight) { longest = 2;  length = lengthLeftRight; }
                                                        
-                                                       int lengthRightLeft = rightPiece.spotMap[rightPiece.results[0].winLStart] - rightPiece.spotMap[rightPiece.results[0].winLEnd];
-                                                       int lengthRightRight = rightPiece.spotMap[rightPiece.results[0].winRStart] - rightPiece.spotMap[rightPiece.results[0].winREnd];
+                                                       int lengthRightLeft = rightPiece.spotMap[rightPiece.results[0].winLEnd] - rightPiece.spotMap[rightPiece.results[0].winLStart];
+                                                       int lengthRightRight = rightPiece.spotMap[rightPiece.results[0].winREnd] - rightPiece.spotMap[rightPiece.results[0].winRStart];
                                                        
                                                        if (lengthRightLeft > length) { longest = 3; length = lengthRightLeft;  }
                                                        if (lengthRightRight > length) { longest = 4; }
@@ -551,7 +480,9 @@ Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc, data_results leftP
                        
                        printBlock(leftPiece, rightPiece, leftChimeric, rightChimeric, chimeraFlag, out);
                        out << endl;
-               }else {  out << querySeq->getName() << "\tno" << endl;  }
+               }else {  
+                       out << querySeq->getName() << "\tno" << endl;  
+               }
                
                return trim;
                
@@ -608,6 +539,8 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc, data_results lef
                                        outAccString += querySeq->getName() + "\n";
                                        results = true;
                                        
+                                       if (templateFileName == "self") {  chimericSeqs.insert(querySeq->getName()); }
+                                       
                                        //write to accnos file
                                        int length = outAccString.length();
                                        char* buf2 = new char[length];
@@ -626,15 +559,15 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc, data_results lef
                                                        for (int i = (rightPiece.spotMap[rightPiece.results[0].winLStart]-1); i < newAligned.length(); i++) { newAligned[i] = '.'; }
                                                }else { //both sides are chimeric, keep longest piece
                                                        
-                                                       int lengthLeftLeft = leftPiece.spotMap[leftPiece.results[0].winLStart] - leftPiece.spotMap[leftPiece.results[0].winLEnd];
-                                                       int lengthLeftRight = leftPiece.spotMap[leftPiece.results[0].winRStart] - leftPiece.spotMap[leftPiece.results[0].winREnd];
+                                                       int lengthLeftLeft = leftPiece.spotMap[leftPiece.results[0].winLEnd] - leftPiece.spotMap[leftPiece.results[0].winLStart];
+                                                       int lengthLeftRight = leftPiece.spotMap[leftPiece.results[0].winREnd] - leftPiece.spotMap[leftPiece.results[0].winRStart];
                                                        
                                                        int longest = 1; // leftleft = 1, leftright = 2, rightleft = 3 rightright = 4
                                                        int length = lengthLeftLeft;
                                                        if (lengthLeftLeft < lengthLeftRight) { longest = 2;  length = lengthLeftRight; }
                                                        
-                                                       int lengthRightLeft = rightPiece.spotMap[rightPiece.results[0].winLStart] - rightPiece.spotMap[rightPiece.results[0].winLEnd];
-                                                       int lengthRightRight = rightPiece.spotMap[rightPiece.results[0].winRStart] - rightPiece.spotMap[rightPiece.results[0].winREnd];
+                                                       int lengthRightLeft = rightPiece.spotMap[rightPiece.results[0].winLEnd] - rightPiece.spotMap[rightPiece.results[0].winLStart];
+                                                       int lengthRightRight = rightPiece.spotMap[rightPiece.results[0].winREnd] - rightPiece.spotMap[rightPiece.results[0].winRStart];
                                                        
                                                        if (lengthRightLeft > length) { longest = 3; length = lengthRightLeft;  }
                                                        if (lengthRightRight > length) { longest = 4; }
@@ -705,7 +638,7 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
                string outputString = "";
                
                Sequence* trim = NULL;
-               if (trimChimera) { trim = trimQuery; }
+               if (trimChimera) { trim = new Sequence(trimQuery.getName(), trimQuery.getAligned()); }
                
                if (chimeraFlags == "yes") {
                        string chimeraFlag = "no";
@@ -720,6 +653,8 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
                                        outAccString += querySeq->getName() + "\n";
                                        results = true;
                                        
+                                       if (templateFileName == "self") {  chimericSeqs.insert(querySeq->getName()); }
+                                       
                                        //write to accnos file
                                        int length = outAccString.length();
                                        char* buf2 = new char[length];
@@ -778,7 +713,9 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
 //***************************************************************************************************************
 int ChimeraSlayer::getChimeras(Sequence* query) {
        try {
-               if (trimChimera) { trimQuery = new Sequence(query->getName(), query->getAligned()); printResults.trimQuery = *trimQuery; }
+               
+               trimQuery.setName(query->getName()); trimQuery.setAligned(query->getAligned());
+               printResults.trimQuery = trimQuery; 
                
                chimeraFlags = "no";
                printResults.flag = "no";
@@ -792,7 +729,7 @@ int ChimeraSlayer::getChimeras(Sequence* query) {
                //you must create a template
                vector<Sequence*> thisTemplate;
                if (templateFileName != "self") { thisTemplate = templateSeqs; }
-               else { thisTemplate = getTemplate(query); } //fills thistemplate and creates the databases
+               else {  thisTemplate = getTemplate(query);  } //fills this template and creates the databases
                
                if (m->control_pressed) {  return 0;  }
                
@@ -810,10 +747,11 @@ int ChimeraSlayer::getChimeras(Sequence* query) {
                if (m->control_pressed) {  return 0;  }
 
                string chimeraFlag = maligner.getResults(query, decalc);
+               
                if (m->control_pressed) {  return 0;  }
+               
                vector<results> Results = maligner.getOutput();
        
-               //found in testing realigning only made things worse
                if (realign) {
                        ChimeraReAligner realigner(thisTemplate, match, misMatch);
                        realigner.reAlign(query, Results);