]> git.donarmstrong.com Git - mothur.git/commitdiff
finished chimera.slayer template=self change
authorwestcott <westcott>
Tue, 19 Apr 2011 10:08:17 +0000 (10:08 +0000)
committerwestcott <westcott>
Tue, 19 Apr 2011 10:08:17 +0000 (10:08 +0000)
chimeraslayer.cpp
chimeraslayer.h
chimeraslayercommand.cpp
chimeraslayercommand.h

index 8c9417ad82cd0341a88aa216a84add2a701791e3..c6613bfd507c3dd74f00edeaaa452c8e9d39e950 100644 (file)
@@ -45,7 +45,7 @@ int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int num
        }
 }
 //***************************************************************************************************************
-ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, string name, string mode, int k, int ms, int mms, int win, float div, 
+ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, map<string, int>& prior, string mode, int k, int ms, int mms, int win, float div, 
                                                         int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera()  {      
        try {
                fastafile = file; templateSeqs = readSeqs(fastafile);
@@ -66,14 +66,16 @@ ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, string name, s
                numWanted = numw;
                realign = r; 
                trimChimera = trim;
+               priority = prior;
                
                decalc = new DeCalculator();    
                
                createFilter(templateSeqs, 0.0); //just removed columns where all seqs have a gap
                
                //run filter on template
-               for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i];  } templateSeqs.clear();
-                
+               for (int i = 0; i < templateSeqs.size(); i++) {  if (m->control_pressed) {  break; }  runFilter(templateSeqs[i]);  }
+
+               
        }
        catch(exception& e) {
                m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer");
@@ -217,9 +219,26 @@ int ChimeraSlayer::doPrep() {
        }
 }
 //***************************************************************************************************************
-int ChimeraSlayer::getTemplate(Sequence* q) {
+vector<Sequence*> ChimeraSlayer::getTemplate(Sequence* q) {
        try {
                
+               //when template=self, the query file is sorted from most abundance to least abundant
+               //userTemplate grows as the query file is processed by adding sequences that are not chimeric and more abundant
+               vector<Sequence*> userTemplate;
+               
+               int myAbund = priority[q->getName()];
+               
+               for (int i = 0; i < templateSeqs.size(); i++) {
+                       
+                       if (m->control_pressed) { return userTemplate; } 
+                       
+                       //have I reached a sequence with the same abundance as myself?
+                       if (!(priority[templateSeqs[i]->getName()] > myAbund)) { break; }
+                       
+                       //if its am not chimeric add it
+                       if (chimericSeqs.count(templateSeqs[i]->getName()) == 0) { userTemplate.push_back(templateSeqs[i]); }
+               }
+               
                string  kmerDBNameLeft;
                string  kmerDBNameRight;
                
@@ -234,7 +253,7 @@ int ChimeraSlayer::getTemplate(Sequence* q) {
 #ifdef USE_MPI
                        for (int i = 0; i < userTemplate.size(); i++) {
                                
-                               if (m->control_pressed) { return 0; } 
+                               if (m->control_pressed) { return userTemplate; } 
                                
                                string leftFrag = userTemplate[i]->getUnaligned();
                                leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
@@ -246,7 +265,7 @@ int ChimeraSlayer::getTemplate(Sequence* q) {
                        databaseLeft->setNumSeqs(userTemplate.size());
                        
                        for (int i = 0; i < userTemplate.size(); i++) {
-                               if (m->control_pressed) { return 0;  } 
+                               if (m->control_pressed) { return userTemplate; } 
                                
                                string rightFrag = userTemplate[i]->getUnaligned();
                                rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
@@ -262,7 +281,7 @@ int ChimeraSlayer::getTemplate(Sequence* q) {
                        
                        for (int i = 0; i < userTemplate.size(); i++) {
                                
-                               if (m->control_pressed) { return 0; } 
+                               if (m->control_pressed) { return userTemplate; } 
                                
                                string leftFrag = userTemplate[i]->getUnaligned();
                                leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
@@ -274,7 +293,7 @@ int ChimeraSlayer::getTemplate(Sequence* q) {
                        databaseLeft->setNumSeqs(userTemplate.size());
                                
                        for (int i = 0; i < userTemplate.size(); i++) {
-                               if (m->control_pressed) { return 0; } 
+                               if (m->control_pressed) { return userTemplate; }  
                                        
                                string rightFrag = userTemplate[i]->getUnaligned();
                                rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
@@ -290,12 +309,12 @@ int ChimeraSlayer::getTemplate(Sequence* q) {
                        //generate blastdb
                        databaseLeft = new BlastDB(-1.0, -1.0, 1, -3);
 
-                       for (int i = 0; i < userTemplate.size(); i++) { if (m->control_pressed) { return 0; }  databaseLeft->addSequence(*userTemplate[i]);     }
+                       for (int i = 0; i < userTemplate.size(); i++) { if (m->control_pressed) { return userTemplate; }   databaseLeft->addSequence(*userTemplate[i]); }
                        databaseLeft->generateDB();
                        databaseLeft->setNumSeqs(userTemplate.size());
                }
                
-               return 0;
+               return userTemplate;
                
        }
        catch(exception& e) {
@@ -310,12 +329,6 @@ ChimeraSlayer::~ChimeraSlayer() {
        if (templateFileName != "self") {
                if (searchMethod == "kmer") {  delete databaseRight;  delete databaseLeft;  }   
                else if (searchMethod == "blast") {  delete databaseLeft; }
-       }else {
-               //delete userTemplate
-               for (int i = 0; i < userTemplate.size(); i++) {
-                       delete userTemplate[i];
-               }
-               userTemplate.clear();
        }
 }
 //***************************************************************************************************************
@@ -344,6 +357,8 @@ Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc) {
                                        m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine();
                                        outAcc << querySeq->getName() << endl;
                                        
+                                       if (templateFileName == "self") {  chimericSeqs.insert(querySeq->getName()); }
+                                       
                                        if (trimChimera) {  
                                                int lengthLeft = spotMap[chimeraResults[0].winLEnd] - spotMap[chimeraResults[0].winLStart];
                                                int lengthRight = spotMap[chimeraResults[0].winREnd] - spotMap[chimeraResults[0].winRStart];
@@ -364,11 +379,6 @@ Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc) {
                        out << endl;
                }else {  
                        out << querySeq->getName() << "\tno" << endl; 
-                       if (templateFileName == "self") {  
-                               Sequence* temp = new Sequence(trimQuery.getName(), trimQuery.getAligned());
-                               runFilter(temp);
-                               userTemplate.push_back(temp);
-                       }
                }
                
                return trim;
@@ -417,6 +427,8 @@ Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc, data_results leftP
                                        m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine();
                                        outAcc << querySeq->getName() << endl;
                                        
+                                       if (templateFileName == "self") {  chimericSeqs.insert(querySeq->getName()); }
+                                       
                                        if (trimChimera) {  
                                                string newAligned = trim->getAligned();
                                                                                                
@@ -470,11 +482,6 @@ Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc, data_results leftP
                        out << endl;
                }else {  
                        out << querySeq->getName() << "\tno" << endl;  
-                       if (templateFileName == "self") {  
-                               Sequence* temp = new Sequence(trimQuery.getName(), trimQuery.getAligned());
-                               runFilter(temp);
-                               userTemplate.push_back(temp);
-                       }
                }
                
                return trim;
@@ -532,6 +539,8 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc, data_results lef
                                        outAccString += querySeq->getName() + "\n";
                                        results = true;
                                        
+                                       if (templateFileName == "self") {  chimericSeqs.insert(querySeq->getName()); }
+                                       
                                        //write to accnos file
                                        int length = outAccString.length();
                                        char* buf2 = new char[length];
@@ -610,12 +619,6 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc, data_results lef
                                
                        MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
                        delete buf;
-                       
-                       if (template == "self") {  
-                               Sequence temp = new Sequence(trimQuery.getName(), trimQuery.getAligned());
-                               runFilter(temp);
-                               userTemplate.push_back(temp);
-                       }
                }
                
                
@@ -650,6 +653,8 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
                                        outAccString += querySeq->getName() + "\n";
                                        results = true;
                                        
+                                       if (templateFileName == "self") {  chimericSeqs.insert(querySeq->getName()); }
+                                       
                                        //write to accnos file
                                        int length = outAccString.length();
                                        char* buf2 = new char[length];
@@ -694,12 +699,6 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
                        
                        MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
                        delete buf;
-                       
-                       if (template == "self") {  
-                               Sequence temp = new Sequence(trimQuery.getName(), trimQuery.getAligned());
-                               runFilter(temp);
-                               userTemplate.push_back(temp);
-                       }
                }
                
                return trim;
@@ -730,7 +729,7 @@ int ChimeraSlayer::getChimeras(Sequence* query) {
                //you must create a template
                vector<Sequence*> thisTemplate;
                if (templateFileName != "self") { thisTemplate = templateSeqs; }
-               else { getTemplate(query);  thisTemplate = userTemplate; } //fills this template and creates the databases
+               else {  thisTemplate = getTemplate(query);  } //fills this template and creates the databases
                
                if (m->control_pressed) {  return 0;  }
                
index cac96eb158a721ac5fe3d5c68a82d61979fd99e8..ace98ce615b7f8106a2b401db75499be2e55d534 100644 (file)
@@ -23,7 +23,7 @@ class ChimeraSlayer : public Chimera {
        
        public:
                ChimeraSlayer(string, string, bool, string, int, int, int, int, float, int, int, int, int, int, int, int, int, bool);
-               ChimeraSlayer(string, string, bool, string, string, int, int, int, int, float, int, int, int, int, int, int, int, int, bool);
+               ChimeraSlayer(string, string, bool, map<string, int>&, string,  int, int, int, int, float, int, int, int, int, int, int, int, int, bool);
 
                ~ChimeraSlayer();
                
@@ -46,9 +46,8 @@ class ChimeraSlayer : public Chimera {
                map<int, int>  spotMap;
                Database* databaseRight;
                Database* databaseLeft;
-               vector<Sequence*> userTemplate;  //when template=self, the query file is sorted from most abundance to least abundant
-                                                                                //userTemplate grows as the query file is processed by adding sequences that are not chimeric
-               set<string> namesOfChimericSeqs; //only used when template=self
+               map<string, int> priority; //for template=self, seqname, seqAligned, abundance
+               set<string> chimericSeqs; //for template=self, so we don't add chimeric sequences to the userTemplate set
                
                vector<data_struct>  chimeraResults;
                data_results printResults;
@@ -62,7 +61,7 @@ class ChimeraSlayer : public Chimera {
                string getBlock(data_struct, string);
                string getBlock(data_results, data_results, bool, bool, string);
                //int readNameFile(string);
-               int getTemplate(Sequence*);
+               vector<Sequence*> getTemplate(Sequence*);
                
 };
 
index 1a9c1fb5ca3d56896e605803bd98d472428f9f01..f6baa0a2074d75e01c73eeb50cae1f77f291ee20 100644 (file)
@@ -381,12 +381,12 @@ int ChimeraSlayerCommand::execute(){
                                
                                //sort fastafile by abundance, returns new sorted fastafile name
                                m->mothurOut("Sorting fastafile according to abundance..."); cout.flush(); 
-                               fastaFileNames[s] = sortFastaFile(fastaFileNames[s], nameFile);
+                               map<string, int> priority = sortFastaFile(fastaFileNames[s], nameFile);
                                m->mothurOut("Done."); m->mothurOutEndLine();
                                
                                if (m->control_pressed) {  for (int j = 0; j < outputNames.size(); j++) {       remove(outputNames[j].c_str()); }  return 0;    }
 
-                               chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, trim, nameFile, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign);   
+                               chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, trim, priority, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign);   
                        }
                                
                        if (outputDir == "") { outputDir = m->hasPath(fastaFileNames[s]);  }//if user entered a file with a path then preserve it                               
@@ -456,36 +456,45 @@ int ChimeraSlayerCommand::execute(){
 
                                        MPIPos = m->setFilePosFasta(fastaFileNames[s], numSeqs); //fills MPIPos, returns numSeqs
                                        
-                                       //send file positions to all processes
-                                       for(int i = 1; i < processors; i++) { 
-                                               MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
-                                               MPI_Send(&MPIPos[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
+                                       if (templatefile != "self") { //if template=self we can only use 1 processor
+                                               //send file positions to all processes
+                                               for(int i = 1; i < processors; i++) { 
+                                                       MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
+                                                       MPI_Send(&MPIPos[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
+                                               }
                                        }
-                                       
                                        //figure out how many sequences you have to align
                                        numSeqsPerProcessor = numSeqs / processors;
                                        int startIndex =  pid * numSeqsPerProcessor;
                                        if(pid == (processors - 1)){    numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor;      }
-                               
+                                       
+                                       if (templatefile == "self") { //if template=self we can only use 1 processor
+                                               startIndex = 0;
+                                               numSeqsPerProcessor = numSeqs;
+                                       }
+                                       
                                        //do your part
                                        driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, MPIPos);
                                        
                                        if (m->control_pressed) { outputTypes.clear();  MPI_File_close(&inMPI);  MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); }  MPI_File_close(&outMPIAccnos);  for (int j = 0; j < outputNames.size(); j++) {   remove(outputNames[j].c_str()); }  remove(outputFileName.c_str());  remove(accnosFileName.c_str());  delete chimera; return 0;  }
 
                                }else{ //you are a child process
-                                       MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
-                                       MPIPos.resize(numSeqs+1);
-                                       MPI_Recv(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
+                                       if (templatefile != "self") { //if template=self we can only use 1 processor
+                                               MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
+                                               MPIPos.resize(numSeqs+1);
+                                               MPI_Recv(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
                                        
-                                       //figure out how many sequences you have to align
-                                       numSeqsPerProcessor = numSeqs / processors;
-                                       int startIndex =  pid * numSeqsPerProcessor;
-                                       if(pid == (processors - 1)){    numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor;      }
+                                               //figure out how many sequences you have to align
+                                               numSeqsPerProcessor = numSeqs / processors;
+                                               int startIndex =  pid * numSeqsPerProcessor;
+                                               if(pid == (processors - 1)){    numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor;      }
                                        
-                                       //do your part
-                                       driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, MPIPos);
+                                               //do your part
+                                               driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, MPIPos);
                                        
-                                       if (m->control_pressed) { outputTypes.clear();  MPI_File_close(&inMPI);  MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); }  MPI_File_close(&outMPIAccnos);  for (int j = 0; j < outputNames.size(); j++) {   remove(outputNames[j].c_str()); }  delete chimera; return 0;  }
+                                               if (m->control_pressed) { outputTypes.clear();  MPI_File_close(&inMPI);  MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); }  MPI_File_close(&outMPIAccnos);  for (int j = 0; j < outputNames.size(); j++) {   remove(outputNames[j].c_str()); }  delete chimera; return 0;  }
+                               
+                                       }
                                }
                                
                                //close files 
@@ -939,9 +948,9 @@ int ChimeraSlayerCommand::divideInHalf(Sequence querySeq, string& leftQuery, str
        }
 }
 /**************************************************************************************************/
-
-string ChimeraSlayerCommand::sortFastaFile(string fastaFile, string nameFile) {
+map<string, int> ChimeraSlayerCommand::sortFastaFile(string fastaFile, string nameFile) {
        try {
+               map<string, int> nameAbund;
                
                //read through fastafile and store info
                map<string, string> seqs;
@@ -950,7 +959,7 @@ string ChimeraSlayerCommand::sortFastaFile(string fastaFile, string nameFile) {
                
                while (!in.eof()) {
                        
-                       if (m->control_pressed) { in.close(); return ""; }
+                       if (m->control_pressed) { in.close(); return nameAbund; }
                        
                        Sequence seq(in); m->gobble(in);
                        seqs[seq.getName()] = seq.getAligned();
@@ -962,10 +971,10 @@ string ChimeraSlayerCommand::sortFastaFile(string fastaFile, string nameFile) {
                vector<seqPriorityNode> nameMapCount;
                int error = m->readNames(nameFile, nameMapCount, seqs);
                
-               if (m->control_pressed) { return ""; }
+               if (m->control_pressed) { return nameAbund; }
                
-               if (error == 1) { m->control_pressed = true; return ""; }
-               if (seqs.size() != nameMapCount.size()) { m->mothurOut( "The number of sequences in your fastafile does not match the number of sequences in your namefile, aborting."); m->mothurOutEndLine(); m->control_pressed = true; return ""; }
+               if (error == 1) { m->control_pressed = true; return nameAbund; }
+               if (seqs.size() != nameMapCount.size()) { m->mothurOut( "The number of sequences in your fastafile does not match the number of sequences in your namefile, aborting."); m->mothurOutEndLine(); m->control_pressed = true; return nameAbund; }
 
                sort(nameMapCount.begin(), nameMapCount.end(), compareSeqPriorityNodes);
                
@@ -976,12 +985,13 @@ string ChimeraSlayerCommand::sortFastaFile(string fastaFile, string nameFile) {
                //print new file in order of
                for (int i = 0; i < nameMapCount.size(); i++) {
                        out << ">" << nameMapCount[i].name << endl << nameMapCount[i].seq << endl;
+                       nameAbund[nameMapCount[i].name] = nameMapCount[i].numIdentical;
                }
                out.close();
                
                rename(newFasta.c_str(), fastaFile.c_str());
                                
-               return fastaFile;
+               return nameAbund;
                
        }
        catch(exception& e) {
index 6ca0310d109c117cf5205ac0725e79338540676a..e5ffeb69c860aab7d08e60d134d6baacae4033f4 100644 (file)
@@ -44,7 +44,7 @@ private:
        int driver(linePair*, string, string, string, string);
        int createProcesses(string, string, string, string);
        int divideInHalf(Sequence, string&, string&);
-       string sortFastaFile(string, string);
+       map<string, int> sortFastaFile(string, string);
                
        #ifdef USE_MPI
        int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long int>&);