finished chimera.slayer template=self change

author westcott <westcott>

Tue, 19 Apr 2011 10:08:17 +0000 (10:08 +0000)

committer westcott <westcott>

Tue, 19 Apr 2011 10:08:17 +0000 (10:08 +0000)
author westcott <westcott>
Tue, 19 Apr 2011 10:08:17 +0000 (10:08 +0000)
committer westcott <westcott>
Tue, 19 Apr 2011 10:08:17 +0000 (10:08 +0000)
diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp

index 8c9417ad82cd0341a88aa216a84add2a701791e3..c6613bfd507c3dd74f00edeaaa452c8e9d39e950 100644 (file)
--- a/chimeraslayer.cpp
+++ b/chimeraslayer.cpp
@@ -45,7 +45,7 @@ int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int num
         }
  }
  //***************************************************************************************************************
-ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, string name, string mode, int k, int ms, int mms, int win, float div, 
+ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, map<string, int>& prior, string mode, int k, int ms, int mms, int win, float div, 
                                                          int minsim, int mincov, int minbs, int minsnp, int par, int it, int inc, int numw, bool r) : Chimera()  {      
         try {
                 fastafile = file; templateSeqs = readSeqs(fastafile);
@@ -66,14 +66,16 @@ ChimeraSlayer::ChimeraSlayer(string file, string temp, bool trim, string name, s
                 numWanted = numw;
                 realign = r; 
                 trimChimera = trim;
+               priority = prior;
                 
                 decalc = new DeCalculator();    
                 
                 createFilter(templateSeqs, 0.0); //just removed columns where all seqs have a gap
                 
                 //run filter on template
-               for (int i = 0; i < templateSeqs.size(); i++) { delete templateSeqs[i];  } templateSeqs.clear();
-                
+               for (int i = 0; i < templateSeqs.size(); i++) {  if (m->control_pressed) {  break; }  runFilter(templateSeqs[i]);  }
+
+               
         }
         catch(exception& e) {
                 m->errorOut(e, "ChimeraSlayer", "ChimeraSlayer");
@@ -217,9 +219,26 @@ int ChimeraSlayer::doPrep() {
         }
  }
  //***************************************************************************************************************
-int ChimeraSlayer::getTemplate(Sequence* q) {
+vector<Sequence*> ChimeraSlayer::getTemplate(Sequence* q) {
         try {
                 
+               //when template=self, the query file is sorted from most abundance to least abundant
+               //userTemplate grows as the query file is processed by adding sequences that are not chimeric and more abundant
+               vector<Sequence*> userTemplate;
+               
+               int myAbund = priority[q->getName()];
+               
+               for (int i = 0; i < templateSeqs.size(); i++) {
+                       
+                       if (m->control_pressed) { return userTemplate; } 
+                       
+                       //have I reached a sequence with the same abundance as myself?
+                       if (!(priority[templateSeqs[i]->getName()] > myAbund)) { break; }
+                       
+                       //if its am not chimeric add it
+                       if (chimericSeqs.count(templateSeqs[i]->getName()) == 0) { userTemplate.push_back(templateSeqs[i]); }
+               }
+               
                 string  kmerDBNameLeft;
                 string  kmerDBNameRight;
                 
@@ -234,7 +253,7 @@ int ChimeraSlayer::getTemplate(Sequence* q) {
  #ifdef USE_MPI
                         for (int i = 0; i < userTemplate.size(); i++) {
                                 
-                               if (m->control_pressed) { return 0; } 
+                               if (m->control_pressed) { return userTemplate; } 
                                 
                                 string leftFrag = userTemplate[i]->getUnaligned();
                                 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
@@ -246,7 +265,7 @@ int ChimeraSlayer::getTemplate(Sequence* q) {
                         databaseLeft->setNumSeqs(userTemplate.size());
                         
                         for (int i = 0; i < userTemplate.size(); i++) {
-                               if (m->control_pressed) { return 0;  } 
+                               if (m->control_pressed) { return userTemplate; } 
                                 
                                 string rightFrag = userTemplate[i]->getUnaligned();
                                 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
@@ -262,7 +281,7 @@ int ChimeraSlayer::getTemplate(Sequence* q) {
                         
                         for (int i = 0; i < userTemplate.size(); i++) {
                                 
-                               if (m->control_pressed) { return 0; } 
+                               if (m->control_pressed) { return userTemplate; } 
                                 
                                 string leftFrag = userTemplate[i]->getUnaligned();
                                 leftFrag = leftFrag.substr(0, int(leftFrag.length() * 0.33));
@@ -274,7 +293,7 @@ int ChimeraSlayer::getTemplate(Sequence* q) {
                         databaseLeft->setNumSeqs(userTemplate.size());
                                 
                         for (int i = 0; i < userTemplate.size(); i++) {
-                               if (m->control_pressed) { return 0; } 
+                               if (m->control_pressed) { return userTemplate; }  
                                         
                                 string rightFrag = userTemplate[i]->getUnaligned();
                                 rightFrag = rightFrag.substr(int(rightFrag.length() * 0.66));
@@ -290,12 +309,12 @@ int ChimeraSlayer::getTemplate(Sequence* q) {
                         //generate blastdb
                         databaseLeft = new BlastDB(-1.0, -1.0, 1, -3);
  
-                       for (int i = 0; i < userTemplate.size(); i++) { if (m->control_pressed) { return 0; }  databaseLeft->addSequence(*userTemplate[i]);     }
+                       for (int i = 0; i < userTemplate.size(); i++) { if (m->control_pressed) { return userTemplate; }   databaseLeft->addSequence(*userTemplate[i]); }
                         databaseLeft->generateDB();
                         databaseLeft->setNumSeqs(userTemplate.size());
                 }
                 
-               return 0;
+               return userTemplate;
                 
         }
         catch(exception& e) {
@@ -310,12 +329,6 @@ ChimeraSlayer::~ChimeraSlayer() {
         if (templateFileName != "self") {
                 if (searchMethod == "kmer") {  delete databaseRight;  delete databaseLeft;  }   
                 else if (searchMethod == "blast") {  delete databaseLeft; }
-       }else {
-               //delete userTemplate
-               for (int i = 0; i < userTemplate.size(); i++) {
-                       delete userTemplate[i];
-               }
-               userTemplate.clear();
         }
  }
  //***************************************************************************************************************
@@ -344,6 +357,8 @@ Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc) {
                                         m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine();
                                         outAcc << querySeq->getName() << endl;
                                         
+                                       if (templateFileName == "self") {  chimericSeqs.insert(querySeq->getName()); }
+                                       
                                         if (trimChimera) {  
                                                 int lengthLeft = spotMap[chimeraResults[0].winLEnd] - spotMap[chimeraResults[0].winLStart];
                                                 int lengthRight = spotMap[chimeraResults[0].winREnd] - spotMap[chimeraResults[0].winRStart];
@@ -364,11 +379,6 @@ Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc) {
                         out << endl;
                 }else {  
                         out << querySeq->getName() << "\tno" << endl; 
-                       if (templateFileName == "self") {  
-                               Sequence* temp = new Sequence(trimQuery.getName(), trimQuery.getAligned());
-                               runFilter(temp);
-                               userTemplate.push_back(temp);
-                       }
                 }
                 
                 return trim;
@@ -417,6 +427,8 @@ Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc, data_results leftP
                                         m->mothurOut(querySeq->getName() + "\tyes"); m->mothurOutEndLine();
                                         outAcc << querySeq->getName() << endl;
                                         
+                                       if (templateFileName == "self") {  chimericSeqs.insert(querySeq->getName()); }
+                                       
                                         if (trimChimera) {  
                                                 string newAligned = trim->getAligned();
                                                                                                 
@@ -470,11 +482,6 @@ Sequence* ChimeraSlayer::print(ostream& out, ostream& outAcc, data_results leftP
                         out << endl;
                 }else {  
                         out << querySeq->getName() << "\tno" << endl;  
-                       if (templateFileName == "self") {  
-                               Sequence* temp = new Sequence(trimQuery.getName(), trimQuery.getAligned());
-                               runFilter(temp);
-                               userTemplate.push_back(temp);
-                       }
                 }
                 
                 return trim;
@@ -532,6 +539,8 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc, data_results lef
                                         outAccString += querySeq->getName() + "\n";
                                         results = true;
                                         
+                                       if (templateFileName == "self") {  chimericSeqs.insert(querySeq->getName()); }
+                                       
                                         //write to accnos file
                                         int length = outAccString.length();
                                         char* buf2 = new char[length];
@@ -610,12 +619,6 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc, data_results lef
                                 
                         MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
                         delete buf;
-                       
-                       if (template == "self") {  
-                               Sequence temp = new Sequence(trimQuery.getName(), trimQuery.getAligned());
-                               runFilter(temp);
-                               userTemplate.push_back(temp);
-                       }
                 }
                 
                 
@@ -650,6 +653,8 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
                                         outAccString += querySeq->getName() + "\n";
                                         results = true;
                                         
+                                       if (templateFileName == "self") {  chimericSeqs.insert(querySeq->getName()); }
+                                       
                                         //write to accnos file
                                         int length = outAccString.length();
                                         char* buf2 = new char[length];
@@ -694,12 +699,6 @@ Sequence* ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
                         
                         MPI_File_write_shared(out, buf, length, MPI_CHAR, &status);
                         delete buf;
-                       
-                       if (template == "self") {  
-                               Sequence temp = new Sequence(trimQuery.getName(), trimQuery.getAligned());
-                               runFilter(temp);
-                               userTemplate.push_back(temp);
-                       }
                 }
                 
                 return trim;
@@ -730,7 +729,7 @@ int ChimeraSlayer::getChimeras(Sequence* query) {
                 //you must create a template
                 vector<Sequence*> thisTemplate;
                 if (templateFileName != "self") { thisTemplate = templateSeqs; }
-               else { getTemplate(query);  thisTemplate = userTemplate; } //fills this template and creates the databases
+               else {  thisTemplate = getTemplate(query);  } //fills this template and creates the databases
                 
                 if (m->control_pressed) {  return 0;  }
                 
diff --git a/chimeraslayer.h b/chimeraslayer.h

index cac96eb158a721ac5fe3d5c68a82d61979fd99e8..ace98ce615b7f8106a2b401db75499be2e55d534 100644 (file)
--- a/chimeraslayer.h
+++ b/chimeraslayer.h
@@ -23,7 +23,7 @@ class ChimeraSlayer : public Chimera {
         
         public:
                 ChimeraSlayer(string, string, bool, string, int, int, int, int, float, int, int, int, int, int, int, int, int, bool);
-               ChimeraSlayer(string, string, bool, string, string, int, int, int, int, float, int, int, int, int, int, int, int, int, bool);
+               ChimeraSlayer(string, string, bool, map<string, int>&, string,  int, int, int, int, float, int, int, int, int, int, int, int, int, bool);
  
                 ~ChimeraSlayer();
                 
@@ -46,9 +46,8 @@ class ChimeraSlayer : public Chimera {
                 map<int, int>  spotMap;
                 Database* databaseRight;
                 Database* databaseLeft;
-               vector<Sequence*> userTemplate;  //when template=self, the query file is sorted from most abundance to least abundant
-                                                                                //userTemplate grows as the query file is processed by adding sequences that are not chimeric
-               set<string> namesOfChimericSeqs; //only used when template=self
+               map<string, int> priority; //for template=self, seqname, seqAligned, abundance
+               set<string> chimericSeqs; //for template=self, so we don't add chimeric sequences to the userTemplate set
                 
                 vector<data_struct>  chimeraResults;
                 data_results printResults;
@@ -62,7 +61,7 @@ class ChimeraSlayer : public Chimera {
                 string getBlock(data_struct, string);
                 string getBlock(data_results, data_results, bool, bool, string);
                 //int readNameFile(string);
-               int getTemplate(Sequence*);
+               vector<Sequence*> getTemplate(Sequence*);
                 
  };
  
diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp

index 1a9c1fb5ca3d56896e605803bd98d472428f9f01..f6baa0a2074d75e01c73eeb50cae1f77f291ee20 100644 (file)
--- a/chimeraslayercommand.cpp
+++ b/chimeraslayercommand.cpp
@@ -381,12 +381,12 @@ int ChimeraSlayerCommand::execute(){
                                 
                                 //sort fastafile by abundance, returns new sorted fastafile name
                                 m->mothurOut("Sorting fastafile according to abundance..."); cout.flush(); 
-                               fastaFileNames[s] = sortFastaFile(fastaFileNames[s], nameFile);
+                               map<string, int> priority = sortFastaFile(fastaFileNames[s], nameFile);
                                 m->mothurOut("Done."); m->mothurOutEndLine();
                                 
                                 if (m->control_pressed) {  for (int j = 0; j < outputNames.size(); j++) {       remove(outputNames[j].c_str()); }  return 0;    }
  
-                               chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, trim, nameFile, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign);   
+                               chimera = new ChimeraSlayer(fastaFileNames[s], templatefile, trim, priority, search, ksize, match, mismatch, window, divR, minSimilarity, minCoverage, minBS, minSNP, parents, iters, increment, numwanted, realign);   
                         }
                                 
                         if (outputDir == "") { outputDir = m->hasPath(fastaFileNames[s]);  }//if user entered a file with a path then preserve it                               
@@ -456,36 +456,45 @@ int ChimeraSlayerCommand::execute(){
  
                                         MPIPos = m->setFilePosFasta(fastaFileNames[s], numSeqs); //fills MPIPos, returns numSeqs
                                         
-                                       //send file positions to all processes
-                                       for(int i = 1; i < processors; i++) { 
-                                               MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
-                                               MPI_Send(&MPIPos[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
+                                       if (templatefile != "self") { //if template=self we can only use 1 processor
+                                               //send file positions to all processes
+                                               for(int i = 1; i < processors; i++) { 
+                                                       MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
+                                                       MPI_Send(&MPIPos[0], (numSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD);
+                                               }
                                         }
-                                       
                                         //figure out how many sequences you have to align
                                         numSeqsPerProcessor = numSeqs / processors;
                                         int startIndex =  pid * numSeqsPerProcessor;
                                         if(pid == (processors - 1)){    numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor;      }
-                               
+                                       
+                                       if (templatefile == "self") { //if template=self we can only use 1 processor
+                                               startIndex = 0;
+                                               numSeqsPerProcessor = numSeqs;
+                                       }
+                                       
                                         //do your part
                                         driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, MPIPos);
                                         
                                         if (m->control_pressed) { outputTypes.clear();  MPI_File_close(&inMPI);  MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); }  MPI_File_close(&outMPIAccnos);  for (int j = 0; j < outputNames.size(); j++) {   remove(outputNames[j].c_str()); }  remove(outputFileName.c_str());  remove(accnosFileName.c_str());  delete chimera; return 0;  }
  
                                 }else{ //you are a child process
-                                       MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
-                                       MPIPos.resize(numSeqs+1);
-                                       MPI_Recv(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
+                                       if (templatefile != "self") { //if template=self we can only use 1 processor
+                                               MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
+                                               MPIPos.resize(numSeqs+1);
+                                               MPI_Recv(&MPIPos[0], (numSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status);
                                         
-                                       //figure out how many sequences you have to align
-                                       numSeqsPerProcessor = numSeqs / processors;
-                                       int startIndex =  pid * numSeqsPerProcessor;
-                                       if(pid == (processors - 1)){    numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor;      }
+                                               //figure out how many sequences you have to align
+                                               numSeqsPerProcessor = numSeqs / processors;
+                                               int startIndex =  pid * numSeqsPerProcessor;
+                                               if(pid == (processors - 1)){    numSeqsPerProcessor = numSeqs - pid * numSeqsPerProcessor;      }
                                         
-                                       //do your part
-                                       driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, MPIPos);
+                                               //do your part
+                                               driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPI, outMPIAccnos, outMPIFasta, MPIPos);
                                         
-                                       if (m->control_pressed) { outputTypes.clear();  MPI_File_close(&inMPI);  MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); }  MPI_File_close(&outMPIAccnos);  for (int j = 0; j < outputNames.size(); j++) {   remove(outputNames[j].c_str()); }  delete chimera; return 0;  }
+                                               if (m->control_pressed) { outputTypes.clear();  MPI_File_close(&inMPI);  MPI_File_close(&outMPI); if (trim) { MPI_File_close(&outMPIFasta); }  MPI_File_close(&outMPIAccnos);  for (int j = 0; j < outputNames.size(); j++) {   remove(outputNames[j].c_str()); }  delete chimera; return 0;  }
+                               
+                                       }
                                 }
                                 
                                 //close files 
@@ -939,9 +948,9 @@ int ChimeraSlayerCommand::divideInHalf(Sequence querySeq, string& leftQuery, str
         }
  }
  /**************************************************************************************************/
-
-string ChimeraSlayerCommand::sortFastaFile(string fastaFile, string nameFile) {
+map<string, int> ChimeraSlayerCommand::sortFastaFile(string fastaFile, string nameFile) {
         try {
+               map<string, int> nameAbund;
                 
                 //read through fastafile and store info
                 map<string, string> seqs;
@@ -950,7 +959,7 @@ string ChimeraSlayerCommand::sortFastaFile(string fastaFile, string nameFile) {
                 
                 while (!in.eof()) {
                         
-                       if (m->control_pressed) { in.close(); return ""; }
+                       if (m->control_pressed) { in.close(); return nameAbund; }
                         
                         Sequence seq(in); m->gobble(in);
                         seqs[seq.getName()] = seq.getAligned();
@@ -962,10 +971,10 @@ string ChimeraSlayerCommand::sortFastaFile(string fastaFile, string nameFile) {
                 vector<seqPriorityNode> nameMapCount;
                 int error = m->readNames(nameFile, nameMapCount, seqs);
                 
-               if (m->control_pressed) { return ""; }
+               if (m->control_pressed) { return nameAbund; }
                 
-               if (error == 1) { m->control_pressed = true; return ""; }
-               if (seqs.size() != nameMapCount.size()) { m->mothurOut( "The number of sequences in your fastafile does not match the number of sequences in your namefile, aborting."); m->mothurOutEndLine(); m->control_pressed = true; return ""; }
+               if (error == 1) { m->control_pressed = true; return nameAbund; }
+               if (seqs.size() != nameMapCount.size()) { m->mothurOut( "The number of sequences in your fastafile does not match the number of sequences in your namefile, aborting."); m->mothurOutEndLine(); m->control_pressed = true; return nameAbund; }
  
                 sort(nameMapCount.begin(), nameMapCount.end(), compareSeqPriorityNodes);
                 
@@ -976,12 +985,13 @@ string ChimeraSlayerCommand::sortFastaFile(string fastaFile, string nameFile) {
                 //print new file in order of
                 for (int i = 0; i < nameMapCount.size(); i++) {
                         out << ">" << nameMapCount[i].name << endl << nameMapCount[i].seq << endl;
+                       nameAbund[nameMapCount[i].name] = nameMapCount[i].numIdentical;
                 }
                 out.close();
                 
                 rename(newFasta.c_str(), fastaFile.c_str());
                                 
-               return fastaFile;
+               return nameAbund;
                 
         }
         catch(exception& e) {
diff --git a/chimeraslayercommand.h b/chimeraslayercommand.h

index 6ca0310d109c117cf5205ac0725e79338540676a..e5ffeb69c860aab7d08e60d134d6baacae4033f4 100644 (file)
--- a/chimeraslayercommand.h
+++ b/chimeraslayercommand.h
@@ -44,7 +44,7 @@ private:
         int driver(linePair*, string, string, string, string);
         int createProcesses(string, string, string, string);
         int divideInHalf(Sequence, string&, string&);
-       string sortFastaFile(string, string);
+       map<string, int> sortFastaFile(string, string);
                 
         #ifdef USE_MPI
         int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector<unsigned long int>&);
author	westcott <westcott>
	Tue, 19 Apr 2011 10:08:17 +0000 (10:08 +0000)
committer	westcott <westcott>
	Tue, 19 Apr 2011 10:08:17 +0000 (10:08 +0000)
chimeraslayer.cpp		patch \| blob \| history
chimeraslayer.h		patch \| blob \| history
chimeraslayercommand.cpp		patch \| blob \| history
chimeraslayercommand.h		patch \| blob \| history