fixed bug in cluster.split

author westcott <westcott>

Mon, 14 Jun 2010 14:01:32 +0000 (14:01 +0000)

committer westcott <westcott>

Mon, 14 Jun 2010 14:01:32 +0000 (14:01 +0000)
author westcott <westcott>
Mon, 14 Jun 2010 14:01:32 +0000 (14:01 +0000)
committer westcott <westcott>
Mon, 14 Jun 2010 14:01:32 +0000 (14:01 +0000)
diff --git a/chimeraslayer.cpp b/chimeraslayer.cpp

index 5295eace3546e14db6488ceb41e144915531c830..b92e8c8d355f6d105c141857bd96c19538ba194b 100644 (file)
--- a/chimeraslayer.cpp
+++ b/chimeraslayer.cpp
@@ -196,7 +196,7 @@ int ChimeraSlayer::print(ostream& out, ostream& outAcc) {
                                 }
                         }
                         
-                       printBlock(chimeraResults[0], out);
+                       printBlock(chimeraResults[0], chimeraFlag, out);
                         out << endl;
                 }else {  out << querySeq->getName() << "\tno" << endl;  }
                 
@@ -240,7 +240,7 @@ int ChimeraSlayer::print(MPI_File& out, MPI_File& outAcc) {
                                 }
                         }
                         
-                       outputString = getBlock(chimeraResults[0]);
+                       outputString = getBlock(chimeraResults[0], chimeraFlag);
                         outputString += "\n";
         //cout << outputString << endl;         
                         //write to output file
@@ -387,7 +387,7 @@ int ChimeraSlayer::getChimeras(Sequence* query) {
         }
  }
  //***************************************************************************************************************
-void ChimeraSlayer::printBlock(data_struct data, ostream& out){
+void ChimeraSlayer::printBlock(data_struct data, string flag, ostream& out){
         try {
         //out << ":)\n";
                 
@@ -399,7 +399,7 @@ void ChimeraSlayer::printBlock(data_struct data, ostream& out){
                 out << data.divr_qla_qrb << '\t' << data.qla_qrb << '\t' << data.bsa << '\t';
                 out << data.divr_qlb_qra << '\t' << data.qlb_qra << '\t' << data.bsb << '\t';
                 
-               out << "yes\t" << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t';
+               out << flag << '\t' << spotMap[data.winLStart] << "-" << spotMap[data.winLEnd] << '\t' << spotMap[data.winRStart] << "-" << spotMap[data.winREnd] << '\t';
                 
                 //out << "Similarity of parents: " << data.ab << endl;
                 //out << "Similarity of query to parentA: " << data.qa << endl;
@@ -422,7 +422,7 @@ void ChimeraSlayer::printBlock(data_struct data, ostream& out){
         }
  }
  //***************************************************************************************************************
-string ChimeraSlayer::getBlock(data_struct data){
+string ChimeraSlayer::getBlock(data_struct data, string flag){
         try {
                 
                 string outputString = "";
@@ -433,7 +433,7 @@ string ChimeraSlayer::getBlock(data_struct data){
                 outputString += toString(data.divr_qla_qrb) + "\t" + toString(data.qla_qrb) + "\t" + toString(data.bsa) + "\t";
                 outputString += toString(data.divr_qlb_qra) + "\t" + toString(data.qlb_qra) + "\t" + toString(data.bsb) + "\t";
                 
-               outputString += "yes\t" + toString(spotMap[data.winLStart]) + "-" + toString(spotMap[data.winLEnd]) + "\t" + toString(spotMap[data.winRStart]) + "-" + toString(spotMap[data.winREnd]) + "\t";
+               outputString += flag + "\t" + toString(spotMap[data.winLStart]) + "-" + toString(spotMap[data.winLEnd]) + "\t" + toString(spotMap[data.winRStart]) + "-" + toString(spotMap[data.winREnd]) + "\t";
                 
                 return outputString;
         }
diff --git a/chimeraslayer.h b/chimeraslayer.h

index 3ce4cce4b40cbe331b260f08552fa34bad010d3d..e4615ee8787d481dfad122bd8120ae35e130cdb6 100644 (file)
--- a/chimeraslayer.h
+++ b/chimeraslayer.h
@@ -50,8 +50,8 @@ class ChimeraSlayer : public Chimera {
                 int window, numWanted, kmerSize, match, misMatch, minSim, minCov, minBS, minSNP, parents, iters, increment;
                 float divR;
         
-               void printBlock(data_struct, ostream&);
-               string getBlock(data_struct);
+               void printBlock(data_struct, string, ostream&);
+               string getBlock(data_struct, string);
                 
  };
  
diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp

index 363b695dfe7439f244f675b2c4c3736c4a210c6a..f6bf105cf915710947493d06b8d6461f309e9212 100644 (file)
--- a/clustersplitcommand.cpp
+++ b/clustersplitcommand.cpp
@@ -128,6 +128,7 @@ ClusterSplitCommand::ClusterSplitCommand(string option)  {
                         
                         if (fastafile != "") {
                                 if (taxFile == "") { m->mothurOut("You need to provide a taxonomy file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
+                               if (namefile == "") { m->mothurOut("You need to provide a names file if you are using a fasta file to generate the split."); m->mothurOutEndLine(); abort = true; }
                         }
                                         
                         //check for optional parameter and set defaults
@@ -195,7 +196,7 @@ void ClusterSplitCommand::help(){
                 m->mothurOut("For the distance file method, you need only provide your distance file and mothur will split the file into distinct groups. \n");
                 m->mothurOut("For the classification method, you need to provide your distance file and taxonomy file, and set the splitmethod to classify.  \n");
                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and split the distance file based on those groups. \n");
-               m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file and taxonomy file.  \n");
+               m->mothurOut("For the classification method using a fasta file, you need to provide your fasta file, names file and taxonomy file.  \n");
                 m->mothurOut("You will also need to set the taxlevel you want to split by. mothur will split the sequence into distinct taxonomy groups, and create distance files for each grouping. \n");
                 m->mothurOut("The phylip and column parameter allow you to enter your distance file. \n");
                 m->mothurOut("The fasta parameter allows you to enter your aligned fasta file. \n");
@@ -270,9 +271,9 @@ int ClusterSplitCommand::execute(){
                 
                 //split matrix into non-overlapping groups
                 SplitMatrix* split;
-               if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                       }
-               else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);   }
-               else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, taxFile, taxLevelCutoff, splitmethod);                                       }
+               if (splitmethod == "distance")                  {       split = new SplitMatrix(distfile, namefile, taxFile, cutoff, splitmethod, large);                                       }
+               else if (splitmethod == "classify")             {       split = new SplitMatrix(distfile, namefile, taxFile, taxLevelCutoff, splitmethod, large);                       }
+               else if (splitmethod == "fasta")                {       split = new SplitMatrix(fastafile, namefile, taxFile, taxLevelCutoff, splitmethod, processors);         }
                 else { m->mothurOut("Not a valid splitting method.  Valid splitting algorithms are distance, classify or fasta."); m->mothurOutEndLine(); return 0; }
                 
                 split->split();
@@ -775,8 +776,8 @@ vector<string> ClusterSplitCommand::cluster(vector< map<string, string> > distNa
                                 listFileNames.clear(); return listFileNames;
                         }
                         
-                       //remove(thisDistFile.c_str());
-                       //remove(thisNamefile.c_str());
+                       remove(thisDistFile.c_str());
+                       remove(thisNamefile.c_str());
                 }
                 
                                         
diff --git a/makefile b/makefile

index ef5cfe7ebb9550c5b5fa275e128f1ff013585e33..3e831124a7cc7a026dcad63df44da4a0674d186b 100644 (file)
--- a/makefile
+++ b/makefile
@@ -463,7 +463,7 @@ mothur : \
                 ./logsd.o\\r
                 ./geom.o\\r
                 ./setlogfilecommand.o\\r
-               -o mothur\r
+               -o ../Release/mothur\r
  \r
  clean : \r
                 rm \\r
diff --git a/splitmatrix.cpp b/splitmatrix.cpp

index cd05cc46c172122ab9ee23490d38eda5e7308c22..9e53c51a8f1b6fbba8e755279e68993fb7f03fa5 100644 (file)
--- a/splitmatrix.cpp
+++ b/splitmatrix.cpp
@@ -9,9 +9,7 @@
  
  #include "splitmatrix.h"
  #include "phylotree.h"
-#include "sequencedb.h"
-#include "onegapdist.h"
-#include "dist.h"
+#include "distancecommand.h"
  
  /***********************************************************************/
  
@@ -26,12 +24,14 @@ SplitMatrix::SplitMatrix(string distfile, string name, string tax, float c, stri
  }
  /***********************************************************************/
  
-SplitMatrix::SplitMatrix(string ffile, string tax, float c, string t){
+SplitMatrix::SplitMatrix(string ffile, string name, string tax, float c, string t, int p){
         m = MothurOut::getInstance();
         fastafile = ffile;
+       namefile = name;
         taxFile = tax;
         cutoff = c;
         method = t;
+       processors = p;
  }
  
  /***********************************************************************/
@@ -125,7 +125,6 @@ int SplitMatrix::splitClassify(){
                         createDistanceFilesFromTax(seqGroup, numGroups);
                 }
                 
-                               
                 return 0;
                         
         }
@@ -137,36 +136,114 @@ int SplitMatrix::splitClassify(){
  /***********************************************************************/
  int SplitMatrix::createDistanceFilesFromTax(map<string, int>& seqGroup, int numGroups){
         try {
+               map<string, int> copyGroups = seqGroup;
                 map<string, int>::iterator it;
-               map<string, int>::iterator it2;
-               map<string, int> seqIndexInFasta;
+               set<string> names;
+                               
+               for (int i = 0; i < numGroups; i++) { //remove old temp files, just in case
+                       remove((fastafile + "." + toString(i) + ".temp").c_str());
+               }
+                       
+               ifstream in;
+               openInputFile(fastafile, in);
+       
+               //parse fastafile
+               ofstream outFile;
+               while (!in.eof()) {
+                       Sequence query(in); gobble(in);
+                       if (query.getName() != "") {
+               
+                               it = seqGroup.find(query.getName());
                                 
-               //read fastafile
-               SequenceDB alignDB;
-               
-               ifstream filehandle;
-               openInputFile(fastafile, filehandle);
-               int numSeqs = 0;
-               while (!filehandle.eof()) {
-                       //input sequence info into sequencedb
-                       Sequence newSequence(filehandle);
+                               //save names in case no namefile is given
+                               if (namefile == "") {  names.insert(query.getName()); }
                         
-                       if (newSequence.getName() != "") {   
-                               alignDB.push_back(newSequence);  
-                               seqIndexInFasta[newSequence.getName()] = numSeqs;
-                               numSeqs++;
+                               if (it != seqGroup.end()) { //not singleton 
+                                       openOutputFileAppend((fastafile + "." + toString(it->second) + ".temp"), outFile);
+                                       query.printSequence(outFile); 
+                                       outFile.close();
+                                       
+                                       copyGroups.erase(it);
+                               }
                         }
+               }
+               in.close();
+               
+               //warn about sequence in groups that are not in fasta file
+               for(it = copyGroups.begin(); it != copyGroups.end(); it++) {
+                       m->mothurOut("ERROR: " + it->first + " is missing from your fastafile. This could happen if your taxonomy file is not unique and your fastafile is, or it could indicate and error."); m->mothurOutEndLine();
+                       exit(1);
+               }
+               
+               copyGroups.clear();
+               
+               //process each distance file
+               for (int i = 0; i < numGroups; i++) { 
+                       
+                       string options = "fasta=" + (fastafile + "." + toString(i) + ".temp") + ", processors=" + toString(processors);
                         
-                       //takes care of white space
-                       gobble(filehandle);
+                       Command* command = new DistanceCommand(options);
+                       command->execute();
+                       delete command;
+                       
+                       remove((fastafile + "." + toString(i) + ".temp").c_str());
+                       
+                       //remove old names files just in case
+                       remove((namefile + "." + toString(i) + ".temp").c_str());
                 }
-               filehandle.close();
                 
-               Dist* distCalculator = new oneGapDist();
+               singleton = namefile + ".extra.temp";
+               ofstream remainingNames;
+               openOutputFile(singleton, remainingNames);
                 
+               bool wroteExtra = false;
+
+               ifstream bigNameFile;
+               openInputFile(namefile, bigNameFile);
                 
-//still not done....           
+               string name, nameList;
+               while(!bigNameFile.eof()){
+                       bigNameFile >> name >> nameList;  gobble(bigNameFile);
+                       
+                       //did this sequence get assigned a group
+                       it = seqGroup.find(name);
+                       
+                       if (it != seqGroup.end()) {  
+                               openOutputFileAppend((namefile + "." + toString(it->second) + ".temp"), outFile);
+                               outFile << name << '\t' << nameList << endl;
+                               outFile.close();
+                       }else{
+                               wroteExtra = true;
+                               remainingNames << name << '\t' << nameList << endl;
+                       }
+               }
+               bigNameFile.close();
                 
+               remainingNames.close();
+               if (!wroteExtra) { 
+                       remove(singleton.c_str());
+                       singleton = "none";
+               }
+
+               for(int i=0;i<numGroups;i++){
+                       string tempNameFile = namefile + "." + toString(i) + ".temp";
+                       string tempDistFile = getRootName(getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist";
+
+                       //if there are valid distances
+                       ifstream fileHandle;
+                       fileHandle.open(tempDistFile.c_str());
+                       if(fileHandle)  {       
+                               gobble(fileHandle);
+                               if (!fileHandle.eof()) {  //check for blank file
+                                       map<string, string> temp;
+                                       temp[tempDistFile] = tempNameFile;
+                                       dists.push_back(temp);
+                               }
+                       }
+                       fileHandle.close();
+               }
+               
+               if (m->control_pressed)  {  for (int i = 0; i < dists.size(); i++) { remove((dists[i].begin()->first).c_str()); remove((dists[i].begin()->second).c_str()); } dists.clear(); }
                 
                 return 0;
         }
@@ -269,25 +346,37 @@ int SplitMatrix::splitDistanceFileByTax(map<string, int>& seqGroup, int numGroup
                         }
                 }
                 bigNameFile.close();
-               remainingNames.close();
-               
-               if (!wroteExtra) { 
-                       remove(singleton.c_str());
-                       singleton = "none";
-               }
-               
+                               
                 for(int i=0;i<numGroups;i++){
+                       string tempNameFile = namefile + "." + toString(i) + ".temp";
+                       string tempDistFile = distFile + "." + toString(i) + ".temp";
+
                         //if there are valid distances
                         if (validDistances[i]) {
-                               string tempNameFile = namefile + "." + toString(i) + ".temp";
-                               string tempDistFile = distFile + "." + toString(i) + ".temp";
-                               
                                 map<string, string> temp;
                                 temp[tempDistFile] = tempNameFile;
                                 dists.push_back(temp);
+                       }else{
+                               ifstream in;
+                               openInputFile(tempNameFile, in);
+                               
+                               while(!in.eof()) { 
+                                       in >> name >> nameList;  gobble(in);
+                                       wroteExtra = true;
+                                       remainingNames << name << '\t' << nameList << endl;
+                               }
+                               in.close();
+                               remove(tempNameFile.c_str());
                         }
                 }
                 
+               remainingNames.close();
+               
+               if (!wroteExtra) { 
+                       remove(singleton.c_str());
+                       singleton = "none";
+               }
+
                 if (m->control_pressed)  {  
                         for (int i = 0; i < dists.size(); i++) { 
                                 remove((dists[i].begin()->first).c_str());
diff --git a/splitmatrix.h b/splitmatrix.h

index 9467dc1a85b3738d27e21cdfc1ab8bd943314416..b98bdfb4b56b952d9ca0a2c7a784c22f02671e94 100644 (file)
--- a/splitmatrix.h
+++ b/splitmatrix.h
@@ -20,7 +20,7 @@ class SplitMatrix  {
         public:
  
                 SplitMatrix(string, string, string, float, string, bool); //column formatted distance file, namesfile, cutoff, method, large
-               SplitMatrix(string, string, float, string); //fastafile, taxFile, cutoff, method
+               SplitMatrix(string, string, string, float, string, int); //fastafile, namefile, taxFile, cutoff, method, processors
                 
                 ~SplitMatrix();
                 int split();
@@ -34,6 +34,7 @@ class SplitMatrix  {
                 vector< map< string, string> > dists;
                 float cutoff;
                 bool large;
+               int processors;
                                 
                 int splitDistance();
                 int splitClassify();
diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp

index 705ac25f5a5e727424a2567a0d358b4c1dc951e5..71e5c9277af61fef9884f42b2c8e9d54a2796e12 100644 (file)
--- a/trimseqscommand.cpp
+++ b/trimseqscommand.cpp
@@ -163,7 +163,7 @@ void TrimSeqsCommand::help(){
                 m->mothurOut("The trim.seqs command reads a fastaFile and creates .....\n");
                 m->mothurOut("The trim.seqs command parameters are fasta, flip, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim and allfiles.\n");
                 m->mothurOut("The fasta parameter is required.\n");
-               m->mothurOut("The flip parameter .... The default is 0.\n");
+               m->mothurOut("The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n");
                 m->mothurOut("The oligos parameter .... The default is "".\n");
                 m->mothurOut("The maxambig parameter .... The default is -1.\n");
                 m->mothurOut("The maxhomop parameter .... The default is 0.\n");
author	westcott <westcott>
	Mon, 14 Jun 2010 14:01:32 +0000 (14:01 +0000)
committer	westcott <westcott>
	Mon, 14 Jun 2010 14:01:32 +0000 (14:01 +0000)
chimeraslayer.cpp		patch \| blob \| history
chimeraslayer.h		patch \| blob \| history
clustersplitcommand.cpp		patch \| blob \| history
makefile		patch \| blob \| history
splitmatrix.cpp		patch \| blob \| history
splitmatrix.h		patch \| blob \| history
trimseqscommand.cpp		patch \| blob \| history