precluster command finished

author westcott <westcott>

Wed, 23 Dec 2009 18:47:52 +0000 (18:47 +0000)

committer westcott <westcott>

Wed, 23 Dec 2009 18:47:52 +0000 (18:47 +0000)
author westcott <westcott>
Wed, 23 Dec 2009 18:47:52 +0000 (18:47 +0000)
committer westcott <westcott>
Wed, 23 Dec 2009 18:47:52 +0000 (18:47 +0000)
diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj

index 5ae6f336987553e85d2841145f2f151008351554..1fbb5b7fec6768db74d64df3c32e05500f0bfb1b 100644 (file)
--- a/Mothur.xcodeproj/project.pbxproj
+++ b/Mothur.xcodeproj/project.pbxproj
@@ -158,6 +158,7 @@
                 7EC3D4550FA0FFF900338DA5 /* coverage.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7EC3D4500FA0FFF900338DA5 /* coverage.cpp */; };
                 7EC3D4560FA0FFF900338DA5 /* whittaker.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 7EC3D4530FA0FFF900338DA5 /* whittaker.cpp */; };
                 8DD76F6A0486A84900D96B5E /* Mothur.1 in CopyFiles */ = {isa = PBXBuildFile; fileRef = C6859E8B029090EE04C91782 /* Mothur.1 */; };
+               A7027F2810DFF86D00BF45FE /* preclustercommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7027F2710DFF86D00BF45FE /* preclustercommand.cpp */; };
                 A70B53AA0F4CD7AD0064797E /* getgroupcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A40F4CD7AD0064797E /* getgroupcommand.cpp */; };
                 A70B53AB0F4CD7AD0064797E /* getlabelcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A60F4CD7AD0064797E /* getlabelcommand.cpp */; };
                 A70B53AC0F4CD7AD0064797E /* getlinecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70B53A80F4CD7AD0064797E /* getlinecommand.cpp */; };
@@ -526,6 +527,8 @@
                 7EC3D4530FA0FFF900338DA5 /* whittaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = whittaker.cpp; sourceTree = SOURCE_ROOT; };
                 7EC3D4540FA0FFF900338DA5 /* whittaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = whittaker.h; sourceTree = SOURCE_ROOT; };
                 8DD76F6C0486A84900D96B5E /* mothur */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = mothur; sourceTree = BUILT_PRODUCTS_DIR; };
+               A7027F2610DFF86D00BF45FE /* preclustercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = preclustercommand.h; sourceTree = SOURCE_ROOT; };
+               A7027F2710DFF86D00BF45FE /* preclustercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = preclustercommand.cpp; sourceTree = SOURCE_ROOT; };
                 A70B53A40F4CD7AD0064797E /* getgroupcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getgroupcommand.cpp; sourceTree = SOURCE_ROOT; };
                 A70B53A50F4CD7AD0064797E /* getgroupcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getgroupcommand.h; sourceTree = SOURCE_ROOT; };
                 A70B53A60F4CD7AD0064797E /* getlabelcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getlabelcommand.cpp; sourceTree = SOURCE_ROOT; };
@@ -931,6 +934,8 @@
                                 3792946F0F2E191800B9034A /* parsimonycommand.cpp */,
                                 A773808E10B6EFD1002A6A61 /* phylotypecommand.h */,
                                 A773808F10B6EFD1002A6A61 /* phylotypecommand.cpp */,
+                               A7027F2610DFF86D00BF45FE /* preclustercommand.h */,
+                               A7027F2710DFF86D00BF45FE /* preclustercommand.cpp */,
                                 37D927FE0F21331F001D4494 /* quitcommand.h */,
                                 37D927FD0F21331F001D4494 /* quitcommand.cpp */,
                                 37D928080F21331F001D4494 /* rarefactcommand.h */,
@@ -1299,6 +1304,7 @@
                                 A773809010B6EFD1002A6A61 /* phylotypecommand.cpp in Sources */,
                                 A727E84A10D14568001A8432 /* readblast.cpp in Sources */,
                                 A7E0AAB610D2886D00CF407B /* mgclustercommand.cpp in Sources */,
+                               A7027F2810DFF86D00BF45FE /* preclustercommand.cpp in Sources */,
                         );
                         runOnlyForDeploymentPostprocessing = 0;
                 };
diff --git a/commandfactory.cpp b/commandfactory.cpp

index b361b6600655ff57a987cf7bce9b2b543d96c854..d2b22d59b06c469a12c7fce92b132a7971453f50 100644 (file)
--- a/commandfactory.cpp
+++ b/commandfactory.cpp
@@ -61,6 +61,7 @@
  #include "classifyseqscommand.h"
  #include "phylotypecommand.h"
  #include "mgclustercommand.h"
+#include "preclustercommand.h"
  
  /*******************************************************/
  
@@ -131,6 +132,7 @@ CommandFactory::CommandFactory(){
         commands["classify.seqs"]               = "classify.seqs"; 
         commands["phylotype"]                   = "phylotype";
         commands["mgcluster"]                   = "mgcluster";
+       commands["pre.cluster"]                 = "pre.cluster";
  
  }
  /***********************************************************/
@@ -199,6 +201,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){
                 else if(commandName == "classify.seqs")                 {       command = new ClassifySeqsCommand(optionString);                }
                 else if(commandName == "phylotype")                             {       command = new PhylotypeCommand(optionString);                   }
                 else if(commandName == "mgcluster")                             {       command = new MGClusterCommand(optionString);                   }
+               else if(commandName == "pre.cluster")                   {       command = new PreClusterCommand(optionString);                  }
                 else                                                                                    {       command = new NoCommand(optionString);                                  }
  
                 return command;
diff --git a/distancecommand.cpp b/distancecommand.cpp

index f8a2a75220bef99557a896e34d8d8f5aa843687a..3ac39699918f6da277ee06a7a7dde1d22c6ae546 100644 (file)
--- a/distancecommand.cpp
+++ b/distancecommand.cpp
@@ -278,7 +278,7 @@ int DistanceCommand::driver(int startLine, int endLine, string dFileName, float
         }
  }
  
-/**************************************************************************************************/
+/**************************************************************************************************
  void DistanceCommand::appendFiles(string temp, string filename) {
         try{
                 ofstream output;
diff --git a/distancecommand.h b/distancecommand.h

index 6a7ac93fa1d7b28d9fe03e1e27f1e4a1ec3ef491..542f4283744024e58b297282f83212ad4b5b1292 100644 (file)
--- a/distancecommand.h
+++ b/distancecommand.h
@@ -43,7 +43,7 @@ private:
         bool abort;
         vector<string>  Estimators; //holds estimators to be used
         
-       void appendFiles(string, string);
+       //void appendFiles(string, string);
         void createProcesses(string);
         int driver(/*Dist*, SequenceDB, */int, int, string, float);
  
diff --git a/hcluster.cpp b/hcluster.cpp

index 2d88533757a1495d54423ad87354532a96d3e053..4e8c9945bd11574cfc7c15b6411343fd79576b6b 100644 (file)
--- a/hcluster.cpp
+++ b/hcluster.cpp
@@ -12,11 +12,13 @@
  #include "listvector.hpp"
  #include "sparsematrix.hpp"
  
+
  /***********************************************************************/
  
-HCluster::HCluster(RAbundVector* rav, ListVector* lv) :  rabund(rav), list(lv){
+HCluster::HCluster(RAbundVector* rav, ListVector* lv, string m) :  rabund(rav), list(lv), method(m){
         try {
                 mapWanted = false;
+               exitedBreak = false; 
                 numSeqs = list->getNumSeqs();
                 
                 //initialize cluster array
@@ -263,27 +265,32 @@ void HCluster::update(int row, int col, float distance){
                 smallRow = row;
                 smallCol = col;
                 smallDist = distance;
-               bool clustered = false;
                 
                 //find upmost parent of row and col
                 smallRow = getUpmostParent(smallRow);
                 smallCol = getUpmostParent(smallCol);
         //cout << "row = " << row << " smallRow = " << smallRow <<  " col = " << col << " smallCol = " << smallCol << " dist = " << distance << endl;
-               
-               //are they active in the link table
-               int linkValue = makeActive(); //after this point this nodes info is active in linkTable
-       //printInfo();                  
-//cout << "linkValue = " << linkValue << " times = " << (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq) << endl;
-               //can we cluster???
-               if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) {
-                       //printInfo();
-                       updateArrayandLinkTable();
-                       clusterBins();
-                       clusterNames();
-                       clustered = true;
-                       //printInfo();
+               //you don't want to cluster with yourself
+               if (smallRow != smallCol) {
+                       //are they active in the link table
+                       int linkValue = makeActive(); //after this point this nodes info is active in linkTable
+                       //printInfo();                  
+                       //cout << "linkValue = " << linkValue << " times = " << (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq) << endl;
+                       //can we cluster???
+                       bool cluster = false;
+                       
+                       if (method == "nearest") { cluster = true;  }
+                       else if (method == "average") { cout << "still working on this... " << endl; //got to figure this out 
+                       }else{ //assume furthest
+                               if (linkValue == (clusterArray[smallRow].numSeq * clusterArray[smallCol].numSeq)) { cluster = true; }
+                       }
+                       
+                       if (cluster) { 
+                               updateArrayandLinkTable();
+                               clusterBins();
+                               clusterNames();
+                       }
                 }
-               
                 //printInfo();
         }
         catch(exception& e) {
@@ -340,6 +347,64 @@ try {
                 exit(1);
         }
  }
+//**********************************************************************************************************************
+vector<seqDist> HCluster::getSeqs(ifstream& filehandle, NameAssignment* nameMap, float cutoff){
+       try {
+               string firstName, secondName;
+               float distance, prevDistance;
+               vector<seqDist> sameSeqs;
+               prevDistance = -1;
+       
+               //if you are not at the beginning of the file
+               if (exitedBreak) { 
+                       sameSeqs.push_back(next);
+                       prevDistance = next.dist;
+                       exitedBreak = false;
+               }
+       
+               //get entry
+               while (!filehandle.eof()) {
+                       
+                       filehandle >> firstName >> secondName >> distance;    gobble(filehandle); 
+       
+                       //save first one
+                       if (prevDistance == -1) { prevDistance = distance; }
+                       
+                       map<string,int>::iterator itA = nameMap->find(firstName);
+                       map<string,int>::iterator itB = nameMap->find(secondName);
+                       if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
+                       if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
+               
+                       //using cutoff
+                       if (distance > cutoff) { break; }
+               
+                       if (distance != -1) { //-1 means skip me
+                       
+                               //are the distances the same
+                               if (distance == prevDistance) { //save in vector
+                                       seqDist temp(itA->second, itB->second, distance);
+                                       sameSeqs.push_back(temp);
+                                       exitedBreak = false;
+                               }else{ 
+                                       next.seq1 = itA->second;
+                                       next.seq2 = itB->second;
+                                       next.dist = distance;
+                                       exitedBreak = true;
+                                       break;
+                               }
+                       }
+               }
+               
+               //rndomize matching dists
+               random_shuffle(sameSeqs.begin(), sameSeqs.end());
+               
+               return sameSeqs;
+       }
+       catch(exception& e) {
+               errorOut(e, "HCluster", "getSeqs");
+               exit(1);
+       }
+}
  /***********************************************************************/
  
  
diff --git a/hcluster.h b/hcluster.h

index dd4c679123f12d90fbc2ca1daaae5045be9a1dc1..065d559d1223865313b95a32f20dd6fb14fc5fc0 100644 (file)
--- a/hcluster.h
+++ b/hcluster.h
@@ -12,6 +12,7 @@
  
  
  #include "mothur.h"
+#include "nameassignment.hpp"
  
  class RAbundVector;
  class ListVector;
@@ -20,11 +21,12 @@ class ListVector;
  class HCluster {
         
  public:
-       HCluster(RAbundVector*, ListVector*);
+       HCluster(RAbundVector*, ListVector*, string);
         ~HCluster(){};
      void update(int, int, float);
         void setMapWanted(bool m); 
         map<string, int> getSeqtoBin()  {  return seq2Bin;      }
+       vector<seqDist> getSeqs(ifstream&, NameAssignment*, float);
  
  protected:     
         void clusterBins();
@@ -49,7 +51,9 @@ protected:
         int smallCol;
         float smallDist;
         map<string, int> seq2Bin;
-       bool mapWanted;
+       bool mapWanted, exitedBreak;
+       seqDist next;
+       string method;
         
  };
  
diff --git a/hclustercommand.cpp b/hclustercommand.cpp

index 3c79f649301f9e049e5ff8cf3eff454bf97fb4c4..506ba544045c7c14e8944ef02d2b99a6e061acf3 100644 (file)
--- a/hclustercommand.cpp
+++ b/hclustercommand.cpp
@@ -76,7 +76,7 @@ HClusterCommand::HClusterCommand(string option){
                         cutoff += (5 / (precision * 10.0));
                         
                         method = validParameter.validFile(parameters, "method", false);
-                       if (method == "not found") { method = "nearest"; }
+                       if (method == "not found") { method = "furthest"; }
                         
                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
                         else { mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); mothurOutEndLine(); abort = true; }
@@ -96,7 +96,9 @@ HClusterCommand::HClusterCommand(string option){
                                                                                         
                                 fileroot = getRootName(distfile);
                                 
-                               tag = "fn";  //until we figure out average and nearest methods
+                               if (method == "furthest")               { tag = "fn";  }
+                               else if (method == "nearest")   { tag = "nn";  }
+                               else                                                    { tag = "an";  }
                         
                                 openOutputFile(fileroot+ tag + ".sabund",       sabundFile);
                                 openOutputFile(fileroot+ tag + ".rabund",       rabundFile);
@@ -119,7 +121,7 @@ void HClusterCommand::help(){
                 mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
                 mothurOut("The hcluster command should be in the following format: \n");
                 mothurOut("hcluster(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
-               mothurOut("The acceptable hcluster methods is furthest, but we hope to add nearest and average in the future.\n\n");    
+               mothurOut("The acceptable hcluster methods are furthest and nearest, but we hope to add average in the future.\n\n");   
         }
         catch(exception& e) {
                 errorOut(e, "HClusterCommand", "help");
@@ -182,17 +184,13 @@ int HClusterCommand::execute(){
                 string firstName, secondName;
                 float distance;
                 
-               cluster = new HCluster(rabund, list);
+               cluster = new HCluster(rabund, list, method);
                 vector<seqDist> seqs; seqs.resize(1); // to start loop
-               exitedBreak = false;  //lets you know if there is a distance stored in next
-       
-               while (seqs.size() != 0){
                 
-                       seqs = getSeqs(in);
-                       random_shuffle(seqs.begin(), seqs.end());
-                       
-                       if (seqs.size() == 0) { break; } //there are no more distances
+               while (seqs.size() != 0){
                 
+                       seqs = cluster->getSeqs(in, globaldata->nameMap, cutoff);
+                               
                         for (int i = 0; i < seqs.size(); i++) {  //-1 means skip me
  
                                 if (print_start && isTrue(timing)) {
@@ -203,14 +201,12 @@ int HClusterCommand::execute(){
                                         print_start = false;
                                 }
                                 
-       //cout << "before cluster update" << endl;
+       
                                 if (seqs[i].seq1 != seqs[i].seq2) {
                                         cluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
                                         
                                         float rndDist = roundDist(seqs[i].dist, precision);
-               //cout << "after cluster update clusterSomething = " << clusteredSomething << " rndDist = " << rndDist << " rndPreviousDist = " << rndPreviousDist << endl;                     
-                                       
-                                       
+                                                       
                                         if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
                                                 printData("unique");
                                         }
@@ -256,11 +252,10 @@ int HClusterCommand::execute(){
                 sabundFile.close();
                 rabundFile.close();
                 listFile.close();
-               
                 delete cluster;
-               //if (isTrue(timing)) {
-                       mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster. "); mothurOutEndLine();
-               //}
+       
+               mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster. "); mothurOutEndLine();
+               
                 return 0;
         }
         catch(exception& e) {
@@ -299,76 +294,4 @@ void HClusterCommand::printData(string label){
  
  }
  //**********************************************************************************************************************
-vector<seqDist> HClusterCommand::getSeqs(ifstream& filehandle){
-       try {
-               string firstName, secondName;
-               float distance, prevDistance;
-               vector<seqDist> sameSeqs;
-               prevDistance = -1;
-               
-               //if you are not at the beginning of the file
-               if (exitedBreak) { 
-                       sameSeqs.push_back(next);
-                       prevDistance = next.dist;
-                       exitedBreak = false;
-               }
-       
-               //get entry
-               while (filehandle) {
-                       
-                       filehandle >> firstName >> secondName >> distance;  
-//cout << firstName << '\t' << secondName << '\t' << distance << endl;
-                       gobble(filehandle);
-                       
-                       //save first one
-                       if (prevDistance == -1) { prevDistance = distance; }
-       //cout << prevDistance << endl; 
-//if (globaldata->nameMap == NULL) { cout << "null" << endl; }
-                       map<string,int>::iterator itA = globaldata->nameMap->find(firstName);
-                       map<string,int>::iterator itB = globaldata->nameMap->find(secondName);
-                       
-                       if(itA == globaldata->nameMap->end()){
-                               cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);
-                       }
-                       if(itB == globaldata->nameMap->end()){
-                               cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);
-                       }
-       //cout << "here" << endl;               
-                       //using cutoff
-                       if (distance > cutoff) { break; }
-                       
-                       if (distance != -1) { //-1 means skip me
-                               
-                               //are the distances the same
-                               if (distance == prevDistance) { //save in vector
-                                       seqDist temp;
-                                       temp.seq1 = itA->second;
-                                       temp.seq2 = itB->second;
-                                       temp.dist = distance;
-                                       sameSeqs.push_back(temp);
-                                       exitedBreak = false;
-                                       //what about precision??
-                                       
-                               }else{ 
-                                       next.seq1 = itA->second;
-                                       next.seq2 = itB->second;
-                                       next.dist = distance;
-                                       exitedBreak = true;
-                                       break;
-                               }
-                               
-                       }
-               }
-
-               return sameSeqs;
-       }
-       catch(exception& e) {
-               errorOut(e, "HClusterCommand", "getSeqs");
-               exit(1);
-       }
-
-
-}
-
-//**********************************************************************************************************************
  
diff --git a/hclustercommand.h b/hclustercommand.h

index 1dd46b9ad96890c5576e18ae500cff6359270795..4c8fa9c9aa7811d31dc52e3b050ac0c1d5305184 100644 (file)
--- a/hclustercommand.h
+++ b/hclustercommand.h
@@ -29,12 +29,6 @@
  //Applications Center, 1030 S. Highway A1A, Patrick AFB, FL 32925-3002, USA 
  //Received January 28, 2009; Revised April 14, 2009; Accepted April 15, 2009 
  /************************************************************/
-struct seqDist {
-       int seq1;
-       int seq2;
-       float dist;
-};
-/************************************************************/
  class HClusterCommand : public Command {
         
  public:
@@ -52,23 +46,15 @@ private:
         ListVector oldList;
         ReadCluster* read;
         
-       bool abort;
-
-       string method, fileroot, tag, distfile, format, phylipfile, columnfile, namefile, sort;
+       bool abort, sorted, print_start;
+       string method, fileroot, tag, distfile, format, phylipfile, columnfile, namefile, sort, showabund, timing;
         double cutoff;
-       string showabund, timing;
         int precision, length;
         ofstream sabundFile, rabundFile, listFile;
-       //ifstream in;
-       seqDist next;
-       bool exitedBreak, sorted;
-
-       bool print_start;
         time_t start;
         unsigned long loops;
         
         void printData(string label);
-       vector<seqDist> getSeqs(ifstream&);
  };
  
  /************************************************************/
diff --git a/mgclustercommand.cpp b/mgclustercommand.cpp

index d5b84a8505bd554dca154c2c4e90db10a401c352..07e3776e08adc3b85ee5b8c2bd3829d340eb9c82 100644 (file)
--- a/mgclustercommand.cpp
+++ b/mgclustercommand.cpp
@@ -8,7 +8,6 @@
   */
  
  #include "mgclustercommand.h"
-#include "readcluster.h"
  
  //**********************************************************************************************************************
  MGClusterCommand::MGClusterCommand(string option){
@@ -138,6 +137,15 @@ int MGClusterCommand::execute(){
                 start = time(NULL);
                 oldList = *list;
                 
+               if (method == "furthest")               { tag = "fn";  }
+               else if (method == "nearest")   { tag = "nn";  }
+               else                                                    { tag = "an";  }
+               
+               //open output files
+               openOutputFile(fileroot+ tag + ".list",  listFile);
+               openOutputFile(fileroot+ tag + ".rabund",  rabundFile);
+               openOutputFile(fileroot+ tag + ".sabund",  sabundFile);
+               
                 if (!hclusterWanted) {
                         //get distmatrix and overlap
                         SparseMatrix* distMatrix = read->getDistMatrix();
@@ -148,14 +156,8 @@ int MGClusterCommand::execute(){
                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, distMatrix); }
                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, distMatrix); }
                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, distMatrix); }
-                       tag = cluster->getTag();
                         cluster->setMapWanted(true);
                         
-                       //open output files
-                       openOutputFile(fileroot+ tag + ".list",  listFile);
-                       openOutputFile(fileroot+ tag + ".rabund",  rabundFile);
-                       openOutputFile(fileroot+ tag + ".sabund",  sabundFile);
-                       
                         //cluster using cluster classes
                         while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
                                 
@@ -218,24 +220,16 @@ int MGClusterCommand::execute(){
                         sortHclusterFiles(distFile, overlapFile);
                 
                         //create cluster
-                       hcluster = new HCluster(rabund, list);
+                       hcluster = new HCluster(rabund, list, method);
                         hcluster->setMapWanted(true);
                         
-                       //open output files
-                       openOutputFile(fileroot+ tag + ".list",  listFile);
-                       openOutputFile(fileroot+ tag + ".rabund",  rabundFile);
-                       openOutputFile(fileroot+ tag + ".sabund",  sabundFile);
-                       
-                       vector<DistNode> seqs; seqs.resize(1); // to start loop
-                       exitedBreak = false;  //lets you know if there is a distance stored in next
-                       
+                       vector<seqDist> seqs; seqs.resize(1); // to start loop
                         ifstream inHcluster;
                         openInputFile(distFile, inHcluster);
  
                         while (seqs.size() != 0){
                 
-                               seqs = getSeqs(inHcluster);
-                               random_shuffle(seqs.begin(), seqs.end());
+                               seqs = hcluster->getSeqs(inHcluster, nameMap, cutoff);
                                 
                                 for (int i = 0; i < seqs.size(); i++) {  //-1 means skip me
                                         
@@ -345,7 +339,7 @@ ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
                 while (!done) {
                         
                         //get next overlap
-                       DistNode overlapNode;
+                       seqDist overlapNode;
                         if (!hclusterWanted) {  
                                 if (count < overlapMatrix.size()) { //do we have another node in the matrix
                                         overlapNode = overlapMatrix[count];
@@ -415,89 +409,21 @@ ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
  void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
         try {
                 //sort distFile
-               ReadCluster* readCluster = new ReadCluster(unsortedDist, cutoff);       
-               readCluster->setFormat("column");
-               readCluster->read(nameMap);
-               string sortedDistFile = readCluster->getOutputFile();
-               ListVector* extralist = readCluster->getListVector();  //we get this so we can delete it and not have a memory leak
-               
+               string sortedDistFile = sortFile(unsortedDist);
                 remove(unsortedDist.c_str());  //delete unsorted file
                 distFile = sortedDistFile;
-               delete extralist;
-               delete readCluster;
                 
                 //sort overlap file
-               readCluster = new ReadCluster(unsortedOverlap, cutoff);         
-               readCluster->setFormat("column");
-               readCluster->read(nameMap);
-               string sortedOverlapFile = readCluster->getOutputFile();
-               extralist = readCluster->getListVector();  //we get this so we can delete it and not have a memory leak
-               
+               string sortedOverlapFile = sortFile(unsortedOverlap);
                 remove(unsortedOverlap.c_str());  //delete unsorted file
                 overlapFile = sortedOverlapFile;
-               delete extralist;
-               delete readCluster;
         }
         catch(exception& e) {
                 errorOut(e, "MGClusterCommand", "sortHclusterFiles");
                 exit(1);
         }
  }
-//**********************************************************************************************************************
-vector<DistNode> MGClusterCommand::getSeqs(ifstream& filehandle){
-       try {
-               string firstName, secondName;
-               float distance, prevDistance;
-               vector<DistNode> sameSeqs;
-               prevDistance = -1;
-       
-               //if you are not at the beginning of the file
-               if (exitedBreak) { 
-                       sameSeqs.push_back(next);
-                       prevDistance = next.dist;
-                       exitedBreak = false;
-               }
-       
-               //get entry
-               while (!filehandle.eof()) {
-                       
-                       filehandle >> firstName >> secondName >> distance;    gobble(filehandle); 
-       
-                       //save first one
-                       if (prevDistance == -1) { prevDistance = distance; }
-                       
-                       map<string,int>::iterator itA = nameMap->find(firstName);
-                       map<string,int>::iterator itB = nameMap->find(secondName);
-                       if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
-                       if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
-               
-                       //using cutoff
-                       if (distance > cutoff) { break; }
-               
-                       if (distance != -1) { //-1 means skip me
-                       
-                               //are the distances the same
-                               if (distance == prevDistance) { //save in vector
-                                       DistNode temp(itA->second, itB->second, distance);
-                                       sameSeqs.push_back(temp);
-                                       exitedBreak = false;
-                               }else{ 
-                                       next.seq1 = itA->second;
-                                       next.seq2 = itB->second;
-                                       next.dist = distance;
-                                       exitedBreak = true;
-                                       break;
-                               }
-                       }
-               }
-               
-               return sameSeqs;
-       }
-       catch(exception& e) {
-               errorOut(e, "MGClusterCommand", "getSeqs");
-               exit(1);
-       }
-}
+
  //**********************************************************************************************************************
  
  
diff --git a/mgclustercommand.h b/mgclustercommand.h

index 9aa86d4441e5c5f1202a6d552fef31162337c00d..2ac80c91dd65fa7cf56f71331e4c4486c68c8b87 100644 (file)
--- a/mgclustercommand.h
+++ b/mgclustercommand.h
@@ -36,20 +36,18 @@ private:
         HCluster* hcluster;
         ListVector* list;
         ListVector oldList;
-       vector<DistNode> overlapMatrix;
-       DistNode next;
-
+       vector<seqDist> overlapMatrix;
         
         string blastfile, method, namefile, overlapFile, distFile;
         ofstream sabundFile, rabundFile, listFile;
         float cutoff, penalty;
         int precision, length, precisionLength;
-       bool abort, minWanted, hclusterWanted, exitedBreak, merge;
+       bool abort, minWanted, hclusterWanted, merge;
         
         void printData(ListVector*);
         ListVector* mergeOPFs(map<string, int>, float);
         void sortHclusterFiles(string, string);
-       vector<DistNode> getSeqs(ifstream&);
+       vector<seqDist> getSeqs(ifstream&);
  
  };
  
diff --git a/mothur.h b/mothur.h

index f839328cd2c304330fb18f31fb97eea531ef625a..cf6ae69dc85fc866c34588118575222772897dcf 100644 (file)
--- a/mothur.h
+++ b/mothur.h
@@ -100,7 +100,15 @@ struct clusterNode {
         int smallChild; //used to make linkTable work with list and rabund. represents bin number of this cluster node
         clusterNode(int num, int par, int kid) : numSeq(num), parent(par), smallChild(kid) {};
  };
-
+/************************************************************/
+struct seqDist {
+       int seq1;
+       int seq2;
+       float dist;
+       seqDist() {}
+       seqDist(int s1, int s2, float d) : seq1(s1), seq2(s2), dist(d) {}
+       ~seqDist() {}
+};
  /***********************************************************************/
  
  // snagged from http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.2
@@ -775,6 +783,92 @@ inline bool anyLabelsToProcess(string label, set<string>& userLabels, string err
         }
  }
  
+/**************************************************************************************************/
+inline void appendFiles(string temp, string filename) {
+       try{
+               ofstream output;
+               ifstream input;
+       
+               //open output file in append mode
+               openOutputFileAppend(filename, output);
+               openInputFile(temp, input);
+               
+               while(char c = input.get()){
+                       if(input.eof())         {       break;                  }
+                       else                            {       output << c;    }
+               }
+               
+               input.close();
+               output.close();
+       }
+       catch(exception& e) {
+               errorOut(e, "mothur", "appendFiles");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+inline string sortFile(string distFile){
+       try {   
+               string outfile = getRootName(distFile) + "sorted.dist";
+               
+               //if you can, use the unix sort since its been optimized for years
+               #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+                       string command = "sort -n -k +3 " + distFile + " -o " + outfile;
+                       system(command.c_str());
+               #else //you are stuck with my best attempt...
+                       //windows sort does not have a way to specify a column, only a character in the line
+                       //since we cannot assume that the distance will always be at the the same character location on each line
+                       //due to variable sequence name lengths, I chose to force the distance into first position, then sort and then put it back.
+               
+                       //read in file line by file and put distance first
+                       string tempDistFile = distFile + ".temp";
+                       ifstream input;
+                       ofstream output;
+                       openInputFile(distFile, input);
+                       openOutputFile(tempDistFile, output);
+
+                       string firstName, secondName;
+                       float dist;
+                       while (input) {
+                               input >> firstName >> secondName >> dist;
+                               output << dist << '\t' << firstName << '\t' << secondName << endl;
+                               gobble(input);
+                       }
+                       input.close();
+                       output.close();
+               
+       
+                       //sort using windows sort
+                       string tempOutfile = outfile + ".temp";
+                       string command = "sort " + tempDistFile + " /O " + tempOutfile;
+                       system(command.c_str());
+               
+                       //read in sorted file and put distance at end again
+                       ifstream input2;
+                       openInputFile(tempOutfile, input2);
+                       openOutputFile(outfile, output);
+               
+                       while (input2) {
+                               input2 >> dist >> firstName >> secondName;
+                               output << firstName << '\t' << secondName << '\t' << dist << endl;
+                               gobble(input2);
+                       }
+                       input2.close();
+                       output.close();
+               
+                       //remove temp files
+                       remove(tempDistFile.c_str());
+                       remove(tempOutfile.c_str());
+               #endif
+               
+               return outfile;
+       }
+       catch(exception& e) {
+               errorOut(e, "mothur", "sortFile");
+               exit(1);
+       }
+}
  /**************************************************************************************************/
  #endif
  
diff --git a/preclustercommand.cpp b/preclustercommand.cpp

new file mode 100644 (file)

index 0000000..479a862
--- /dev/null
+++ b/preclustercommand.cpp
@@ -0,0 +1,269 @@
+/*
+ *  preclustercommand.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 12/21/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "preclustercommand.h"
+
+//**********************************************************************************************************************
+inline bool comparePriority(seqPNode first, seqPNode second) {  return (first.numIdentical > second.numIdentical); }
+//**********************************************************************************************************************
+
+PreClusterCommand::PreClusterCommand(string option){
+       try {
+               abort = false;
+               
+               //allow user to run help
+               if(option == "help") { help(); abort = true; }
+               
+               else {
+                       //valid paramters for this command
+                       string Array[] =  {"fasta", "name", "diffs"};
+                       vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
+                       
+                       OptionParser parser(option);
+                       map<string, string> parameters = parser.getParameters();
+                       
+                       ValidParameters validParameter;
+               
+                       //check to make sure all parameters are valid for command
+                       for (map<string, string>::iterator it2 = parameters.begin(); it2 != parameters.end(); it2++) { 
+                               if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) {  abort = true;  }
+                       }
+                       
+                       //check for required parameters
+                       fastafile = validParameter.validFile(parameters, "fasta", true);
+                       if (fastafile == "not found") { mothurOut("fasta is a required parameter for the pre.cluster command."); mothurOutEndLine(); abort = true; }
+                       else if (fastafile == "not open") { abort = true; }     
+                       
+                       //check for optional parameter and set defaults
+                       // ...at some point should added some additional type checking...
+                       namefile = validParameter.validFile(parameters, "name", true);
+                       if (namefile == "not found") { namefile =  "";  }
+                       else if (namefile == "not open") { abort = true; }      
+                       else {  readNameFile();  }
+                       
+                       string temp     = validParameter.validFile(parameters, "diffs", false);                         if(temp == "not found"){        temp = "1"; }
+                       convert(temp, diffs); 
+               }
+                               
+       }
+       catch(exception& e) {
+               errorOut(e, "PreClusterCommand", "PreClusterCommand");
+               exit(1);
+       }
+}
+
+//**********************************************************************************************************************
+PreClusterCommand::~PreClusterCommand(){}      
+//**********************************************************************************************************************
+
+void PreClusterCommand::help(){
+       try {
+               mothurOut("The pre.cluster command groups sequences that are within a given number of base mismatches.\n");
+               mothurOut("The pre.cluster command outputs a new fasta and name file.\n");
+               mothurOut("The pre.cluster command parameters are fasta, names and diffs. The fasta parameter is required. \n");
+               mothurOut("The names parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n");
+               mothurOut("The diffs parameter allows you to specify maximum number of mismatched bases allowed between sequences in a grouping. The default is 1.\n");
+               mothurOut("The pre.cluster command should be in the following format: \n");
+               mothurOut("pre.cluster(fasta=yourFastaFile, names=yourNamesFile, diffs=yourMaxDiffs) \n");
+               mothurOut("Example pre.cluster(fasta=amazon.fasta, diffs=2).\n");
+               mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n");
+       }
+       catch(exception& e) {
+               errorOut(e, "PreClusterCommand", "help");
+               exit(1);
+       }
+}
+//**********************************************************************************************************************
+
+int PreClusterCommand::execute(){
+       try {
+               
+               if (abort == true) { return 0; }
+               
+               //reads fasta file and return number of seqs
+               int numSeqs = readSeqs(); //fills alignSeqs and makes all seqs active
+       
+               if (numSeqs == 0) { mothurOut("Error reading fasta file...please correct."); mothurOutEndLine(); return 0;  }
+               if (diffs > length) { mothurOut("Error: diffs is greater than your sequence length."); mothurOutEndLine(); return 0;  }
+               
+               //clear sizes since you only needed this info to build the alignSeqs seqPNode structs
+               sizes.clear();
+       
+               //sort seqs by number of identical seqs
+               sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
+       
+               //go through active list and cluster everthing you can, until all nodes are inactive
+               //taking advantage of the fact that maps are already sorted
+               map<string, bool>::iterator itActive;
+               map<string, bool>::iterator it2Active;
+               int count = 0;
+               
+               for (int i = 0; i < alignSeqs.size(); i++) {
+               
+                       //are you active
+                       itActive = active.find(alignSeqs[i].seq.getName());
+                       
+                       if (itActive != active.end()) {  //this sequence has not been merged yet
+                       
+                               //try to merge it with all smaller seqs
+                               for (int j = i; j < alignSeqs.size(); j++) {
+                                       
+                                       if (i != j) {
+                                               //are you active
+                                               it2Active = active.find(alignSeqs[j].seq.getName());
+                                               if (it2Active != active.end()) {  //this sequence has not been merged yet
+                                                       //are you within "diff" bases
+                                                       int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
+                                                       
+                                                       if (mismatch <= diffs) {
+                                                               //merge
+                                                               names[alignSeqs[i].seq.getName()] += "," + names[alignSeqs[j].seq.getName()];
+                                                       
+                                                               //remove from active list
+                                                               active.erase(it2Active);
+                                                               
+                                                               //set numIdentical to 0, so you only print the representative seqs in the fasta file
+                                                               alignSeqs[j].numIdentical = 0;
+                                                               count++;
+                                                       }
+                                               }//end if j active
+                                       }//end if i != j
+                               }//end for loop
+                               
+                               //remove from active list 
+                               active.erase(itActive);
+                       }//end if active i
+               }
+       
+               string newFastaFile = getRootName(fastafile) + "precluster" + getExtension(fastafile);
+               string newNamesFile = getRootName(fastafile) + "precluster.names";
+               
+               printData(newFastaFile, newNamesFile);
+               
+               mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); mothurOutEndLine();
+               mothurOut("pre.cluster removed " + toString(count) + " sequences."); mothurOutEndLine(); 
+               
+               return 0;
+               
+       }
+       catch(exception& e) {
+               errorOut(e, "PreClusterCommand", "execute");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+int PreClusterCommand::readSeqs(){
+       try {
+               ifstream inFasta;
+               openInputFile(fastafile, inFasta);
+               length = 0;
+               map<string, string>::iterator it;
+                               
+               while (!inFasta.eof()) {
+                       Sequence temp(inFasta);  //read seq
+                       
+                       if (temp.getName() != "") {  
+                               if (namefile != "") {
+                                       //make sure fasta and name files match
+                                       it = names.find(temp.getName());
+                                       if (it == names.end()) { mothurOut(temp.getName() + " is not in your names file, please correct."); mothurOutEndLine(); exit(1); }
+                               }else { sizes[temp.getName()] = 1; }
+                               
+                               seqPNode tempNode(sizes[temp.getName()], temp);
+                               alignSeqs.push_back(tempNode); 
+                               active[temp.getName()] = true;
+                       }
+                       gobble(inFasta);
+               }
+               inFasta.close();
+               
+               if (alignSeqs.size() != 0) {  length = alignSeqs[0].seq.getAligned().length();  }
+               
+               return alignSeqs.size();
+       }
+       catch(exception& e) {
+               errorOut(e, "PreClusterCommand", "readSeqs");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+void PreClusterCommand::readNameFile(){
+       try {
+               ifstream in;
+               openInputFile(namefile, in);
+               string firstCol, secondCol;
+                               
+               while (!in.eof()) {
+                       in >> firstCol >> secondCol; gobble(in);
+                       names[firstCol] = secondCol;
+                       int size = 1;
+                       while (secondCol.find_first_of(',') != -1) { 
+                               size++;
+                               secondCol = secondCol.substr(secondCol.find_first_of(',')+1, secondCol.length());
+                       }
+                       sizes[firstCol] = size;
+               }
+               in.close();
+       }
+       catch(exception& e) {
+               errorOut(e, "PreClusterCommand", "readNameFile");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+int PreClusterCommand::calcMisMatches(string seq1, string seq2){
+       try {
+               int numBad = 0;
+               
+               for (int i = 0; i < seq1.length(); i++) {
+                       //do they match
+                       if (seq1[i] != seq2[i]) { numBad++; }
+                       if (numBad > diffs) { return length;  } //to far to cluster
+               }
+               
+               return numBad;
+       }
+       catch(exception& e) {
+               errorOut(e, "PreClusterCommand", "calcMisMatches");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+void PreClusterCommand::printData(string newfasta, string newname){
+       try {
+               ofstream outFasta;
+               ofstream outNames;
+               openOutputFile(newfasta, outFasta);
+               openOutputFile(newname, outNames);
+                               
+               map<string, string>::iterator itNames;
+               
+               for (int i = 0; i < alignSeqs.size(); i++) {
+                       if (alignSeqs[i].numIdentical != 0) {
+                               alignSeqs[i].seq.printSequence(outFasta); 
+                               
+                               itNames = names.find(alignSeqs[i].seq.getName());
+                               
+                               outNames << itNames->first << '\t' << itNames->second << endl;
+                       }
+               }
+               
+               outFasta.close();
+               outNames.close();
+               
+       }
+       catch(exception& e) {
+               errorOut(e, "PreClusterCommand", "printData");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+
diff --git a/preclustercommand.h b/preclustercommand.h

new file mode 100644 (file)

index 0000000..bb666cd
--- /dev/null
+++ b/preclustercommand.h
@@ -0,0 +1,59 @@
+#ifndef PRECLUSTERCOMMAND_H
+#define PRECLUSTERCOMMAND_H
+
+
+/*
+ *  preclustercommand.h
+ *  Mothur
+ *
+ *  Created by westcott on 12/21/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+
+#include "command.hpp"
+#include "sequence.hpp"
+
+/************************************************************/
+struct seqPNode {
+       int numIdentical;
+       Sequence seq;
+       seqPNode() {}
+       seqPNode(int s, Sequence q) : numIdentical(s), seq(q) {}
+       ~seqPNode() {}
+};
+/************************************************************/
+
+class PreClusterCommand : public Command {
+       
+public:
+       PreClusterCommand(string);      
+       ~PreClusterCommand();
+       int execute();  
+       void help();
+       
+private:
+       int diffs, length;
+       bool abort;
+       string fastafile, namefile;
+       vector<seqPNode> alignSeqs; //maps the number of identical seqs to a sequence
+       map<string, string> names; //represents the names file first column maps to second column
+       map<string, int> sizes;  //this map a seq name to the number of identical seqs in the names file
+       map<string, bool> active; //maps sequence name to whether it has already been merged or not.
+       
+       int readSeqs();
+       int calcMisMatches(string, string);
+       void readNameFile();
+       void printData(string, string); //fasta filename, names file name
+};
+
+/************************************************************/
+
+
+
+
+
+#endif
+
+
diff --git a/readblast.cpp b/readblast.cpp

index 7a092f96cc8b92cd245decd37d388034737a73ed..a6d23e95bfddf71afb20a2d4c2b2d8d1377db71d 100644 (file)
--- a/readblast.cpp
+++ b/readblast.cpp
@@ -12,7 +12,7 @@
  
  //********************************************************************************************************************
  //sorts lowest to highest
-inline bool compareOverlap(DistNode left, DistNode right){
+inline bool compareOverlap(seqDist left, seqDist right){
         return (left.dist < right.dist);        
  } 
  /*********************************************************************************************/
@@ -91,7 +91,7 @@ void ReadBlast::read(NameAssignment* nameMap) {
                                 //if there is a valid overlap, add it
                                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
                                         if (!hclusterWanted) {
-                                               DistNode overlapValue(itA->second, itB->second, thisoverlap);
+                                               seqDist overlapValue(itA->second, itB->second, thisoverlap);
                                                 overlap.push_back(overlapValue);
                                         }else {
                                                 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
@@ -137,7 +137,7 @@ void ReadBlast::read(NameAssignment* nameMap) {
                                                 //if there is a valid overlap, add it
                                                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
                                                         if (!hclusterWanted) {
-                                                               DistNode overlapValue(itA->second, itB->second, thisoverlap);
+                                                               seqDist overlapValue(itA->second, itB->second, thisoverlap);
                                                                 //cout << "overlap = " << itA->second << '\t' << itB->second << '\t' << thisoverlap << endl;
                                                                 overlap.push_back(overlapValue);
                                                         }else {
@@ -201,7 +201,7 @@ void ReadBlast::read(NameAssignment* nameMap) {
                                                 //if there is a valid overlap, add it
                                                 if ((startRef <= length) && ((endQuery+length) >= lengthThisSeq) && (thisoverlap < cutoff)) {
                                                         if (!hclusterWanted) {
-                                                               DistNode overlapValue(itA->second, itB->second, thisoverlap);
+                                                               seqDist overlapValue(itA->second, itB->second, thisoverlap);
                                                                 overlap.push_back(overlapValue);
                                                         }else {
                                                                 outOverlap << itA->first << '\t' << itB->first << '\t' << thisoverlap << endl;
diff --git a/readblast.h b/readblast.h

index 0b47b5942d7126cec59ce0923a66c1bc4c6ff4e1..128c144fc3d29109453331c059b478c18f7a6bfe 100644 (file)
--- a/readblast.h
+++ b/readblast.h
@@ -13,15 +13,6 @@
  #include "sparsematrix.hpp"
  #include "nameassignment.hpp"
  
-/****************************************************************************************/
-struct DistNode {
-       int seq1;
-       int seq2;
-       float dist;
-       DistNode(int s1, int s2, float d) : seq1(s1), seq2(s2), dist(d) {}
-       DistNode() {}
-       ~DistNode() {}
-};
  /****************************************************************************************/
  
  //Note: this class creates a sparsematrix and list if the read is executed, but does not delete them on deconstruction.
@@ -36,7 +27,7 @@ public:
         
         void read(NameAssignment*);
         SparseMatrix* getDistMatrix()           {       return matrix;          }
-       vector<DistNode> getOverlapMatrix()     {       return overlap;         }
+       vector<seqDist> getOverlapMatrix()      {       return overlap;         }
         string getOverlapFile()                         {       return overlapFile;     }
         string getDistFile()                            {       return distFile;        }
         
@@ -48,7 +39,7 @@ private:
         bool hclusterWanted;
         
         SparseMatrix* matrix;
-       vector<DistNode> overlap;
+       vector<seqDist> overlap;
         
         void readNames(NameAssignment*);
  };
diff --git a/readcluster.cpp b/readcluster.cpp

index 233b8e13094cf560ef295522a7cd6a8d552dd55a..11b8d012a22e592d99bb71e3063c0c89a710252a 100644 (file)
--- a/readcluster.cpp
+++ b/readcluster.cpp
@@ -25,7 +25,7 @@ void ReadCluster::read(NameAssignment* nameMap){
                 if (format == "phylip") { convertPhylip2Column(nameMap); }
                 else { list = new ListVector(nameMap->getListVector());  }
                 
-               createHClusterFile();
+               OutPutFile = sortFile(distFile);
                         
         }
         catch(exception& e) {
@@ -35,71 +35,6 @@ void ReadCluster::read(NameAssignment* nameMap){
  }
  /***********************************************************************/
  
-void ReadCluster::createHClusterFile(){
-       try {   
-               string outfile = getRootName(distFile) + "sorted.dist";
-               
-               //if you can, use the unix sort since its been optimized for years
-               #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
-                       string command = "sort -n -k +3 " + distFile + " -o " + outfile;
-                       system(command.c_str());
-               #else //you are stuck with my best attempt...
-                       //windows sort does not have a way to specify a column, only a character in the line
-                       //since we cannot assume that the distance will always be at the the same character location on each line
-                       //due to variable sequence name lengths, I chose to force the distance into first position, then sort and then put it back.
-               
-                       //read in file line by file and put distance first
-                       string tempDistFile = distFile + ".temp";
-                       ifstream input;
-                       ofstream output;
-                       openInputFile(distFile, input);
-                       openOutputFile(tempDistFile, output);
-
-                       string firstName, secondName;
-                       float dist;
-                       while (input) {
-                               input >> firstName >> secondName >> dist;
-                               output << dist << '\t' << firstName << '\t' << secondName << endl;
-                               gobble(input);
-                       }
-                       input.close();
-                       output.close();
-               
-       
-                       //sort using windows sort
-                       string tempOutfile = outfile + ".temp";
-                       string command = "sort " + tempDistFile + " /O " + tempOutfile;
-                       system(command.c_str());
-               
-                       //read in sorted file and put distance at end again
-                       ifstream input2;
-                       openInputFile(tempOutfile, input2);
-                       openOutputFile(outfile, output);
-               
-                       while (input2) {
-                               input2 >> dist >> firstName >> secondName;
-                               output << firstName << '\t' << secondName << '\t' << dist << endl;
-                               gobble(input2);
-                       }
-                       input2.close();
-                       output.close();
-               
-                       //remove temp files
-                       remove(tempDistFile.c_str());
-                       remove(tempOutfile.c_str());
-               #endif
-               
-               OutPutFile = outfile;
-       }
-       catch(exception& e) {
-               errorOut(e, "ReadCluster", "createHClusterFile");
-               exit(1);
-       }
-}
-
-
-/***********************************************************************/
-
  void ReadCluster::convertPhylip2Column(NameAssignment* nameMap){
         try {   
                 //convert phylip file to column file
diff --git a/readcluster.h b/readcluster.h

index 9e62e92aea46e04120c0992e12414d70b6207bba..bcd910cb1c032e06df74aae9acfd55e7b3be2142 100644 (file)
--- a/readcluster.h
+++ b/readcluster.h
@@ -27,7 +27,6 @@ public:
         string getOutputFile() { return OutPutFile; }
         void setFormat(string f) { format = f;  }
         ListVector* getListVector()             {       return list;    }
-       //NameAssignment* getNameMap()  {       return nameMap; }
         
  private:
         GlobalData* globaldata;
@@ -35,9 +34,7 @@ private:
         string OutPutFile, format;
         ListVector* list;
         float cutoff;
-       //NameAssignment* nameMap;
         
-       void createHClusterFile();
         void convertPhylip2Column(NameAssignment*);
  };
author	westcott <westcott>
	Wed, 23 Dec 2009 18:47:52 +0000 (18:47 +0000)
committer	westcott <westcott>
	Wed, 23 Dec 2009 18:47:52 +0000 (18:47 +0000)
Mothur.xcodeproj/project.pbxproj		patch \| blob \| history
commandfactory.cpp		patch \| blob \| history
distancecommand.cpp		patch \| blob \| history
distancecommand.h		patch \| blob \| history
hcluster.cpp		patch \| blob \| history
hcluster.h		patch \| blob \| history
hclustercommand.cpp		patch \| blob \| history
hclustercommand.h		patch \| blob \| history
mgclustercommand.cpp		patch \| blob \| history
mgclustercommand.h		patch \| blob \| history
mothur.h		patch \| blob \| history
preclustercommand.cpp	[new file with mode: 0644]	patch \| blob
preclustercommand.h	[new file with mode: 0644]	patch \| blob
readblast.cpp		patch \| blob \| history
readblast.h		patch \| blob \| history
readcluster.cpp		patch \| blob \| history
readcluster.h		patch \| blob \| history