]> git.donarmstrong.com Git - mothur.git/blobdiff - mgclustercommand.cpp
created mothurOut class to handle logfiles
[mothur.git] / mgclustercommand.cpp
index d5b84a8505bd554dca154c2c4e90db10a401c352..bf0b82b3973edce4353325d3364010bdadd20c54 100644 (file)
@@ -8,10 +8,9 @@
  */
 
 #include "mgclustercommand.h"
-#include "readcluster.h"
 
 //**********************************************************************************************************************
-MGClusterCommand::MGClusterCommand(string option){
+MGClusterCommand::MGClusterCommand(string option) {
        try {
                globaldata = GlobalData::getInstance();
                abort = false;
@@ -21,29 +20,59 @@ MGClusterCommand::MGClusterCommand(string option){
                
                else {
                        //valid paramters for this command
-                       string Array[] =  {"blast", "method", "name", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge"};
+                       string Array[] =  {"blast", "method", "name", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"};
                        vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
                        
                        OptionParser parser(option);
                        map<string, string> parameters = parser.getParameters();
                        
                        ValidParameters validParameter;
+                       map<string,string>::iterator it;
                
                        //check to make sure all parameters are valid for command
-                       for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
+                       for (it = parameters.begin(); it != parameters.end(); it++) { 
                                if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
                        }
                        
+                       //if the user changes the input directory command factory will send this info to us in the output parameter 
+                       string inputDir = validParameter.validFile(parameters, "inputdir", false);              
+                       if (inputDir == "not found"){   inputDir = "";          }
+                       else {
+                               string path;
+                               it = parameters.find("blast");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["blast"] = inputDir + it->second;            }
+                               }
+                               
+                               it = parameters.find("name");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["name"] = inputDir + it->second;             }
+                               }
+                       }
+
+                       
                        //check for required parameters
                        blastfile = validParameter.validFile(parameters, "blast", true);
                        if (blastfile == "not open") { abort = true; }  
                        else if (blastfile == "not found") { blastfile = ""; }
                        
+                       //if the user changes the output directory command factory will send this info to us in the output parameter 
+                       outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
+                               outputDir = ""; 
+                               outputDir += hasPath(blastfile); //if user entered a file with a path then preserve it  
+                       }
+                       
                        namefile = validParameter.validFile(parameters, "name", true);
                        if (namefile == "not open") { abort = true; }   
                        else if (namefile == "not found") { namefile = ""; }
                        
-                       if ((blastfile == "")) { mothurOut("When executing a mgcluster command you must provide a blastfile."); mothurOutEndLine(); abort = true; }
+                       if ((blastfile == "")) { m->mothurOut("When executing a mgcluster command you must provide a blastfile."); m->mothurOutEndLine(); abort = true; }
                        
                        //check for optional parameter and set defaults
                        string temp;
@@ -59,7 +88,7 @@ MGClusterCommand::MGClusterCommand(string option){
                        if (method == "not found") { method = "furthest"; }
                        
                        if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
-                       else { mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); mothurOutEndLine(); abort = true; }
+                       else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
 
                        temp = validParameter.validFile(parameters, "length", false);                   if (temp == "not found") { temp = "5"; }
                        convert(temp, length); 
@@ -79,7 +108,7 @@ MGClusterCommand::MGClusterCommand(string option){
 
        }
        catch(exception& e) {
-               errorOut(e, "MGClusterCommand", "MGClusterCommand");
+               m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
                exit(1);
        }
 }
@@ -87,23 +116,23 @@ MGClusterCommand::MGClusterCommand(string option){
 
 void MGClusterCommand::help(){
        try {
-               mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n");
-               mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n");
-               mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n");
-               mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n");
-               mothurOut("The precision parameter's default value is 100. \n");
-               mothurOut("The acceptable mgcluster methods are furthest, nearest and average.  If no method is provided then furthest is assumed.\n\n");       
-               mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n");
-               mothurOut("The length parameter is used to specify the minimum overlap required.  The default is 5.\n");
-               mothurOut("The penalty parameter is used to adjust the error rate.  The default is 0.10.\n");
-               mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster.  By default merge is true, meaning you want to merge.\n");
-               mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering.  This may be neccessary if your file is too large to fit into RAM. The default is false.\n");
-               mothurOut("The mgcluster command should be in the following format: \n");
-               mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n");
-               mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n");
+               m->mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n");
+               m->mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n");
+               m->mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n");
+               m->mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n");
+               m->mothurOut("The precision parameter's default value is 100. \n");
+               m->mothurOut("The acceptable mgcluster methods are furthest, nearest and average.  If no method is provided then furthest is assumed.\n\n");    
+               m->mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n");
+               m->mothurOut("The length parameter is used to specify the minimum overlap required.  The default is 5.\n");
+               m->mothurOut("The penalty parameter is used to adjust the error rate.  The default is 0.10.\n");
+               m->mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster.  By default merge is true, meaning you want to merge.\n");
+               m->mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering.  This may be neccessary if your file is too large to fit into RAM. The default is false.\n");
+               m->mothurOut("The mgcluster command should be in the following format: \n");
+               m->mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n");
+               m->mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n");
        }
        catch(exception& e) {
-               errorOut(e, "MGClusterCommand", "help");
+               m->errorOut(e, "MGClusterCommand", "help");
                exit(1);
        }
 }
@@ -121,7 +150,7 @@ int MGClusterCommand::execute(){
                        nameMap->readMap();
                }else{ nameMap= new NameAssignment(); }
                
-               string fileroot = getRootName(blastfile);
+               string fileroot = outputDir + getRootName(getSimpleName(blastfile));
                string tag = "";
                time_t start;
                float previousDist = 0.00000;
@@ -138,6 +167,15 @@ int MGClusterCommand::execute(){
                start = time(NULL);
                oldList = *list;
                
+               if (method == "furthest")               { tag = "fn";  }
+               else if (method == "nearest")   { tag = "nn";  }
+               else                                                    { tag = "an";  }
+               
+               //open output files
+               openOutputFile(fileroot+ tag + ".list",  listFile);
+               openOutputFile(fileroot+ tag + ".rabund",  rabundFile);
+               openOutputFile(fileroot+ tag + ".sabund",  sabundFile);
+               
                if (!hclusterWanted) {
                        //get distmatrix and overlap
                        SparseMatrix* distMatrix = read->getDistMatrix();
@@ -145,21 +183,15 @@ int MGClusterCommand::execute(){
                        delete read;
                
                        //create cluster
-                       if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, distMatrix); }
-                       else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, distMatrix); }
-                       else if(method == "average"){   cluster = new AverageLinkage(rabund, list, distMatrix); }
-                       tag = cluster->getTag();
+                       if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method); }
+                       else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
+                       else if(method == "average"){   cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); }
                        cluster->setMapWanted(true);
                        
-                       //open output files
-                       openOutputFile(fileroot+ tag + ".list",  listFile);
-                       openOutputFile(fileroot+ tag + ".rabund",  rabundFile);
-                       openOutputFile(fileroot+ tag + ".sabund",  sabundFile);
-                       
                        //cluster using cluster classes
                        while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
                                
-                               cluster->update();
+                               cluster->update(cutoff);
                                float dist = distMatrix->getSmallDist();
                                float rndDist = roundDist(dist, precision);
                                
@@ -208,7 +240,6 @@ int MGClusterCommand::execute(){
                        delete cluster;
                        
                }else { //use hcluster to cluster
-                       tag = "fn";
                        //get distmatrix and overlap
                        overlapFile = read->getOverlapFile();
                        distFile = read->getDistFile(); 
@@ -218,24 +249,16 @@ int MGClusterCommand::execute(){
                        sortHclusterFiles(distFile, overlapFile);
                
                        //create cluster
-                       hcluster = new HCluster(rabund, list);
+                       hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
                        hcluster->setMapWanted(true);
                        
-                       //open output files
-                       openOutputFile(fileroot+ tag + ".list",  listFile);
-                       openOutputFile(fileroot+ tag + ".rabund",  rabundFile);
-                       openOutputFile(fileroot+ tag + ".sabund",  sabundFile);
-                       
-                       vector<DistNode> seqs; seqs.resize(1); // to start loop
-                       exitedBreak = false;  //lets you know if there is a distance stored in next
-                       
-                       ifstream inHcluster;
-                       openInputFile(distFile, inHcluster);
+                       vector<seqDist> seqs; seqs.resize(1); // to start loop
+                       //ifstream inHcluster;
+                       //openInputFile(distFile, inHcluster);
 
                        while (seqs.size() != 0){
                
-                               seqs = getSeqs(inHcluster);
-                               random_shuffle(seqs.begin(), seqs.end());
+                               seqs = hcluster->getSeqs();
                                
                                for (int i = 0; i < seqs.size(); i++) {  //-1 means skip me
                                        
@@ -268,7 +291,7 @@ int MGClusterCommand::execute(){
                                        }
                                }
                        }
-                       inHcluster.close();
+                       //inHcluster.close();
                        
                        if(previousDist <= 0.0000){
                                oldList.setLabel("unique");
@@ -300,13 +323,20 @@ int MGClusterCommand::execute(){
        
                globaldata->setListFile(fileroot+ tag + ".list");
                globaldata->setFormat("list");
-                       
-               mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); mothurOutEndLine();
+               
+               m->mothurOutEndLine();
+               m->mothurOut("Output File Names: "); m->mothurOutEndLine();
+               m->mothurOut(fileroot+ tag + ".list"); m->mothurOutEndLine();   
+               m->mothurOut(fileroot+ tag + ".rabund"); m->mothurOutEndLine(); 
+               m->mothurOut(fileroot+ tag + ".sabund"); m->mothurOutEndLine(); 
+               m->mothurOutEndLine();
+               
+               m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine();
                        
                return 0;
        }
        catch(exception& e) {
-               errorOut(e, "MGClusterCommand", "execute");
+               m->errorOut(e, "MGClusterCommand", "execute");
                exit(1);
        }
 }
@@ -322,7 +352,7 @@ void MGClusterCommand::printData(ListVector* mergedList){
                sabund.print(sabundFile);
        }
        catch(exception& e) {
-               errorOut(e, "MGClusterCommand", "printData");
+               m->errorOut(e, "MGClusterCommand", "printData");
                exit(1);
        }
 }
@@ -345,7 +375,7 @@ ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
                while (!done) {
                        
                        //get next overlap
-                       DistNode overlapNode;
+                       seqDist overlapNode;
                        if (!hclusterWanted) {  
                                if (count < overlapMatrix.size()) { //do we have another node in the matrix
                                        overlapNode = overlapMatrix[count];
@@ -407,7 +437,7 @@ ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
                                
        }
        catch(exception& e) {
-               errorOut(e, "MGClusterCommand", "mergeOPFs");
+               m->errorOut(e, "MGClusterCommand", "mergeOPFs");
                exit(1);
        }
 }
@@ -415,89 +445,21 @@ ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
        try {
                //sort distFile
-               ReadCluster* readCluster = new ReadCluster(unsortedDist, cutoff);       
-               readCluster->setFormat("column");
-               readCluster->read(nameMap);
-               string sortedDistFile = readCluster->getOutputFile();
-               ListVector* extralist = readCluster->getListVector();  //we get this so we can delete it and not have a memory leak
-               
+               string sortedDistFile = sortFile(unsortedDist);
                remove(unsortedDist.c_str());  //delete unsorted file
                distFile = sortedDistFile;
-               delete extralist;
-               delete readCluster;
                
                //sort overlap file
-               readCluster = new ReadCluster(unsortedOverlap, cutoff);         
-               readCluster->setFormat("column");
-               readCluster->read(nameMap);
-               string sortedOverlapFile = readCluster->getOutputFile();
-               extralist = readCluster->getListVector();  //we get this so we can delete it and not have a memory leak
-               
+               string sortedOverlapFile = sortFile(unsortedOverlap);
                remove(unsortedOverlap.c_str());  //delete unsorted file
                overlapFile = sortedOverlapFile;
-               delete extralist;
-               delete readCluster;
        }
        catch(exception& e) {
-               errorOut(e, "MGClusterCommand", "sortHclusterFiles");
-               exit(1);
-       }
-}
-//**********************************************************************************************************************
-vector<DistNode> MGClusterCommand::getSeqs(ifstream& filehandle){
-       try {
-               string firstName, secondName;
-               float distance, prevDistance;
-               vector<DistNode> sameSeqs;
-               prevDistance = -1;
-       
-               //if you are not at the beginning of the file
-               if (exitedBreak) { 
-                       sameSeqs.push_back(next);
-                       prevDistance = next.dist;
-                       exitedBreak = false;
-               }
-       
-               //get entry
-               while (!filehandle.eof()) {
-                       
-                       filehandle >> firstName >> secondName >> distance;    gobble(filehandle); 
-       
-                       //save first one
-                       if (prevDistance == -1) { prevDistance = distance; }
-                       
-                       map<string,int>::iterator itA = nameMap->find(firstName);
-                       map<string,int>::iterator itB = nameMap->find(secondName);
-                       if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
-                       if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
-               
-                       //using cutoff
-                       if (distance > cutoff) { break; }
-               
-                       if (distance != -1) { //-1 means skip me
-                       
-                               //are the distances the same
-                               if (distance == prevDistance) { //save in vector
-                                       DistNode temp(itA->second, itB->second, distance);
-                                       sameSeqs.push_back(temp);
-                                       exitedBreak = false;
-                               }else{ 
-                                       next.seq1 = itA->second;
-                                       next.seq2 = itB->second;
-                                       next.dist = distance;
-                                       exitedBreak = true;
-                                       break;
-                               }
-                       }
-               }
-               
-               return sameSeqs;
-       }
-       catch(exception& e) {
-               errorOut(e, "MGClusterCommand", "getSeqs");
+               m->errorOut(e, "MGClusterCommand", "sortHclusterFiles");
                exit(1);
        }
 }
+
 //**********************************************************************************************************************