X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;ds=sidebyside;f=hclustercommand.cpp;h=b4d601751630af81741d64ab605c95d5650838a5;hb=bbdb72971ea4cd171abe34985af89f97bc2a31d3;hp=38559500b0a81f25716b983b2274b88332e3ff56;hpb=ef3f6d42fe720cd6d91419e5e32f8c04d8765010;p=mothur.git diff --git a/hclustercommand.cpp b/hclustercommand.cpp index 3855950..b4d6017 100644 --- a/hclustercommand.cpp +++ b/hclustercommand.cpp @@ -11,7 +11,7 @@ //********************************************************************************************************************** //This function checks to make sure the cluster command has no errors and then clusters based on the method chosen. -HClusterCommand::HClusterCommand(string option){ +HClusterCommand::HClusterCommand(string option) { try{ globaldata = GlobalData::getInstance(); abort = false; @@ -21,16 +21,17 @@ HClusterCommand::HClusterCommand(string option){ else { //valid paramters for this command - string Array[] = {"cutoff","precision","method","showabund","timing","phylip","column","name","sorted"}; + string Array[] = {"cutoff","hard","precision","method","phylip","column","name","sorted","showabund","timing","outputdir","inputdir"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); map parameters = parser.getParameters(); ValidParameters validParameter; + map::iterator it; //check to make sure all parameters are valid for command - for (map::iterator it = parameters.begin(); it != parameters.end(); it++) { + for (it = parameters.begin(); it != parameters.end(); it++) { if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } @@ -38,6 +39,39 @@ HClusterCommand::HClusterCommand(string option){ globaldata->newRead(); + //if the user changes the input directory command factory will send this info to us in the output parameter + string inputDir = validParameter.validFile(parameters, "inputdir", false); + if (inputDir == "not found"){ inputDir = ""; } + else { + string path; + it = parameters.find("phylip"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["phylip"] = inputDir + it->second; } + } + + it = parameters.find("column"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["column"] = inputDir + it->second; } + } + + it = parameters.find("name"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["name"] = inputDir + it->second; } + } + } + + //if the user changes the output directory command factory will send this info to us in the output parameter + outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } + //check for required parameters phylipfile = validParameter.validFile(parameters, "phylip", true); if (phylipfile == "not open") { abort = true; } @@ -53,8 +87,8 @@ HClusterCommand::HClusterCommand(string option){ if (namefile == "not open") { abort = true; } else if (namefile == "not found") { namefile = ""; } - if ((phylipfile == "") && (columnfile == "")) { mothurOut("When executing a hcluster command you must enter a phylip or a column."); mothurOutEndLine(); abort = true; } - else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a hcluster command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; } + if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a hcluster command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; } + else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a hcluster command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; } if (columnfile != "") { if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; } @@ -70,16 +104,19 @@ HClusterCommand::HClusterCommand(string option){ length = temp.length(); convert(temp, precision); + temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; } + hard = isTrue(temp); + temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10"; } convert(temp, cutoff); - cutoff += (5 / (precision * 10.0)); + cutoff += (5 / (precision * 10.0)); method = validParameter.validFile(parameters, "method", false); - if (method == "not found") { method = "nearest"; } + if (method == "not found") { method = "furthest"; } if ((method == "furthest") || (method == "nearest") || (method == "average")) { } - else { mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); mothurOutEndLine(); abort = true; } + else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; } showabund = validParameter.validFile(parameters, "showabund", false); if (showabund == "not found") { showabund = "T"; } @@ -93,19 +130,26 @@ HClusterCommand::HClusterCommand(string option){ if (abort == false) { - - fileroot = getRootName(distfile); - tag = "fn"; //until we figure out average and nearest methods + if (outputDir == "") { outputDir += hasPath(distfile); } + fileroot = outputDir + getRootName(getSimpleName(distfile)); + + if (method == "furthest") { tag = "fn"; } + else if (method == "nearest") { tag = "nn"; } + else { tag = "an"; } openOutputFile(fileroot+ tag + ".sabund", sabundFile); openOutputFile(fileroot+ tag + ".rabund", rabundFile); openOutputFile(fileroot+ tag + ".list", listFile); + + outputNames.push_back(fileroot+ tag + ".sabund"); + outputNames.push_back(fileroot+ tag + ".rabund"); + outputNames.push_back(fileroot+ tag + ".list"); } } } catch(exception& e) { - errorOut(e, "HClusterCommand", "HClusterCommand"); + m->errorOut(e, "HClusterCommand", "HClusterCommand"); exit(1); } } @@ -114,15 +158,15 @@ HClusterCommand::HClusterCommand(string option){ void HClusterCommand::help(){ try { - mothurOut("The hcluster command parameter options are cutoff, precision, method, showabund, timing, phylip, column, name and sorted. Phylip or column and name are required.\n"); - mothurOut("The phylip and column parameter allow you to enter your distance file, and sorted indicates whether your column distance file is already sorted. \n"); - mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n"); - mothurOut("The hcluster command should be in the following format: \n"); - mothurOut("hcluster(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n"); - mothurOut("The acceptable hcluster methods is furthest, but we hope to add nearest and average in the future.\n\n"); + m->mothurOut("The hcluster command parameter options are cutoff, precision, method, phylip, column, name, showabund, timing and sorted. Phylip or column and name are required.\n"); + m->mothurOut("The phylip and column parameter allow you to enter your distance file, and sorted indicates whether your column distance file is already sorted. \n"); + m->mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n"); + m->mothurOut("The hcluster command should be in the following format: \n"); + m->mothurOut("hcluster(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n"); + m->mothurOut("The acceptable hcluster methods are furthest, nearest and average.\n\n"); } catch(exception& e) { - errorOut(e, "HClusterCommand", "help"); + m->errorOut(e, "HClusterCommand", "help"); exit(1); } } @@ -148,9 +192,19 @@ int HClusterCommand::execute(){ time_t estart = time(NULL); if (!sorted) { - read = new ReadCluster(distfile, cutoff); + read = new ReadCluster(distfile, cutoff, outputDir, true); read->setFormat(format); read->read(globaldata->nameMap); + + if (m->control_pressed) { + delete read; + sabundFile.close(); + rabundFile.close(); + listFile.close(); + for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } + return 0; + } + distfile = read->getOutputFile(); list = read->getListVector(); @@ -158,15 +212,23 @@ int HClusterCommand::execute(){ }else { list = new ListVector(globaldata->nameMap->getListVector()); } - - mothurOut("It took " + toString(time(NULL) - estart) + " seconds to sort. "); mothurOutEndLine(); - estart = time(NULL); + if (m->control_pressed) { + sabundFile.close(); + rabundFile.close(); + listFile.close(); + for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } + return 0; + } + + m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to sort. "); m->mothurOutEndLine(); + estart = time(NULL); + //list vector made by read contains all sequence names if(list != NULL){ rabund = new RAbundVector(list->getRAbundVector()); }else{ - mothurOut("Error: no list vector!"); mothurOutEndLine(); return 0; + m->mothurOut("Error: no list vector!"); m->mothurOutEndLine(); return 0; } float previousDist = 0.00000; @@ -176,42 +238,55 @@ int HClusterCommand::execute(){ print_start = true; start = time(NULL); - -//cout << "here" << endl; - ifstream in; - openInputFile(distfile, in); - string firstName, secondName; - float distance; - - cluster = new HCluster(rabund, list); - bool clusteredSomething; + + cluster = new HCluster(rabund, list, method, distfile, globaldata->nameMap, cutoff); vector seqs; seqs.resize(1); // to start loop - exitedBreak = false; //lets you know if there is a distance stored in next + + if (m->control_pressed) { + delete cluster; + sabundFile.close(); + rabundFile.close(); + listFile.close(); + for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } + return 0; + } + while (seqs.size() != 0){ - seqs = getSeqs(in); - random_shuffle(seqs.begin(), seqs.end()); + seqs = cluster->getSeqs(); - if (seqs.size() == 0) { break; } //there are no more distances - - for (int i = 0; i < seqs.size(); i++) { //-1 means skip me + if (m->control_pressed) { + delete cluster; + sabundFile.close(); + rabundFile.close(); + listFile.close(); + for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } + return 0; + } - if (print_start && isTrue(timing)) { - mothurOut("Clustering (" + tag + ") dist " + toString(distance) + "/" - + toString(roundDist(distance, precision)) - + "\t(precision: " + toString(precision) + ")"); - cout.flush(); - print_start = false; - } + for (int i = 0; i < seqs.size(); i++) { //-1 means skip me - ///cout << "before cluster update" << endl; if (seqs[i].seq1 != seqs[i].seq2) { - clusteredSomething = cluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist); - - float rndDist = roundDist(seqs[i].dist, precision); - //cout << "after cluster update clusterSomething = " << clusteredSomething << " rndDist = " << rndDist << " rndPreviousDist = " << rndPreviousDist << endl; + cluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist); + if (m->control_pressed) { + delete cluster; + sabundFile.close(); + rabundFile.close(); + listFile.close(); + for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } + return 0; + } + + + float rndDist; + if (hard) { + rndDist = ceilDist(seqs[i].dist, precision); + }else{ + rndDist = roundDist(seqs[i].dist, precision); + } + if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){ printData("unique"); @@ -219,7 +294,7 @@ int HClusterCommand::execute(){ else if((rndDist != rndPreviousDist)){ printData(toString(rndPreviousDist, length-1)); } - + previousDist = seqs[i].dist; rndPreviousDist = rndDist; oldRAbund = *rabund; @@ -227,16 +302,16 @@ int HClusterCommand::execute(){ } } } - - in.close(); - if (print_start && isTrue(timing)) { - //mothurOut("Clustering (" + tag + ") for distance " + toString(previousDist) + "/" + toString(rndPreviousDist) - //+ "\t(precision: " + toString(precision) + ", Nodes: " + toString(matrix->getNNodes()) + ")"); - cout.flush(); - print_start = false; + if (m->control_pressed) { + delete cluster; + sabundFile.close(); + rabundFile.close(); + listFile.close(); + for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } + return 0; } - + if(previousDist <= 0.0000){ printData("unique"); } @@ -258,15 +333,25 @@ int HClusterCommand::execute(){ sabundFile.close(); rabundFile.close(); listFile.close(); - delete cluster; - //if (isTrue(timing)) { - mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster. "); mothurOutEndLine(); - //} + + if (m->control_pressed) { + for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } + return 0; + } + + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); + + m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to cluster. "); m->mothurOutEndLine(); + return 0; } catch(exception& e) { - errorOut(e, "HClusterCommand", "execute"); + m->errorOut(e, "HClusterCommand", "execute"); exit(1); } } @@ -276,8 +361,8 @@ int HClusterCommand::execute(){ void HClusterCommand::printData(string label){ try { if (isTrue(timing)) { - mothurOut("\tTime: " + toString(time(NULL) - start) + "\tsecs for " + toString(oldRAbund.getNumBins()) - + "\tclusters. Updates: " + toString(loops)); mothurOutEndLine(); + m->mothurOut("\tTime: " + toString(time(NULL) - start) + "\tsecs for " + toString(oldRAbund.getNumBins()) + + "\tclusters. Updates: " + toString(loops)); m->mothurOutEndLine(); } print_start = true; loops = 0; @@ -294,7 +379,7 @@ void HClusterCommand::printData(string label){ oldList.print(listFile); } catch(exception& e) { - errorOut(e, "HClusterCommand", "printData"); + m->errorOut(e, "HClusterCommand", "printData"); exit(1); } @@ -302,75 +387,3 @@ void HClusterCommand::printData(string label){ } //********************************************************************************************************************** -vector HClusterCommand::getSeqs(ifstream& filehandle){ - try { - string firstName, secondName; - float distance, prevDistance; - vector sameSeqs; - prevDistance = -1; - - //if you are not at the beginning of the file - if (exitedBreak) { - sameSeqs.push_back(next); - prevDistance = next.dist; - exitedBreak = false; - } - - //get entry - while (filehandle) { - - filehandle >> firstName >> secondName >> distance; -//cout << firstName << '\t' << secondName << '\t' << distance << endl; - gobble(filehandle); - - //save first one - if (prevDistance == -1) { prevDistance = distance; } - - map::iterator itA = globaldata->nameMap->find(firstName); - map::iterator itB = globaldata->nameMap->find(secondName); - - if(itA == globaldata->nameMap->end()){ - cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); - } - if(itB == globaldata->nameMap->end()){ - cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); - } - - //using cutoff - if (distance > cutoff) { break; } - - if (distance != -1) { //-1 means skip me - - //are the distances the same - if (distance == prevDistance) { //save in vector - seqDist temp; - temp.seq1 = itA->second; - temp.seq2 = itB->second; - temp.dist = distance; - sameSeqs.push_back(temp); - exitedBreak = false; - //what about precision?? - - }else{ - next.seq1 = itA->second; - next.seq2 = itB->second; - next.dist = distance; - exitedBreak = true; - break; - } - - } - } - - return sameSeqs; - } - catch(exception& e) { - errorOut(e, "HClusterCommand", "getSeqs"); - exit(1); - } - - -} - -//********************************************************************************************************************** -