X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=clustersplitcommand.h;h=e1f17b1bd0b8c4088d5deb3c06691fb3ccf26999;hp=28c7d237dccb8ce31542c76f01378b0f8e053d71;hb=615301e57c25e241356a9c2380648d117709458d;hpb=ca9ac1d80c62f57270b0dcd49410ebe08a8aecd6 diff --git a/clustersplitcommand.h b/clustersplitcommand.h index 28c7d23..e1f17b1 100644 --- a/clustersplitcommand.h +++ b/clustersplitcommand.h @@ -15,8 +15,15 @@ #include "sabundvector.hpp" #include "listvector.hpp" #include "cluster.hpp" -#include "sparsematrix.hpp" - +#include "sparsedistancematrix.h" +#include "readcluster.h" +#include "splitmatrix.h" +#include "readphylip.h" +#include "readcolumn.h" +#include "readmatrix.hpp" +#include "inputdata.h" +#include "clustercommand.h" +#include "clusterclassic.h" class ClusterSplitCommand : public Command { @@ -26,9 +33,13 @@ public: ~ClusterSplitCommand() {} vector setParameters(); - string getCommandName() { return "cluster.split"; } - string getCommandCategory() { return "OTU-Based Approaches"; } + string getCommandName() { return "cluster.split"; } + string getCommandCategory() { return "Clustering"; } + string getHelpString(); + string getOutputPattern(string); + string getCitation() { return "Schloss PD, Westcott SL (2011). Assessing and improving methods used in OTU-based approaches for 16S rRNA gene sequence analysis. Appl Environ Microbiol 77:3219. \nhttp://www.mothur.org/wiki/Cluster.split"; } + string getDescription() { return "splits your sequences by distance or taxonomy then clusters into OTUs"; } int execute(); void help() { m->mothurOut(getHelpString()); } @@ -37,20 +48,219 @@ private: vector processIDS; //processid vector outputNames; - string method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, distfile, format, showabund, timing, splitmethod, taxFile, fastafile; + string method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, countfile, distfile, format, showabund, timing, splitmethod, taxFile, fastafile; double cutoff, splitcutoff; int precision, length, processors, taxLevelCutoff; - bool print_start, abort, hard, large; + bool print_start, abort, hard, large, classic, runCluster; time_t start; ofstream outList, outRabund, outSabund; void printData(ListVector*); - int createProcesses(vector < vector < map > >); + vector createProcesses(vector< map >, set&); vector cluster(vector< map >, set&); + string clusterFile(string, string, set&, double&); + string clusterClassicFile(string, string, set&, double&); int mergeLists(vector, map, ListVector*); map completeListFile(vector, string, set&, ListVector*&); int createMergedDistanceFile(vector< map >); + int createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund); +}; + +/////////////////not working for Windows//////////////////////////////////////////////////////////// +// getting an access violation error. This is most likely caused by the +// threads stepping on eachother's structures, as I can run the thread function and the cluster fuction +// in separately without errors occuring. I suspect it may be in the use of the +// static class mothurOut, but I can't pinpoint the problem. All other objects are made new +// within the thread. MothurOut is used by almost all the classes in mothur, so if this was +// really the cause I would expect to see all the windows threaded commands to have issues, but not +// all do. So far, shhh.flows and trim.flows have similiar problems. Other thoughts, could it have +// anything to do with mothur's use of copy constructors in many of our data structures. ie. listvector +// is copied by nameassignment and passed to read which passes to the thread? -westcott 2-8-12 +//////////////////////////////////////////////////////////////////////////////////////////////////// +/************************************************************************************************** +//custom data structure for threads to use. +// This is passed by void pointer so it can be any data type +// that can be passed using a single void pointer (LPVOID). +struct clusterData { + set labels; + vector < map > distNames; + string method; + MothurOut* m; + double cutoff, precision; + string tag, outputDir; + vector listFiles; + bool hard; + int length, threadID; + + + clusterData(){} + clusterData(vector < map > dv, MothurOut* mout, double cu, string me, string ou, bool hd, double pre, int len, int th) { + distNames = dv; + m = mout; + cutoff = cu; + method = me; + outputDir = ou; + hard = hd; + precision = pre; + length = len; + threadID = th; + } }; +/************************************************************************************************** +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) +#else +static DWORD WINAPI MyClusterThreadFunction(LPVOID lpParam){ + clusterData* pDataArray; + pDataArray = (clusterData*)lpParam; + + try { + cout << "starting " << endl; + + double smallestCutoff = pDataArray->cutoff; + + //cluster each distance file + for (int i = 0; i < pDataArray->distNames.size(); i++) { + + Cluster* mycluster = NULL; + SparseMatrix* mymatrix = NULL; + ListVector* mylist = NULL; + ListVector myoldList; + RAbundVector* myrabund = NULL; + + if (pDataArray->m->control_pressed) { break; } + + string thisNamefile = pDataArray->distNames[i].begin()->second; + string thisDistFile = pDataArray->distNames[i].begin()->first; + cout << thisNamefile << '\t' << thisDistFile << endl; + pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Reading " + thisDistFile); pDataArray->m->mothurOutEndLine(); + + ReadMatrix* myread = new ReadColumnMatrix(thisDistFile); + myread->setCutoff(pDataArray->cutoff); + NameAssignment* mynameMap = new NameAssignment(thisNamefile); + mynameMap->readMap(); + cout << "done reading " << thisNamefile << endl; + myread->read(mynameMap); + cout << "done reading " << thisDistFile << endl; + if (pDataArray->m->control_pressed) { delete myread; delete mynameMap; break; } + + mylist = myread->getListVector(); + myoldList = *mylist; + mymatrix = myread->getMatrix(); + cout << "here" << endl; + delete myread; myread = NULL; + delete mynameMap; mynameMap = NULL; + + pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Clustering " + thisDistFile); pDataArray->m->mothurOutEndLine(); + + myrabund = new RAbundVector(mylist->getRAbundVector()); + cout << "here" << endl; + //create cluster + if (pDataArray->method == "furthest") { mycluster = new CompleteLinkage(myrabund, mylist, mymatrix, pDataArray->cutoff, pDataArray->method); } + else if(pDataArray->method == "nearest"){ mycluster = new SingleLinkage(myrabund, mylist, mymatrix, pDataArray->cutoff, pDataArray->method); } + else if(pDataArray->method == "average"){ mycluster = new AverageLinkage(myrabund, mylist, mymatrix, pDataArray->cutoff, pDataArray->method); } + pDataArray->tag = mycluster->getTag(); + cout << "here" << endl; + if (pDataArray->outputDir == "") { pDataArray->outputDir += pDataArray->m->hasPath(thisDistFile); } + string fileroot = pDataArray->outputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisDistFile)); + cout << "here" << endl; + ofstream listFile; + pDataArray->m->openOutputFile(fileroot+ pDataArray->tag + ".list", listFile); + cout << "here" << endl; + pDataArray->listFiles.push_back(fileroot+ pDataArray->tag + ".list"); + + float previousDist = 0.00000; + float rndPreviousDist = 0.00000; + + myoldList = *mylist; + + bool print_start = true; + int start = time(NULL); + double saveCutoff = pDataArray->cutoff; + + while (mymatrix->getSmallDist() < pDataArray->cutoff && mymatrix->getNNodes() > 0){ + + if (pDataArray->m->control_pressed) { //clean up + delete mymatrix; delete mylist; delete mycluster; delete myrabund; + listFile.close(); + for (int i = 0; i < pDataArray->listFiles.size(); i++) { pDataArray->m->mothurRemove(pDataArray->listFiles[i]); } + pDataArray->listFiles.clear(); break; + } + + mycluster->update(saveCutoff); + + float dist = mymatrix->getSmallDist(); + float rndDist; + if (pDataArray->hard) { + rndDist = pDataArray->m->ceilDist(dist, pDataArray->precision); + }else{ + rndDist = pDataArray->m->roundDist(dist, pDataArray->precision); + } + + if(previousDist <= 0.0000 && dist != previousDist){ + myoldList.setLabel("unique"); + myoldList.print(listFile); + if (pDataArray->labels.count("unique") == 0) { pDataArray->labels.insert("unique"); } + } + else if(rndDist != rndPreviousDist){ + myoldList.setLabel(toString(rndPreviousDist, pDataArray->length-1)); + myoldList.print(listFile); + if (pDataArray->labels.count(toString(rndPreviousDist, pDataArray->length-1)) == 0) { pDataArray->labels.insert(toString(rndPreviousDist, pDataArray->length-1)); } + } + + previousDist = dist; + rndPreviousDist = rndDist; + myoldList = *mylist; + } + + cout << "here2" << endl; + if(previousDist <= 0.0000){ + myoldList.setLabel("unique"); + myoldList.print(listFile); + if (pDataArray->labels.count("unique") == 0) { pDataArray->labels.insert("unique"); } + } + else if(rndPreviousDistcutoff){ + myoldList.setLabel(toString(rndPreviousDist, pDataArray->length-1)); + myoldList.print(listFile); + if (pDataArray->labels.count(toString(rndPreviousDist, pDataArray->length-1)) == 0) { pDataArray->labels.insert(toString(rndPreviousDist, pDataArray->length-1)); } + } + + delete mymatrix; delete mylist; delete mycluster; delete myrabund; + mymatrix = NULL; mylist = NULL; mycluster = NULL; myrabund = NULL; + listFile.close(); + + if (pDataArray->m->control_pressed) { //clean up + for (int i = 0; i < pDataArray->listFiles.size(); i++) { pDataArray->m->mothurRemove(pDataArray->listFiles[i]); } + pDataArray->listFiles.clear(); break; + } + cout << "here3" << endl; + pDataArray->m->mothurRemove(thisDistFile); + pDataArray->m->mothurRemove(thisNamefile); + cout << "here4" << endl; + if (saveCutoff != pDataArray->cutoff) { + if (pDataArray->hard) { saveCutoff = pDataArray->m->ceilDist(saveCutoff, pDataArray->precision); } + else { saveCutoff = pDataArray->m->roundDist(saveCutoff, pDataArray->precision); } + + pDataArray->m->mothurOut("Cutoff was " + toString(pDataArray->cutoff) + " changed cutoff to " + toString(saveCutoff)); pDataArray->m->mothurOutEndLine(); + } + cout << "here5" << endl; + if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; } + } + + pDataArray->cutoff = smallestCutoff; + + return 0; + + } + catch(exception& e) { + pDataArray->m->errorOut(e, "ClusterSplitCommand", "MyClusterThreadFunction"); + exit(1); + } +} +#endif + +*/ + + #endif