1 #ifndef CLUSTERSPLITCOMMAND_H
2 #define CLUSTERSPLITCOMMAND_H
5 * clustersplitcommand.h
8 * Created by westcott on 5/19/10.
9 * Copyright 2010 Schloss Lab. All rights reserved.
13 #include "command.hpp"
14 #include "rabundvector.hpp"
15 #include "sabundvector.hpp"
16 #include "listvector.hpp"
17 #include "cluster.hpp"
18 #include "sparsedistancematrix.h"
19 #include "readcluster.h"
20 #include "splitmatrix.h"
21 #include "readphylip.h"
22 #include "readcolumn.h"
23 #include "readmatrix.hpp"
24 #include "inputdata.h"
25 #include "clustercommand.h"
26 #include "clusterclassic.h"
28 class ClusterSplitCommand : public Command {
31 ClusterSplitCommand(string);
32 ClusterSplitCommand();
33 ~ClusterSplitCommand() {}
35 vector<string> setParameters();
36 string getCommandName() { return "cluster.split"; }
37 string getCommandCategory() { return "Clustering"; }
39 string getHelpString();
40 string getOutputPattern(string);
41 string getCitation() { return "Schloss PD, Westcott SL (2011). Assessing and improving methods used in OTU-based approaches for 16S rRNA gene sequence analysis. Appl Environ Microbiol 77:3219. \nhttp://www.mothur.org/wiki/Cluster.split"; }
42 string getDescription() { return "splits your sequences by distance or taxonomy then clusters into OTUs"; }
45 void help() { m->mothurOut(getHelpString()); }
48 vector<int> processIDS; //processid
49 vector<string> outputNames;
51 string file, method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, countfile, distfile, format, showabund, timing, splitmethod, taxFile, fastafile;
52 double cutoff, splitcutoff;
53 int precision, length, processors, taxLevelCutoff;
54 bool print_start, abort, hard, large, classic, runCluster, deleteFiles;
56 ofstream outList, outRabund, outSabund;
58 void printData(ListVector*);
59 vector<string> createProcesses(vector< map<string, string> >, set<string>&);
60 vector<string> cluster(vector< map<string, string> >, set<string>&);
61 string clusterFile(string, string, set<string>&, double&);
62 string clusterClassicFile(string, string, set<string>&, double&);
63 int mergeLists(vector<string>, map<float, int>, ListVector*);
64 map<float, int> completeListFile(vector<string>, string, set<string>&, ListVector*&);
65 int createMergedDistanceFile(vector< map<string, string> >);
66 int createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund);
67 string readFile(vector< map<string, string> >&);
68 int printFile(string, vector< map<string, string> >&);
71 /////////////////not working for Windows////////////////////////////////////////////////////////////
72 // getting an access violation error. This is most likely caused by the
73 // threads stepping on eachother's structures, as I can run the thread function and the cluster fuction
74 // in separately without errors occuring. I suspect it may be in the use of the
75 // static class mothurOut, but I can't pinpoint the problem. All other objects are made new
76 // within the thread. MothurOut is used by almost all the classes in mothur, so if this was
77 // really the cause I would expect to see all the windows threaded commands to have issues, but not
78 // all do. So far, shhh.flows and trim.flows have similiar problems. Other thoughts, could it have
79 // anything to do with mothur's use of copy constructors in many of our data structures. ie. listvector
80 // is copied by nameassignment and passed to read which passes to the thread? -westcott 2-8-12
81 ////////////////////////////////////////////////////////////////////////////////////////////////////
82 /**************************************************************************************************
83 //custom data structure for threads to use.
84 // This is passed by void pointer so it can be any data type
85 // that can be passed using a single void pointer (LPVOID).
88 vector < map<string, string> > distNames;
91 double cutoff, precision;
92 string tag, outputDir;
93 vector<string> listFiles;
99 clusterData(vector < map<string, string> > dv, MothurOut* mout, double cu, string me, string ou, bool hd, double pre, int len, int th) {
112 /**************************************************************************************************
113 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
115 static DWORD WINAPI MyClusterThreadFunction(LPVOID lpParam){
116 clusterData* pDataArray;
117 pDataArray = (clusterData*)lpParam;
120 cout << "starting " << endl;
122 double smallestCutoff = pDataArray->cutoff;
124 //cluster each distance file
125 for (int i = 0; i < pDataArray->distNames.size(); i++) {
127 Cluster* mycluster = NULL;
128 SparseMatrix* mymatrix = NULL;
129 ListVector* mylist = NULL;
130 ListVector myoldList;
131 RAbundVector* myrabund = NULL;
133 if (pDataArray->m->control_pressed) { break; }
135 string thisNamefile = pDataArray->distNames[i].begin()->second;
136 string thisDistFile = pDataArray->distNames[i].begin()->first;
137 cout << thisNamefile << '\t' << thisDistFile << endl;
138 pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Reading " + thisDistFile); pDataArray->m->mothurOutEndLine();
140 ReadMatrix* myread = new ReadColumnMatrix(thisDistFile);
141 myread->setCutoff(pDataArray->cutoff);
142 NameAssignment* mynameMap = new NameAssignment(thisNamefile);
143 mynameMap->readMap();
144 cout << "done reading " << thisNamefile << endl;
145 myread->read(mynameMap);
146 cout << "done reading " << thisDistFile << endl;
147 if (pDataArray->m->control_pressed) { delete myread; delete mynameMap; break; }
149 mylist = myread->getListVector();
151 mymatrix = myread->getMatrix();
152 cout << "here" << endl;
153 delete myread; myread = NULL;
154 delete mynameMap; mynameMap = NULL;
156 pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Clustering " + thisDistFile); pDataArray->m->mothurOutEndLine();
158 myrabund = new RAbundVector(mylist->getRAbundVector());
159 cout << "here" << endl;
161 if (pDataArray->method == "furthest") { mycluster = new CompleteLinkage(myrabund, mylist, mymatrix, pDataArray->cutoff, pDataArray->method); }
162 else if(pDataArray->method == "nearest"){ mycluster = new SingleLinkage(myrabund, mylist, mymatrix, pDataArray->cutoff, pDataArray->method); }
163 else if(pDataArray->method == "average"){ mycluster = new AverageLinkage(myrabund, mylist, mymatrix, pDataArray->cutoff, pDataArray->method); }
164 pDataArray->tag = mycluster->getTag();
165 cout << "here" << endl;
166 if (pDataArray->outputDir == "") { pDataArray->outputDir += pDataArray->m->hasPath(thisDistFile); }
167 string fileroot = pDataArray->outputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisDistFile));
168 cout << "here" << endl;
170 pDataArray->m->openOutputFile(fileroot+ pDataArray->tag + ".list", listFile);
171 cout << "here" << endl;
172 pDataArray->listFiles.push_back(fileroot+ pDataArray->tag + ".list");
174 float previousDist = 0.00000;
175 float rndPreviousDist = 0.00000;
179 bool print_start = true;
180 int start = time(NULL);
181 double saveCutoff = pDataArray->cutoff;
183 while (mymatrix->getSmallDist() < pDataArray->cutoff && mymatrix->getNNodes() > 0){
185 if (pDataArray->m->control_pressed) { //clean up
186 delete mymatrix; delete mylist; delete mycluster; delete myrabund;
188 for (int i = 0; i < pDataArray->listFiles.size(); i++) { pDataArray->m->mothurRemove(pDataArray->listFiles[i]); }
189 pDataArray->listFiles.clear(); break;
192 mycluster->update(saveCutoff);
194 float dist = mymatrix->getSmallDist();
196 if (pDataArray->hard) {
197 rndDist = pDataArray->m->ceilDist(dist, pDataArray->precision);
199 rndDist = pDataArray->m->roundDist(dist, pDataArray->precision);
202 if(previousDist <= 0.0000 && dist != previousDist){
203 myoldList.setLabel("unique");
204 myoldList.print(listFile);
205 if (pDataArray->labels.count("unique") == 0) { pDataArray->labels.insert("unique"); }
207 else if(rndDist != rndPreviousDist){
208 myoldList.setLabel(toString(rndPreviousDist, pDataArray->length-1));
209 myoldList.print(listFile);
210 if (pDataArray->labels.count(toString(rndPreviousDist, pDataArray->length-1)) == 0) { pDataArray->labels.insert(toString(rndPreviousDist, pDataArray->length-1)); }
214 rndPreviousDist = rndDist;
218 cout << "here2" << endl;
219 if(previousDist <= 0.0000){
220 myoldList.setLabel("unique");
221 myoldList.print(listFile);
222 if (pDataArray->labels.count("unique") == 0) { pDataArray->labels.insert("unique"); }
224 else if(rndPreviousDist<pDataArray->cutoff){
225 myoldList.setLabel(toString(rndPreviousDist, pDataArray->length-1));
226 myoldList.print(listFile);
227 if (pDataArray->labels.count(toString(rndPreviousDist, pDataArray->length-1)) == 0) { pDataArray->labels.insert(toString(rndPreviousDist, pDataArray->length-1)); }
230 delete mymatrix; delete mylist; delete mycluster; delete myrabund;
231 mymatrix = NULL; mylist = NULL; mycluster = NULL; myrabund = NULL;
234 if (pDataArray->m->control_pressed) { //clean up
235 for (int i = 0; i < pDataArray->listFiles.size(); i++) { pDataArray->m->mothurRemove(pDataArray->listFiles[i]); }
236 pDataArray->listFiles.clear(); break;
238 cout << "here3" << endl;
239 pDataArray->m->mothurRemove(thisDistFile);
240 pDataArray->m->mothurRemove(thisNamefile);
241 cout << "here4" << endl;
242 if (saveCutoff != pDataArray->cutoff) {
243 if (pDataArray->hard) { saveCutoff = pDataArray->m->ceilDist(saveCutoff, pDataArray->precision); }
244 else { saveCutoff = pDataArray->m->roundDist(saveCutoff, pDataArray->precision); }
246 pDataArray->m->mothurOut("Cutoff was " + toString(pDataArray->cutoff) + " changed cutoff to " + toString(saveCutoff)); pDataArray->m->mothurOutEndLine();
248 cout << "here5" << endl;
249 if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff; }
252 pDataArray->cutoff = smallestCutoff;
257 catch(exception& e) {
258 pDataArray->m->errorOut(e, "ClusterSplitCommand", "MyClusterThreadFunction");