]> git.donarmstrong.com Git - mothur.git/blob - clustersplitcommand.h
update .gitignore
[mothur.git] / clustersplitcommand.h
1 #ifndef CLUSTERSPLITCOMMAND_H
2 #define CLUSTERSPLITCOMMAND_H
3
4 /*
5  *  clustersplitcommand.h
6  *  Mothur
7  *
8  *  Created by westcott on 5/19/10.
9  *  Copyright 2010 Schloss Lab. All rights reserved.
10  *
11  */
12  
13 #include "command.hpp"
14 #include "rabundvector.hpp"
15 #include "sabundvector.hpp"
16 #include "listvector.hpp"
17 #include "cluster.hpp"
18 #include "sparsedistancematrix.h"
19 #include "readcluster.h"
20 #include "splitmatrix.h"
21 #include "readphylip.h"
22 #include "readcolumn.h"
23 #include "readmatrix.hpp"
24 #include "inputdata.h"
25 #include "clustercommand.h"
26 #include "clusterclassic.h"
27
28 class ClusterSplitCommand : public Command {
29         
30 public:
31         ClusterSplitCommand(string);
32         ClusterSplitCommand();
33         ~ClusterSplitCommand() {}
34         
35         vector<string> setParameters();
36         string getCommandName()                 { return "cluster.split";               }
37         string getCommandCategory()             { return "Clustering";                  }
38         
39         string getHelpString(); 
40     string getOutputPattern(string);    
41         string getCitation() { return "Schloss PD, Westcott SL (2011). Assessing and improving methods used in OTU-based approaches for 16S rRNA gene sequence analysis. Appl Environ Microbiol 77:3219. \nhttp://www.mothur.org/wiki/Cluster.split"; }
42         string getDescription()         { return "splits your sequences by distance or taxonomy then clusters into OTUs"; }
43         
44         int execute(); 
45         void help() { m->mothurOut(getHelpString()); }  
46
47 private:
48         vector<int> processIDS;   //processid
49         vector<string> outputNames;
50         
51         string file, method, fileroot, tag, outputDir, phylipfile, columnfile, namefile, countfile, distfile, format, showabund, timing, splitmethod, taxFile, fastafile;
52         double cutoff, splitcutoff;
53         int precision, length, processors, taxLevelCutoff;
54         bool print_start, abort, hard, large, classic, runCluster, deleteFiles;
55         time_t start;
56         ofstream outList, outRabund, outSabund;
57         
58         void printData(ListVector*);
59         vector<string> createProcesses(vector< map<string, string> >, set<string>&);
60         vector<string> cluster(vector< map<string, string> >, set<string>&);
61     string clusterFile(string, string, set<string>&, double&);
62     string clusterClassicFile(string, string, set<string>&, double&);
63         int mergeLists(vector<string>, map<float, int>, ListVector*);
64         map<float, int> completeListFile(vector<string>, string, set<string>&, ListVector*&);
65         int createMergedDistanceFile(vector< map<string, string> >);
66     int createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund);
67     string readFile(vector< map<string, string> >&);
68     int printFile(string, vector< map<string, string> >&);
69 };
70
71 /////////////////not working for Windows////////////////////////////////////////////////////////////
72 // getting an access violation error.  This is most likely caused by the 
73 // threads stepping on eachother's structures, as I can run the thread function and the cluster fuction 
74 // in separately without errors occuring.  I suspect it may be in the use of the
75 // static class mothurOut, but I can't pinpoint the problem.  All other objects are made new
76 // within the thread.  MothurOut is used by almost all the classes in mothur, so if this was 
77 // really the cause I would expect to see all the windows threaded commands to have issues, but not 
78 // all do. So far, shhh.flows and trim.flows have similiar problems. Other thoughts, could it have 
79 // anything to do with mothur's use of copy constructors in many of our data structures. ie. listvector 
80 // is copied by nameassignment and passed to read which passes to the thread?  -westcott 2-8-12
81 ////////////////////////////////////////////////////////////////////////////////////////////////////
82 /**************************************************************************************************
83 //custom data structure for threads to use.
84 // This is passed by void pointer so it can be any data type
85 // that can be passed using a single void pointer (LPVOID).
86 struct clusterData {
87         set<string> labels;
88         vector < map<string, string> > distNames; 
89         string method; 
90     MothurOut* m;
91         double cutoff, precision;
92     string tag, outputDir;
93     vector<string> listFiles;
94     bool hard;
95     int length, threadID;
96         
97         
98         clusterData(){}
99         clusterData(vector < map<string, string> > dv, MothurOut* mout, double cu, string me, string ou, bool hd, double pre, int len, int th) {
100                 distNames = dv;
101                 m = mout;
102                 cutoff = cu;
103         method = me;
104                 outputDir = ou;
105         hard = hd;
106         precision = pre;
107         length = len;
108         threadID = th;
109         }
110 };
111
112 /**************************************************************************************************
113 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
114 #else
115 static DWORD WINAPI MyClusterThreadFunction(LPVOID lpParam){ 
116         clusterData* pDataArray;
117         pDataArray = (clusterData*)lpParam;
118         
119         try {
120                 cout << "starting " << endl;            
121                 
122                 double smallestCutoff = pDataArray->cutoff;
123                 
124                 //cluster each distance file
125                 for (int i = 0; i < pDataArray->distNames.size(); i++) {
126             
127             Cluster* mycluster = NULL;
128             SparseMatrix* mymatrix = NULL;
129             ListVector* mylist = NULL;
130             ListVector myoldList;
131             RAbundVector* myrabund = NULL;
132                         
133                         if (pDataArray->m->control_pressed) { break; }
134                         
135                         string thisNamefile = pDataArray->distNames[i].begin()->second;
136                         string thisDistFile = pDataArray->distNames[i].begin()->first;
137             cout << thisNamefile << '\t' << thisDistFile << endl;       
138                         pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Reading " + thisDistFile); pDataArray->m->mothurOutEndLine();
139                         
140                         ReadMatrix* myread = new ReadColumnMatrix(thisDistFile);        
141                         myread->setCutoff(pDataArray->cutoff);
142                         NameAssignment* mynameMap = new NameAssignment(thisNamefile);
143                         mynameMap->readMap();
144             cout << "done reading " << thisNamefile << endl;  
145                         myread->read(mynameMap);
146                         cout << "done reading " << thisDistFile << endl;  
147                         if (pDataArray->m->control_pressed) {  delete myread; delete mynameMap; break; }
148             
149                         mylist = myread->getListVector();
150                         myoldList = *mylist;
151                         mymatrix = myread->getMatrix();
152             cout << "here" << endl;     
153                         delete myread; myread = NULL;
154                         delete mynameMap; mynameMap = NULL;
155                         
156             pDataArray->m->mothurOutEndLine(); pDataArray->m->mothurOut("Clustering " + thisDistFile); pDataArray->m->mothurOutEndLine();
157             
158                         myrabund = new RAbundVector(mylist->getRAbundVector());
159                          cout << "here" << endl;        
160                         //create cluster
161                         if (pDataArray->method == "furthest")   {       mycluster = new CompleteLinkage(myrabund, mylist, mymatrix, pDataArray->cutoff, pDataArray->method); }
162                         else if(pDataArray->method == "nearest"){       mycluster = new SingleLinkage(myrabund, mylist, mymatrix, pDataArray->cutoff, pDataArray->method); }
163                         else if(pDataArray->method == "average"){       mycluster = new AverageLinkage(myrabund, mylist, mymatrix, pDataArray->cutoff, pDataArray->method);     }
164                         pDataArray->tag = mycluster->getTag();
165              cout << "here" << endl;    
166                         if (pDataArray->outputDir == "") { pDataArray->outputDir += pDataArray->m->hasPath(thisDistFile); }
167                         string fileroot = pDataArray->outputDir + pDataArray->m->getRootName(pDataArray->m->getSimpleName(thisDistFile));
168                          cout << "here" << endl;        
169                         ofstream listFile;
170                         pDataArray->m->openOutputFile(fileroot+ pDataArray->tag + ".list",      listFile);
171              cout << "here" << endl;    
172                         pDataArray->listFiles.push_back(fileroot+ pDataArray->tag + ".list");
173             
174                         float previousDist = 0.00000;
175                         float rndPreviousDist = 0.00000;
176                         
177                         myoldList = *mylist;
178         
179                         bool print_start = true;
180                         int start = time(NULL);
181                         double saveCutoff = pDataArray->cutoff;
182             
183                         while (mymatrix->getSmallDist() < pDataArray->cutoff && mymatrix->getNNodes() > 0){
184                 
185                                 if (pDataArray->m->control_pressed) { //clean up
186                                         delete mymatrix; delete mylist; delete mycluster; delete myrabund;
187                                         listFile.close();
188                                         for (int i = 0; i < pDataArray->listFiles.size(); i++) {        pDataArray->m->mothurRemove(pDataArray->listFiles[i]);  }
189                                         pDataArray->listFiles.clear(); break;
190                                 }
191                 
192                                 mycluster->update(saveCutoff);
193                 
194                                 float dist = mymatrix->getSmallDist();
195                                 float rndDist;
196                                 if (pDataArray->hard) {
197                                         rndDist = pDataArray->m->ceilDist(dist, pDataArray->precision); 
198                                 }else{
199                                         rndDist = pDataArray->m->roundDist(dist, pDataArray->precision); 
200                                 }
201                 
202                                 if(previousDist <= 0.0000 && dist != previousDist){
203                                         myoldList.setLabel("unique");
204                                         myoldList.print(listFile);
205                                         if (pDataArray->labels.count("unique") == 0) {  pDataArray->labels.insert("unique");  }
206                                 }
207                                 else if(rndDist != rndPreviousDist){
208                                         myoldList.setLabel(toString(rndPreviousDist,  pDataArray->length-1));
209                                         myoldList.print(listFile);
210                                         if (pDataArray->labels.count(toString(rndPreviousDist,  pDataArray->length-1)) == 0) { pDataArray->labels.insert(toString(rndPreviousDist,  pDataArray->length-1)); }
211                                 }
212                 
213                                 previousDist = dist;
214                                 rndPreviousDist = rndDist;
215                                 myoldList = *mylist;
216                         }
217             
218              cout << "here2" << endl;   
219                         if(previousDist <= 0.0000){
220                                 myoldList.setLabel("unique");
221                                 myoldList.print(listFile);
222                                 if (pDataArray->labels.count("unique") == 0) { pDataArray->labels.insert("unique"); }
223                         }
224                         else if(rndPreviousDist<pDataArray->cutoff){
225                                 myoldList.setLabel(toString(rndPreviousDist,  pDataArray->length-1));
226                                 myoldList.print(listFile);
227                                 if (pDataArray->labels.count(toString(rndPreviousDist,  pDataArray->length-1)) == 0) { pDataArray->labels.insert(toString(rndPreviousDist,  pDataArray->length-1)); }
228                         }
229             
230                         delete mymatrix; delete mylist; delete mycluster; delete myrabund; 
231             mymatrix = NULL; mylist = NULL; mycluster = NULL; myrabund = NULL;
232                         listFile.close();
233                         
234                         if (pDataArray->m->control_pressed) { //clean up
235                                 for (int i = 0; i < pDataArray->listFiles.size(); i++) {        pDataArray->m->mothurRemove(pDataArray->listFiles[i]);  }
236                                 pDataArray->listFiles.clear(); break;
237                         }
238                          cout << "here3" << endl;       
239                         pDataArray->m->mothurRemove(thisDistFile);
240                         pDataArray->m->mothurRemove(thisNamefile);
241                          cout << "here4" << endl;       
242                         if (saveCutoff != pDataArray->cutoff) { 
243                                 if (pDataArray->hard)   {  saveCutoff = pDataArray->m->ceilDist(saveCutoff, pDataArray->precision);     }
244                                 else            {       saveCutoff = pDataArray->m->roundDist(saveCutoff, pDataArray->precision);  }
245                 
246                                 pDataArray->m->mothurOut("Cutoff was " + toString(pDataArray->cutoff) + " changed cutoff to " + toString(saveCutoff)); pDataArray->m->mothurOutEndLine();  
247                         }
248                          cout << "here5" << endl;       
249                         if (saveCutoff < smallestCutoff) { smallestCutoff = saveCutoff;  }
250                 }
251                 
252                 pDataArray->cutoff = smallestCutoff;
253                 
254                 return 0;
255                 
256         }
257         catch(exception& e) {
258                 pDataArray->m->errorOut(e, "ClusterSplitCommand", "MyClusterThreadFunction");
259                 exit(1);
260         }
261
262 #endif
263
264 */
265
266
267 #endif
268