]> git.donarmstrong.com Git - mothur.git/blob - mgclustercommand.cpp
85db5e0e8ffc384960f7954d6db1266bac9ea3c1
[mothur.git] / mgclustercommand.cpp
1 /*
2  *  mgclustercommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 12/11/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "mgclustercommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> MGClusterCommand::setParameters(){       
14         try {
15                 CommandParameter pblast("blast", "InputTypes", "", "", "none", "none", "none","list",false,true,true); parameters.push_back(pblast);
16                 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName","rabund-sabund",false,false,true); parameters.push_back(pname);
17                 CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "none","",false,false,true); parameters.push_back(pcount);
18                 CommandParameter plength("length", "Number", "", "5", "", "", "","",false,false); parameters.push_back(plength);
19                 CommandParameter ppenalty("penalty", "Number", "", "0.10", "", "", "","",false,false); parameters.push_back(ppenalty);
20                 CommandParameter pcutoff("cutoff", "Number", "", "0.70", "", "", "","",false,false,true); parameters.push_back(pcutoff);
21                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
22                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average", "average", "", "", "","",false,false); parameters.push_back(pmethod);
23                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
24                 CommandParameter pmin("min", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pmin);
25                 CommandParameter pmerge("merge", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pmerge);
26         CommandParameter padjust("adjust", "String", "", "F", "", "", "","",false,false); parameters.push_back(padjust);
27                 CommandParameter phcluster("hcluster", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(phcluster);
28                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
29                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
30                 
31                 vector<string> myArray;
32                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
33                 return myArray;
34         }
35         catch(exception& e) {
36                 m->errorOut(e, "MGClusterCommand", "setParameters");
37                 exit(1);
38         }
39 }
40 //**********************************************************************************************************************
41 string MGClusterCommand::getHelpString(){       
42         try {
43                 string helpString = "";
44                 helpString += "The mgcluster command parameter options are blast, name, cutoff, precision, hard,  method, merge, min, length, penalty, adjust and hcluster. The blast parameter is required.\n";
45                 helpString += "The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n";
46                 helpString += "This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n";
47                 helpString += "The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n";
48                 helpString += "The precision parameter's default value is 100. \n";
49                 helpString += "The acceptable mgcluster methods are furthest, nearest and average.  If no method is provided then average is assumed.\n";       
50                 helpString += "The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n";
51                 helpString += "The length parameter is used to specify the minimum overlap required.  The default is 5.\n";
52         helpString += "The adjust parameter is used to handle missing distances.  If you set a cutoff, adjust=f by default.  If not, adjust=t by default. Adjust=f, means ignore missing distances and adjust cutoff as needed with the average neighbor method.  Adjust=t, will treat missing distances as 1.0. You can also set the value the missing distances should be set to, adjust=0.5 would give missing distances a value of 0.5.\n";
53                 helpString += "The penalty parameter is used to adjust the error rate.  The default is 0.10.\n";
54                 helpString += "The merge parameter allows you to shut off merging based on overlaps and just cluster.  By default merge is true, meaning you want to merge.\n";
55                 helpString += "The hcluster parameter allows you to use the hcluster algorithm when clustering.  This may be neccessary if your file is too large to fit into RAM. The default is false.\n";
56                 helpString += "The mgcluster command should be in the following format: \n";
57                 helpString += "mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n";
58                 helpString += "Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n";
59                 return helpString;
60         }
61         catch(exception& e) {
62                 m->errorOut(e, "MGClusterCommand", "getHelpString");
63                 exit(1);
64         }
65 }
66 //**********************************************************************************************************************
67 string MGClusterCommand::getOutputPattern(string type) {
68     try {
69         string pattern = "";
70         
71         if (type == "list") {  pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; } 
72         else if (type == "rabund") {  pattern = "[filename],[clustertag],rabund"; } 
73         else if (type == "sabund") {  pattern = "[filename],[clustertag],sabund"; }
74         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
75         
76         return pattern;
77     }
78     catch(exception& e) {
79         m->errorOut(e, "MGClusterCommand", "getOutputPattern");
80         exit(1);
81     }
82 }
83 //*******************************************************************************************************************
84 MGClusterCommand::MGClusterCommand(){   
85         try {
86                 abort = true; calledHelp = true; 
87                 setParameters();
88                 vector<string> tempOutNames;
89                 outputTypes["list"] = tempOutNames;
90                 outputTypes["rabund"] = tempOutNames;
91                 outputTypes["sabund"] = tempOutNames;
92         }
93         catch(exception& e) {
94                 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
95                 exit(1);
96         }
97 }
98 //**********************************************************************************************************************
99 MGClusterCommand::MGClusterCommand(string option) {
100         try {
101                 abort = false; calledHelp = false;   
102                 
103                 //allow user to run help
104                 if(option == "help") { help(); abort = true; calledHelp = true; }
105                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
106                 
107                 else {
108                         vector<string> myArray = setParameters();
109                         
110                         OptionParser parser(option);
111                         map<string, string> parameters = parser.getParameters();
112                         
113                         ValidParameters validParameter;
114                         map<string,string>::iterator it;
115                 
116                         //check to make sure all parameters are valid for command
117                         for (it = parameters.begin(); it != parameters.end(); it++) { 
118                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
119                         }
120                         
121                         //initialize outputTypes
122                         vector<string> tempOutNames;
123                         outputTypes["list"] = tempOutNames;
124                         outputTypes["rabund"] = tempOutNames;
125                         outputTypes["sabund"] = tempOutNames;
126                         
127                         //if the user changes the input directory command factory will send this info to us in the output parameter 
128                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
129                         if (inputDir == "not found"){   inputDir = "";          }
130                         else {
131                                 string path;
132                                 it = parameters.find("blast");
133                                 //user has given a template file
134                                 if(it != parameters.end()){ 
135                                         path = m->hasPath(it->second);
136                                         //if the user has not given a path then, add inputdir. else leave path alone.
137                                         if (path == "") {       parameters["blast"] = inputDir + it->second;            }
138                                 }
139                                 
140                                 it = parameters.find("name");
141                                 //user has given a template file
142                                 if(it != parameters.end()){ 
143                                         path = m->hasPath(it->second);
144                                         //if the user has not given a path then, add inputdir. else leave path alone.
145                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
146                                 }
147                 
148                 it = parameters.find("count");
149                                 //user has given a template file
150                                 if(it != parameters.end()){ 
151                                         path = m->hasPath(it->second);
152                                         //if the user has not given a path then, add inputdir. else leave path alone.
153                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
154                                 }
155                         }
156
157                         
158                         //check for required parameters
159                         blastfile = validParameter.validFile(parameters, "blast", true);
160                         if (blastfile == "not open") { blastfile = ""; abort = true; }  
161                         else if (blastfile == "not found") { blastfile = ""; }
162                         
163                         //if the user changes the output directory command factory will send this info to us in the output parameter 
164                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
165                                 outputDir = ""; 
166                                 outputDir += m->hasPath(blastfile); //if user entered a file with a path then preserve it       
167                         }
168                         
169                         namefile = validParameter.validFile(parameters, "name", true);
170                         if (namefile == "not open") { abort = true; }   
171                         else if (namefile == "not found") { namefile = ""; }
172                         else { m->setNameFile(namefile); }
173             
174             countfile = validParameter.validFile(parameters, "count", true);
175                         if (countfile == "not open") { abort = true; }  
176                         else if (countfile == "not found") { countfile = ""; }
177             else { m->setCountTableFile(countfile); }
178             
179             if (countfile != "" && namefile != "") { m->mothurOut("[ERROR]: Cannot have both a name file and count file. Please use one or the other."); m->mothurOutEndLine(); abort = true; }
180                         
181                         if ((blastfile == "")) { m->mothurOut("When executing a mgcluster command you must provide a blastfile."); m->mothurOutEndLine(); abort = true; }
182                         
183                         //check for optional parameter and set defaults
184                         string temp;
185             temp = validParameter.validFile(parameters, "precision", false);            if (temp == "not found") { temp = "100"; }
186                         precisionLength = temp.length();
187                         m->mothurConvert(temp, precision); 
188                         
189             cutoffSet = false;
190                         temp = validParameter.validFile(parameters, "cutoff", false);
191             if (temp == "not found") { temp = "0.70"; }
192             else { cutoffSet = true;  }
193                         m->mothurConvert(temp, cutoff); 
194                         cutoff += (5 / (precision * 10.0));
195                         
196                         method = validParameter.validFile(parameters, "method", false);
197                         if (method == "not found") { method = "average"; }
198                         
199                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
200                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
201
202                         temp = validParameter.validFile(parameters, "length", false);                   if (temp == "not found") { temp = "5"; }
203                         m->mothurConvert(temp, length); 
204                         
205                         temp = validParameter.validFile(parameters, "penalty", false);                  if (temp == "not found") { temp = "0.10"; }
206                         m->mothurConvert(temp, penalty); 
207                         
208                         temp = validParameter.validFile(parameters, "min", false);                              if (temp == "not found") { temp = "true"; }
209                         minWanted = m->isTrue(temp); 
210                         
211                         temp = validParameter.validFile(parameters, "merge", false);                    if (temp == "not found") { temp = "true"; }
212                         merge = m->isTrue(temp); 
213                         
214                         temp = validParameter.validFile(parameters, "hcluster", false);                 if (temp == "not found") { temp = "false"; }
215                         hclusterWanted = m->isTrue(temp); 
216                         
217                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
218                         hard = m->isTrue(temp);
219             
220             temp = validParameter.validFile(parameters, "adjust", false);                               if (temp == "not found") { if (cutoffSet) { temp = "F"; }else { temp="T"; } }
221             if (m->isNumeric1(temp))    { m->mothurConvert(temp, adjust);   }
222             else if (m->isTrue(temp))   { adjust = 1.0;                     }
223             else                        { adjust = -1.0;                    }
224                 }
225
226         }
227         catch(exception& e) {
228                 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
229                 exit(1);
230         }
231 }
232 //**********************************************************************************************************************
233 int MGClusterCommand::execute(){
234         try {
235                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
236                 
237                 //read names file
238                 if (namefile != "") {
239                         nameMap = new NameAssignment(namefile);
240                         nameMap->readMap();
241                 }else if (countfile != "") {
242             ct = new CountTable();
243             ct->readTable(countfile, false, false);
244             nameMap= new NameAssignment();
245             vector<string> tempNames = ct->getNamesOfSeqs();
246             for (int i = 0; i < tempNames.size(); i++) {  nameMap->push_back(tempNames[i]);  }
247         }else{ nameMap= new NameAssignment(); }
248                 
249                 string fileroot = outputDir + m->getRootName(m->getSimpleName(blastfile));
250                 string tag = "";
251                 time_t start;
252                 float previousDist = 0.00000;
253                 float rndPreviousDist = 0.00000; 
254         
255                 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
256                 //must remember to delete those objects here since readBlast does not
257                 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
258                 read->read(nameMap);
259         
260         list = new ListVector(nameMap->getListVector());
261         RAbundVector* rabund = NULL;
262         
263         if(countfile != "") {
264             rabund = new RAbundVector();
265             createRabund(ct, list, rabund);
266         }else {
267             rabund = new RAbundVector(list->getRAbundVector());
268         }
269         
270                 
271                 //list = new ListVector(nameMap->getListVector());
272                 //rabund = new RAbundVector(list->getRAbundVector());
273                 
274                 if (m->control_pressed) { outputTypes.clear(); delete nameMap; delete read; delete list; delete rabund; return 0; }
275                 
276                 start = time(NULL);
277                 oldList = *list;
278                 map<string, int> Seq2Bin;
279                 map<string, int> oldSeq2Bin;
280                 
281                 if (method == "furthest")               { tag = "fn";  }
282                 else if (method == "nearest")   { tag = "nn";  }
283                 else                                                    { tag = "an";  }
284                 
285         map<string, string> variables; 
286         variables["[filename]"] = fileroot;
287         variables["[clustertag]"] = tag;
288         string sabundFileName = getOutputFileName("sabund", variables);
289         string rabundFileName = getOutputFileName("rabund", variables);
290         if (countfile != "") { variables["[tag2]"] = "unique_list"; }
291         string listFileName = getOutputFileName("list", variables);
292         
293         if (countfile == "") {
294             m->openOutputFile(sabundFileName,   sabundFile);
295             m->openOutputFile(rabundFileName,   rabundFile);
296         }
297                 m->openOutputFile(listFileName, listFile);
298                 
299                 if (m->control_pressed) { 
300                         delete nameMap; delete read; delete list; delete rabund; 
301                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
302                         outputTypes.clear();
303                         return 0; 
304                 }
305                 
306                 double saveCutoff = cutoff;
307                 
308                 if (!hclusterWanted) {
309                         //get distmatrix and overlap
310                         SparseDistanceMatrix* distMatrix = read->getDistMatrix();
311                         overlapMatrix = read->getOverlapMatrix(); //already sorted by read 
312                         delete read;
313                 
314                         //create cluster
315                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method, adjust); }
316                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method, adjust); }
317                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method, adjust); }
318                         cluster->setMapWanted(true);
319                         Seq2Bin = cluster->getSeqtoBin();
320                         oldSeq2Bin = Seq2Bin;
321                         
322                         if (m->control_pressed) { 
323                                 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
324                                 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
325                                 outputTypes.clear();
326                                 return 0; 
327                         }
328             
329             
330                         //cluster using cluster classes
331                         while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
332                                 
333                 if (m->debug) {  cout << "numNodes=" << distMatrix->getNNodes() << " smallDist = " << distMatrix->getSmallDist() << endl; }
334                 
335                                 cluster->update(cutoff);
336                                 
337                                 if (m->control_pressed) { 
338                                         delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
339                                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
340                                         outputTypes.clear();
341                                         return 0; 
342                                 }
343                                 
344                                 float dist = distMatrix->getSmallDist();
345                                 float rndDist;
346                                 if (hard) {
347                                         rndDist = m->ceilDist(dist, precision); 
348                                 }else{
349                                         rndDist = m->roundDist(dist, precision); 
350                                 }
351                                 
352                                 if(previousDist <= 0.0000 && dist != previousDist){
353                                         oldList.setLabel("unique");
354                                         printData(&oldList);
355                                 }
356                                 else if(rndDist != rndPreviousDist){
357                                         if (merge) {
358                                                 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
359                                                 
360                                                 if (m->control_pressed) { 
361                                                         delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
362                                                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
363                                                         outputTypes.clear();
364                                                         return 0; 
365                                                 }
366                                                 
367                                                 temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
368                                                 printData(temp);
369                                                 delete temp;
370                                         }else{
371                                                 oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
372                                                 printData(&oldList);
373                                         }
374                                 }
375         
376                                 previousDist = dist;
377                                 rndPreviousDist = rndDist;
378                                 oldList = *list;
379                                 Seq2Bin = cluster->getSeqtoBin();
380                                 oldSeq2Bin = Seq2Bin;
381                         }
382                         
383                         if(previousDist <= 0.0000){
384                                 oldList.setLabel("unique");
385                                 printData(&oldList);
386                         }
387                         else if(rndPreviousDist<cutoff){
388                                 if (merge) {
389                                         ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
390                                         
391                                         if (m->control_pressed) { 
392                                                         delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
393                                                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
394                                                         outputTypes.clear();
395                                                         return 0; 
396                                         }
397                                         
398                                         temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
399                                         printData(temp);
400                                         delete temp;
401                                 }else{
402                                         oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
403                                         printData(&oldList);
404                                 }
405                         }
406                         
407                         //free memory
408                         overlapMatrix.clear();
409                         delete distMatrix;
410                         delete cluster;
411                         
412                 }else { //use hcluster to cluster
413                         //get distmatrix and overlap
414                         overlapFile = read->getOverlapFile();
415                         distFile = read->getDistFile(); 
416                         delete read;
417                 
418                         //sort the distance and overlap files
419                         sortHclusterFiles(distFile, overlapFile);
420                         
421                         if (m->control_pressed) { 
422                                 delete nameMap;  delete list; delete rabund; 
423                                 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
424                                 outputTypes.clear();
425                                 return 0; 
426                         }
427                 
428                         //create cluster
429                         hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
430                         hcluster->setMapWanted(true);
431                         Seq2Bin = cluster->getSeqtoBin();
432                         oldSeq2Bin = Seq2Bin;
433                         
434                         vector<seqDist> seqs; seqs.resize(1); // to start loop
435                         //ifstream inHcluster;
436                         //m->openInputFile(distFile, inHcluster);
437                         
438                         if (m->control_pressed) { 
439                                 delete nameMap;  delete list; delete rabund; delete hcluster;
440                                 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
441                                 outputTypes.clear();
442                                 return 0; 
443                         }
444
445                         while (seqs.size() != 0){
446                 
447                                 seqs = hcluster->getSeqs();
448                                 
449                                 //to account for cutoff change in average neighbor
450                                 if (seqs.size() != 0) {
451                                         if (seqs[0].dist > cutoff) { break; }
452                                 }
453                                 
454                                 if (m->control_pressed) { 
455                                         delete nameMap;  delete list; delete rabund; delete hcluster;
456                                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
457                                         m->mothurRemove(distFile);
458                                         m->mothurRemove(overlapFile);
459                                         outputTypes.clear();
460                                         return 0; 
461                                 }
462                                 
463                                 for (int i = 0; i < seqs.size(); i++) {  //-1 means skip me
464                                         
465                                         if (seqs[i].seq1 != seqs[i].seq2) {
466                 
467                                                 cutoff = hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
468                                                 
469                                                 if (m->control_pressed) { 
470                                                         delete nameMap;  delete list; delete rabund; delete hcluster;
471                                                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
472                                                         m->mothurRemove(distFile);
473                                                         m->mothurRemove(overlapFile);
474                                                         outputTypes.clear();
475                                                         return 0; 
476                                                 }
477         
478                                                 float rndDist;
479                                                 if (hard) {
480                                                         rndDist = m->ceilDist(seqs[i].dist, precision); 
481                                                 }else{
482                                                         rndDist = m->roundDist(seqs[i].dist, precision); 
483                                                 }
484                                                                                                 
485                                                 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
486                                                         oldList.setLabel("unique");
487                                                         printData(&oldList);
488                                                 }
489                                                 else if((rndDist != rndPreviousDist)){
490                                                         if (merge) {
491                                                                 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
492                                                                 
493                                                                 if (m->control_pressed) { 
494                                                                         delete nameMap;  delete list; delete rabund; delete hcluster; delete temp;
495                                                                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
496                                                                         m->mothurRemove(distFile);
497                                                                         m->mothurRemove(overlapFile);
498                                                                         outputTypes.clear();
499                                                                         return 0; 
500                                                                 }
501
502                                                                 temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
503                                                                 printData(temp);
504                                                                 delete temp;
505                                                         }else{
506                                                                 oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
507                                                                 printData(&oldList);
508                                                         }
509                                                 }
510                                                 
511                                                 previousDist = seqs[i].dist;
512                                                 rndPreviousDist = rndDist;
513                                                 oldList = *list;
514                                                 Seq2Bin = cluster->getSeqtoBin();
515                                                 oldSeq2Bin = Seq2Bin;
516                                         }
517                                 }
518                         }
519                         //inHcluster.close();
520                         
521                         if(previousDist <= 0.0000){
522                                 oldList.setLabel("unique");
523                                 printData(&oldList);
524                         }
525                         else if(rndPreviousDist<cutoff){
526                                 if (merge) {
527                                         ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
528                                         
529                                         if (m->control_pressed) { 
530                                                         delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
531                                                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
532                                                         m->mothurRemove(distFile);
533                                                         m->mothurRemove(overlapFile);
534                                                         outputTypes.clear();
535                                                         return 0; 
536                                         }
537                                         
538                                         temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
539                                         printData(temp);
540                                         delete temp;
541                                 }else{
542                                         oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
543                                         printData(&oldList);
544                                 }
545                         }
546                         
547                         delete hcluster;
548                         m->mothurRemove(distFile);
549                         m->mothurRemove(overlapFile);
550                 }
551                 
552                 delete list;
553                 delete rabund;
554                 listFile.close();
555         if (countfile == "") {
556             sabundFile.close();
557             rabundFile.close();
558         }
559                 if (m->control_pressed) { 
560                         delete nameMap; 
561                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
562                         outputTypes.clear();
563                         return 0; 
564                 }
565                 
566                 m->mothurOutEndLine();
567                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
568                 m->mothurOut(listFileName); m->mothurOutEndLine();      outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
569                 if (countfile == "") {
570             m->mothurOut(rabundFileName); m->mothurOutEndLine();        outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
571             m->mothurOut(sabundFileName); m->mothurOutEndLine();        outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
572         }
573                 m->mothurOutEndLine();
574                 
575                 if (saveCutoff != cutoff) { 
576                         if (hard)       {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
577                         else            {       saveCutoff = m->roundDist(saveCutoff, precision);  }
578                         
579                         m->mothurOut("changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); 
580                 }
581                 
582                 //set list file as new current listfile
583                 string current = "";
584                 itTypes = outputTypes.find("list");
585                 if (itTypes != outputTypes.end()) {
586                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
587                 }
588                 
589                 //set rabund file as new current rabundfile
590                 itTypes = outputTypes.find("rabund");
591                 if (itTypes != outputTypes.end()) {
592                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
593                 }
594                 
595                 //set sabund file as new current sabundfile
596                 itTypes = outputTypes.find("sabund");
597                 if (itTypes != outputTypes.end()) {
598                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
599                 }
600                 
601                 
602                 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine();
603                         
604                 return 0;
605         }
606         catch(exception& e) {
607                 m->errorOut(e, "MGClusterCommand", "execute");
608                 exit(1);
609         }
610 }
611 //**********************************************************************************************************************
612 void MGClusterCommand::printData(ListVector* mergedList){
613         try {
614                 mergedList->print(listFile);
615         SAbundVector sabund = mergedList->getSAbundVector();
616         
617         if (countfile == "") {
618             mergedList->getRAbundVector().print(rabundFile);
619             sabund.print(sabundFile);
620         }
621
622                 sabund.print(cout);
623         }
624         catch(exception& e) {
625                 m->errorOut(e, "MGClusterCommand", "printData");
626                 exit(1);
627         }
628 }
629 //**********************************************************************************************************************
630 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
631 //that are used to cluster by distance.  this is done so that the overlapping data does not have more influenece than the distance data.
632 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
633         try {
634                 //create new listvector so you don't overwrite the clustering
635                 ListVector* newList = new ListVector(oldList);
636
637                 bool done = false;
638                 ifstream inOverlap;
639                 int count = 0;
640                 
641                 if (hclusterWanted) {  
642                         m->openInputFile(overlapFile, inOverlap);  
643                         if (inOverlap.eof()) {  done = true;  }
644                 }else { if (overlapMatrix.size() == 0)  {  done = true;  } } 
645                 
646                 while (!done) {
647                         if (m->control_pressed) { 
648                                 if (hclusterWanted) {   inOverlap.close();  }           
649                                 return newList;
650                         }
651                         
652                         //get next overlap
653                         seqDist overlapNode;
654                         if (!hclusterWanted) {  
655                                 if (count < overlapMatrix.size()) { //do we have another node in the matrix
656                                         overlapNode = overlapMatrix[count];
657                                         count++;
658                                 }else { break; }
659                         }else { 
660                                 if (!inOverlap.eof()) {
661                                         string firstName, secondName;
662                                         float overlapDistance;
663                                         inOverlap >> firstName >> secondName >> overlapDistance; m->gobble(inOverlap);
664                                         
665                                         //commented out because we check this in readblast already
666                                         //map<string,int>::iterator itA = nameMap->find(firstName);
667                                         //map<string,int>::iterator itB = nameMap->find(secondName);
668                                         //if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
669                                         //if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
670                                         
671                                         //overlapNode.seq1 = itA->second;
672                                         //overlapNode.seq2 = itB->second;
673                                         overlapNode.seq1 = nameMap->get(firstName);
674                                         overlapNode.seq2 = nameMap->get(secondName);
675                                         overlapNode.dist = overlapDistance;
676                                 }else { inOverlap.close(); break; }
677                         } 
678                 
679                         if (overlapNode.dist < dist) {
680                                 //get names of seqs that overlap
681                                 string name1 = nameMap->get(overlapNode.seq1);
682                                 string name2 = nameMap->get(overlapNode.seq2);
683                         
684                                 //use binInfo to find out if they are already in the same bin
685                                 //map<string, int>::iterator itBin1 = binInfo.find(name1);
686                                 //map<string, int>::iterator itBin2 = binInfo.find(name2);
687                                 
688                                 //if(itBin1 == binInfo.end()){  cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1);  }
689                                 //if(itBin2 == binInfo.end()){  cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1);  }
690
691                                 //int binKeep = itBin1->second;
692                                 //int binRemove = itBin2->second;
693                                 
694                                 int binKeep = binInfo[name1];
695                                 int binRemove = binInfo[name2];
696                         
697                                 //if not merge bins and update binInfo
698                                 if(binKeep != binRemove) {
699                 
700                                         //save names in old bin
701                                         string names = newList->get(binRemove);
702                 
703                                         //merge bins into name1s bin
704                                         newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
705                                         newList->set(binRemove, "");    
706                                         
707                                         //update binInfo
708                                         while (names.find_first_of(',') != -1) { 
709                                                 //get name from bin
710                                                 string name = names.substr(0,names.find_first_of(','));
711                                                 //save name and bin number
712                                                 binInfo[name] = binKeep;
713                                                 names = names.substr(names.find_first_of(',')+1, names.length());
714                                         }
715                                         
716                                         //get last name
717                                         binInfo[names] = binKeep;
718                                 }
719                                 
720                         }else { done = true; }
721                 }
722                 
723                 //return listvector
724                 return newList;
725                                 
726         }
727         catch(exception& e) {
728                 m->errorOut(e, "MGClusterCommand", "mergeOPFs");
729                 exit(1);
730         }
731 }
732 //**********************************************************************************************************************
733 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
734         try {
735                 //sort distFile
736                 string sortedDistFile = m->sortFile(unsortedDist, outputDir);
737                 m->mothurRemove(unsortedDist);  //delete unsorted file
738                 distFile = sortedDistFile;
739                 
740                 //sort overlap file
741                 string sortedOverlapFile = m->sortFile(unsortedOverlap, outputDir);
742                 m->mothurRemove(unsortedOverlap);  //delete unsorted file
743                 overlapFile = sortedOverlapFile;
744         }
745         catch(exception& e) {
746                 m->errorOut(e, "MGClusterCommand", "sortHclusterFiles");
747                 exit(1);
748         }
749 }
750
751 //**********************************************************************************************************************
752
753 void MGClusterCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
754     try {
755         //vector<string> names = ct.getNamesOfSeqs();
756
757         //for ( int i; i < ct.getNumGroups(); i++ ) {    rav.push_back( ct.getNumSeqs(names[i]) );    }
758         //return rav;
759         
760         for(int i = 0; i < list->getNumBins(); i++) { 
761            vector<string> binNames;
762            string bin = list->get(i);
763            m->splitAtComma(bin, binNames);
764            int total = 0;
765            for (int j = 0; j < binNames.size(); j++) { 
766                total += ct->getNumSeqs(binNames[j]);
767            }
768            rabund->push_back(total);   
769        }
770         
771         
772     }
773     catch(exception& e) {
774                 m->errorOut(e, "MGClusterCommand", "createRabund");
775                 exit(1);
776         }
777     
778 }
779
780 //**********************************************************************************************************************
781
782