]> git.donarmstrong.com Git - mothur.git/blob - mgclustercommand.cpp
added sparseDistanceMatrix class. Modified cluster commands to use the new sparse...
[mothur.git] / mgclustercommand.cpp
1 /*
2  *  mgclustercommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 12/11/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "mgclustercommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> MGClusterCommand::setParameters(){       
14         try {
15                 CommandParameter pblast("blast", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pblast);
16                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
17         CommandParameter pcount("count", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pcount);
18                 CommandParameter plength("length", "Number", "", "5", "", "", "",false,false); parameters.push_back(plength);
19                 CommandParameter ppenalty("penalty", "Number", "", "0.10", "", "", "",false,false); parameters.push_back(ppenalty);
20                 CommandParameter pcutoff("cutoff", "Number", "", "0.70", "", "", "",false,false); parameters.push_back(pcutoff);
21                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
22                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average", "average", "", "", "",false,false); parameters.push_back(pmethod);
23                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
24                 CommandParameter pmin("min", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pmin);
25                 CommandParameter pmerge("merge", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pmerge);
26                 CommandParameter phcluster("hcluster", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(phcluster);
27                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
28                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
29                 
30                 vector<string> myArray;
31                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
32                 return myArray;
33         }
34         catch(exception& e) {
35                 m->errorOut(e, "MGClusterCommand", "setParameters");
36                 exit(1);
37         }
38 }
39 //**********************************************************************************************************************
40 string MGClusterCommand::getHelpString(){       
41         try {
42                 string helpString = "";
43                 helpString += "The mgcluster command parameter options are blast, name, cutoff, precision, hard,  method, merge, min, length, penalty and hcluster. The blast parameter is required.\n";
44                 helpString += "The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n";
45                 helpString += "This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n";
46                 helpString += "The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n";
47                 helpString += "The precision parameter's default value is 100. \n";
48                 helpString += "The acceptable mgcluster methods are furthest, nearest and average.  If no method is provided then average is assumed.\n";       
49                 helpString += "The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n";
50                 helpString += "The length parameter is used to specify the minimum overlap required.  The default is 5.\n";
51                 helpString += "The penalty parameter is used to adjust the error rate.  The default is 0.10.\n";
52                 helpString += "The merge parameter allows you to shut off merging based on overlaps and just cluster.  By default merge is true, meaning you want to merge.\n";
53                 helpString += "The hcluster parameter allows you to use the hcluster algorithm when clustering.  This may be neccessary if your file is too large to fit into RAM. The default is false.\n";
54                 helpString += "The mgcluster command should be in the following format: \n";
55                 helpString += "mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n";
56                 helpString += "Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n";
57                 return helpString;
58         }
59         catch(exception& e) {
60                 m->errorOut(e, "MGClusterCommand", "getHelpString");
61                 exit(1);
62         }
63 }
64 //**********************************************************************************************************************
65 string MGClusterCommand::getOutputFileNameTag(string type, string inputName=""){        
66         try {
67         string outputFileName = "";
68                 map<string, vector<string> >::iterator it;
69         
70         //is this a type this command creates
71         it = outputTypes.find(type);
72         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
73         else {
74             if (type == "list") {  outputFileName =  "list"; }
75             else if (type == "rabund") {  outputFileName =  "rabund"; }
76             else if (type == "sabund") {  outputFileName =  "sabund"; }
77             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
78         }
79         return outputFileName;
80         }
81         catch(exception& e) {
82                 m->errorOut(e, "MGClusterCommand", "getOutputFileNameTag");
83                 exit(1);
84         }
85 }
86 //**********************************************************************************************************************
87 MGClusterCommand::MGClusterCommand(){   
88         try {
89                 abort = true; calledHelp = true; 
90                 setParameters();
91                 vector<string> tempOutNames;
92                 outputTypes["list"] = tempOutNames;
93                 outputTypes["rabund"] = tempOutNames;
94                 outputTypes["sabund"] = tempOutNames;
95         }
96         catch(exception& e) {
97                 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
98                 exit(1);
99         }
100 }
101 //**********************************************************************************************************************
102 MGClusterCommand::MGClusterCommand(string option) {
103         try {
104                 abort = false; calledHelp = false;   
105                 
106                 //allow user to run help
107                 if(option == "help") { help(); abort = true; calledHelp = true; }
108                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
109                 
110                 else {
111                         vector<string> myArray = setParameters();
112                         
113                         OptionParser parser(option);
114                         map<string, string> parameters = parser.getParameters();
115                         
116                         ValidParameters validParameter;
117                         map<string,string>::iterator it;
118                 
119                         //check to make sure all parameters are valid for command
120                         for (it = parameters.begin(); it != parameters.end(); it++) { 
121                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
122                         }
123                         
124                         //initialize outputTypes
125                         vector<string> tempOutNames;
126                         outputTypes["list"] = tempOutNames;
127                         outputTypes["rabund"] = tempOutNames;
128                         outputTypes["sabund"] = tempOutNames;
129                         
130                         //if the user changes the input directory command factory will send this info to us in the output parameter 
131                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
132                         if (inputDir == "not found"){   inputDir = "";          }
133                         else {
134                                 string path;
135                                 it = parameters.find("blast");
136                                 //user has given a template file
137                                 if(it != parameters.end()){ 
138                                         path = m->hasPath(it->second);
139                                         //if the user has not given a path then, add inputdir. else leave path alone.
140                                         if (path == "") {       parameters["blast"] = inputDir + it->second;            }
141                                 }
142                                 
143                                 it = parameters.find("name");
144                                 //user has given a template file
145                                 if(it != parameters.end()){ 
146                                         path = m->hasPath(it->second);
147                                         //if the user has not given a path then, add inputdir. else leave path alone.
148                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
149                                 }
150                         }
151
152                         
153                         //check for required parameters
154                         blastfile = validParameter.validFile(parameters, "blast", true);
155                         if (blastfile == "not open") { blastfile = ""; abort = true; }  
156                         else if (blastfile == "not found") { blastfile = ""; }
157                         
158                         //if the user changes the output directory command factory will send this info to us in the output parameter 
159                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
160                                 outputDir = ""; 
161                                 outputDir += m->hasPath(blastfile); //if user entered a file with a path then preserve it       
162                         }
163                         
164                         namefile = validParameter.validFile(parameters, "name", true);
165                         if (namefile == "not open") { abort = true; }   
166                         else if (namefile == "not found") { namefile = ""; }
167                         else { m->setNameFile(namefile); }
168             
169             countfile = validParameter.validFile(parameters, "count", true);
170                         if (countfile == "not open") { abort = true; }  
171                         else if (countfile == "not found") { countfile = ""; }
172             else { m->setCountTableFile(countfile); }
173             
174             if (countfile != "" && namefile != "") { m->mothurOut("[ERROR]: Cannot have both a name file and count file. Please use one or the other."); m->mothurOutEndLine(); abort = true; }
175                         
176                         if ((blastfile == "")) { m->mothurOut("When executing a mgcluster command you must provide a blastfile."); m->mothurOutEndLine(); abort = true; }
177                         
178                         //check for optional parameter and set defaults
179                         string temp;
180             temp = validParameter.validFile(parameters, "precision", false);            if (temp == "not found") { temp = "100"; }
181                         precisionLength = temp.length();
182                         m->mothurConvert(temp, precision); 
183                         
184                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "0.70"; }
185                         m->mothurConvert(temp, cutoff); 
186                         cutoff += (5 / (precision * 10.0));
187                         
188                         method = validParameter.validFile(parameters, "method", false);
189                         if (method == "not found") { method = "average"; }
190                         
191                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
192                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
193
194                         temp = validParameter.validFile(parameters, "length", false);                   if (temp == "not found") { temp = "5"; }
195                         m->mothurConvert(temp, length); 
196                         
197                         temp = validParameter.validFile(parameters, "penalty", false);                  if (temp == "not found") { temp = "0.10"; }
198                         m->mothurConvert(temp, penalty); 
199                         
200                         temp = validParameter.validFile(parameters, "min", false);                              if (temp == "not found") { temp = "true"; }
201                         minWanted = m->isTrue(temp); 
202                         
203                         temp = validParameter.validFile(parameters, "merge", false);                    if (temp == "not found") { temp = "true"; }
204                         merge = m->isTrue(temp); 
205                         
206                         temp = validParameter.validFile(parameters, "hcluster", false);                 if (temp == "not found") { temp = "false"; }
207                         hclusterWanted = m->isTrue(temp); 
208                         
209                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
210                         hard = m->isTrue(temp);            
211                 }
212
213         }
214         catch(exception& e) {
215                 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
216                 exit(1);
217         }
218 }
219 //**********************************************************************************************************************
220 int MGClusterCommand::execute(){
221         try {
222                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
223                 
224                 //read names file
225                 if (namefile != "") {
226                         nameMap = new NameAssignment(namefile);
227                         nameMap->readMap();
228                 }else{ nameMap= new NameAssignment(); }
229                 
230                 string fileroot = outputDir + m->getRootName(m->getSimpleName(blastfile));
231                 string tag = "";
232                 time_t start;
233                 float previousDist = 0.00000;
234                 float rndPreviousDist = 0.00000; 
235         
236                 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
237                 //must remember to delete those objects here since readBlast does not
238                 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
239                 read->read(nameMap);
240         
241         list = new ListVector(nameMap->getListVector());
242         RAbundVector* rabund = NULL;
243         
244         if(countfile != "") {
245             //map<string, int> nameMapCounts = m->readNames(namefile);
246             ct = new CountTable();
247             ct->readTable(countfile);
248             rabund = new RAbundVector();
249             createRabund(ct, list, rabund);
250         }else {
251             rabund = new RAbundVector(list->getRAbundVector());
252         }
253         
254                 
255                 //list = new ListVector(nameMap->getListVector());
256                 //rabund = new RAbundVector(list->getRAbundVector());
257                 
258                 if (m->control_pressed) { outputTypes.clear(); delete nameMap; delete read; delete list; delete rabund; return 0; }
259                 
260                 start = time(NULL);
261                 oldList = *list;
262                 map<string, int> Seq2Bin;
263                 map<string, int> oldSeq2Bin;
264                 
265                 if (method == "furthest")               { tag = "fn";  }
266                 else if (method == "nearest")   { tag = "nn";  }
267                 else                                                    { tag = "an";  }
268                 
269         string sabundFileName = fileroot+ tag + "." + getOutputFileNameTag("sabund");
270         string rabundFileName = fileroot+ tag + "." + getOutputFileNameTag("rabund");
271         string listFileName = fileroot+ tag + "." + getOutputFileNameTag("list");
272         
273         if (countfile == "") {
274             m->openOutputFile(sabundFileName,   sabundFile);
275             m->openOutputFile(rabundFileName,   rabundFile);
276         }
277                 m->openOutputFile(listFileName, listFile);
278                 
279                 if (m->control_pressed) { 
280                         delete nameMap; delete read; delete list; delete rabund; 
281                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
282                         outputTypes.clear();
283                         return 0; 
284                 }
285                 
286                 double saveCutoff = cutoff;
287                 
288                 if (!hclusterWanted) {
289                         //get distmatrix and overlap
290                         SparseDistanceMatrix* distMatrix = read->getDistMatrix();
291                         overlapMatrix = read->getOverlapMatrix(); //already sorted by read 
292                         delete read;
293                 
294                         //create cluster
295                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method); }
296                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
297                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); }
298                         cluster->setMapWanted(true);
299                         Seq2Bin = cluster->getSeqtoBin();
300                         oldSeq2Bin = Seq2Bin;
301                         
302                         if (m->control_pressed) { 
303                                 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
304                                 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
305                                 outputTypes.clear();
306                                 return 0; 
307                         }
308                 
309                         //cluster using cluster classes
310                         while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
311                                 
312                                 cluster->update(cutoff);
313                                 
314                                 if (m->control_pressed) { 
315                                         delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
316                                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
317                                         outputTypes.clear();
318                                         return 0; 
319                                 }
320                                 
321                                 float dist = distMatrix->getSmallDist();
322                                 float rndDist;
323                                 if (hard) {
324                                         rndDist = m->ceilDist(dist, precision); 
325                                 }else{
326                                         rndDist = m->roundDist(dist, precision); 
327                                 }
328                                 
329                                 if(previousDist <= 0.0000 && dist != previousDist){
330                                         oldList.setLabel("unique");
331                                         printData(&oldList);
332                                 }
333                                 else if(rndDist != rndPreviousDist){
334                                         if (merge) {
335                                                 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
336                                                 
337                                                 if (m->control_pressed) { 
338                                                         delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
339                                                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
340                                                         outputTypes.clear();
341                                                         return 0; 
342                                                 }
343                                                 
344                                                 temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
345                                                 printData(temp);
346                                                 delete temp;
347                                         }else{
348                                                 oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
349                                                 printData(&oldList);
350                                         }
351                                 }
352         
353                                 previousDist = dist;
354                                 rndPreviousDist = rndDist;
355                                 oldList = *list;
356                                 Seq2Bin = cluster->getSeqtoBin();
357                                 oldSeq2Bin = Seq2Bin;
358                         }
359                         
360                         if(previousDist <= 0.0000){
361                                 oldList.setLabel("unique");
362                                 printData(&oldList);
363                         }
364                         else if(rndPreviousDist<cutoff){
365                                 if (merge) {
366                                         ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
367                                         
368                                         if (m->control_pressed) { 
369                                                         delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
370                                                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
371                                                         outputTypes.clear();
372                                                         return 0; 
373                                         }
374                                         
375                                         temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
376                                         printData(temp);
377                                         delete temp;
378                                 }else{
379                                         oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
380                                         printData(&oldList);
381                                 }
382                         }
383                         
384                         //free memory
385                         overlapMatrix.clear();
386                         delete distMatrix;
387                         delete cluster;
388                         
389                 }else { //use hcluster to cluster
390                         //get distmatrix and overlap
391                         overlapFile = read->getOverlapFile();
392                         distFile = read->getDistFile(); 
393                         delete read;
394                 
395                         //sort the distance and overlap files
396                         sortHclusterFiles(distFile, overlapFile);
397                         
398                         if (m->control_pressed) { 
399                                 delete nameMap;  delete list; delete rabund; 
400                                 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
401                                 outputTypes.clear();
402                                 return 0; 
403                         }
404                 
405                         //create cluster
406                         hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
407                         hcluster->setMapWanted(true);
408                         Seq2Bin = cluster->getSeqtoBin();
409                         oldSeq2Bin = Seq2Bin;
410                         
411                         vector<seqDist> seqs; seqs.resize(1); // to start loop
412                         //ifstream inHcluster;
413                         //m->openInputFile(distFile, inHcluster);
414                         
415                         if (m->control_pressed) { 
416                                 delete nameMap;  delete list; delete rabund; delete hcluster;
417                                 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
418                                 outputTypes.clear();
419                                 return 0; 
420                         }
421
422                         while (seqs.size() != 0){
423                 
424                                 seqs = hcluster->getSeqs();
425                                 
426                                 //to account for cutoff change in average neighbor
427                                 if (seqs.size() != 0) {
428                                         if (seqs[0].dist > cutoff) { break; }
429                                 }
430                                 
431                                 if (m->control_pressed) { 
432                                         delete nameMap;  delete list; delete rabund; delete hcluster;
433                                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
434                                         m->mothurRemove(distFile);
435                                         m->mothurRemove(overlapFile);
436                                         outputTypes.clear();
437                                         return 0; 
438                                 }
439                                 
440                                 for (int i = 0; i < seqs.size(); i++) {  //-1 means skip me
441                                         
442                                         if (seqs[i].seq1 != seqs[i].seq2) {
443                 
444                                                 cutoff = hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
445                                                 
446                                                 if (m->control_pressed) { 
447                                                         delete nameMap;  delete list; delete rabund; delete hcluster;
448                                                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
449                                                         m->mothurRemove(distFile);
450                                                         m->mothurRemove(overlapFile);
451                                                         outputTypes.clear();
452                                                         return 0; 
453                                                 }
454         
455                                                 float rndDist;
456                                                 if (hard) {
457                                                         rndDist = m->ceilDist(seqs[i].dist, precision); 
458                                                 }else{
459                                                         rndDist = m->roundDist(seqs[i].dist, precision); 
460                                                 }
461                                                                                                 
462                                                 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
463                                                         oldList.setLabel("unique");
464                                                         printData(&oldList);
465                                                 }
466                                                 else if((rndDist != rndPreviousDist)){
467                                                         if (merge) {
468                                                                 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
469                                                                 
470                                                                 if (m->control_pressed) { 
471                                                                         delete nameMap;  delete list; delete rabund; delete hcluster; delete temp;
472                                                                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
473                                                                         m->mothurRemove(distFile);
474                                                                         m->mothurRemove(overlapFile);
475                                                                         outputTypes.clear();
476                                                                         return 0; 
477                                                                 }
478
479                                                                 temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
480                                                                 printData(temp);
481                                                                 delete temp;
482                                                         }else{
483                                                                 oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
484                                                                 printData(&oldList);
485                                                         }
486                                                 }
487                                                 
488                                                 previousDist = seqs[i].dist;
489                                                 rndPreviousDist = rndDist;
490                                                 oldList = *list;
491                                                 Seq2Bin = cluster->getSeqtoBin();
492                                                 oldSeq2Bin = Seq2Bin;
493                                         }
494                                 }
495                         }
496                         //inHcluster.close();
497                         
498                         if(previousDist <= 0.0000){
499                                 oldList.setLabel("unique");
500                                 printData(&oldList);
501                         }
502                         else if(rndPreviousDist<cutoff){
503                                 if (merge) {
504                                         ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
505                                         
506                                         if (m->control_pressed) { 
507                                                         delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
508                                                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
509                                                         m->mothurRemove(distFile);
510                                                         m->mothurRemove(overlapFile);
511                                                         outputTypes.clear();
512                                                         return 0; 
513                                         }
514                                         
515                                         temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
516                                         printData(temp);
517                                         delete temp;
518                                 }else{
519                                         oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
520                                         printData(&oldList);
521                                 }
522                         }
523                         
524                         delete hcluster;
525                         m->mothurRemove(distFile);
526                         m->mothurRemove(overlapFile);
527                 }
528                 
529                 delete list;
530                 delete rabund;
531                 listFile.close();
532         if (countfile == "") {
533             sabundFile.close();
534             rabundFile.close();
535         }
536                 if (m->control_pressed) { 
537                         delete nameMap; 
538                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
539                         outputTypes.clear();
540                         return 0; 
541                 }
542                 
543                 m->mothurOutEndLine();
544                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
545                 m->mothurOut(listFileName); m->mothurOutEndLine();      outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
546                 if (countfile == "") {
547             m->mothurOut(rabundFileName); m->mothurOutEndLine();        outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
548             m->mothurOut(sabundFileName); m->mothurOutEndLine();        outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
549         }
550                 m->mothurOutEndLine();
551                 
552                 if (saveCutoff != cutoff) { 
553                         if (hard)       {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
554                         else            {       saveCutoff = m->roundDist(saveCutoff, precision);  }
555                         
556                         m->mothurOut("changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); 
557                 }
558                 
559                 //set list file as new current listfile
560                 string current = "";
561                 itTypes = outputTypes.find("list");
562                 if (itTypes != outputTypes.end()) {
563                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
564                 }
565                 
566                 //set rabund file as new current rabundfile
567                 itTypes = outputTypes.find("rabund");
568                 if (itTypes != outputTypes.end()) {
569                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
570                 }
571                 
572                 //set sabund file as new current sabundfile
573                 itTypes = outputTypes.find("sabund");
574                 if (itTypes != outputTypes.end()) {
575                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
576                 }
577                 
578                 
579                 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine();
580                         
581                 return 0;
582         }
583         catch(exception& e) {
584                 m->errorOut(e, "MGClusterCommand", "execute");
585                 exit(1);
586         }
587 }
588 //**********************************************************************************************************************
589 void MGClusterCommand::printData(ListVector* mergedList){
590         try {
591                 mergedList->print(listFile);
592         SAbundVector sabund = mergedList->getSAbundVector();
593         
594         if (countfile == "") {
595             mergedList->getRAbundVector().print(rabundFile);
596             sabund.print(sabundFile);
597         }
598
599                 sabund.print(cout);
600         }
601         catch(exception& e) {
602                 m->errorOut(e, "MGClusterCommand", "printData");
603                 exit(1);
604         }
605 }
606 //**********************************************************************************************************************
607 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
608 //that are used to cluster by distance.  this is done so that the overlapping data does not have more influenece than the distance data.
609 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
610         try {
611                 //create new listvector so you don't overwrite the clustering
612                 ListVector* newList = new ListVector(oldList);
613
614                 bool done = false;
615                 ifstream inOverlap;
616                 int count = 0;
617                 
618                 if (hclusterWanted) {  
619                         m->openInputFile(overlapFile, inOverlap);  
620                         if (inOverlap.eof()) {  done = true;  }
621                 }else { if (overlapMatrix.size() == 0)  {  done = true;  } } 
622                 
623                 while (!done) {
624                         if (m->control_pressed) { 
625                                 if (hclusterWanted) {   inOverlap.close();  }           
626                                 return newList;
627                         }
628                         
629                         //get next overlap
630                         seqDist overlapNode;
631                         if (!hclusterWanted) {  
632                                 if (count < overlapMatrix.size()) { //do we have another node in the matrix
633                                         overlapNode = overlapMatrix[count];
634                                         count++;
635                                 }else { break; }
636                         }else { 
637                                 if (!inOverlap.eof()) {
638                                         string firstName, secondName;
639                                         float overlapDistance;
640                                         inOverlap >> firstName >> secondName >> overlapDistance; m->gobble(inOverlap);
641                                         
642                                         //commented out because we check this in readblast already
643                                         //map<string,int>::iterator itA = nameMap->find(firstName);
644                                         //map<string,int>::iterator itB = nameMap->find(secondName);
645                                         //if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
646                                         //if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
647                                         
648                                         //overlapNode.seq1 = itA->second;
649                                         //overlapNode.seq2 = itB->second;
650                                         overlapNode.seq1 = nameMap->get(firstName);
651                                         overlapNode.seq2 = nameMap->get(secondName);
652                                         overlapNode.dist = overlapDistance;
653                                 }else { inOverlap.close(); break; }
654                         } 
655                 
656                         if (overlapNode.dist < dist) {
657                                 //get names of seqs that overlap
658                                 string name1 = nameMap->get(overlapNode.seq1);
659                                 string name2 = nameMap->get(overlapNode.seq2);
660                         
661                                 //use binInfo to find out if they are already in the same bin
662                                 //map<string, int>::iterator itBin1 = binInfo.find(name1);
663                                 //map<string, int>::iterator itBin2 = binInfo.find(name2);
664                                 
665                                 //if(itBin1 == binInfo.end()){  cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1);  }
666                                 //if(itBin2 == binInfo.end()){  cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1);  }
667
668                                 //int binKeep = itBin1->second;
669                                 //int binRemove = itBin2->second;
670                                 
671                                 int binKeep = binInfo[name1];
672                                 int binRemove = binInfo[name2];
673                         
674                                 //if not merge bins and update binInfo
675                                 if(binKeep != binRemove) {
676                 
677                                         //save names in old bin
678                                         string names = newList->get(binRemove);
679                 
680                                         //merge bins into name1s bin
681                                         newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
682                                         newList->set(binRemove, "");    
683                                         
684                                         //update binInfo
685                                         while (names.find_first_of(',') != -1) { 
686                                                 //get name from bin
687                                                 string name = names.substr(0,names.find_first_of(','));
688                                                 //save name and bin number
689                                                 binInfo[name] = binKeep;
690                                                 names = names.substr(names.find_first_of(',')+1, names.length());
691                                         }
692                                         
693                                         //get last name
694                                         binInfo[names] = binKeep;
695                                 }
696                                 
697                         }else { done = true; }
698                 }
699                 
700                 //return listvector
701                 return newList;
702                                 
703         }
704         catch(exception& e) {
705                 m->errorOut(e, "MGClusterCommand", "mergeOPFs");
706                 exit(1);
707         }
708 }
709 //**********************************************************************************************************************
710 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
711         try {
712                 //sort distFile
713                 string sortedDistFile = m->sortFile(unsortedDist, outputDir);
714                 m->mothurRemove(unsortedDist);  //delete unsorted file
715                 distFile = sortedDistFile;
716                 
717                 //sort overlap file
718                 string sortedOverlapFile = m->sortFile(unsortedOverlap, outputDir);
719                 m->mothurRemove(unsortedOverlap);  //delete unsorted file
720                 overlapFile = sortedOverlapFile;
721         }
722         catch(exception& e) {
723                 m->errorOut(e, "MGClusterCommand", "sortHclusterFiles");
724                 exit(1);
725         }
726 }
727
728 //**********************************************************************************************************************
729
730 void MGClusterCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
731     try {
732         //vector<string> names = ct.getNamesOfSeqs();
733
734         //for ( int i; i < ct.getNumGroups(); i++ ) {    rav.push_back( ct.getNumSeqs(names[i]) );    }
735         //return rav;
736         
737         for(int i = 0; i < list->getNumBins(); i++) { 
738            vector<string> binNames;
739            string bin = list->get(i);
740            m->splitAtComma(bin, binNames);
741            int total = 0;
742            for (int j = 0; j < binNames.size(); j++) { 
743                total += ct->getNumSeqs(binNames[j]);
744            }
745            rabund->push_back(total);   
746        }
747         
748         
749     }
750     catch(exception& e) {
751                 m->errorOut(e, "MGClusterCommand", "createRabund");
752                 exit(1);
753         }
754     
755 }
756
757 //**********************************************************************************************************************
758
759