]> git.donarmstrong.com Git - mothur.git/blob - mgclustercommand.cpp
Merge remote-tracking branch 'mothur/master'
[mothur.git] / mgclustercommand.cpp
1 /*
2  *  mgclustercommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 12/11/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "mgclustercommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> MGClusterCommand::setParameters(){       
14         try {
15                 CommandParameter pblast("blast", "InputTypes", "", "", "none", "none", "none","list",false,true,true); parameters.push_back(pblast);
16                 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName","rabund-sabund",false,false,true); parameters.push_back(pname);
17                 CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "none","",false,false,true); parameters.push_back(pcount);
18                 CommandParameter plength("length", "Number", "", "5", "", "", "","",false,false); parameters.push_back(plength);
19                 CommandParameter ppenalty("penalty", "Number", "", "0.10", "", "", "","",false,false); parameters.push_back(ppenalty);
20                 CommandParameter pcutoff("cutoff", "Number", "", "0.70", "", "", "","",false,false,true); parameters.push_back(pcutoff);
21                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
22                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average", "average", "", "", "","",false,false); parameters.push_back(pmethod);
23                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
24                 CommandParameter pmin("min", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pmin);
25                 CommandParameter pmerge("merge", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pmerge);
26                 CommandParameter phcluster("hcluster", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(phcluster);
27                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
28                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
29                 
30                 vector<string> myArray;
31                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
32                 return myArray;
33         }
34         catch(exception& e) {
35                 m->errorOut(e, "MGClusterCommand", "setParameters");
36                 exit(1);
37         }
38 }
39 //**********************************************************************************************************************
40 string MGClusterCommand::getHelpString(){       
41         try {
42                 string helpString = "";
43                 helpString += "The mgcluster command parameter options are blast, name, cutoff, precision, hard,  method, merge, min, length, penalty and hcluster. The blast parameter is required.\n";
44                 helpString += "The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n";
45                 helpString += "This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n";
46                 helpString += "The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n";
47                 helpString += "The precision parameter's default value is 100. \n";
48                 helpString += "The acceptable mgcluster methods are furthest, nearest and average.  If no method is provided then average is assumed.\n";       
49                 helpString += "The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n";
50                 helpString += "The length parameter is used to specify the minimum overlap required.  The default is 5.\n";
51                 helpString += "The penalty parameter is used to adjust the error rate.  The default is 0.10.\n";
52                 helpString += "The merge parameter allows you to shut off merging based on overlaps and just cluster.  By default merge is true, meaning you want to merge.\n";
53                 helpString += "The hcluster parameter allows you to use the hcluster algorithm when clustering.  This may be neccessary if your file is too large to fit into RAM. The default is false.\n";
54                 helpString += "The mgcluster command should be in the following format: \n";
55                 helpString += "mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n";
56                 helpString += "Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n";
57                 return helpString;
58         }
59         catch(exception& e) {
60                 m->errorOut(e, "MGClusterCommand", "getHelpString");
61                 exit(1);
62         }
63 }
64 //**********************************************************************************************************************
65 string MGClusterCommand::getOutputPattern(string type) {
66     try {
67         string pattern = "";
68         
69         if (type == "list") {  pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; } 
70         else if (type == "rabund") {  pattern = "[filename],[clustertag],rabund"; } 
71         else if (type == "sabund") {  pattern = "[filename],[clustertag],sabund"; }
72         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
73         
74         return pattern;
75     }
76     catch(exception& e) {
77         m->errorOut(e, "MGClusterCommand", "getOutputPattern");
78         exit(1);
79     }
80 }
81 //*******************************************************************************************************************
82 MGClusterCommand::MGClusterCommand(){   
83         try {
84                 abort = true; calledHelp = true; 
85                 setParameters();
86                 vector<string> tempOutNames;
87                 outputTypes["list"] = tempOutNames;
88                 outputTypes["rabund"] = tempOutNames;
89                 outputTypes["sabund"] = tempOutNames;
90         }
91         catch(exception& e) {
92                 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
93                 exit(1);
94         }
95 }
96 //**********************************************************************************************************************
97 MGClusterCommand::MGClusterCommand(string option) {
98         try {
99                 abort = false; calledHelp = false;   
100                 
101                 //allow user to run help
102                 if(option == "help") { help(); abort = true; calledHelp = true; }
103                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
104                 
105                 else {
106                         vector<string> myArray = setParameters();
107                         
108                         OptionParser parser(option);
109                         map<string, string> parameters = parser.getParameters();
110                         
111                         ValidParameters validParameter;
112                         map<string,string>::iterator it;
113                 
114                         //check to make sure all parameters are valid for command
115                         for (it = parameters.begin(); it != parameters.end(); it++) { 
116                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
117                         }
118                         
119                         //initialize outputTypes
120                         vector<string> tempOutNames;
121                         outputTypes["list"] = tempOutNames;
122                         outputTypes["rabund"] = tempOutNames;
123                         outputTypes["sabund"] = tempOutNames;
124                         
125                         //if the user changes the input directory command factory will send this info to us in the output parameter 
126                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
127                         if (inputDir == "not found"){   inputDir = "";          }
128                         else {
129                                 string path;
130                                 it = parameters.find("blast");
131                                 //user has given a template file
132                                 if(it != parameters.end()){ 
133                                         path = m->hasPath(it->second);
134                                         //if the user has not given a path then, add inputdir. else leave path alone.
135                                         if (path == "") {       parameters["blast"] = inputDir + it->second;            }
136                                 }
137                                 
138                                 it = parameters.find("name");
139                                 //user has given a template file
140                                 if(it != parameters.end()){ 
141                                         path = m->hasPath(it->second);
142                                         //if the user has not given a path then, add inputdir. else leave path alone.
143                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
144                                 }
145                 
146                 it = parameters.find("count");
147                                 //user has given a template file
148                                 if(it != parameters.end()){ 
149                                         path = m->hasPath(it->second);
150                                         //if the user has not given a path then, add inputdir. else leave path alone.
151                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
152                                 }
153                         }
154
155                         
156                         //check for required parameters
157                         blastfile = validParameter.validFile(parameters, "blast", true);
158                         if (blastfile == "not open") { blastfile = ""; abort = true; }  
159                         else if (blastfile == "not found") { blastfile = ""; }
160                         
161                         //if the user changes the output directory command factory will send this info to us in the output parameter 
162                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
163                                 outputDir = ""; 
164                                 outputDir += m->hasPath(blastfile); //if user entered a file with a path then preserve it       
165                         }
166                         
167                         namefile = validParameter.validFile(parameters, "name", true);
168                         if (namefile == "not open") { abort = true; }   
169                         else if (namefile == "not found") { namefile = ""; }
170                         else { m->setNameFile(namefile); }
171             
172             countfile = validParameter.validFile(parameters, "count", true);
173                         if (countfile == "not open") { abort = true; }  
174                         else if (countfile == "not found") { countfile = ""; }
175             else { m->setCountTableFile(countfile); }
176             
177             if (countfile != "" && namefile != "") { m->mothurOut("[ERROR]: Cannot have both a name file and count file. Please use one or the other."); m->mothurOutEndLine(); abort = true; }
178                         
179                         if ((blastfile == "")) { m->mothurOut("When executing a mgcluster command you must provide a blastfile."); m->mothurOutEndLine(); abort = true; }
180                         
181                         //check for optional parameter and set defaults
182                         string temp;
183             temp = validParameter.validFile(parameters, "precision", false);            if (temp == "not found") { temp = "100"; }
184                         precisionLength = temp.length();
185                         m->mothurConvert(temp, precision); 
186                         
187                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "0.70"; }
188                         m->mothurConvert(temp, cutoff); 
189                         cutoff += (5 / (precision * 10.0));
190                         
191                         method = validParameter.validFile(parameters, "method", false);
192                         if (method == "not found") { method = "average"; }
193                         
194                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
195                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
196
197                         temp = validParameter.validFile(parameters, "length", false);                   if (temp == "not found") { temp = "5"; }
198                         m->mothurConvert(temp, length); 
199                         
200                         temp = validParameter.validFile(parameters, "penalty", false);                  if (temp == "not found") { temp = "0.10"; }
201                         m->mothurConvert(temp, penalty); 
202                         
203                         temp = validParameter.validFile(parameters, "min", false);                              if (temp == "not found") { temp = "true"; }
204                         minWanted = m->isTrue(temp); 
205                         
206                         temp = validParameter.validFile(parameters, "merge", false);                    if (temp == "not found") { temp = "true"; }
207                         merge = m->isTrue(temp); 
208                         
209                         temp = validParameter.validFile(parameters, "hcluster", false);                 if (temp == "not found") { temp = "false"; }
210                         hclusterWanted = m->isTrue(temp); 
211                         
212                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
213                         hard = m->isTrue(temp);            
214                 }
215
216         }
217         catch(exception& e) {
218                 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
219                 exit(1);
220         }
221 }
222 //**********************************************************************************************************************
223 int MGClusterCommand::execute(){
224         try {
225                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
226                 
227                 //read names file
228                 if (namefile != "") {
229                         nameMap = new NameAssignment(namefile);
230                         nameMap->readMap();
231                 }else if (countfile != "") {
232             ct = new CountTable();
233             ct->readTable(countfile, false);
234             nameMap= new NameAssignment();
235             vector<string> tempNames = ct->getNamesOfSeqs();
236             for (int i = 0; i < tempNames.size(); i++) {  nameMap->push_back(tempNames[i]);  }
237         }else{ nameMap= new NameAssignment(); }
238                 
239                 string fileroot = outputDir + m->getRootName(m->getSimpleName(blastfile));
240                 string tag = "";
241                 time_t start;
242                 float previousDist = 0.00000;
243                 float rndPreviousDist = 0.00000; 
244         
245                 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
246                 //must remember to delete those objects here since readBlast does not
247                 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
248                 read->read(nameMap);
249         
250         list = new ListVector(nameMap->getListVector());
251         RAbundVector* rabund = NULL;
252         
253         if(countfile != "") {
254             rabund = new RAbundVector();
255             createRabund(ct, list, rabund);
256         }else {
257             rabund = new RAbundVector(list->getRAbundVector());
258         }
259         
260                 
261                 //list = new ListVector(nameMap->getListVector());
262                 //rabund = new RAbundVector(list->getRAbundVector());
263                 
264                 if (m->control_pressed) { outputTypes.clear(); delete nameMap; delete read; delete list; delete rabund; return 0; }
265                 
266                 start = time(NULL);
267                 oldList = *list;
268                 map<string, int> Seq2Bin;
269                 map<string, int> oldSeq2Bin;
270                 
271                 if (method == "furthest")               { tag = "fn";  }
272                 else if (method == "nearest")   { tag = "nn";  }
273                 else                                                    { tag = "an";  }
274                 
275         map<string, string> variables; 
276         variables["[filename]"] = fileroot;
277         variables["[clustertag]"] = tag;
278         string sabundFileName = getOutputFileName("sabund", variables);
279         string rabundFileName = getOutputFileName("rabund", variables);
280         if (countfile != "") { variables["[tag2]"] = "unique_list"; }
281         string listFileName = getOutputFileName("list", variables);
282         
283         if (countfile == "") {
284             m->openOutputFile(sabundFileName,   sabundFile);
285             m->openOutputFile(rabundFileName,   rabundFile);
286         }
287                 m->openOutputFile(listFileName, listFile);
288                 
289                 if (m->control_pressed) { 
290                         delete nameMap; delete read; delete list; delete rabund; 
291                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
292                         outputTypes.clear();
293                         return 0; 
294                 }
295                 
296                 double saveCutoff = cutoff;
297                 
298                 if (!hclusterWanted) {
299                         //get distmatrix and overlap
300                         SparseDistanceMatrix* distMatrix = read->getDistMatrix();
301                         overlapMatrix = read->getOverlapMatrix(); //already sorted by read 
302                         delete read;
303                 
304                         //create cluster
305                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method); }
306                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
307                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); }
308                         cluster->setMapWanted(true);
309                         Seq2Bin = cluster->getSeqtoBin();
310                         oldSeq2Bin = Seq2Bin;
311                         
312                         if (m->control_pressed) { 
313                                 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
314                                 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
315                                 outputTypes.clear();
316                                 return 0; 
317                         }
318             
319             
320                         //cluster using cluster classes
321                         while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
322                                 
323                 if (m->debug) {  cout << "numNodes=" << distMatrix->getNNodes() << " smallDist = " << distMatrix->getSmallDist() << endl; }
324                 
325                                 cluster->update(cutoff);
326                                 
327                                 if (m->control_pressed) { 
328                                         delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
329                                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
330                                         outputTypes.clear();
331                                         return 0; 
332                                 }
333                                 
334                                 float dist = distMatrix->getSmallDist();
335                                 float rndDist;
336                                 if (hard) {
337                                         rndDist = m->ceilDist(dist, precision); 
338                                 }else{
339                                         rndDist = m->roundDist(dist, precision); 
340                                 }
341                                 
342                                 if(previousDist <= 0.0000 && dist != previousDist){
343                                         oldList.setLabel("unique");
344                                         printData(&oldList);
345                                 }
346                                 else if(rndDist != rndPreviousDist){
347                                         if (merge) {
348                                                 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
349                                                 
350                                                 if (m->control_pressed) { 
351                                                         delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
352                                                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
353                                                         outputTypes.clear();
354                                                         return 0; 
355                                                 }
356                                                 
357                                                 temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
358                                                 printData(temp);
359                                                 delete temp;
360                                         }else{
361                                                 oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
362                                                 printData(&oldList);
363                                         }
364                                 }
365         
366                                 previousDist = dist;
367                                 rndPreviousDist = rndDist;
368                                 oldList = *list;
369                                 Seq2Bin = cluster->getSeqtoBin();
370                                 oldSeq2Bin = Seq2Bin;
371                         }
372                         
373                         if(previousDist <= 0.0000){
374                                 oldList.setLabel("unique");
375                                 printData(&oldList);
376                         }
377                         else if(rndPreviousDist<cutoff){
378                                 if (merge) {
379                                         ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
380                                         
381                                         if (m->control_pressed) { 
382                                                         delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
383                                                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
384                                                         outputTypes.clear();
385                                                         return 0; 
386                                         }
387                                         
388                                         temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
389                                         printData(temp);
390                                         delete temp;
391                                 }else{
392                                         oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
393                                         printData(&oldList);
394                                 }
395                         }
396                         
397                         //free memory
398                         overlapMatrix.clear();
399                         delete distMatrix;
400                         delete cluster;
401                         
402                 }else { //use hcluster to cluster
403                         //get distmatrix and overlap
404                         overlapFile = read->getOverlapFile();
405                         distFile = read->getDistFile(); 
406                         delete read;
407                 
408                         //sort the distance and overlap files
409                         sortHclusterFiles(distFile, overlapFile);
410                         
411                         if (m->control_pressed) { 
412                                 delete nameMap;  delete list; delete rabund; 
413                                 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
414                                 outputTypes.clear();
415                                 return 0; 
416                         }
417                 
418                         //create cluster
419                         hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
420                         hcluster->setMapWanted(true);
421                         Seq2Bin = cluster->getSeqtoBin();
422                         oldSeq2Bin = Seq2Bin;
423                         
424                         vector<seqDist> seqs; seqs.resize(1); // to start loop
425                         //ifstream inHcluster;
426                         //m->openInputFile(distFile, inHcluster);
427                         
428                         if (m->control_pressed) { 
429                                 delete nameMap;  delete list; delete rabund; delete hcluster;
430                                 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
431                                 outputTypes.clear();
432                                 return 0; 
433                         }
434
435                         while (seqs.size() != 0){
436                 
437                                 seqs = hcluster->getSeqs();
438                                 
439                                 //to account for cutoff change in average neighbor
440                                 if (seqs.size() != 0) {
441                                         if (seqs[0].dist > cutoff) { break; }
442                                 }
443                                 
444                                 if (m->control_pressed) { 
445                                         delete nameMap;  delete list; delete rabund; delete hcluster;
446                                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
447                                         m->mothurRemove(distFile);
448                                         m->mothurRemove(overlapFile);
449                                         outputTypes.clear();
450                                         return 0; 
451                                 }
452                                 
453                                 for (int i = 0; i < seqs.size(); i++) {  //-1 means skip me
454                                         
455                                         if (seqs[i].seq1 != seqs[i].seq2) {
456                 
457                                                 cutoff = hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
458                                                 
459                                                 if (m->control_pressed) { 
460                                                         delete nameMap;  delete list; delete rabund; delete hcluster;
461                                                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
462                                                         m->mothurRemove(distFile);
463                                                         m->mothurRemove(overlapFile);
464                                                         outputTypes.clear();
465                                                         return 0; 
466                                                 }
467         
468                                                 float rndDist;
469                                                 if (hard) {
470                                                         rndDist = m->ceilDist(seqs[i].dist, precision); 
471                                                 }else{
472                                                         rndDist = m->roundDist(seqs[i].dist, precision); 
473                                                 }
474                                                                                                 
475                                                 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
476                                                         oldList.setLabel("unique");
477                                                         printData(&oldList);
478                                                 }
479                                                 else if((rndDist != rndPreviousDist)){
480                                                         if (merge) {
481                                                                 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
482                                                                 
483                                                                 if (m->control_pressed) { 
484                                                                         delete nameMap;  delete list; delete rabund; delete hcluster; delete temp;
485                                                                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
486                                                                         m->mothurRemove(distFile);
487                                                                         m->mothurRemove(overlapFile);
488                                                                         outputTypes.clear();
489                                                                         return 0; 
490                                                                 }
491
492                                                                 temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
493                                                                 printData(temp);
494                                                                 delete temp;
495                                                         }else{
496                                                                 oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
497                                                                 printData(&oldList);
498                                                         }
499                                                 }
500                                                 
501                                                 previousDist = seqs[i].dist;
502                                                 rndPreviousDist = rndDist;
503                                                 oldList = *list;
504                                                 Seq2Bin = cluster->getSeqtoBin();
505                                                 oldSeq2Bin = Seq2Bin;
506                                         }
507                                 }
508                         }
509                         //inHcluster.close();
510                         
511                         if(previousDist <= 0.0000){
512                                 oldList.setLabel("unique");
513                                 printData(&oldList);
514                         }
515                         else if(rndPreviousDist<cutoff){
516                                 if (merge) {
517                                         ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
518                                         
519                                         if (m->control_pressed) { 
520                                                         delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
521                                                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
522                                                         m->mothurRemove(distFile);
523                                                         m->mothurRemove(overlapFile);
524                                                         outputTypes.clear();
525                                                         return 0; 
526                                         }
527                                         
528                                         temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
529                                         printData(temp);
530                                         delete temp;
531                                 }else{
532                                         oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
533                                         printData(&oldList);
534                                 }
535                         }
536                         
537                         delete hcluster;
538                         m->mothurRemove(distFile);
539                         m->mothurRemove(overlapFile);
540                 }
541                 
542                 delete list;
543                 delete rabund;
544                 listFile.close();
545         if (countfile == "") {
546             sabundFile.close();
547             rabundFile.close();
548         }
549                 if (m->control_pressed) { 
550                         delete nameMap; 
551                         listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close();  m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
552                         outputTypes.clear();
553                         return 0; 
554                 }
555                 
556                 m->mothurOutEndLine();
557                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
558                 m->mothurOut(listFileName); m->mothurOutEndLine();      outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
559                 if (countfile == "") {
560             m->mothurOut(rabundFileName); m->mothurOutEndLine();        outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
561             m->mothurOut(sabundFileName); m->mothurOutEndLine();        outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
562         }
563                 m->mothurOutEndLine();
564                 
565                 if (saveCutoff != cutoff) { 
566                         if (hard)       {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
567                         else            {       saveCutoff = m->roundDist(saveCutoff, precision);  }
568                         
569                         m->mothurOut("changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); 
570                 }
571                 
572                 //set list file as new current listfile
573                 string current = "";
574                 itTypes = outputTypes.find("list");
575                 if (itTypes != outputTypes.end()) {
576                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
577                 }
578                 
579                 //set rabund file as new current rabundfile
580                 itTypes = outputTypes.find("rabund");
581                 if (itTypes != outputTypes.end()) {
582                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
583                 }
584                 
585                 //set sabund file as new current sabundfile
586                 itTypes = outputTypes.find("sabund");
587                 if (itTypes != outputTypes.end()) {
588                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
589                 }
590                 
591                 
592                 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine();
593                         
594                 return 0;
595         }
596         catch(exception& e) {
597                 m->errorOut(e, "MGClusterCommand", "execute");
598                 exit(1);
599         }
600 }
601 //**********************************************************************************************************************
602 void MGClusterCommand::printData(ListVector* mergedList){
603         try {
604                 mergedList->print(listFile);
605         SAbundVector sabund = mergedList->getSAbundVector();
606         
607         if (countfile == "") {
608             mergedList->getRAbundVector().print(rabundFile);
609             sabund.print(sabundFile);
610         }
611
612                 sabund.print(cout);
613         }
614         catch(exception& e) {
615                 m->errorOut(e, "MGClusterCommand", "printData");
616                 exit(1);
617         }
618 }
619 //**********************************************************************************************************************
620 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
621 //that are used to cluster by distance.  this is done so that the overlapping data does not have more influenece than the distance data.
622 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
623         try {
624                 //create new listvector so you don't overwrite the clustering
625                 ListVector* newList = new ListVector(oldList);
626
627                 bool done = false;
628                 ifstream inOverlap;
629                 int count = 0;
630                 
631                 if (hclusterWanted) {  
632                         m->openInputFile(overlapFile, inOverlap);  
633                         if (inOverlap.eof()) {  done = true;  }
634                 }else { if (overlapMatrix.size() == 0)  {  done = true;  } } 
635                 
636                 while (!done) {
637                         if (m->control_pressed) { 
638                                 if (hclusterWanted) {   inOverlap.close();  }           
639                                 return newList;
640                         }
641                         
642                         //get next overlap
643                         seqDist overlapNode;
644                         if (!hclusterWanted) {  
645                                 if (count < overlapMatrix.size()) { //do we have another node in the matrix
646                                         overlapNode = overlapMatrix[count];
647                                         count++;
648                                 }else { break; }
649                         }else { 
650                                 if (!inOverlap.eof()) {
651                                         string firstName, secondName;
652                                         float overlapDistance;
653                                         inOverlap >> firstName >> secondName >> overlapDistance; m->gobble(inOverlap);
654                                         
655                                         //commented out because we check this in readblast already
656                                         //map<string,int>::iterator itA = nameMap->find(firstName);
657                                         //map<string,int>::iterator itB = nameMap->find(secondName);
658                                         //if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
659                                         //if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
660                                         
661                                         //overlapNode.seq1 = itA->second;
662                                         //overlapNode.seq2 = itB->second;
663                                         overlapNode.seq1 = nameMap->get(firstName);
664                                         overlapNode.seq2 = nameMap->get(secondName);
665                                         overlapNode.dist = overlapDistance;
666                                 }else { inOverlap.close(); break; }
667                         } 
668                 
669                         if (overlapNode.dist < dist) {
670                                 //get names of seqs that overlap
671                                 string name1 = nameMap->get(overlapNode.seq1);
672                                 string name2 = nameMap->get(overlapNode.seq2);
673                         
674                                 //use binInfo to find out if they are already in the same bin
675                                 //map<string, int>::iterator itBin1 = binInfo.find(name1);
676                                 //map<string, int>::iterator itBin2 = binInfo.find(name2);
677                                 
678                                 //if(itBin1 == binInfo.end()){  cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1);  }
679                                 //if(itBin2 == binInfo.end()){  cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1);  }
680
681                                 //int binKeep = itBin1->second;
682                                 //int binRemove = itBin2->second;
683                                 
684                                 int binKeep = binInfo[name1];
685                                 int binRemove = binInfo[name2];
686                         
687                                 //if not merge bins and update binInfo
688                                 if(binKeep != binRemove) {
689                 
690                                         //save names in old bin
691                                         string names = newList->get(binRemove);
692                 
693                                         //merge bins into name1s bin
694                                         newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
695                                         newList->set(binRemove, "");    
696                                         
697                                         //update binInfo
698                                         while (names.find_first_of(',') != -1) { 
699                                                 //get name from bin
700                                                 string name = names.substr(0,names.find_first_of(','));
701                                                 //save name and bin number
702                                                 binInfo[name] = binKeep;
703                                                 names = names.substr(names.find_first_of(',')+1, names.length());
704                                         }
705                                         
706                                         //get last name
707                                         binInfo[names] = binKeep;
708                                 }
709                                 
710                         }else { done = true; }
711                 }
712                 
713                 //return listvector
714                 return newList;
715                                 
716         }
717         catch(exception& e) {
718                 m->errorOut(e, "MGClusterCommand", "mergeOPFs");
719                 exit(1);
720         }
721 }
722 //**********************************************************************************************************************
723 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
724         try {
725                 //sort distFile
726                 string sortedDistFile = m->sortFile(unsortedDist, outputDir);
727                 m->mothurRemove(unsortedDist);  //delete unsorted file
728                 distFile = sortedDistFile;
729                 
730                 //sort overlap file
731                 string sortedOverlapFile = m->sortFile(unsortedOverlap, outputDir);
732                 m->mothurRemove(unsortedOverlap);  //delete unsorted file
733                 overlapFile = sortedOverlapFile;
734         }
735         catch(exception& e) {
736                 m->errorOut(e, "MGClusterCommand", "sortHclusterFiles");
737                 exit(1);
738         }
739 }
740
741 //**********************************************************************************************************************
742
743 void MGClusterCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
744     try {
745         //vector<string> names = ct.getNamesOfSeqs();
746
747         //for ( int i; i < ct.getNumGroups(); i++ ) {    rav.push_back( ct.getNumSeqs(names[i]) );    }
748         //return rav;
749         
750         for(int i = 0; i < list->getNumBins(); i++) { 
751            vector<string> binNames;
752            string bin = list->get(i);
753            m->splitAtComma(bin, binNames);
754            int total = 0;
755            for (int j = 0; j < binNames.size(); j++) { 
756                total += ct->getNumSeqs(binNames[j]);
757            }
758            rabund->push_back(total);   
759        }
760         
761         
762     }
763     catch(exception& e) {
764                 m->errorOut(e, "MGClusterCommand", "createRabund");
765                 exit(1);
766         }
767     
768 }
769
770 //**********************************************************************************************************************
771
772