]> git.donarmstrong.com Git - mothur.git/blob - mgclustercommand.cpp
added cutoff change to mgcluster
[mothur.git] / mgclustercommand.cpp
1 /*
2  *  mgclustercommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 12/11/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "mgclustercommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> MGClusterCommand::setParameters(){       
14         try {
15                 CommandParameter pblast("blast", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pblast);
16                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
17                 CommandParameter plength("length", "Number", "", "5", "", "", "",false,false); parameters.push_back(plength);
18                 CommandParameter ppenalty("penalty", "Number", "", "0.10", "", "", "",false,false); parameters.push_back(ppenalty);
19                 CommandParameter pcutoff("cutoff", "Number", "", "0.70", "", "", "",false,false); parameters.push_back(pcutoff);
20                 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
21                 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average", "average", "", "", "",false,false); parameters.push_back(pmethod);
22                 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
23                 CommandParameter pmin("min", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pmin);
24                 CommandParameter pmerge("merge", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pmerge);
25                 CommandParameter phcluster("hcluster", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(phcluster);
26                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
27                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
28                 
29                 vector<string> myArray;
30                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
31                 return myArray;
32         }
33         catch(exception& e) {
34                 m->errorOut(e, "MGClusterCommand", "setParameters");
35                 exit(1);
36         }
37 }
38 //**********************************************************************************************************************
39 string MGClusterCommand::getHelpString(){       
40         try {
41                 string helpString = "";
42                 helpString += "The mgcluster command parameter options are blast, name, cutoff, precision, hard,  method, merge, min, length, penalty and hcluster. The blast parameter is required.\n";
43                 helpString += "The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n";
44                 helpString += "This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n";
45                 helpString += "The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n";
46                 helpString += "The precision parameter's default value is 100. \n";
47                 helpString += "The acceptable mgcluster methods are furthest, nearest and average.  If no method is provided then average is assumed.\n";       
48                 helpString += "The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n";
49                 helpString += "The length parameter is used to specify the minimum overlap required.  The default is 5.\n";
50                 helpString += "The penalty parameter is used to adjust the error rate.  The default is 0.10.\n";
51                 helpString += "The merge parameter allows you to shut off merging based on overlaps and just cluster.  By default merge is true, meaning you want to merge.\n";
52                 helpString += "The hcluster parameter allows you to use the hcluster algorithm when clustering.  This may be neccessary if your file is too large to fit into RAM. The default is false.\n";
53                 helpString += "The mgcluster command should be in the following format: \n";
54                 helpString += "mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n";
55                 helpString += "Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n";
56                 return helpString;
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "MGClusterCommand", "getHelpString");
60                 exit(1);
61         }
62 }
63 //**********************************************************************************************************************
64 MGClusterCommand::MGClusterCommand(){   
65         try {
66                 abort = true; calledHelp = true; 
67                 setParameters();
68                 vector<string> tempOutNames;
69                 outputTypes["list"] = tempOutNames;
70                 outputTypes["rabund"] = tempOutNames;
71                 outputTypes["sabund"] = tempOutNames;
72         }
73         catch(exception& e) {
74                 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
75                 exit(1);
76         }
77 }
78 //**********************************************************************************************************************
79 MGClusterCommand::MGClusterCommand(string option) {
80         try {
81                 abort = false; calledHelp = false;   
82                 
83                 //allow user to run help
84                 if(option == "help") { help(); abort = true; calledHelp = true; }
85                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
86                 
87                 else {
88                         vector<string> myArray = setParameters();
89                         
90                         OptionParser parser(option);
91                         map<string, string> parameters = parser.getParameters();
92                         
93                         ValidParameters validParameter;
94                         map<string,string>::iterator it;
95                 
96                         //check to make sure all parameters are valid for command
97                         for (it = parameters.begin(); it != parameters.end(); it++) { 
98                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
99                         }
100                         
101                         //initialize outputTypes
102                         vector<string> tempOutNames;
103                         outputTypes["list"] = tempOutNames;
104                         outputTypes["rabund"] = tempOutNames;
105                         outputTypes["sabund"] = tempOutNames;
106                         
107                         //if the user changes the input directory command factory will send this info to us in the output parameter 
108                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
109                         if (inputDir == "not found"){   inputDir = "";          }
110                         else {
111                                 string path;
112                                 it = parameters.find("blast");
113                                 //user has given a template file
114                                 if(it != parameters.end()){ 
115                                         path = m->hasPath(it->second);
116                                         //if the user has not given a path then, add inputdir. else leave path alone.
117                                         if (path == "") {       parameters["blast"] = inputDir + it->second;            }
118                                 }
119                                 
120                                 it = parameters.find("name");
121                                 //user has given a template file
122                                 if(it != parameters.end()){ 
123                                         path = m->hasPath(it->second);
124                                         //if the user has not given a path then, add inputdir. else leave path alone.
125                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
126                                 }
127                         }
128
129                         
130                         //check for required parameters
131                         blastfile = validParameter.validFile(parameters, "blast", true);
132                         if (blastfile == "not open") { blastfile = ""; abort = true; }  
133                         else if (blastfile == "not found") { blastfile = ""; }
134                         
135                         //if the user changes the output directory command factory will send this info to us in the output parameter 
136                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
137                                 outputDir = ""; 
138                                 outputDir += m->hasPath(blastfile); //if user entered a file with a path then preserve it       
139                         }
140                         
141                         namefile = validParameter.validFile(parameters, "name", true);
142                         if (namefile == "not open") { abort = true; }   
143                         else if (namefile == "not found") { namefile = ""; }
144                         else { m->setNameFile(namefile); }
145                         
146                         if ((blastfile == "")) { m->mothurOut("When executing a mgcluster command you must provide a blastfile."); m->mothurOutEndLine(); abort = true; }
147                         
148                         //check for optional parameter and set defaults
149                         string temp;
150                         temp = validParameter.validFile(parameters, "precision", false);                if (temp == "not found") { temp = "100"; }
151                         precisionLength = temp.length();
152                         convert(temp, precision); 
153                         
154                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "0.70"; }
155                         convert(temp, cutoff); 
156                         cutoff += (5 / (precision * 10.0));
157                         
158                         method = validParameter.validFile(parameters, "method", false);
159                         if (method == "not found") { method = "average"; }
160                         
161                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
162                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
163
164                         temp = validParameter.validFile(parameters, "length", false);                   if (temp == "not found") { temp = "5"; }
165                         convert(temp, length); 
166                         
167                         temp = validParameter.validFile(parameters, "penalty", false);                  if (temp == "not found") { temp = "0.10"; }
168                         convert(temp, penalty); 
169                         
170                         temp = validParameter.validFile(parameters, "min", false);                              if (temp == "not found") { temp = "true"; }
171                         minWanted = m->isTrue(temp); 
172                         
173                         temp = validParameter.validFile(parameters, "merge", false);                    if (temp == "not found") { temp = "true"; }
174                         merge = m->isTrue(temp); 
175                         
176                         temp = validParameter.validFile(parameters, "hcluster", false);                 if (temp == "not found") { temp = "false"; }
177                         hclusterWanted = m->isTrue(temp); 
178                         
179                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "T"; }
180                         hard = m->isTrue(temp);
181                 }
182
183         }
184         catch(exception& e) {
185                 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
186                 exit(1);
187         }
188 }
189 //**********************************************************************************************************************
190 int MGClusterCommand::execute(){
191         try {
192                 
193                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
194                 
195                 //read names file
196                 if (namefile != "") {
197                         nameMap = new NameAssignment(namefile);
198                         nameMap->readMap();
199                 }else{ nameMap= new NameAssignment(); }
200                 
201                 string fileroot = outputDir + m->getRootName(m->getSimpleName(blastfile));
202                 string tag = "";
203                 time_t start;
204                 float previousDist = 0.00000;
205                 float rndPreviousDist = 0.00000;
206                 
207                 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
208                 //must remember to delete those objects here since readBlast does not
209                 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
210                 read->read(nameMap);
211                 
212                 list = new ListVector(nameMap->getListVector());
213                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
214                 
215                 if (m->control_pressed) { outputTypes.clear(); delete nameMap; delete read; delete list; delete rabund; return 0; }
216                 
217                 start = time(NULL);
218                 oldList = *list;
219                 map<string, int> Seq2Bin;
220                 map<string, int> oldSeq2Bin;
221                 
222                 if (method == "furthest")               { tag = "fn";  }
223                 else if (method == "nearest")   { tag = "nn";  }
224                 else                                                    { tag = "an";  }
225                 
226                 //open output files
227                 m->openOutputFile(fileroot+ tag + ".list",  listFile);
228                 m->openOutputFile(fileroot+ tag + ".rabund",  rabundFile);
229                 m->openOutputFile(fileroot+ tag + ".sabund",  sabundFile);
230                 
231                 if (m->control_pressed) { 
232                         delete nameMap; delete read; delete list; delete rabund; 
233                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
234                         outputTypes.clear();
235                         return 0; 
236                 }
237                 
238                 double saveCutoff = cutoff;
239                 
240                 if (!hclusterWanted) {
241                         //get distmatrix and overlap
242                         SparseMatrix* distMatrix = read->getDistMatrix();
243                         overlapMatrix = read->getOverlapMatrix(); //already sorted by read 
244                         delete read;
245                 
246                         //create cluster
247                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method); }
248                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
249                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); }
250                         cluster->setMapWanted(true);
251                         Seq2Bin = cluster->getSeqtoBin();
252                         oldSeq2Bin = Seq2Bin;
253                         
254                         if (m->control_pressed) { 
255                                 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
256                                 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
257                                 outputTypes.clear();
258                                 return 0; 
259                         }
260                 
261                         //cluster using cluster classes
262                         while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
263                                 
264                                 cluster->update(cutoff);
265                                 
266                                 if (m->control_pressed) { 
267                                         delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
268                                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
269                                         outputTypes.clear();
270                                         return 0; 
271                                 }
272                                 
273                                 float dist = distMatrix->getSmallDist();
274                                 float rndDist;
275                                 if (hard) {
276                                         rndDist = m->ceilDist(dist, precision); 
277                                 }else{
278                                         rndDist = m->roundDist(dist, precision); 
279                                 }
280                                 
281                                 if(previousDist <= 0.0000 && dist != previousDist){
282                                         oldList.setLabel("unique");
283                                         printData(&oldList);
284                                 }
285                                 else if(rndDist != rndPreviousDist){
286                                         if (merge) {
287                                                 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
288                                                 
289                                                 if (m->control_pressed) { 
290                                                         delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
291                                                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
292                                                         outputTypes.clear();
293                                                         return 0; 
294                                                 }
295                                                 
296                                                 temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
297                                                 printData(temp);
298                                                 delete temp;
299                                         }else{
300                                                 oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
301                                                 printData(&oldList);
302                                         }
303                                 }
304         
305                                 previousDist = dist;
306                                 rndPreviousDist = rndDist;
307                                 oldList = *list;
308                                 Seq2Bin = cluster->getSeqtoBin();
309                                 oldSeq2Bin = Seq2Bin;
310                         }
311                         
312                         if(previousDist <= 0.0000){
313                                 oldList.setLabel("unique");
314                                 printData(&oldList);
315                         }
316                         else if(rndPreviousDist<cutoff){
317                                 if (merge) {
318                                         ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
319                                         
320                                         if (m->control_pressed) { 
321                                                         delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
322                                                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
323                                                         outputTypes.clear();
324                                                         return 0; 
325                                         }
326                                         
327                                         temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
328                                         printData(temp);
329                                         delete temp;
330                                 }else{
331                                         oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
332                                         printData(&oldList);
333                                 }
334                         }
335                         
336                         //free memory
337                         overlapMatrix.clear();
338                         delete distMatrix;
339                         delete cluster;
340                         
341                 }else { //use hcluster to cluster
342                         //get distmatrix and overlap
343                         overlapFile = read->getOverlapFile();
344                         distFile = read->getDistFile(); 
345                         delete read;
346                 
347                         //sort the distance and overlap files
348                         sortHclusterFiles(distFile, overlapFile);
349                         
350                         if (m->control_pressed) { 
351                                 delete nameMap;  delete list; delete rabund; 
352                                 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
353                                 outputTypes.clear();
354                                 return 0; 
355                         }
356                 
357                         //create cluster
358                         hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
359                         hcluster->setMapWanted(true);
360                         Seq2Bin = cluster->getSeqtoBin();
361                         oldSeq2Bin = Seq2Bin;
362                         
363                         vector<seqDist> seqs; seqs.resize(1); // to start loop
364                         //ifstream inHcluster;
365                         //m->openInputFile(distFile, inHcluster);
366                         
367                         if (m->control_pressed) { 
368                                 delete nameMap;  delete list; delete rabund; delete hcluster;
369                                 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
370                                 outputTypes.clear();
371                                 return 0; 
372                         }
373
374                         while (seqs.size() != 0){
375                 
376                                 seqs = hcluster->getSeqs();
377                                 
378                                 //to account for cutoff change in average neighbor
379                                 if (seqs.size() != 0) {
380                                         if (seqs[0].dist > cutoff) { break; }
381                                 }
382                                 
383                                 if (m->control_pressed) { 
384                                         delete nameMap;  delete list; delete rabund; delete hcluster;
385                                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
386                                         remove(distFile.c_str());
387                                         remove(overlapFile.c_str());
388                                         outputTypes.clear();
389                                         return 0; 
390                                 }
391                                 
392                                 for (int i = 0; i < seqs.size(); i++) {  //-1 means skip me
393                                         
394                                         if (seqs[i].seq1 != seqs[i].seq2) {
395                 
396                                                 cutoff = hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
397                                                 
398                                                 if (m->control_pressed) { 
399                                                         delete nameMap;  delete list; delete rabund; delete hcluster;
400                                                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
401                                                         remove(distFile.c_str());
402                                                         remove(overlapFile.c_str());
403                                                         outputTypes.clear();
404                                                         return 0; 
405                                                 }
406         
407                                                 float rndDist;
408                                                 if (hard) {
409                                                         rndDist = m->ceilDist(seqs[i].dist, precision); 
410                                                 }else{
411                                                         rndDist = m->roundDist(seqs[i].dist, precision); 
412                                                 }
413                                                                                                 
414                                                 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
415                                                         oldList.setLabel("unique");
416                                                         printData(&oldList);
417                                                 }
418                                                 else if((rndDist != rndPreviousDist)){
419                                                         if (merge) {
420                                                                 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
421                                                                 
422                                                                 if (m->control_pressed) { 
423                                                                         delete nameMap;  delete list; delete rabund; delete hcluster; delete temp;
424                                                                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
425                                                                         remove(distFile.c_str());
426                                                                         remove(overlapFile.c_str());
427                                                                         outputTypes.clear();
428                                                                         return 0; 
429                                                                 }
430
431                                                                 temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
432                                                                 printData(temp);
433                                                                 delete temp;
434                                                         }else{
435                                                                 oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
436                                                                 printData(&oldList);
437                                                         }
438                                                 }
439                                                 
440                                                 previousDist = seqs[i].dist;
441                                                 rndPreviousDist = rndDist;
442                                                 oldList = *list;
443                                                 Seq2Bin = cluster->getSeqtoBin();
444                                                 oldSeq2Bin = Seq2Bin;
445                                         }
446                                 }
447                         }
448                         //inHcluster.close();
449                         
450                         if(previousDist <= 0.0000){
451                                 oldList.setLabel("unique");
452                                 printData(&oldList);
453                         }
454                         else if(rndPreviousDist<cutoff){
455                                 if (merge) {
456                                         ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
457                                         
458                                         if (m->control_pressed) { 
459                                                         delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
460                                                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
461                                                         remove(distFile.c_str());
462                                                         remove(overlapFile.c_str());
463                                                         outputTypes.clear();
464                                                         return 0; 
465                                         }
466                                         
467                                         temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
468                                         printData(temp);
469                                         delete temp;
470                                 }else{
471                                         oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
472                                         printData(&oldList);
473                                 }
474                         }
475                         
476                         delete hcluster;
477                         remove(distFile.c_str());
478                         remove(overlapFile.c_str());
479                 }
480                 
481                 delete list; 
482                 delete rabund;
483                 listFile.close();
484                 sabundFile.close();
485                 rabundFile.close();
486         
487                 if (m->control_pressed) { 
488                         delete nameMap; 
489                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
490                         outputTypes.clear();
491                         return 0; 
492                 }
493                 
494                 m->mothurOutEndLine();
495                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
496                 m->mothurOut(fileroot+ tag + ".list"); m->mothurOutEndLine();   outputNames.push_back(fileroot+ tag + ".list"); outputTypes["list"].push_back(fileroot+ tag + ".list");
497                 m->mothurOut(fileroot+ tag + ".rabund"); m->mothurOutEndLine(); outputNames.push_back(fileroot+ tag + ".rabund"); outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
498                 m->mothurOut(fileroot+ tag + ".sabund"); m->mothurOutEndLine(); outputNames.push_back(fileroot+ tag + ".sabund"); outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
499                 m->mothurOutEndLine();
500                 
501                 if (saveCutoff != cutoff) { 
502                         if (hard)       {  saveCutoff = m->ceilDist(saveCutoff, precision);     }
503                         else            {       saveCutoff = m->roundDist(saveCutoff, precision);  }
504                         
505                         m->mothurOut("changed cutoff to " + toString(cutoff)); m->mothurOutEndLine(); 
506                 }
507                 
508                 //set list file as new current listfile
509                 string current = "";
510                 itTypes = outputTypes.find("list");
511                 if (itTypes != outputTypes.end()) {
512                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
513                 }
514                 
515                 //set rabund file as new current rabundfile
516                 itTypes = outputTypes.find("rabund");
517                 if (itTypes != outputTypes.end()) {
518                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
519                 }
520                 
521                 //set sabund file as new current sabundfile
522                 itTypes = outputTypes.find("sabund");
523                 if (itTypes != outputTypes.end()) {
524                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
525                 }
526                 
527                 
528                 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine();
529                         
530                 return 0;
531         }
532         catch(exception& e) {
533                 m->errorOut(e, "MGClusterCommand", "execute");
534                 exit(1);
535         }
536 }
537 //**********************************************************************************************************************
538 void MGClusterCommand::printData(ListVector* mergedList){
539         try {
540                 mergedList->print(listFile);
541                 mergedList->getRAbundVector().print(rabundFile);
542                 
543                 SAbundVector sabund = mergedList->getSAbundVector();
544
545                 sabund.print(cout);
546                 sabund.print(sabundFile);
547         }
548         catch(exception& e) {
549                 m->errorOut(e, "MGClusterCommand", "printData");
550                 exit(1);
551         }
552 }
553 //**********************************************************************************************************************
554 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
555 //that are used to cluster by distance.  this is done so that the overlapping data does not have more influenece than the distance data.
556 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
557         try {
558                 //create new listvector so you don't overwrite the clustering
559                 ListVector* newList = new ListVector(oldList);
560
561                 bool done = false;
562                 ifstream inOverlap;
563                 int count = 0;
564                 
565                 if (hclusterWanted) {  
566                         m->openInputFile(overlapFile, inOverlap);  
567                         if (inOverlap.eof()) {  done = true;  }
568                 }else { if (overlapMatrix.size() == 0)  {  done = true;  } } 
569                 
570                 while (!done) {
571                         if (m->control_pressed) { 
572                                 if (hclusterWanted) {   inOverlap.close();  }           
573                                 return newList;
574                         }
575                         
576                         //get next overlap
577                         seqDist overlapNode;
578                         if (!hclusterWanted) {  
579                                 if (count < overlapMatrix.size()) { //do we have another node in the matrix
580                                         overlapNode = overlapMatrix[count];
581                                         count++;
582                                 }else { break; }
583                         }else { 
584                                 if (!inOverlap.eof()) {
585                                         string firstName, secondName;
586                                         float overlapDistance;
587                                         inOverlap >> firstName >> secondName >> overlapDistance; m->gobble(inOverlap);
588                                         
589                                         //commented out because we check this in readblast already
590                                         //map<string,int>::iterator itA = nameMap->find(firstName);
591                                         //map<string,int>::iterator itB = nameMap->find(secondName);
592                                         //if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
593                                         //if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
594                                         
595                                         //overlapNode.seq1 = itA->second;
596                                         //overlapNode.seq2 = itB->second;
597                                         overlapNode.seq1 = nameMap->get(firstName);
598                                         overlapNode.seq2 = nameMap->get(secondName);
599                                         overlapNode.dist = overlapDistance;
600                                 }else { inOverlap.close(); break; }
601                         } 
602                 
603                         if (overlapNode.dist < dist) {
604                                 //get names of seqs that overlap
605                                 string name1 = nameMap->get(overlapNode.seq1);
606                                 string name2 = nameMap->get(overlapNode.seq2);
607                         
608                                 //use binInfo to find out if they are already in the same bin
609                                 //map<string, int>::iterator itBin1 = binInfo.find(name1);
610                                 //map<string, int>::iterator itBin2 = binInfo.find(name2);
611                                 
612                                 //if(itBin1 == binInfo.end()){  cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1);  }
613                                 //if(itBin2 == binInfo.end()){  cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1);  }
614
615                                 //int binKeep = itBin1->second;
616                                 //int binRemove = itBin2->second;
617                                 
618                                 int binKeep = binInfo[name1];
619                                 int binRemove = binInfo[name2];
620                         
621                                 //if not merge bins and update binInfo
622                                 if(binKeep != binRemove) {
623                 
624                                         //save names in old bin
625                                         string names = newList->get(binRemove);
626                 
627                                         //merge bins into name1s bin
628                                         newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
629                                         newList->set(binRemove, "");    
630                                         
631                                         //update binInfo
632                                         while (names.find_first_of(',') != -1) { 
633                                                 //get name from bin
634                                                 string name = names.substr(0,names.find_first_of(','));
635                                                 //save name and bin number
636                                                 binInfo[name] = binKeep;
637                                                 names = names.substr(names.find_first_of(',')+1, names.length());
638                                         }
639                                         
640                                         //get last name
641                                         binInfo[names] = binKeep;
642                                 }
643                                 
644                         }else { done = true; }
645                 }
646                 
647                 //return listvector
648                 return newList;
649                                 
650         }
651         catch(exception& e) {
652                 m->errorOut(e, "MGClusterCommand", "mergeOPFs");
653                 exit(1);
654         }
655 }
656 //**********************************************************************************************************************
657 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
658         try {
659                 //sort distFile
660                 string sortedDistFile = m->sortFile(unsortedDist, outputDir);
661                 remove(unsortedDist.c_str());  //delete unsorted file
662                 distFile = sortedDistFile;
663                 
664                 //sort overlap file
665                 string sortedOverlapFile = m->sortFile(unsortedOverlap, outputDir);
666                 remove(unsortedOverlap.c_str());  //delete unsorted file
667                 overlapFile = sortedOverlapFile;
668         }
669         catch(exception& e) {
670                 m->errorOut(e, "MGClusterCommand", "sortHclusterFiles");
671                 exit(1);
672         }
673 }
674
675 //**********************************************************************************************************************
676
677
678
679
680