]> git.donarmstrong.com Git - mothur.git/blob - mgclustercommand.cpp
fixes while testing
[mothur.git] / mgclustercommand.cpp
1 /*
2  *  mgclustercommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 12/11/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "mgclustercommand.h"
11
12
13 //**********************************************************************************************************************
14 vector<string> MGClusterCommand::getValidParameters(){  
15         try {
16                 string Array[] =  {"blast", "method", "name", "hard", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"};
17                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
18                 return myArray;
19         }
20         catch(exception& e) {
21                 m->errorOut(e, "MGClusterCommand", "getValidParameters");
22                 exit(1);
23         }
24 }
25 //**********************************************************************************************************************
26 MGClusterCommand::MGClusterCommand(){   
27         try {
28                 abort = true;
29                 //initialize outputTypes
30                 vector<string> tempOutNames;
31                 outputTypes["list"] = tempOutNames;
32                 outputTypes["rabund"] = tempOutNames;
33                 outputTypes["sabund"] = tempOutNames;
34         }
35         catch(exception& e) {
36                 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
37                 exit(1);
38         }
39 }
40 //**********************************************************************************************************************
41 vector<string> MGClusterCommand::getRequiredParameters(){       
42         try {
43                 string Array[] =  {"blast"};
44                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
45                 return myArray;
46         }
47         catch(exception& e) {
48                 m->errorOut(e, "MGClusterCommand", "getRequiredParameters");
49                 exit(1);
50         }
51 }
52 //**********************************************************************************************************************
53 vector<string> MGClusterCommand::getRequiredFiles(){    
54         try {
55                 vector<string> myArray;
56                 return myArray;
57         }
58         catch(exception& e) {
59                 m->errorOut(e, "MGClusterCommand", "getRequiredFiles");
60                 exit(1);
61         }
62 }
63 //**********************************************************************************************************************
64 MGClusterCommand::MGClusterCommand(string option) {
65         try {
66                 globaldata = GlobalData::getInstance();
67                 abort = false;
68                 
69                 //allow user to run help
70                 if(option == "help") { help(); abort = true; }
71                 
72                 else {
73                         //valid paramters for this command
74                         string Array[] =  {"blast", "method", "name", "hard", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"};
75                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
76                         
77                         OptionParser parser(option);
78                         map<string, string> parameters = parser.getParameters();
79                         
80                         ValidParameters validParameter;
81                         map<string,string>::iterator it;
82                 
83                         //check to make sure all parameters are valid for command
84                         for (it = parameters.begin(); it != parameters.end(); it++) { 
85                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
86                         }
87                         
88                         //initialize outputTypes
89                         vector<string> tempOutNames;
90                         outputTypes["list"] = tempOutNames;
91                         outputTypes["rabund"] = tempOutNames;
92                         outputTypes["sabund"] = tempOutNames;
93                         
94                         //if the user changes the input directory command factory will send this info to us in the output parameter 
95                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
96                         if (inputDir == "not found"){   inputDir = "";          }
97                         else {
98                                 string path;
99                                 it = parameters.find("blast");
100                                 //user has given a template file
101                                 if(it != parameters.end()){ 
102                                         path = m->hasPath(it->second);
103                                         //if the user has not given a path then, add inputdir. else leave path alone.
104                                         if (path == "") {       parameters["blast"] = inputDir + it->second;            }
105                                 }
106                                 
107                                 it = parameters.find("name");
108                                 //user has given a template file
109                                 if(it != parameters.end()){ 
110                                         path = m->hasPath(it->second);
111                                         //if the user has not given a path then, add inputdir. else leave path alone.
112                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
113                                 }
114                         }
115
116                         
117                         //check for required parameters
118                         blastfile = validParameter.validFile(parameters, "blast", true);
119                         if (blastfile == "not open") { abort = true; }  
120                         else if (blastfile == "not found") { blastfile = ""; }
121                         
122                         //if the user changes the output directory command factory will send this info to us in the output parameter 
123                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
124                                 outputDir = ""; 
125                                 outputDir += m->hasPath(blastfile); //if user entered a file with a path then preserve it       
126                         }
127                         
128                         namefile = validParameter.validFile(parameters, "name", true);
129                         if (namefile == "not open") { abort = true; }   
130                         else if (namefile == "not found") { namefile = ""; }
131                         
132                         if ((blastfile == "")) { m->mothurOut("When executing a mgcluster command you must provide a blastfile."); m->mothurOutEndLine(); abort = true; }
133                         
134                         //check for optional parameter and set defaults
135                         string temp;
136                         temp = validParameter.validFile(parameters, "precision", false);                if (temp == "not found") { temp = "100"; }
137                         precisionLength = temp.length();
138                         convert(temp, precision); 
139                         
140                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "0.70"; }
141                         convert(temp, cutoff); 
142                         cutoff += (5 / (precision * 10.0));
143                         
144                         method = validParameter.validFile(parameters, "method", false);
145                         if (method == "not found") { method = "furthest"; }
146                         
147                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
148                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
149
150                         temp = validParameter.validFile(parameters, "length", false);                   if (temp == "not found") { temp = "5"; }
151                         convert(temp, length); 
152                         
153                         temp = validParameter.validFile(parameters, "penalty", false);                  if (temp == "not found") { temp = "0.10"; }
154                         convert(temp, penalty); 
155                         
156                         temp = validParameter.validFile(parameters, "min", false);                              if (temp == "not found") { temp = "true"; }
157                         minWanted = m->isTrue(temp); 
158                         
159                         temp = validParameter.validFile(parameters, "merge", false);                    if (temp == "not found") { temp = "true"; }
160                         merge = m->isTrue(temp); 
161                         
162                         temp = validParameter.validFile(parameters, "hcluster", false);                 if (temp == "not found") { temp = "false"; }
163                         hclusterWanted = m->isTrue(temp); 
164                         
165                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "F"; }
166                         hard = m->isTrue(temp);
167                 }
168
169         }
170         catch(exception& e) {
171                 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
172                 exit(1);
173         }
174 }
175 //**********************************************************************************************************************
176
177 void MGClusterCommand::help(){
178         try {
179                 m->mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n");
180                 m->mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n");
181                 m->mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n");
182                 m->mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n");
183                 m->mothurOut("The precision parameter's default value is 100. \n");
184                 m->mothurOut("The acceptable mgcluster methods are furthest, nearest and average.  If no method is provided then furthest is assumed.\n\n");    
185                 m->mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n");
186                 m->mothurOut("The length parameter is used to specify the minimum overlap required.  The default is 5.\n");
187                 m->mothurOut("The penalty parameter is used to adjust the error rate.  The default is 0.10.\n");
188                 m->mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster.  By default merge is true, meaning you want to merge.\n");
189                 m->mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering.  This may be neccessary if your file is too large to fit into RAM. The default is false.\n");
190                 m->mothurOut("The mgcluster command should be in the following format: \n");
191                 m->mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n");
192                 m->mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n");
193         }
194         catch(exception& e) {
195                 m->errorOut(e, "MGClusterCommand", "help");
196                 exit(1);
197         }
198 }
199 //**********************************************************************************************************************
200 MGClusterCommand::~MGClusterCommand(){}
201 //**********************************************************************************************************************
202 int MGClusterCommand::execute(){
203         try {
204                 
205                 if (abort == true) {    return 0;       }
206                 
207                 //read names file
208                 if (namefile != "") {
209                         nameMap = new NameAssignment(namefile);
210                         nameMap->readMap();
211                 }else{ nameMap= new NameAssignment(); }
212                 
213                 string fileroot = outputDir + m->getRootName(m->getSimpleName(blastfile));
214                 string tag = "";
215                 time_t start;
216                 float previousDist = 0.00000;
217                 float rndPreviousDist = 0.00000;
218                 
219                 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
220                 //must remember to delete those objects here since readBlast does not
221                 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
222                 read->read(nameMap);
223                 
224                 list = new ListVector(nameMap->getListVector());
225                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
226                 
227                 if (m->control_pressed) { outputTypes.clear(); delete nameMap; delete read; delete list; delete rabund; return 0; }
228                 
229                 start = time(NULL);
230                 oldList = *list;
231                 map<string, int> Seq2Bin;
232                 map<string, int> oldSeq2Bin;
233                 
234                 if (method == "furthest")               { tag = "fn";  }
235                 else if (method == "nearest")   { tag = "nn";  }
236                 else                                                    { tag = "an";  }
237                 
238                 //open output files
239                 m->openOutputFile(fileroot+ tag + ".list",  listFile);
240                 m->openOutputFile(fileroot+ tag + ".rabund",  rabundFile);
241                 m->openOutputFile(fileroot+ tag + ".sabund",  sabundFile);
242                 
243                 if (m->control_pressed) { 
244                         delete nameMap; delete read; delete list; delete rabund; 
245                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
246                         outputTypes.clear();
247                         return 0; 
248                 }
249                 
250                 if (!hclusterWanted) {
251                         //get distmatrix and overlap
252                         SparseMatrix* distMatrix = read->getDistMatrix();
253                         overlapMatrix = read->getOverlapMatrix(); //already sorted by read 
254                         delete read;
255                 
256                         //create cluster
257                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method); }
258                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
259                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); }
260                         cluster->setMapWanted(true);
261                         Seq2Bin = cluster->getSeqtoBin();
262                         oldSeq2Bin = Seq2Bin;
263                         
264                         if (m->control_pressed) { 
265                                 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
266                                 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
267                                 outputTypes.clear();
268                                 return 0; 
269                         }
270                 
271                         //cluster using cluster classes
272                         while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
273                                 
274                                 cluster->update(cutoff);
275                                 
276                                 if (m->control_pressed) { 
277                                         delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
278                                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
279                                         outputTypes.clear();
280                                         return 0; 
281                                 }
282                                 
283                                 float dist = distMatrix->getSmallDist();
284                                 float rndDist;
285                                 if (hard) {
286                                         rndDist = m->ceilDist(dist, precision); 
287                                 }else{
288                                         rndDist = m->roundDist(dist, precision); 
289                                 }
290                                 
291                                 if(previousDist <= 0.0000 && dist != previousDist){
292                                         oldList.setLabel("unique");
293                                         printData(&oldList);
294                                 }
295                                 else if(rndDist != rndPreviousDist){
296                                         if (merge) {
297                                                 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
298                                                 
299                                                 if (m->control_pressed) { 
300                                                         delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
301                                                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
302                                                         outputTypes.clear();
303                                                         return 0; 
304                                                 }
305                                                 
306                                                 temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
307                                                 printData(temp);
308                                                 delete temp;
309                                         }else{
310                                                 oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
311                                                 printData(&oldList);
312                                         }
313                                 }
314         
315                                 previousDist = dist;
316                                 rndPreviousDist = rndDist;
317                                 oldList = *list;
318                                 Seq2Bin = cluster->getSeqtoBin();
319                                 oldSeq2Bin = Seq2Bin;
320                         }
321                         
322                         if(previousDist <= 0.0000){
323                                 oldList.setLabel("unique");
324                                 printData(&oldList);
325                         }
326                         else if(rndPreviousDist<cutoff){
327                                 if (merge) {
328                                         ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
329                                         
330                                         if (m->control_pressed) { 
331                                                         delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
332                                                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
333                                                         outputTypes.clear();
334                                                         return 0; 
335                                         }
336                                         
337                                         temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
338                                         printData(temp);
339                                         delete temp;
340                                 }else{
341                                         oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
342                                         printData(&oldList);
343                                 }
344                         }
345                         
346                         //free memory
347                         overlapMatrix.clear();
348                         delete distMatrix;
349                         delete cluster;
350                         
351                 }else { //use hcluster to cluster
352                         //get distmatrix and overlap
353                         overlapFile = read->getOverlapFile();
354                         distFile = read->getDistFile(); 
355                         delete read;
356                 
357                         //sort the distance and overlap files
358                         sortHclusterFiles(distFile, overlapFile);
359                         
360                         if (m->control_pressed) { 
361                                 delete nameMap;  delete list; delete rabund; 
362                                 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
363                                 outputTypes.clear();
364                                 return 0; 
365                         }
366                 
367                         //create cluster
368                         hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
369                         hcluster->setMapWanted(true);
370                         Seq2Bin = cluster->getSeqtoBin();
371                         oldSeq2Bin = Seq2Bin;
372                         
373                         vector<seqDist> seqs; seqs.resize(1); // to start loop
374                         //ifstream inHcluster;
375                         //m->openInputFile(distFile, inHcluster);
376                         
377                         if (m->control_pressed) { 
378                                 delete nameMap;  delete list; delete rabund; delete hcluster;
379                                 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
380                                 outputTypes.clear();
381                                 return 0; 
382                         }
383
384                         while (seqs.size() != 0){
385                 
386                                 seqs = hcluster->getSeqs();
387                                 
388                                 if (m->control_pressed) { 
389                                         delete nameMap;  delete list; delete rabund; delete hcluster;
390                                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
391                                         remove(distFile.c_str());
392                                         remove(overlapFile.c_str());
393                                         outputTypes.clear();
394                                         return 0; 
395                                 }
396                                 
397                                 for (int i = 0; i < seqs.size(); i++) {  //-1 means skip me
398                                         
399                                         if (seqs[i].seq1 != seqs[i].seq2) {
400                 
401                                                 hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
402                                                 
403                                                 if (m->control_pressed) { 
404                                                         delete nameMap;  delete list; delete rabund; delete hcluster;
405                                                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
406                                                         remove(distFile.c_str());
407                                                         remove(overlapFile.c_str());
408                                                         outputTypes.clear();
409                                                         return 0; 
410                                                 }
411         
412                                                 float rndDist;
413                                                 if (hard) {
414                                                         rndDist = m->ceilDist(seqs[i].dist, precision); 
415                                                 }else{
416                                                         rndDist = m->roundDist(seqs[i].dist, precision); 
417                                                 }
418                                                                                                 
419                                                 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
420                                                         oldList.setLabel("unique");
421                                                         printData(&oldList);
422                                                 }
423                                                 else if((rndDist != rndPreviousDist)){
424                                                         if (merge) {
425                                                                 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
426                                                                 
427                                                                 if (m->control_pressed) { 
428                                                                         delete nameMap;  delete list; delete rabund; delete hcluster; delete temp;
429                                                                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
430                                                                         remove(distFile.c_str());
431                                                                         remove(overlapFile.c_str());
432                                                                         outputTypes.clear();
433                                                                         return 0; 
434                                                                 }
435
436                                                                 temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
437                                                                 printData(temp);
438                                                                 delete temp;
439                                                         }else{
440                                                                 oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
441                                                                 printData(&oldList);
442                                                         }
443                                                 }
444                                                 
445                                                 previousDist = seqs[i].dist;
446                                                 rndPreviousDist = rndDist;
447                                                 oldList = *list;
448                                                 Seq2Bin = cluster->getSeqtoBin();
449                                                 oldSeq2Bin = Seq2Bin;
450                                         }
451                                 }
452                         }
453                         //inHcluster.close();
454                         
455                         if(previousDist <= 0.0000){
456                                 oldList.setLabel("unique");
457                                 printData(&oldList);
458                         }
459                         else if(rndPreviousDist<cutoff){
460                                 if (merge) {
461                                         ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
462                                         
463                                         if (m->control_pressed) { 
464                                                         delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
465                                                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
466                                                         remove(distFile.c_str());
467                                                         remove(overlapFile.c_str());
468                                                         outputTypes.clear();
469                                                         return 0; 
470                                         }
471                                         
472                                         temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
473                                         printData(temp);
474                                         delete temp;
475                                 }else{
476                                         oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
477                                         printData(&oldList);
478                                 }
479                         }
480                         
481                         delete hcluster;
482                         remove(distFile.c_str());
483                         remove(overlapFile.c_str());
484                 }
485                 
486                 delete list; 
487                 delete rabund;
488                 listFile.close();
489                 sabundFile.close();
490                 rabundFile.close();
491         
492                 globaldata->setListFile(fileroot+ tag + ".list");
493                 globaldata->setFormat("list");
494                 
495                 if (m->control_pressed) { 
496                         delete nameMap; 
497                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
498                         globaldata->setListFile("");
499                         globaldata->setFormat("");
500                         outputTypes.clear();
501                         return 0; 
502                 }
503                 
504                 m->mothurOutEndLine();
505                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
506                 m->mothurOut(fileroot+ tag + ".list"); m->mothurOutEndLine();   outputNames.push_back(fileroot+ tag + ".list"); outputTypes["list"].push_back(fileroot+ tag + ".list");
507                 m->mothurOut(fileroot+ tag + ".rabund"); m->mothurOutEndLine(); outputNames.push_back(fileroot+ tag + ".rabund"); outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
508                 m->mothurOut(fileroot+ tag + ".sabund"); m->mothurOutEndLine(); outputNames.push_back(fileroot+ tag + ".sabund"); outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
509                 m->mothurOutEndLine();
510                 
511                 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine();
512                         
513                 return 0;
514         }
515         catch(exception& e) {
516                 m->errorOut(e, "MGClusterCommand", "execute");
517                 exit(1);
518         }
519 }
520 //**********************************************************************************************************************
521 void MGClusterCommand::printData(ListVector* mergedList){
522         try {
523                 mergedList->print(listFile);
524                 mergedList->getRAbundVector().print(rabundFile);
525                 
526                 SAbundVector sabund = mergedList->getSAbundVector();
527
528                 sabund.print(cout);
529                 sabund.print(sabundFile);
530         }
531         catch(exception& e) {
532                 m->errorOut(e, "MGClusterCommand", "printData");
533                 exit(1);
534         }
535 }
536 //**********************************************************************************************************************
537 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
538 //that are used to cluster by distance.  this is done so that the overlapping data does not have more influenece than the distance data.
539 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
540         try {
541                 //create new listvector so you don't overwrite the clustering
542                 ListVector* newList = new ListVector(oldList);
543
544                 bool done = false;
545                 ifstream inOverlap;
546                 int count = 0;
547                 
548                 if (hclusterWanted) {  
549                         m->openInputFile(overlapFile, inOverlap);  
550                         if (inOverlap.eof()) {  done = true;  }
551                 }else { if (overlapMatrix.size() == 0)  {  done = true;  } } 
552                 
553                 while (!done) {
554                         if (m->control_pressed) { 
555                                 if (hclusterWanted) {   inOverlap.close();  }           
556                                 return newList;
557                         }
558                         
559                         //get next overlap
560                         seqDist overlapNode;
561                         if (!hclusterWanted) {  
562                                 if (count < overlapMatrix.size()) { //do we have another node in the matrix
563                                         overlapNode = overlapMatrix[count];
564                                         count++;
565                                 }else { break; }
566                         }else { 
567                                 if (!inOverlap.eof()) {
568                                         string firstName, secondName;
569                                         float overlapDistance;
570                                         inOverlap >> firstName >> secondName >> overlapDistance; m->gobble(inOverlap);
571                                         
572                                         //commented out because we check this in readblast already
573                                         //map<string,int>::iterator itA = nameMap->find(firstName);
574                                         //map<string,int>::iterator itB = nameMap->find(secondName);
575                                         //if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
576                                         //if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
577                                         
578                                         //overlapNode.seq1 = itA->second;
579                                         //overlapNode.seq2 = itB->second;
580                                         overlapNode.seq1 = nameMap->get(firstName);
581                                         overlapNode.seq2 = nameMap->get(secondName);
582                                         overlapNode.dist = overlapDistance;
583                                 }else { inOverlap.close(); break; }
584                         } 
585                 
586                         if (overlapNode.dist < dist) {
587                                 //get names of seqs that overlap
588                                 string name1 = nameMap->get(overlapNode.seq1);
589                                 string name2 = nameMap->get(overlapNode.seq2);
590                         
591                                 //use binInfo to find out if they are already in the same bin
592                                 //map<string, int>::iterator itBin1 = binInfo.find(name1);
593                                 //map<string, int>::iterator itBin2 = binInfo.find(name2);
594                                 
595                                 //if(itBin1 == binInfo.end()){  cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1);  }
596                                 //if(itBin2 == binInfo.end()){  cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1);  }
597
598                                 //int binKeep = itBin1->second;
599                                 //int binRemove = itBin2->second;
600                                 
601                                 int binKeep = binInfo[name1];
602                                 int binRemove = binInfo[name2];
603                         
604                                 //if not merge bins and update binInfo
605                                 if(binKeep != binRemove) {
606                 
607                                         //save names in old bin
608                                         string names = newList->get(binRemove);
609                 
610                                         //merge bins into name1s bin
611                                         newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
612                                         newList->set(binRemove, "");    
613                                         
614                                         //update binInfo
615                                         while (names.find_first_of(',') != -1) { 
616                                                 //get name from bin
617                                                 string name = names.substr(0,names.find_first_of(','));
618                                                 //save name and bin number
619                                                 binInfo[name] = binKeep;
620                                                 names = names.substr(names.find_first_of(',')+1, names.length());
621                                         }
622                                         
623                                         //get last name
624                                         binInfo[names] = binKeep;
625                                 }
626                                 
627                         }else { done = true; }
628                 }
629                 
630                 //return listvector
631                 return newList;
632                                 
633         }
634         catch(exception& e) {
635                 m->errorOut(e, "MGClusterCommand", "mergeOPFs");
636                 exit(1);
637         }
638 }
639 //**********************************************************************************************************************
640 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
641         try {
642                 //sort distFile
643                 string sortedDistFile = m->sortFile(unsortedDist, outputDir);
644                 remove(unsortedDist.c_str());  //delete unsorted file
645                 distFile = sortedDistFile;
646                 
647                 //sort overlap file
648                 string sortedOverlapFile = m->sortFile(unsortedOverlap, outputDir);
649                 remove(unsortedOverlap.c_str());  //delete unsorted file
650                 overlapFile = sortedOverlapFile;
651         }
652         catch(exception& e) {
653                 m->errorOut(e, "MGClusterCommand", "sortHclusterFiles");
654                 exit(1);
655         }
656 }
657
658 //**********************************************************************************************************************
659
660
661
662
663