]> git.donarmstrong.com Git - mothur.git/blob - mgclustercommand.cpp
created blank accnos in chimera commands if no seqs are deemed chimeric. modified...
[mothur.git] / mgclustercommand.cpp
1 /*
2  *  mgclustercommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 12/11/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "mgclustercommand.h"
11
12 //**********************************************************************************************************************
13 MGClusterCommand::MGClusterCommand(string option) {
14         try {
15                 globaldata = GlobalData::getInstance();
16                 abort = false;
17                 
18                 //allow user to run help
19                 if(option == "help") { help(); abort = true; }
20                 
21                 else {
22                         //valid paramters for this command
23                         string Array[] =  {"blast", "method", "name", "hard", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"};
24                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
25                         
26                         OptionParser parser(option);
27                         map<string, string> parameters = parser.getParameters();
28                         
29                         ValidParameters validParameter;
30                         map<string,string>::iterator it;
31                 
32                         //check to make sure all parameters are valid for command
33                         for (it = parameters.begin(); it != parameters.end(); it++) { 
34                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
35                         }
36                         
37                         //if the user changes the input directory command factory will send this info to us in the output parameter 
38                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
39                         if (inputDir == "not found"){   inputDir = "";          }
40                         else {
41                                 string path;
42                                 it = parameters.find("blast");
43                                 //user has given a template file
44                                 if(it != parameters.end()){ 
45                                         path = hasPath(it->second);
46                                         //if the user has not given a path then, add inputdir. else leave path alone.
47                                         if (path == "") {       parameters["blast"] = inputDir + it->second;            }
48                                 }
49                                 
50                                 it = parameters.find("name");
51                                 //user has given a template file
52                                 if(it != parameters.end()){ 
53                                         path = hasPath(it->second);
54                                         //if the user has not given a path then, add inputdir. else leave path alone.
55                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
56                                 }
57                         }
58
59                         
60                         //check for required parameters
61                         blastfile = validParameter.validFile(parameters, "blast", true);
62                         if (blastfile == "not open") { abort = true; }  
63                         else if (blastfile == "not found") { blastfile = ""; }
64                         
65                         //if the user changes the output directory command factory will send this info to us in the output parameter 
66                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
67                                 outputDir = ""; 
68                                 outputDir += hasPath(blastfile); //if user entered a file with a path then preserve it  
69                         }
70                         
71                         namefile = validParameter.validFile(parameters, "name", true);
72                         if (namefile == "not open") { abort = true; }   
73                         else if (namefile == "not found") { namefile = ""; }
74                         
75                         if ((blastfile == "")) { m->mothurOut("When executing a mgcluster command you must provide a blastfile."); m->mothurOutEndLine(); abort = true; }
76                         
77                         //check for optional parameter and set defaults
78                         string temp;
79                         temp = validParameter.validFile(parameters, "precision", false);                if (temp == "not found") { temp = "100"; }
80                         precisionLength = temp.length();
81                         convert(temp, precision); 
82                         
83                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "0.70"; }
84                         convert(temp, cutoff); 
85                         cutoff += (5 / (precision * 10.0));
86                         
87                         method = validParameter.validFile(parameters, "method", false);
88                         if (method == "not found") { method = "furthest"; }
89                         
90                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
91                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
92
93                         temp = validParameter.validFile(parameters, "length", false);                   if (temp == "not found") { temp = "5"; }
94                         convert(temp, length); 
95                         
96                         temp = validParameter.validFile(parameters, "penalty", false);                  if (temp == "not found") { temp = "0.10"; }
97                         convert(temp, penalty); 
98                         
99                         temp = validParameter.validFile(parameters, "min", false);                              if (temp == "not found") { temp = "true"; }
100                         minWanted = isTrue(temp); 
101                         
102                         temp = validParameter.validFile(parameters, "merge", false);                    if (temp == "not found") { temp = "true"; }
103                         merge = isTrue(temp); 
104                         
105                         temp = validParameter.validFile(parameters, "hcluster", false);                 if (temp == "not found") { temp = "false"; }
106                         hclusterWanted = isTrue(temp); 
107                         
108                         temp = validParameter.validFile(parameters, "hard", false);                     if (temp == "not found") { temp = "F"; }
109                         hard = isTrue(temp);
110                 }
111
112         }
113         catch(exception& e) {
114                 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
115                 exit(1);
116         }
117 }
118 //**********************************************************************************************************************
119
120 void MGClusterCommand::help(){
121         try {
122                 m->mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n");
123                 m->mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n");
124                 m->mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n");
125                 m->mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n");
126                 m->mothurOut("The precision parameter's default value is 100. \n");
127                 m->mothurOut("The acceptable mgcluster methods are furthest, nearest and average.  If no method is provided then furthest is assumed.\n\n");    
128                 m->mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n");
129                 m->mothurOut("The length parameter is used to specify the minimum overlap required.  The default is 5.\n");
130                 m->mothurOut("The penalty parameter is used to adjust the error rate.  The default is 0.10.\n");
131                 m->mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster.  By default merge is true, meaning you want to merge.\n");
132                 m->mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering.  This may be neccessary if your file is too large to fit into RAM. The default is false.\n");
133                 m->mothurOut("The mgcluster command should be in the following format: \n");
134                 m->mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n");
135                 m->mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n");
136         }
137         catch(exception& e) {
138                 m->errorOut(e, "MGClusterCommand", "help");
139                 exit(1);
140         }
141 }
142 //**********************************************************************************************************************
143 MGClusterCommand::~MGClusterCommand(){}
144 //**********************************************************************************************************************
145 int MGClusterCommand::execute(){
146         try {
147                 
148                 if (abort == true) {    return 0;       }
149                 
150                 //read names file
151                 if (namefile != "") {
152                         nameMap = new NameAssignment(namefile);
153                         nameMap->readMap();
154                 }else{ nameMap= new NameAssignment(); }
155                 
156                 string fileroot = outputDir + getRootName(getSimpleName(blastfile));
157                 string tag = "";
158                 time_t start;
159                 float previousDist = 0.00000;
160                 float rndPreviousDist = 0.00000;
161                 
162                 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
163                 //must remember to delete those objects here since readBlast does not
164                 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
165                 read->read(nameMap);
166                 
167                 list = new ListVector(nameMap->getListVector());
168                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
169 for (int i = 0; i < list->getNumBins(); i++) {
170 string bin = list->get(i);
171 if(bin == "") {
172 cout << "bin " << i << " is blank."<< endl;
173 }
174 }
175 cout << "after outputting blank bins." << endl;
176                 
177                 if (m->control_pressed) { delete nameMap; delete read; delete list; delete rabund; return 0; }
178                 
179                 start = time(NULL);
180                 oldList = *list;
181                 map<string, int> Seq2Bin;
182                 map<string, int> oldSeq2Bin;
183                 
184                 if (method == "furthest")               { tag = "fn";  }
185                 else if (method == "nearest")   { tag = "nn";  }
186                 else                                                    { tag = "an";  }
187                 
188                 //open output files
189                 openOutputFile(fileroot+ tag + ".list",  listFile);
190                 openOutputFile(fileroot+ tag + ".rabund",  rabundFile);
191                 openOutputFile(fileroot+ tag + ".sabund",  sabundFile);
192                 
193                 if (m->control_pressed) { 
194                         delete nameMap; delete read; delete list; delete rabund; 
195                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
196                         return 0; 
197                 }
198                 
199                 if (!hclusterWanted) {
200                         //get distmatrix and overlap
201                         SparseMatrix* distMatrix = read->getDistMatrix();
202                         overlapMatrix = read->getOverlapMatrix(); //already sorted by read 
203                         delete read;
204                 
205                         //create cluster
206                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method); }
207                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
208                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); }
209                         cluster->setMapWanted(true);
210                         Seq2Bin = cluster->getSeqtoBin();
211                         oldSeq2Bin = Seq2Bin;
212                         
213                         if (m->control_pressed) { 
214                                 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
215                                 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
216                                 return 0; 
217                         }
218         int count = 0;          
219                         //cluster using cluster classes
220                         while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
221                                 
222                                 cluster->update(cutoff);
223                                 
224                                 if (m->control_pressed) { 
225                                         delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
226                                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
227                                         return 0; 
228                                 }
229                                 
230                                 float dist = distMatrix->getSmallDist();
231                                 float rndDist;
232                                 if (hard) {
233                                         rndDist = ceilDist(dist, precision); 
234                                 }else{
235                                         rndDist = roundDist(dist, precision); 
236                                 }
237 cout << "here " << count << '\t' << dist << endl;
238                                 
239                                 if(previousDist <= 0.0000 && dist != previousDist){
240                                         oldList.setLabel("unique");
241                                         printData(&oldList);
242                                         Seq2Bin = cluster->getSeqtoBin();
243                                 }
244                                 else if(rndDist != rndPreviousDist){
245                                         if (merge) {
246                                                 Seq2Bin = cluster->getSeqtoBin();
247                                                 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
248                                                 
249                                                 if (m->control_pressed) { 
250                                                         delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
251                                                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
252                                                         return 0; 
253                                                 }
254                                                 
255                                                 temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
256                                                 printData(temp);
257                                                 delete temp;
258                                         }else{
259                                                 oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
260                                                 printData(&oldList);
261                                         }
262                                 }
263         //cout << "after merge " << count << '\t' << dist << endl;      
264         count++;                
265                                 previousDist = dist;
266                                 rndPreviousDist = rndDist;
267                                 oldList = *list;
268                                 oldSeq2Bin = Seq2Bin;
269                         }
270                         
271                         if(previousDist <= 0.0000){
272                                 oldList.setLabel("unique");
273                                 printData(&oldList);
274                                 Seq2Bin = cluster->getSeqtoBin();
275                         }
276                         else if(rndPreviousDist<cutoff){
277                                 if (merge) {
278                                         Seq2Bin = cluster->getSeqtoBin();
279                                         ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
280                                         
281                                         if (m->control_pressed) { 
282                                                         delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
283                                                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
284                                                         return 0; 
285                                         }
286                                         
287                                         temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
288                                         printData(temp);
289                                         delete temp;
290                                 }else{
291                                         oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
292                                         printData(&oldList);
293                                 }
294                         }
295                         
296                         //free memory
297                         overlapMatrix.clear();
298                         delete distMatrix;
299                         delete cluster;
300                         
301                 }else { //use hcluster to cluster
302                         //get distmatrix and overlap
303                         overlapFile = read->getOverlapFile();
304                         distFile = read->getDistFile(); 
305                         delete read;
306                 
307                         //sort the distance and overlap files
308                         sortHclusterFiles(distFile, overlapFile);
309                         
310                         if (m->control_pressed) { 
311                                 delete nameMap;  delete list; delete rabund; 
312                                 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
313                                 return 0; 
314                         }
315                 
316                         //create cluster
317                         hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
318                         hcluster->setMapWanted(true);
319                         Seq2Bin = cluster->getSeqtoBin();
320                         oldSeq2Bin = Seq2Bin;
321                         
322                         vector<seqDist> seqs; seqs.resize(1); // to start loop
323                         //ifstream inHcluster;
324                         //openInputFile(distFile, inHcluster);
325                         
326                         if (m->control_pressed) { 
327                                 delete nameMap;  delete list; delete rabund; delete hcluster;
328                                 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
329                                 return 0; 
330                         }
331
332                         while (seqs.size() != 0){
333                 
334                                 seqs = hcluster->getSeqs();
335                                 
336                                 if (m->control_pressed) { 
337                                         delete nameMap;  delete list; delete rabund; delete hcluster;
338                                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
339                                         remove(distFile.c_str());
340                                         remove(overlapFile.c_str());
341                                         return 0; 
342                                 }
343                                 
344                                 for (int i = 0; i < seqs.size(); i++) {  //-1 means skip me
345                                         
346                                         if (seqs[i].seq1 != seqs[i].seq2) {
347                 
348                                                 hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
349                                                 
350                                                 if (m->control_pressed) { 
351                                                         delete nameMap;  delete list; delete rabund; delete hcluster;
352                                                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
353                                                         remove(distFile.c_str());
354                                                         remove(overlapFile.c_str());
355                                                         return 0; 
356                                                 }
357         
358                                                 float rndDist;
359                                                 if (hard) {
360                                                         rndDist = ceilDist(seqs[i].dist, precision); 
361                                                 }else{
362                                                         rndDist = roundDist(seqs[i].dist, precision); 
363                                                 }
364                                                                                                 
365                                                 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
366                                                         oldList.setLabel("unique");
367                                                         printData(&oldList);
368                                                         Seq2Bin = hcluster->getSeqtoBin();
369                                                 }
370                                                 else if((rndDist != rndPreviousDist)){
371                                                         if (merge) {
372                                                                 Seq2Bin = hcluster->getSeqtoBin();
373                                                                 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
374                                                                 
375                                                                 if (m->control_pressed) { 
376                                                                         delete nameMap;  delete list; delete rabund; delete hcluster; delete temp;
377                                                                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
378                                                                         remove(distFile.c_str());
379                                                                         remove(overlapFile.c_str());
380                                                                         return 0; 
381                                                                 }
382
383                                                                 temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
384                                                                 printData(temp);
385                                                                 delete temp;
386                                                         }else{
387                                                                 oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
388                                                                 printData(&oldList);
389                                                         }
390                                                 }
391                                                 
392                                                 previousDist = seqs[i].dist;
393                                                 rndPreviousDist = rndDist;
394                                                 oldList = *list;
395                                                 oldSeq2Bin = Seq2Bin;
396                                         }
397                                 }
398                         }
399                         //inHcluster.close();
400                         
401                         if(previousDist <= 0.0000){
402                                 oldList.setLabel("unique");
403                                 printData(&oldList);
404                         }
405                         else if(rndPreviousDist<cutoff){
406                                 if (merge) {
407                                         Seq2Bin = hcluster->getSeqtoBin();
408                                         ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
409                                         
410                                         if (m->control_pressed) { 
411                                                         delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
412                                                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
413                                                         remove(distFile.c_str());
414                                                         remove(overlapFile.c_str());
415                                                         return 0; 
416                                         }
417                                         
418                                         temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
419                                         printData(temp);
420                                         delete temp;
421                                 }else{
422                                         oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
423                                         printData(&oldList);
424                                 }
425                         }
426                         
427                         delete hcluster;
428                         remove(distFile.c_str());
429                         remove(overlapFile.c_str());
430                 }
431                 
432                 delete list; 
433                 delete rabund;
434                 listFile.close();
435                 sabundFile.close();
436                 rabundFile.close();
437         
438                 globaldata->setListFile(fileroot+ tag + ".list");
439                 globaldata->setFormat("list");
440                 
441                 if (m->control_pressed) { 
442                         delete nameMap; 
443                         listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
444                         globaldata->setListFile("");
445                         globaldata->setFormat("");
446                         return 0; 
447                 }
448                 
449                 m->mothurOutEndLine();
450                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
451                 m->mothurOut(fileroot+ tag + ".list"); m->mothurOutEndLine();   
452                 m->mothurOut(fileroot+ tag + ".rabund"); m->mothurOutEndLine(); 
453                 m->mothurOut(fileroot+ tag + ".sabund"); m->mothurOutEndLine(); 
454                 m->mothurOutEndLine();
455                 
456                 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine();
457                         
458                 return 0;
459         }
460         catch(exception& e) {
461                 m->errorOut(e, "MGClusterCommand", "execute");
462                 exit(1);
463         }
464 }
465 //**********************************************************************************************************************
466 void MGClusterCommand::printData(ListVector* mergedList){
467         try {
468                 mergedList->print(listFile);
469                 mergedList->getRAbundVector().print(rabundFile);
470                 
471                 SAbundVector sabund = mergedList->getSAbundVector();
472
473                 sabund.print(cout);
474                 sabund.print(sabundFile);
475         }
476         catch(exception& e) {
477                 m->errorOut(e, "MGClusterCommand", "printData");
478                 exit(1);
479         }
480 }
481 //**********************************************************************************************************************
482 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
483 //that are used to cluster by distance.  this is done so that the overlapping data does not have more influenece than the distance data.
484 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
485         try {
486                 //create new listvector so you don't overwrite the clustering
487                 ListVector* newList = new ListVector(oldList);
488 for (int i = 0; i < newList->getNumBins(); i++) {
489 string bin = newList->get(i);
490 if(bin == "") {
491 cout << "bin " << i << " is blank."<< endl;
492 for (map<string, int>::iterator itBin = binInfo.begin(); itBin != binInfo.end(); itBin++) {
493         if (itBin->second == i) { cout << itBin->first << " maps to an empty bin." << endl; }
494 }
495 }
496 }
497 cout << "after outputting blank bins." << endl;         
498
499                 bool done = false;
500                 ifstream inOverlap;
501                 int count = 0;
502                 
503                 if (hclusterWanted) {  
504                         openInputFile(overlapFile, inOverlap);  
505                         if (inOverlap.eof()) {  done = true;  }
506                 }else { if (overlapMatrix.size() == 0)  {  done = true;  } } 
507                 
508                 while (!done) {
509                         if (m->control_pressed) { 
510                                 if (hclusterWanted) {   inOverlap.close();  }           
511                                 return newList;
512                         }
513                         
514                         //get next overlap
515                         seqDist overlapNode;
516                         if (!hclusterWanted) {  
517                                 if (count < overlapMatrix.size()) { //do we have another node in the matrix
518                                         overlapNode = overlapMatrix[count];
519                                         count++;
520                                 }else { break; }
521                         }else { 
522                                 if (!inOverlap.eof()) {
523                                         string firstName, secondName;
524                                         float overlapDistance;
525                                         inOverlap >> firstName >> secondName >> overlapDistance; gobble(inOverlap);
526                                         
527                                         map<string,int>::iterator itA = nameMap->find(firstName);
528                                         map<string,int>::iterator itB = nameMap->find(secondName);
529                                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
530                                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
531                                         
532                                         overlapNode.seq1 = itA->second;
533                                         overlapNode.seq2 = itB->second;
534                                         overlapNode.dist = overlapDistance;
535                                 }else { inOverlap.close(); break; }
536                         } 
537                 
538                         if (overlapNode.dist < dist) {
539                                 //get names of seqs that overlap
540                                 string name1 = nameMap->get(overlapNode.seq1);
541                                 string name2 = nameMap->get(overlapNode.seq2);
542                         
543                                 //use binInfo to find out if they are already in the same bin
544                                 map<string, int>::iterator itBin1 = binInfo.find(name1);
545                                 map<string, int>::iterator itBin2 = binInfo.find(name2);
546                                 
547                                 if(itBin1 == binInfo.end()){  cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1);  }
548                                 if(itBin2 == binInfo.end()){  cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1);  }
549 cout << overlapNode.dist << '\t' << dist << endl;
550                                 int binKeep = itBin1->second;
551                                 int binRemove = itBin2->second;
552                         
553                                 //if not merge bins and update binInfo
554                                 if(binKeep != binRemove) {
555         cout << "bin keep = " << binKeep << " bin remove = " << binRemove << endl;              
556                                         //save names in old bin
557                                         string names = newList->get(binRemove);
558                         cout << names << endl << endl << endl;  
559                         cout << newList->get(binKeep) << endl << endl << endl;  
560                                         //merge bins into name1s bin
561                                         newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
562                                         newList->set(binRemove, "");    
563                                         
564                                         //update binInfo
565                                         while (names.find_first_of(',') != -1) { 
566                                                 //get name from bin
567                                                 string name = names.substr(0,names.find_first_of(','));
568                                                 //save name and bin number
569                                                 binInfo[name] = binKeep;
570                                                 names = names.substr(names.find_first_of(',')+1, names.length());
571                                         }
572                                         
573                                         //get last name
574                                         binInfo[names] = binKeep;
575                                 }
576                                 
577                         }else { done = true; }
578                 }
579                 
580                 //return listvector
581                 return newList;
582                                 
583         }
584         catch(exception& e) {
585                 m->errorOut(e, "MGClusterCommand", "mergeOPFs");
586                 exit(1);
587         }
588 }
589 //**********************************************************************************************************************
590 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
591         try {
592                 //sort distFile
593                 string sortedDistFile = sortFile(unsortedDist, outputDir);
594                 remove(unsortedDist.c_str());  //delete unsorted file
595                 distFile = sortedDistFile;
596                 
597                 //sort overlap file
598                 string sortedOverlapFile = sortFile(unsortedOverlap, outputDir);
599                 remove(unsortedOverlap.c_str());  //delete unsorted file
600                 overlapFile = sortedOverlapFile;
601         }
602         catch(exception& e) {
603                 m->errorOut(e, "MGClusterCommand", "sortHclusterFiles");
604                 exit(1);
605         }
606 }
607
608 //**********************************************************************************************************************
609
610
611
612
613