5 * Created by westcott on 12/11/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "mgclustercommand.h"
12 //**********************************************************************************************************************
13 vector<string> MGClusterCommand::setParameters(){
15 CommandParameter pblast("blast", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pblast);
16 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
17 CommandParameter pcount("count", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pcount);
18 CommandParameter plength("length", "Number", "", "5", "", "", "",false,false); parameters.push_back(plength);
19 CommandParameter ppenalty("penalty", "Number", "", "0.10", "", "", "",false,false); parameters.push_back(ppenalty);
20 CommandParameter pcutoff("cutoff", "Number", "", "0.70", "", "", "",false,false); parameters.push_back(pcutoff);
21 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
22 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average", "average", "", "", "",false,false); parameters.push_back(pmethod);
23 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
24 CommandParameter pmin("min", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pmin);
25 CommandParameter pmerge("merge", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pmerge);
26 CommandParameter phcluster("hcluster", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(phcluster);
27 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
28 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
30 vector<string> myArray;
31 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
35 m->errorOut(e, "MGClusterCommand", "setParameters");
39 //**********************************************************************************************************************
40 string MGClusterCommand::getHelpString(){
42 string helpString = "";
43 helpString += "The mgcluster command parameter options are blast, name, cutoff, precision, hard, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n";
44 helpString += "The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n";
45 helpString += "This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n";
46 helpString += "The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n";
47 helpString += "The precision parameter's default value is 100. \n";
48 helpString += "The acceptable mgcluster methods are furthest, nearest and average. If no method is provided then average is assumed.\n";
49 helpString += "The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n";
50 helpString += "The length parameter is used to specify the minimum overlap required. The default is 5.\n";
51 helpString += "The penalty parameter is used to adjust the error rate. The default is 0.10.\n";
52 helpString += "The merge parameter allows you to shut off merging based on overlaps and just cluster. By default merge is true, meaning you want to merge.\n";
53 helpString += "The hcluster parameter allows you to use the hcluster algorithm when clustering. This may be neccessary if your file is too large to fit into RAM. The default is false.\n";
54 helpString += "The mgcluster command should be in the following format: \n";
55 helpString += "mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n";
56 helpString += "Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n";
60 m->errorOut(e, "MGClusterCommand", "getHelpString");
64 //**********************************************************************************************************************
65 string MGClusterCommand::getOutputFileNameTag(string type, string inputName=""){
67 string outputFileName = "";
68 map<string, vector<string> >::iterator it;
70 //is this a type this command creates
71 it = outputTypes.find(type);
72 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
74 if (type == "list") { outputFileName = "list"; }
75 else if (type == "rabund") { outputFileName = "rabund"; }
76 else if (type == "sabund") { outputFileName = "sabund"; }
77 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
79 return outputFileName;
82 m->errorOut(e, "MGClusterCommand", "getOutputFileNameTag");
86 //**********************************************************************************************************************
87 MGClusterCommand::MGClusterCommand(){
89 abort = true; calledHelp = true;
91 vector<string> tempOutNames;
92 outputTypes["list"] = tempOutNames;
93 outputTypes["rabund"] = tempOutNames;
94 outputTypes["sabund"] = tempOutNames;
97 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
101 //**********************************************************************************************************************
102 MGClusterCommand::MGClusterCommand(string option) {
104 abort = false; calledHelp = false;
106 //allow user to run help
107 if(option == "help") { help(); abort = true; calledHelp = true; }
108 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
111 vector<string> myArray = setParameters();
113 OptionParser parser(option);
114 map<string, string> parameters = parser.getParameters();
116 ValidParameters validParameter;
117 map<string,string>::iterator it;
119 //check to make sure all parameters are valid for command
120 for (it = parameters.begin(); it != parameters.end(); it++) {
121 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
124 //initialize outputTypes
125 vector<string> tempOutNames;
126 outputTypes["list"] = tempOutNames;
127 outputTypes["rabund"] = tempOutNames;
128 outputTypes["sabund"] = tempOutNames;
130 //if the user changes the input directory command factory will send this info to us in the output parameter
131 string inputDir = validParameter.validFile(parameters, "inputdir", false);
132 if (inputDir == "not found"){ inputDir = ""; }
135 it = parameters.find("blast");
136 //user has given a template file
137 if(it != parameters.end()){
138 path = m->hasPath(it->second);
139 //if the user has not given a path then, add inputdir. else leave path alone.
140 if (path == "") { parameters["blast"] = inputDir + it->second; }
143 it = parameters.find("name");
144 //user has given a template file
145 if(it != parameters.end()){
146 path = m->hasPath(it->second);
147 //if the user has not given a path then, add inputdir. else leave path alone.
148 if (path == "") { parameters["name"] = inputDir + it->second; }
153 //check for required parameters
154 blastfile = validParameter.validFile(parameters, "blast", true);
155 if (blastfile == "not open") { blastfile = ""; abort = true; }
156 else if (blastfile == "not found") { blastfile = ""; }
158 //if the user changes the output directory command factory will send this info to us in the output parameter
159 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
161 outputDir += m->hasPath(blastfile); //if user entered a file with a path then preserve it
164 namefile = validParameter.validFile(parameters, "name", true);
165 if (namefile == "not open") { abort = true; }
166 else if (namefile == "not found") { namefile = ""; }
167 else { m->setNameFile(namefile); }
169 countfile = validParameter.validFile(parameters, "count", true);
170 if (countfile == "not open") { abort = true; }
171 else if (countfile == "not found") { countfile = ""; }
172 else { m->setCountTableFile(countfile); }
174 if (countfile != "" && namefile != "") { m->mothurOut("[ERROR]: Cannot have both a name file and count file. Please use one or the other."); m->mothurOutEndLine(); abort = true; }
176 if ((blastfile == "")) { m->mothurOut("When executing a mgcluster command you must provide a blastfile."); m->mothurOutEndLine(); abort = true; }
178 //check for optional parameter and set defaults
180 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
181 precisionLength = temp.length();
182 m->mothurConvert(temp, precision);
184 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.70"; }
185 m->mothurConvert(temp, cutoff);
186 cutoff += (5 / (precision * 10.0));
188 method = validParameter.validFile(parameters, "method", false);
189 if (method == "not found") { method = "average"; }
191 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
192 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
194 temp = validParameter.validFile(parameters, "length", false); if (temp == "not found") { temp = "5"; }
195 m->mothurConvert(temp, length);
197 temp = validParameter.validFile(parameters, "penalty", false); if (temp == "not found") { temp = "0.10"; }
198 m->mothurConvert(temp, penalty);
200 temp = validParameter.validFile(parameters, "min", false); if (temp == "not found") { temp = "true"; }
201 minWanted = m->isTrue(temp);
203 temp = validParameter.validFile(parameters, "merge", false); if (temp == "not found") { temp = "true"; }
204 merge = m->isTrue(temp);
206 temp = validParameter.validFile(parameters, "hcluster", false); if (temp == "not found") { temp = "false"; }
207 hclusterWanted = m->isTrue(temp);
209 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; }
210 hard = m->isTrue(temp);
214 catch(exception& e) {
215 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
219 //**********************************************************************************************************************
220 int MGClusterCommand::execute(){
222 if (abort == true) { if (calledHelp) { return 0; } return 2; }
225 if (namefile != "") {
226 nameMap = new NameAssignment(namefile);
228 }else{ nameMap= new NameAssignment(); }
230 string fileroot = outputDir + m->getRootName(m->getSimpleName(blastfile));
233 float previousDist = 0.00000;
234 float rndPreviousDist = 0.00000;
236 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
237 //must remember to delete those objects here since readBlast does not
238 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
241 list = new ListVector(nameMap->getListVector());
242 RAbundVector* rabund = NULL;
244 if(countfile != "") {
245 //map<string, int> nameMapCounts = m->readNames(namefile);
246 ct = new CountTable();
247 ct->readTable(countfile);
248 rabund = new RAbundVector();
249 createRabund(ct, list, rabund);
251 rabund = new RAbundVector(list->getRAbundVector());
255 //list = new ListVector(nameMap->getListVector());
256 //rabund = new RAbundVector(list->getRAbundVector());
258 if (m->control_pressed) { outputTypes.clear(); delete nameMap; delete read; delete list; delete rabund; return 0; }
262 map<string, int> Seq2Bin;
263 map<string, int> oldSeq2Bin;
265 if (method == "furthest") { tag = "fn"; }
266 else if (method == "nearest") { tag = "nn"; }
269 string sabundFileName = fileroot+ tag + "." + getOutputFileNameTag("sabund");
270 string rabundFileName = fileroot+ tag + "." + getOutputFileNameTag("rabund");
271 string listFileName = fileroot+ tag + "." + getOutputFileNameTag("list");
273 if (countfile == "") {
274 m->openOutputFile(sabundFileName, sabundFile);
275 m->openOutputFile(rabundFileName, rabundFile);
277 m->openOutputFile(listFileName, listFile);
279 if (m->control_pressed) {
280 delete nameMap; delete read; delete list; delete rabund;
281 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
286 double saveCutoff = cutoff;
288 if (!hclusterWanted) {
289 //get distmatrix and overlap
290 SparseMatrix* distMatrix = read->getDistMatrix();
291 overlapMatrix = read->getOverlapMatrix(); //already sorted by read
295 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method); }
296 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
297 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); }
298 cluster->setMapWanted(true);
299 Seq2Bin = cluster->getSeqtoBin();
300 oldSeq2Bin = Seq2Bin;
302 if (m->control_pressed) {
303 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
304 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
309 //cluster using cluster classes
310 while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
312 cluster->update(cutoff);
314 if (m->control_pressed) {
315 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
316 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
321 float dist = distMatrix->getSmallDist();
324 rndDist = m->ceilDist(dist, precision);
326 rndDist = m->roundDist(dist, precision);
329 if(previousDist <= 0.0000 && dist != previousDist){
330 oldList.setLabel("unique");
333 else if(rndDist != rndPreviousDist){
335 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
337 if (m->control_pressed) {
338 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
339 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
344 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
348 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
354 rndPreviousDist = rndDist;
356 Seq2Bin = cluster->getSeqtoBin();
357 oldSeq2Bin = Seq2Bin;
360 if(previousDist <= 0.0000){
361 oldList.setLabel("unique");
364 else if(rndPreviousDist<cutoff){
366 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
368 if (m->control_pressed) {
369 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
370 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
375 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
379 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
385 overlapMatrix.clear();
389 }else { //use hcluster to cluster
390 //get distmatrix and overlap
391 overlapFile = read->getOverlapFile();
392 distFile = read->getDistFile();
395 //sort the distance and overlap files
396 sortHclusterFiles(distFile, overlapFile);
398 if (m->control_pressed) {
399 delete nameMap; delete list; delete rabund;
400 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
406 hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
407 hcluster->setMapWanted(true);
408 Seq2Bin = cluster->getSeqtoBin();
409 oldSeq2Bin = Seq2Bin;
411 vector<seqDist> seqs; seqs.resize(1); // to start loop
412 //ifstream inHcluster;
413 //m->openInputFile(distFile, inHcluster);
415 if (m->control_pressed) {
416 delete nameMap; delete list; delete rabund; delete hcluster;
417 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
422 while (seqs.size() != 0){
424 seqs = hcluster->getSeqs();
426 //to account for cutoff change in average neighbor
427 if (seqs.size() != 0) {
428 if (seqs[0].dist > cutoff) { break; }
431 if (m->control_pressed) {
432 delete nameMap; delete list; delete rabund; delete hcluster;
433 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
434 m->mothurRemove(distFile);
435 m->mothurRemove(overlapFile);
440 for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
442 if (seqs[i].seq1 != seqs[i].seq2) {
444 cutoff = hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
446 if (m->control_pressed) {
447 delete nameMap; delete list; delete rabund; delete hcluster;
448 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
449 m->mothurRemove(distFile);
450 m->mothurRemove(overlapFile);
457 rndDist = m->ceilDist(seqs[i].dist, precision);
459 rndDist = m->roundDist(seqs[i].dist, precision);
462 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
463 oldList.setLabel("unique");
466 else if((rndDist != rndPreviousDist)){
468 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
470 if (m->control_pressed) {
471 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
472 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
473 m->mothurRemove(distFile);
474 m->mothurRemove(overlapFile);
479 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
483 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
488 previousDist = seqs[i].dist;
489 rndPreviousDist = rndDist;
491 Seq2Bin = cluster->getSeqtoBin();
492 oldSeq2Bin = Seq2Bin;
496 //inHcluster.close();
498 if(previousDist <= 0.0000){
499 oldList.setLabel("unique");
502 else if(rndPreviousDist<cutoff){
504 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
506 if (m->control_pressed) {
507 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
508 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
509 m->mothurRemove(distFile);
510 m->mothurRemove(overlapFile);
515 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
519 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
525 m->mothurRemove(distFile);
526 m->mothurRemove(overlapFile);
532 if (countfile == "") {
536 if (m->control_pressed) {
538 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
543 m->mothurOutEndLine();
544 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
545 m->mothurOut(listFileName); m->mothurOutEndLine(); outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
546 if (countfile == "") {
547 m->mothurOut(rabundFileName); m->mothurOutEndLine(); outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
548 m->mothurOut(sabundFileName); m->mothurOutEndLine(); outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
550 m->mothurOutEndLine();
552 if (saveCutoff != cutoff) {
553 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
554 else { saveCutoff = m->roundDist(saveCutoff, precision); }
556 m->mothurOut("changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();
559 //set list file as new current listfile
561 itTypes = outputTypes.find("list");
562 if (itTypes != outputTypes.end()) {
563 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
566 //set rabund file as new current rabundfile
567 itTypes = outputTypes.find("rabund");
568 if (itTypes != outputTypes.end()) {
569 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
572 //set sabund file as new current sabundfile
573 itTypes = outputTypes.find("sabund");
574 if (itTypes != outputTypes.end()) {
575 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
579 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine();
583 catch(exception& e) {
584 m->errorOut(e, "MGClusterCommand", "execute");
588 //**********************************************************************************************************************
589 void MGClusterCommand::printData(ListVector* mergedList){
591 mergedList->print(listFile);
592 SAbundVector sabund = mergedList->getSAbundVector();
594 if (countfile == "") {
595 mergedList->getRAbundVector().print(rabundFile);
596 sabund.print(sabundFile);
601 catch(exception& e) {
602 m->errorOut(e, "MGClusterCommand", "printData");
606 //**********************************************************************************************************************
607 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
608 //that are used to cluster by distance. this is done so that the overlapping data does not have more influenece than the distance data.
609 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
611 //create new listvector so you don't overwrite the clustering
612 ListVector* newList = new ListVector(oldList);
618 if (hclusterWanted) {
619 m->openInputFile(overlapFile, inOverlap);
620 if (inOverlap.eof()) { done = true; }
621 }else { if (overlapMatrix.size() == 0) { done = true; } }
624 if (m->control_pressed) {
625 if (hclusterWanted) { inOverlap.close(); }
631 if (!hclusterWanted) {
632 if (count < overlapMatrix.size()) { //do we have another node in the matrix
633 overlapNode = overlapMatrix[count];
637 if (!inOverlap.eof()) {
638 string firstName, secondName;
639 float overlapDistance;
640 inOverlap >> firstName >> secondName >> overlapDistance; m->gobble(inOverlap);
642 //commented out because we check this in readblast already
643 //map<string,int>::iterator itA = nameMap->find(firstName);
644 //map<string,int>::iterator itB = nameMap->find(secondName);
645 //if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
646 //if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
648 //overlapNode.seq1 = itA->second;
649 //overlapNode.seq2 = itB->second;
650 overlapNode.seq1 = nameMap->get(firstName);
651 overlapNode.seq2 = nameMap->get(secondName);
652 overlapNode.dist = overlapDistance;
653 }else { inOverlap.close(); break; }
656 if (overlapNode.dist < dist) {
657 //get names of seqs that overlap
658 string name1 = nameMap->get(overlapNode.seq1);
659 string name2 = nameMap->get(overlapNode.seq2);
661 //use binInfo to find out if they are already in the same bin
662 //map<string, int>::iterator itBin1 = binInfo.find(name1);
663 //map<string, int>::iterator itBin2 = binInfo.find(name2);
665 //if(itBin1 == binInfo.end()){ cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1); }
666 //if(itBin2 == binInfo.end()){ cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1); }
668 //int binKeep = itBin1->second;
669 //int binRemove = itBin2->second;
671 int binKeep = binInfo[name1];
672 int binRemove = binInfo[name2];
674 //if not merge bins and update binInfo
675 if(binKeep != binRemove) {
677 //save names in old bin
678 string names = newList->get(binRemove);
680 //merge bins into name1s bin
681 newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
682 newList->set(binRemove, "");
685 while (names.find_first_of(',') != -1) {
687 string name = names.substr(0,names.find_first_of(','));
688 //save name and bin number
689 binInfo[name] = binKeep;
690 names = names.substr(names.find_first_of(',')+1, names.length());
694 binInfo[names] = binKeep;
697 }else { done = true; }
704 catch(exception& e) {
705 m->errorOut(e, "MGClusterCommand", "mergeOPFs");
709 //**********************************************************************************************************************
710 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
713 string sortedDistFile = m->sortFile(unsortedDist, outputDir);
714 m->mothurRemove(unsortedDist); //delete unsorted file
715 distFile = sortedDistFile;
718 string sortedOverlapFile = m->sortFile(unsortedOverlap, outputDir);
719 m->mothurRemove(unsortedOverlap); //delete unsorted file
720 overlapFile = sortedOverlapFile;
722 catch(exception& e) {
723 m->errorOut(e, "MGClusterCommand", "sortHclusterFiles");
728 //**********************************************************************************************************************
730 void MGClusterCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
732 //vector<string> names = ct.getNamesOfSeqs();
734 //for ( int i; i < ct.getNumGroups(); i++ ) { rav.push_back( ct.getNumSeqs(names[i]) ); }
737 for(int i = 0; i < list->getNumBins(); i++) {
738 vector<string> binNames;
739 string bin = list->get(i);
740 m->splitAtComma(bin, binNames);
742 for (int j = 0; j < binNames.size(); j++) {
743 total += ct->getNumSeqs(binNames[j]);
745 rabund->push_back(total);
750 catch(exception& e) {
751 m->errorOut(e, "MGClusterCommand", "createRabund");
757 //**********************************************************************************************************************