5 * Created by westcott on 12/11/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "mgclustercommand.h"
12 //**********************************************************************************************************************
13 vector<string> MGClusterCommand::setParameters(){
15 CommandParameter pblast("blast", "InputTypes", "", "", "none", "none", "none","list",false,true,true); parameters.push_back(pblast);
16 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName","rabund-sabund",false,false,true); parameters.push_back(pname);
17 CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "none","",false,false,true); parameters.push_back(pcount);
18 CommandParameter plength("length", "Number", "", "5", "", "", "","",false,false); parameters.push_back(plength);
19 CommandParameter ppenalty("penalty", "Number", "", "0.10", "", "", "","",false,false); parameters.push_back(ppenalty);
20 CommandParameter pcutoff("cutoff", "Number", "", "0.70", "", "", "","",false,false,true); parameters.push_back(pcutoff);
21 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
22 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average", "average", "", "", "","",false,false); parameters.push_back(pmethod);
23 CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
24 CommandParameter pmin("min", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pmin);
25 CommandParameter pmerge("merge", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pmerge);
26 CommandParameter padjust("adjust", "String", "", "F", "", "", "","",false,false); parameters.push_back(padjust);
27 CommandParameter phcluster("hcluster", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(phcluster);
28 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
29 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
31 vector<string> myArray;
32 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
36 m->errorOut(e, "MGClusterCommand", "setParameters");
40 //**********************************************************************************************************************
41 string MGClusterCommand::getHelpString(){
43 string helpString = "";
44 helpString += "The mgcluster command parameter options are blast, name, cutoff, precision, hard, method, merge, min, length, penalty, adjust and hcluster. The blast parameter is required.\n";
45 helpString += "The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n";
46 helpString += "This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n";
47 helpString += "The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n";
48 helpString += "The precision parameter's default value is 100. \n";
49 helpString += "The acceptable mgcluster methods are furthest, nearest and average. If no method is provided then average is assumed.\n";
50 helpString += "The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n";
51 helpString += "The length parameter is used to specify the minimum overlap required. The default is 5.\n";
52 helpString += "The adjust parameter is used to handle missing distances. If you set a cutoff, adjust=f by default. If not, adjust=t by default. Adjust=f, means ignore missing distances and adjust cutoff as needed with the average neighbor method. Adjust=t, will treat missing distances as 1.0. You can also set the value the missing distances should be set to, adjust=0.5 would give missing distances a value of 0.5.\n";
53 helpString += "The penalty parameter is used to adjust the error rate. The default is 0.10.\n";
54 helpString += "The merge parameter allows you to shut off merging based on overlaps and just cluster. By default merge is true, meaning you want to merge.\n";
55 helpString += "The hcluster parameter allows you to use the hcluster algorithm when clustering. This may be neccessary if your file is too large to fit into RAM. The default is false.\n";
56 helpString += "The mgcluster command should be in the following format: \n";
57 helpString += "mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n";
58 helpString += "Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n";
62 m->errorOut(e, "MGClusterCommand", "getHelpString");
66 //**********************************************************************************************************************
67 string MGClusterCommand::getOutputPattern(string type) {
71 if (type == "list") { pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; }
72 else if (type == "rabund") { pattern = "[filename],[clustertag],rabund"; }
73 else if (type == "sabund") { pattern = "[filename],[clustertag],sabund"; }
74 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
79 m->errorOut(e, "MGClusterCommand", "getOutputPattern");
83 //*******************************************************************************************************************
84 MGClusterCommand::MGClusterCommand(){
86 abort = true; calledHelp = true;
88 vector<string> tempOutNames;
89 outputTypes["list"] = tempOutNames;
90 outputTypes["rabund"] = tempOutNames;
91 outputTypes["sabund"] = tempOutNames;
94 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
98 //**********************************************************************************************************************
99 MGClusterCommand::MGClusterCommand(string option) {
101 abort = false; calledHelp = false;
103 //allow user to run help
104 if(option == "help") { help(); abort = true; calledHelp = true; }
105 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
108 vector<string> myArray = setParameters();
110 OptionParser parser(option);
111 map<string, string> parameters = parser.getParameters();
113 ValidParameters validParameter;
114 map<string,string>::iterator it;
116 //check to make sure all parameters are valid for command
117 for (it = parameters.begin(); it != parameters.end(); it++) {
118 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
121 //initialize outputTypes
122 vector<string> tempOutNames;
123 outputTypes["list"] = tempOutNames;
124 outputTypes["rabund"] = tempOutNames;
125 outputTypes["sabund"] = tempOutNames;
127 //if the user changes the input directory command factory will send this info to us in the output parameter
128 string inputDir = validParameter.validFile(parameters, "inputdir", false);
129 if (inputDir == "not found"){ inputDir = ""; }
132 it = parameters.find("blast");
133 //user has given a template file
134 if(it != parameters.end()){
135 path = m->hasPath(it->second);
136 //if the user has not given a path then, add inputdir. else leave path alone.
137 if (path == "") { parameters["blast"] = inputDir + it->second; }
140 it = parameters.find("name");
141 //user has given a template file
142 if(it != parameters.end()){
143 path = m->hasPath(it->second);
144 //if the user has not given a path then, add inputdir. else leave path alone.
145 if (path == "") { parameters["name"] = inputDir + it->second; }
148 it = parameters.find("count");
149 //user has given a template file
150 if(it != parameters.end()){
151 path = m->hasPath(it->second);
152 //if the user has not given a path then, add inputdir. else leave path alone.
153 if (path == "") { parameters["count"] = inputDir + it->second; }
158 //check for required parameters
159 blastfile = validParameter.validFile(parameters, "blast", true);
160 if (blastfile == "not open") { blastfile = ""; abort = true; }
161 else if (blastfile == "not found") { blastfile = ""; }
163 //if the user changes the output directory command factory will send this info to us in the output parameter
164 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
166 outputDir += m->hasPath(blastfile); //if user entered a file with a path then preserve it
169 namefile = validParameter.validFile(parameters, "name", true);
170 if (namefile == "not open") { abort = true; }
171 else if (namefile == "not found") { namefile = ""; }
172 else { m->setNameFile(namefile); }
174 countfile = validParameter.validFile(parameters, "count", true);
175 if (countfile == "not open") { abort = true; }
176 else if (countfile == "not found") { countfile = ""; }
177 else { m->setCountTableFile(countfile); }
179 if (countfile != "" && namefile != "") { m->mothurOut("[ERROR]: Cannot have both a name file and count file. Please use one or the other."); m->mothurOutEndLine(); abort = true; }
181 if ((blastfile == "")) { m->mothurOut("When executing a mgcluster command you must provide a blastfile."); m->mothurOutEndLine(); abort = true; }
183 //check for optional parameter and set defaults
185 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
186 precisionLength = temp.length();
187 m->mothurConvert(temp, precision);
190 temp = validParameter.validFile(parameters, "cutoff", false);
191 if (temp == "not found") { temp = "0.70"; }
192 else { cutoffSet = true; }
193 m->mothurConvert(temp, cutoff);
194 cutoff += (5 / (precision * 10.0));
196 method = validParameter.validFile(parameters, "method", false);
197 if (method == "not found") { method = "average"; }
199 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
200 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
202 temp = validParameter.validFile(parameters, "length", false); if (temp == "not found") { temp = "5"; }
203 m->mothurConvert(temp, length);
205 temp = validParameter.validFile(parameters, "penalty", false); if (temp == "not found") { temp = "0.10"; }
206 m->mothurConvert(temp, penalty);
208 temp = validParameter.validFile(parameters, "min", false); if (temp == "not found") { temp = "true"; }
209 minWanted = m->isTrue(temp);
211 temp = validParameter.validFile(parameters, "merge", false); if (temp == "not found") { temp = "true"; }
212 merge = m->isTrue(temp);
214 temp = validParameter.validFile(parameters, "hcluster", false); if (temp == "not found") { temp = "false"; }
215 hclusterWanted = m->isTrue(temp);
217 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; }
218 hard = m->isTrue(temp);
220 temp = validParameter.validFile(parameters, "adjust", false); if (temp == "not found") { if (cutoffSet) { temp = "F"; }else { temp="T"; } }
221 if (m->isNumeric1(temp)) { m->mothurConvert(temp, adjust); }
222 else if (m->isTrue(temp)) { adjust = 1.0; }
223 else { adjust = -1.0; }
227 catch(exception& e) {
228 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
232 //**********************************************************************************************************************
233 int MGClusterCommand::execute(){
235 if (abort == true) { if (calledHelp) { return 0; } return 2; }
238 if (namefile != "") {
239 nameMap = new NameAssignment(namefile);
241 }else if (countfile != "") {
242 ct = new CountTable();
243 ct->readTable(countfile, false, false);
244 nameMap= new NameAssignment();
245 vector<string> tempNames = ct->getNamesOfSeqs();
246 for (int i = 0; i < tempNames.size(); i++) { nameMap->push_back(tempNames[i]); }
247 }else{ nameMap= new NameAssignment(); }
249 string fileroot = outputDir + m->getRootName(m->getSimpleName(blastfile));
252 float previousDist = 0.00000;
253 float rndPreviousDist = 0.00000;
255 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
256 //must remember to delete those objects here since readBlast does not
257 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
260 list = new ListVector(nameMap->getListVector());
261 RAbundVector* rabund = NULL;
263 if(countfile != "") {
264 rabund = new RAbundVector();
265 createRabund(ct, list, rabund);
267 rabund = new RAbundVector(list->getRAbundVector());
271 //list = new ListVector(nameMap->getListVector());
272 //rabund = new RAbundVector(list->getRAbundVector());
274 if (m->control_pressed) { outputTypes.clear(); delete nameMap; delete read; delete list; delete rabund; return 0; }
278 map<string, int> Seq2Bin;
279 map<string, int> oldSeq2Bin;
281 if (method == "furthest") { tag = "fn"; }
282 else if (method == "nearest") { tag = "nn"; }
285 map<string, string> variables;
286 variables["[filename]"] = fileroot;
287 variables["[clustertag]"] = tag;
288 string sabundFileName = getOutputFileName("sabund", variables);
289 string rabundFileName = getOutputFileName("rabund", variables);
290 if (countfile != "") { variables["[tag2]"] = "unique_list"; }
291 string listFileName = getOutputFileName("list", variables);
293 if (countfile == "") {
294 m->openOutputFile(sabundFileName, sabundFile);
295 m->openOutputFile(rabundFileName, rabundFile);
297 m->openOutputFile(listFileName, listFile);
298 list->printHeaders(listFile);
300 if (m->control_pressed) {
301 delete nameMap; delete read; delete list; delete rabund;
302 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
307 double saveCutoff = cutoff;
309 if (!hclusterWanted) {
310 //get distmatrix and overlap
311 SparseDistanceMatrix* distMatrix = read->getDistMatrix();
312 overlapMatrix = read->getOverlapMatrix(); //already sorted by read
316 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method, adjust); }
317 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method, adjust); }
318 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method, adjust); }
319 cluster->setMapWanted(true);
320 Seq2Bin = cluster->getSeqtoBin();
321 oldSeq2Bin = Seq2Bin;
323 if (m->control_pressed) {
324 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
325 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
331 //cluster using cluster classes
332 while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
334 if (m->debug) { cout << "numNodes=" << distMatrix->getNNodes() << " smallDist = " << distMatrix->getSmallDist() << endl; }
336 cluster->update(cutoff);
338 if (m->control_pressed) {
339 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
340 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
345 float dist = distMatrix->getSmallDist();
348 rndDist = m->ceilDist(dist, precision);
350 rndDist = m->roundDist(dist, precision);
353 if(previousDist <= 0.0000 && dist != previousDist){
354 oldList.setLabel("unique");
357 else if(rndDist != rndPreviousDist){
359 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
361 if (m->control_pressed) {
362 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
363 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
368 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
372 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
378 rndPreviousDist = rndDist;
380 Seq2Bin = cluster->getSeqtoBin();
381 oldSeq2Bin = Seq2Bin;
384 if(previousDist <= 0.0000){
385 oldList.setLabel("unique");
388 else if(rndPreviousDist<cutoff){
390 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
392 if (m->control_pressed) {
393 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
394 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
399 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
403 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
409 overlapMatrix.clear();
413 }else { //use hcluster to cluster
414 //get distmatrix and overlap
415 overlapFile = read->getOverlapFile();
416 distFile = read->getDistFile();
419 //sort the distance and overlap files
420 sortHclusterFiles(distFile, overlapFile);
422 if (m->control_pressed) {
423 delete nameMap; delete list; delete rabund;
424 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
430 hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
431 hcluster->setMapWanted(true);
432 Seq2Bin = cluster->getSeqtoBin();
433 oldSeq2Bin = Seq2Bin;
435 vector<seqDist> seqs; seqs.resize(1); // to start loop
436 //ifstream inHcluster;
437 //m->openInputFile(distFile, inHcluster);
439 if (m->control_pressed) {
440 delete nameMap; delete list; delete rabund; delete hcluster;
441 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
446 while (seqs.size() != 0){
448 seqs = hcluster->getSeqs();
450 //to account for cutoff change in average neighbor
451 if (seqs.size() != 0) {
452 if (seqs[0].dist > cutoff) { break; }
455 if (m->control_pressed) {
456 delete nameMap; delete list; delete rabund; delete hcluster;
457 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
458 m->mothurRemove(distFile);
459 m->mothurRemove(overlapFile);
464 for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
466 if (seqs[i].seq1 != seqs[i].seq2) {
468 cutoff = hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
470 if (m->control_pressed) {
471 delete nameMap; delete list; delete rabund; delete hcluster;
472 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
473 m->mothurRemove(distFile);
474 m->mothurRemove(overlapFile);
481 rndDist = m->ceilDist(seqs[i].dist, precision);
483 rndDist = m->roundDist(seqs[i].dist, precision);
486 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
487 oldList.setLabel("unique");
490 else if((rndDist != rndPreviousDist)){
492 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
494 if (m->control_pressed) {
495 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
496 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
497 m->mothurRemove(distFile);
498 m->mothurRemove(overlapFile);
503 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
507 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
512 previousDist = seqs[i].dist;
513 rndPreviousDist = rndDist;
515 Seq2Bin = cluster->getSeqtoBin();
516 oldSeq2Bin = Seq2Bin;
520 //inHcluster.close();
522 if(previousDist <= 0.0000){
523 oldList.setLabel("unique");
526 else if(rndPreviousDist<cutoff){
528 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
530 if (m->control_pressed) {
531 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
532 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
533 m->mothurRemove(distFile);
534 m->mothurRemove(overlapFile);
539 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
543 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
549 m->mothurRemove(distFile);
550 m->mothurRemove(overlapFile);
556 if (countfile == "") {
560 if (m->control_pressed) {
562 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
567 m->mothurOutEndLine();
568 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
569 m->mothurOut(listFileName); m->mothurOutEndLine(); outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
570 if (countfile == "") {
571 m->mothurOut(rabundFileName); m->mothurOutEndLine(); outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
572 m->mothurOut(sabundFileName); m->mothurOutEndLine(); outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
574 m->mothurOutEndLine();
576 if (saveCutoff != cutoff) {
577 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
578 else { saveCutoff = m->roundDist(saveCutoff, precision); }
580 m->mothurOut("changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();
583 //set list file as new current listfile
585 itTypes = outputTypes.find("list");
586 if (itTypes != outputTypes.end()) {
587 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
590 //set rabund file as new current rabundfile
591 itTypes = outputTypes.find("rabund");
592 if (itTypes != outputTypes.end()) {
593 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
596 //set sabund file as new current sabundfile
597 itTypes = outputTypes.find("sabund");
598 if (itTypes != outputTypes.end()) {
599 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
603 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine();
607 catch(exception& e) {
608 m->errorOut(e, "MGClusterCommand", "execute");
612 //**********************************************************************************************************************
613 void MGClusterCommand::printData(ListVector* mergedList){
615 mergedList->print(listFile);
616 SAbundVector sabund = mergedList->getSAbundVector();
618 if (countfile == "") {
619 mergedList->getRAbundVector().print(rabundFile);
620 sabund.print(sabundFile);
625 catch(exception& e) {
626 m->errorOut(e, "MGClusterCommand", "printData");
630 //**********************************************************************************************************************
631 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
632 //that are used to cluster by distance. this is done so that the overlapping data does not have more influenece than the distance data.
633 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
635 //create new listvector so you don't overwrite the clustering
636 ListVector* newList = new ListVector(oldList);
642 if (hclusterWanted) {
643 m->openInputFile(overlapFile, inOverlap);
644 if (inOverlap.eof()) { done = true; }
645 }else { if (overlapMatrix.size() == 0) { done = true; } }
648 if (m->control_pressed) {
649 if (hclusterWanted) { inOverlap.close(); }
655 if (!hclusterWanted) {
656 if (count < overlapMatrix.size()) { //do we have another node in the matrix
657 overlapNode = overlapMatrix[count];
661 if (!inOverlap.eof()) {
662 string firstName, secondName;
663 float overlapDistance;
664 inOverlap >> firstName >> secondName >> overlapDistance; m->gobble(inOverlap);
666 //commented out because we check this in readblast already
667 //map<string,int>::iterator itA = nameMap->find(firstName);
668 //map<string,int>::iterator itB = nameMap->find(secondName);
669 //if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
670 //if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
672 //overlapNode.seq1 = itA->second;
673 //overlapNode.seq2 = itB->second;
674 overlapNode.seq1 = nameMap->get(firstName);
675 overlapNode.seq2 = nameMap->get(secondName);
676 overlapNode.dist = overlapDistance;
677 }else { inOverlap.close(); break; }
680 if (overlapNode.dist < dist) {
681 //get names of seqs that overlap
682 string name1 = nameMap->get(overlapNode.seq1);
683 string name2 = nameMap->get(overlapNode.seq2);
685 //use binInfo to find out if they are already in the same bin
686 //map<string, int>::iterator itBin1 = binInfo.find(name1);
687 //map<string, int>::iterator itBin2 = binInfo.find(name2);
689 //if(itBin1 == binInfo.end()){ cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1); }
690 //if(itBin2 == binInfo.end()){ cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1); }
692 //int binKeep = itBin1->second;
693 //int binRemove = itBin2->second;
695 int binKeep = binInfo[name1];
696 int binRemove = binInfo[name2];
698 //if not merge bins and update binInfo
699 if(binKeep != binRemove) {
701 //save names in old bin
702 string names = newList->get(binRemove);
704 //merge bins into name1s bin
705 newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
706 newList->set(binRemove, "");
709 while (names.find_first_of(',') != -1) {
711 string name = names.substr(0,names.find_first_of(','));
712 //save name and bin number
713 binInfo[name] = binKeep;
714 names = names.substr(names.find_first_of(',')+1, names.length());
718 binInfo[names] = binKeep;
721 }else { done = true; }
728 catch(exception& e) {
729 m->errorOut(e, "MGClusterCommand", "mergeOPFs");
733 //**********************************************************************************************************************
734 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
737 string sortedDistFile = m->sortFile(unsortedDist, outputDir);
738 m->mothurRemove(unsortedDist); //delete unsorted file
739 distFile = sortedDistFile;
742 string sortedOverlapFile = m->sortFile(unsortedOverlap, outputDir);
743 m->mothurRemove(unsortedOverlap); //delete unsorted file
744 overlapFile = sortedOverlapFile;
746 catch(exception& e) {
747 m->errorOut(e, "MGClusterCommand", "sortHclusterFiles");
752 //**********************************************************************************************************************
754 void MGClusterCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
756 //vector<string> names = ct.getNamesOfSeqs();
758 //for ( int i; i < ct.getNumGroups(); i++ ) { rav.push_back( ct.getNumSeqs(names[i]) ); }
761 for(int i = 0; i < list->getNumBins(); i++) {
762 vector<string> binNames;
763 string bin = list->get(i);
764 m->splitAtComma(bin, binNames);
766 for (int j = 0; j < binNames.size(); j++) {
767 total += ct->getNumSeqs(binNames[j]);
769 rabund->push_back(total);
774 catch(exception& e) {
775 m->errorOut(e, "MGClusterCommand", "createRabund");
781 //**********************************************************************************************************************