5 * Created by westcott on 12/11/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "mgclustercommand.h"
12 //**********************************************************************************************************************
13 vector<string> MGClusterCommand::setParameters(){
15 CommandParameter pblast("blast", "InputTypes", "", "", "none", "none", "none","list",false,true,true); parameters.push_back(pblast);
16 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName","rabund-sabund",false,false,true); parameters.push_back(pname);
17 CommandParameter pcount("count", "InputTypes", "", "", "NameCount", "none", "none","",false,false,true); parameters.push_back(pcount);
18 CommandParameter plength("length", "Number", "", "5", "", "", "","",false,false); parameters.push_back(plength);
19 CommandParameter ppenalty("penalty", "Number", "", "0.10", "", "", "","",false,false); parameters.push_back(ppenalty);
20 CommandParameter pcutoff("cutoff", "Number", "", "0.70", "", "", "","",false,false,true); parameters.push_back(pcutoff);
21 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
22 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average", "average", "", "", "","",false,false); parameters.push_back(pmethod);
23 CommandParameter phard("hard", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(phard);
24 CommandParameter pmin("min", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pmin);
25 CommandParameter pmerge("merge", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pmerge);
26 CommandParameter padjust("adjust", "String", "", "F", "", "", "","",false,false); parameters.push_back(padjust);
27 CommandParameter phcluster("hcluster", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(phcluster);
28 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
29 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
31 vector<string> myArray;
32 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
36 m->errorOut(e, "MGClusterCommand", "setParameters");
40 //**********************************************************************************************************************
41 string MGClusterCommand::getHelpString(){
43 string helpString = "";
44 helpString += "The mgcluster command parameter options are blast, name, cutoff, precision, hard, method, merge, min, length, penalty, adjust and hcluster. The blast parameter is required.\n";
45 helpString += "The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n";
46 helpString += "This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n";
47 helpString += "The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n";
48 helpString += "The precision parameter's default value is 100. \n";
49 helpString += "The acceptable mgcluster methods are furthest, nearest and average. If no method is provided then average is assumed.\n";
50 helpString += "The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n";
51 helpString += "The length parameter is used to specify the minimum overlap required. The default is 5.\n";
52 helpString += "The adjust parameter is used to handle missing distances. If you set a cutoff, adjust=f by default. If not, adjust=t by default. Adjust=f, means ignore missing distances and adjust cutoff as needed with the average neighbor method. Adjust=t, will treat missing distances as 1.0. You can also set the value the missing distances should be set to, adjust=0.5 would give missing distances a value of 0.5.\n";
53 helpString += "The penalty parameter is used to adjust the error rate. The default is 0.10.\n";
54 helpString += "The merge parameter allows you to shut off merging based on overlaps and just cluster. By default merge is true, meaning you want to merge.\n";
55 helpString += "The hcluster parameter allows you to use the hcluster algorithm when clustering. This may be neccessary if your file is too large to fit into RAM. The default is false.\n";
56 helpString += "The mgcluster command should be in the following format: \n";
57 helpString += "mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n";
58 helpString += "Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n";
62 m->errorOut(e, "MGClusterCommand", "getHelpString");
66 //**********************************************************************************************************************
67 string MGClusterCommand::getOutputPattern(string type) {
71 if (type == "list") { pattern = "[filename],[clustertag],list-[filename],[clustertag],[tag2],list"; }
72 else if (type == "rabund") { pattern = "[filename],[clustertag],rabund"; }
73 else if (type == "sabund") { pattern = "[filename],[clustertag],sabund"; }
74 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
79 m->errorOut(e, "MGClusterCommand", "getOutputPattern");
83 //*******************************************************************************************************************
84 MGClusterCommand::MGClusterCommand(){
86 abort = true; calledHelp = true;
88 vector<string> tempOutNames;
89 outputTypes["list"] = tempOutNames;
90 outputTypes["rabund"] = tempOutNames;
91 outputTypes["sabund"] = tempOutNames;
94 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
98 //**********************************************************************************************************************
99 MGClusterCommand::MGClusterCommand(string option) {
101 abort = false; calledHelp = false;
103 //allow user to run help
104 if(option == "help") { help(); abort = true; calledHelp = true; }
105 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
108 vector<string> myArray = setParameters();
110 OptionParser parser(option);
111 map<string, string> parameters = parser.getParameters();
113 ValidParameters validParameter;
114 map<string,string>::iterator it;
116 //check to make sure all parameters are valid for command
117 for (it = parameters.begin(); it != parameters.end(); it++) {
118 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
121 //initialize outputTypes
122 vector<string> tempOutNames;
123 outputTypes["list"] = tempOutNames;
124 outputTypes["rabund"] = tempOutNames;
125 outputTypes["sabund"] = tempOutNames;
127 //if the user changes the input directory command factory will send this info to us in the output parameter
128 string inputDir = validParameter.validFile(parameters, "inputdir", false);
129 if (inputDir == "not found"){ inputDir = ""; }
132 it = parameters.find("blast");
133 //user has given a template file
134 if(it != parameters.end()){
135 path = m->hasPath(it->second);
136 //if the user has not given a path then, add inputdir. else leave path alone.
137 if (path == "") { parameters["blast"] = inputDir + it->second; }
140 it = parameters.find("name");
141 //user has given a template file
142 if(it != parameters.end()){
143 path = m->hasPath(it->second);
144 //if the user has not given a path then, add inputdir. else leave path alone.
145 if (path == "") { parameters["name"] = inputDir + it->second; }
148 it = parameters.find("count");
149 //user has given a template file
150 if(it != parameters.end()){
151 path = m->hasPath(it->second);
152 //if the user has not given a path then, add inputdir. else leave path alone.
153 if (path == "") { parameters["count"] = inputDir + it->second; }
158 //check for required parameters
159 blastfile = validParameter.validFile(parameters, "blast", true);
160 if (blastfile == "not open") { blastfile = ""; abort = true; }
161 else if (blastfile == "not found") { blastfile = ""; }
163 //if the user changes the output directory command factory will send this info to us in the output parameter
164 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
166 outputDir += m->hasPath(blastfile); //if user entered a file with a path then preserve it
169 namefile = validParameter.validFile(parameters, "name", true);
170 if (namefile == "not open") { abort = true; }
171 else if (namefile == "not found") { namefile = ""; }
172 else { m->setNameFile(namefile); }
174 countfile = validParameter.validFile(parameters, "count", true);
175 if (countfile == "not open") { abort = true; }
176 else if (countfile == "not found") { countfile = ""; }
177 else { m->setCountTableFile(countfile); }
179 if (countfile != "" && namefile != "") { m->mothurOut("[ERROR]: Cannot have both a name file and count file. Please use one or the other."); m->mothurOutEndLine(); abort = true; }
181 if ((blastfile == "")) { m->mothurOut("When executing a mgcluster command you must provide a blastfile."); m->mothurOutEndLine(); abort = true; }
183 //check for optional parameter and set defaults
185 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
186 precisionLength = temp.length();
187 m->mothurConvert(temp, precision);
190 temp = validParameter.validFile(parameters, "cutoff", false);
191 if (temp == "not found") { temp = "0.70"; }
192 else { cutoffSet = true; }
193 m->mothurConvert(temp, cutoff);
194 cutoff += (5 / (precision * 10.0));
196 method = validParameter.validFile(parameters, "method", false);
197 if (method == "not found") { method = "average"; }
199 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
200 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
202 temp = validParameter.validFile(parameters, "length", false); if (temp == "not found") { temp = "5"; }
203 m->mothurConvert(temp, length);
205 temp = validParameter.validFile(parameters, "penalty", false); if (temp == "not found") { temp = "0.10"; }
206 m->mothurConvert(temp, penalty);
208 temp = validParameter.validFile(parameters, "min", false); if (temp == "not found") { temp = "true"; }
209 minWanted = m->isTrue(temp);
211 temp = validParameter.validFile(parameters, "merge", false); if (temp == "not found") { temp = "true"; }
212 merge = m->isTrue(temp);
214 temp = validParameter.validFile(parameters, "hcluster", false); if (temp == "not found") { temp = "false"; }
215 hclusterWanted = m->isTrue(temp);
217 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; }
218 hard = m->isTrue(temp);
220 temp = validParameter.validFile(parameters, "adjust", false); if (temp == "not found") { if (cutoffSet) { temp = "F"; }else { temp="T"; } }
221 if (m->isNumeric1(temp)) { m->mothurConvert(temp, adjust); }
222 else if (m->isTrue(temp)) { adjust = 1.0; }
223 else { adjust = -1.0; }
227 catch(exception& e) {
228 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
232 //**********************************************************************************************************************
233 int MGClusterCommand::execute(){
235 if (abort == true) { if (calledHelp) { return 0; } return 2; }
238 if (namefile != "") {
239 nameMap = new NameAssignment(namefile);
241 }else if (countfile != "") {
242 ct = new CountTable();
243 ct->readTable(countfile, false);
244 nameMap= new NameAssignment();
245 vector<string> tempNames = ct->getNamesOfSeqs();
246 for (int i = 0; i < tempNames.size(); i++) { nameMap->push_back(tempNames[i]); }
247 }else{ nameMap= new NameAssignment(); }
249 string fileroot = outputDir + m->getRootName(m->getSimpleName(blastfile));
252 float previousDist = 0.00000;
253 float rndPreviousDist = 0.00000;
255 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
256 //must remember to delete those objects here since readBlast does not
257 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
260 list = new ListVector(nameMap->getListVector());
261 RAbundVector* rabund = NULL;
263 if(countfile != "") {
264 rabund = new RAbundVector();
265 createRabund(ct, list, rabund);
267 rabund = new RAbundVector(list->getRAbundVector());
271 //list = new ListVector(nameMap->getListVector());
272 //rabund = new RAbundVector(list->getRAbundVector());
274 if (m->control_pressed) { outputTypes.clear(); delete nameMap; delete read; delete list; delete rabund; return 0; }
278 map<string, int> Seq2Bin;
279 map<string, int> oldSeq2Bin;
281 if (method == "furthest") { tag = "fn"; }
282 else if (method == "nearest") { tag = "nn"; }
285 map<string, string> variables;
286 variables["[filename]"] = fileroot;
287 variables["[clustertag]"] = tag;
288 string sabundFileName = getOutputFileName("sabund", variables);
289 string rabundFileName = getOutputFileName("rabund", variables);
290 if (countfile != "") { variables["[tag2]"] = "unique_list"; }
291 string listFileName = getOutputFileName("list", variables);
293 if (countfile == "") {
294 m->openOutputFile(sabundFileName, sabundFile);
295 m->openOutputFile(rabundFileName, rabundFile);
297 m->openOutputFile(listFileName, listFile);
299 if (m->control_pressed) {
300 delete nameMap; delete read; delete list; delete rabund;
301 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
306 double saveCutoff = cutoff;
308 if (!hclusterWanted) {
309 //get distmatrix and overlap
310 SparseDistanceMatrix* distMatrix = read->getDistMatrix();
311 overlapMatrix = read->getOverlapMatrix(); //already sorted by read
315 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method, adjust); }
316 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method, adjust); }
317 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method, adjust); }
318 cluster->setMapWanted(true);
319 Seq2Bin = cluster->getSeqtoBin();
320 oldSeq2Bin = Seq2Bin;
322 if (m->control_pressed) {
323 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
324 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
330 //cluster using cluster classes
331 while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
333 if (m->debug) { cout << "numNodes=" << distMatrix->getNNodes() << " smallDist = " << distMatrix->getSmallDist() << endl; }
335 cluster->update(cutoff);
337 if (m->control_pressed) {
338 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
339 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
344 float dist = distMatrix->getSmallDist();
347 rndDist = m->ceilDist(dist, precision);
349 rndDist = m->roundDist(dist, precision);
352 if(previousDist <= 0.0000 && dist != previousDist){
353 oldList.setLabel("unique");
356 else if(rndDist != rndPreviousDist){
358 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
360 if (m->control_pressed) {
361 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
362 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
367 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
371 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
377 rndPreviousDist = rndDist;
379 Seq2Bin = cluster->getSeqtoBin();
380 oldSeq2Bin = Seq2Bin;
383 if(previousDist <= 0.0000){
384 oldList.setLabel("unique");
387 else if(rndPreviousDist<cutoff){
389 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
391 if (m->control_pressed) {
392 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
393 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
398 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
402 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
408 overlapMatrix.clear();
412 }else { //use hcluster to cluster
413 //get distmatrix and overlap
414 overlapFile = read->getOverlapFile();
415 distFile = read->getDistFile();
418 //sort the distance and overlap files
419 sortHclusterFiles(distFile, overlapFile);
421 if (m->control_pressed) {
422 delete nameMap; delete list; delete rabund;
423 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
429 hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
430 hcluster->setMapWanted(true);
431 Seq2Bin = cluster->getSeqtoBin();
432 oldSeq2Bin = Seq2Bin;
434 vector<seqDist> seqs; seqs.resize(1); // to start loop
435 //ifstream inHcluster;
436 //m->openInputFile(distFile, inHcluster);
438 if (m->control_pressed) {
439 delete nameMap; delete list; delete rabund; delete hcluster;
440 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
445 while (seqs.size() != 0){
447 seqs = hcluster->getSeqs();
449 //to account for cutoff change in average neighbor
450 if (seqs.size() != 0) {
451 if (seqs[0].dist > cutoff) { break; }
454 if (m->control_pressed) {
455 delete nameMap; delete list; delete rabund; delete hcluster;
456 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
457 m->mothurRemove(distFile);
458 m->mothurRemove(overlapFile);
463 for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
465 if (seqs[i].seq1 != seqs[i].seq2) {
467 cutoff = hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
469 if (m->control_pressed) {
470 delete nameMap; delete list; delete rabund; delete hcluster;
471 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
472 m->mothurRemove(distFile);
473 m->mothurRemove(overlapFile);
480 rndDist = m->ceilDist(seqs[i].dist, precision);
482 rndDist = m->roundDist(seqs[i].dist, precision);
485 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
486 oldList.setLabel("unique");
489 else if((rndDist != rndPreviousDist)){
491 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
493 if (m->control_pressed) {
494 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
495 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
496 m->mothurRemove(distFile);
497 m->mothurRemove(overlapFile);
502 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
506 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
511 previousDist = seqs[i].dist;
512 rndPreviousDist = rndDist;
514 Seq2Bin = cluster->getSeqtoBin();
515 oldSeq2Bin = Seq2Bin;
519 //inHcluster.close();
521 if(previousDist <= 0.0000){
522 oldList.setLabel("unique");
525 else if(rndPreviousDist<cutoff){
527 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
529 if (m->control_pressed) {
530 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
531 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
532 m->mothurRemove(distFile);
533 m->mothurRemove(overlapFile);
538 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
542 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
548 m->mothurRemove(distFile);
549 m->mothurRemove(overlapFile);
555 if (countfile == "") {
559 if (m->control_pressed) {
561 listFile.close(); if (countfile == "") { rabundFile.close(); sabundFile.close(); m->mothurRemove((fileroot+ tag + ".rabund")); m->mothurRemove((fileroot+ tag + ".sabund")); } m->mothurRemove((fileroot+ tag + ".list"));
566 m->mothurOutEndLine();
567 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
568 m->mothurOut(listFileName); m->mothurOutEndLine(); outputNames.push_back(listFileName); outputTypes["list"].push_back(listFileName);
569 if (countfile == "") {
570 m->mothurOut(rabundFileName); m->mothurOutEndLine(); outputNames.push_back(rabundFileName); outputTypes["rabund"].push_back(rabundFileName);
571 m->mothurOut(sabundFileName); m->mothurOutEndLine(); outputNames.push_back(sabundFileName); outputTypes["sabund"].push_back(sabundFileName);
573 m->mothurOutEndLine();
575 if (saveCutoff != cutoff) {
576 if (hard) { saveCutoff = m->ceilDist(saveCutoff, precision); }
577 else { saveCutoff = m->roundDist(saveCutoff, precision); }
579 m->mothurOut("changed cutoff to " + toString(cutoff)); m->mothurOutEndLine();
582 //set list file as new current listfile
584 itTypes = outputTypes.find("list");
585 if (itTypes != outputTypes.end()) {
586 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
589 //set rabund file as new current rabundfile
590 itTypes = outputTypes.find("rabund");
591 if (itTypes != outputTypes.end()) {
592 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
595 //set sabund file as new current sabundfile
596 itTypes = outputTypes.find("sabund");
597 if (itTypes != outputTypes.end()) {
598 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
602 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine();
606 catch(exception& e) {
607 m->errorOut(e, "MGClusterCommand", "execute");
611 //**********************************************************************************************************************
612 void MGClusterCommand::printData(ListVector* mergedList){
614 mergedList->print(listFile);
615 SAbundVector sabund = mergedList->getSAbundVector();
617 if (countfile == "") {
618 mergedList->getRAbundVector().print(rabundFile);
619 sabund.print(sabundFile);
624 catch(exception& e) {
625 m->errorOut(e, "MGClusterCommand", "printData");
629 //**********************************************************************************************************************
630 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
631 //that are used to cluster by distance. this is done so that the overlapping data does not have more influenece than the distance data.
632 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
634 //create new listvector so you don't overwrite the clustering
635 ListVector* newList = new ListVector(oldList);
641 if (hclusterWanted) {
642 m->openInputFile(overlapFile, inOverlap);
643 if (inOverlap.eof()) { done = true; }
644 }else { if (overlapMatrix.size() == 0) { done = true; } }
647 if (m->control_pressed) {
648 if (hclusterWanted) { inOverlap.close(); }
654 if (!hclusterWanted) {
655 if (count < overlapMatrix.size()) { //do we have another node in the matrix
656 overlapNode = overlapMatrix[count];
660 if (!inOverlap.eof()) {
661 string firstName, secondName;
662 float overlapDistance;
663 inOverlap >> firstName >> secondName >> overlapDistance; m->gobble(inOverlap);
665 //commented out because we check this in readblast already
666 //map<string,int>::iterator itA = nameMap->find(firstName);
667 //map<string,int>::iterator itB = nameMap->find(secondName);
668 //if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
669 //if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
671 //overlapNode.seq1 = itA->second;
672 //overlapNode.seq2 = itB->second;
673 overlapNode.seq1 = nameMap->get(firstName);
674 overlapNode.seq2 = nameMap->get(secondName);
675 overlapNode.dist = overlapDistance;
676 }else { inOverlap.close(); break; }
679 if (overlapNode.dist < dist) {
680 //get names of seqs that overlap
681 string name1 = nameMap->get(overlapNode.seq1);
682 string name2 = nameMap->get(overlapNode.seq2);
684 //use binInfo to find out if they are already in the same bin
685 //map<string, int>::iterator itBin1 = binInfo.find(name1);
686 //map<string, int>::iterator itBin2 = binInfo.find(name2);
688 //if(itBin1 == binInfo.end()){ cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1); }
689 //if(itBin2 == binInfo.end()){ cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1); }
691 //int binKeep = itBin1->second;
692 //int binRemove = itBin2->second;
694 int binKeep = binInfo[name1];
695 int binRemove = binInfo[name2];
697 //if not merge bins and update binInfo
698 if(binKeep != binRemove) {
700 //save names in old bin
701 string names = newList->get(binRemove);
703 //merge bins into name1s bin
704 newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
705 newList->set(binRemove, "");
708 while (names.find_first_of(',') != -1) {
710 string name = names.substr(0,names.find_first_of(','));
711 //save name and bin number
712 binInfo[name] = binKeep;
713 names = names.substr(names.find_first_of(',')+1, names.length());
717 binInfo[names] = binKeep;
720 }else { done = true; }
727 catch(exception& e) {
728 m->errorOut(e, "MGClusterCommand", "mergeOPFs");
732 //**********************************************************************************************************************
733 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
736 string sortedDistFile = m->sortFile(unsortedDist, outputDir);
737 m->mothurRemove(unsortedDist); //delete unsorted file
738 distFile = sortedDistFile;
741 string sortedOverlapFile = m->sortFile(unsortedOverlap, outputDir);
742 m->mothurRemove(unsortedOverlap); //delete unsorted file
743 overlapFile = sortedOverlapFile;
745 catch(exception& e) {
746 m->errorOut(e, "MGClusterCommand", "sortHclusterFiles");
751 //**********************************************************************************************************************
753 void MGClusterCommand::createRabund(CountTable*& ct, ListVector*& list, RAbundVector*& rabund){
755 //vector<string> names = ct.getNamesOfSeqs();
757 //for ( int i; i < ct.getNumGroups(); i++ ) { rav.push_back( ct.getNumSeqs(names[i]) ); }
760 for(int i = 0; i < list->getNumBins(); i++) {
761 vector<string> binNames;
762 string bin = list->get(i);
763 m->splitAtComma(bin, binNames);
765 for (int j = 0; j < binNames.size(); j++) {
766 total += ct->getNumSeqs(binNames[j]);
768 rabund->push_back(total);
773 catch(exception& e) {
774 m->errorOut(e, "MGClusterCommand", "createRabund");
780 //**********************************************************************************************************************