5 * Created by westcott on 12/11/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "mgclustercommand.h"
12 //**********************************************************************************************************************
13 vector<string> MGClusterCommand::setParameters(){
15 CommandParameter pblast("blast", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pblast);
16 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
17 CommandParameter plength("length", "Number", "", "5", "", "", "",false,false); parameters.push_back(plength);
18 CommandParameter ppenalty("penalty", "Number", "", "0.10", "", "", "",false,false); parameters.push_back(ppenalty);
19 CommandParameter pcutoff("cutoff", "Number", "", "0.70", "", "", "",false,false); parameters.push_back(pcutoff);
20 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
21 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average", "average", "", "", "",false,false); parameters.push_back(pmethod);
22 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
23 CommandParameter pmin("min", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pmin);
24 CommandParameter pmerge("merge", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pmerge);
25 CommandParameter phcluster("hcluster", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(phcluster);
26 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
27 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
29 vector<string> myArray;
30 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
34 m->errorOut(e, "MGClusterCommand", "setParameters");
38 //**********************************************************************************************************************
39 string MGClusterCommand::getHelpString(){
41 string helpString = "";
42 helpString += "The mgcluster command parameter options are blast, name, cutoff, precision, hard, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n";
43 helpString += "The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n";
44 helpString += "This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n";
45 helpString += "The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n";
46 helpString += "The precision parameter's default value is 100. \n";
47 helpString += "The acceptable mgcluster methods are furthest, nearest and average. If no method is provided then average is assumed.\n";
48 helpString += "The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n";
49 helpString += "The length parameter is used to specify the minimum overlap required. The default is 5.\n";
50 helpString += "The penalty parameter is used to adjust the error rate. The default is 0.10.\n";
51 helpString += "The merge parameter allows you to shut off merging based on overlaps and just cluster. By default merge is true, meaning you want to merge.\n";
52 helpString += "The hcluster parameter allows you to use the hcluster algorithm when clustering. This may be neccessary if your file is too large to fit into RAM. The default is false.\n";
53 helpString += "The mgcluster command should be in the following format: \n";
54 helpString += "mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n";
55 helpString += "Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n";
59 m->errorOut(e, "MGClusterCommand", "getHelpString");
63 //**********************************************************************************************************************
64 MGClusterCommand::MGClusterCommand(){
66 abort = true; calledHelp = true;
68 vector<string> tempOutNames;
69 outputTypes["list"] = tempOutNames;
70 outputTypes["rabund"] = tempOutNames;
71 outputTypes["sabund"] = tempOutNames;
74 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
78 //**********************************************************************************************************************
79 MGClusterCommand::MGClusterCommand(string option) {
81 abort = false; calledHelp = false;
83 //allow user to run help
84 if(option == "help") { help(); abort = true; calledHelp = true; }
85 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
88 vector<string> myArray = setParameters();
90 OptionParser parser(option);
91 map<string, string> parameters = parser.getParameters();
93 ValidParameters validParameter;
94 map<string,string>::iterator it;
96 //check to make sure all parameters are valid for command
97 for (it = parameters.begin(); it != parameters.end(); it++) {
98 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
101 //initialize outputTypes
102 vector<string> tempOutNames;
103 outputTypes["list"] = tempOutNames;
104 outputTypes["rabund"] = tempOutNames;
105 outputTypes["sabund"] = tempOutNames;
107 //if the user changes the input directory command factory will send this info to us in the output parameter
108 string inputDir = validParameter.validFile(parameters, "inputdir", false);
109 if (inputDir == "not found"){ inputDir = ""; }
112 it = parameters.find("blast");
113 //user has given a template file
114 if(it != parameters.end()){
115 path = m->hasPath(it->second);
116 //if the user has not given a path then, add inputdir. else leave path alone.
117 if (path == "") { parameters["blast"] = inputDir + it->second; }
120 it = parameters.find("name");
121 //user has given a template file
122 if(it != parameters.end()){
123 path = m->hasPath(it->second);
124 //if the user has not given a path then, add inputdir. else leave path alone.
125 if (path == "") { parameters["name"] = inputDir + it->second; }
130 //check for required parameters
131 blastfile = validParameter.validFile(parameters, "blast", true);
132 if (blastfile == "not open") { blastfile = ""; abort = true; }
133 else if (blastfile == "not found") { blastfile = ""; }
135 //if the user changes the output directory command factory will send this info to us in the output parameter
136 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
138 outputDir += m->hasPath(blastfile); //if user entered a file with a path then preserve it
141 namefile = validParameter.validFile(parameters, "name", true);
142 if (namefile == "not open") { abort = true; }
143 else if (namefile == "not found") { namefile = ""; }
144 else { m->setNameFile(namefile); }
146 if ((blastfile == "")) { m->mothurOut("When executing a mgcluster command you must provide a blastfile."); m->mothurOutEndLine(); abort = true; }
148 //check for optional parameter and set defaults
150 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
151 precisionLength = temp.length();
152 convert(temp, precision);
154 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.70"; }
155 convert(temp, cutoff);
156 cutoff += (5 / (precision * 10.0));
158 method = validParameter.validFile(parameters, "method", false);
159 if (method == "not found") { method = "average"; }
161 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
162 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
164 temp = validParameter.validFile(parameters, "length", false); if (temp == "not found") { temp = "5"; }
165 convert(temp, length);
167 temp = validParameter.validFile(parameters, "penalty", false); if (temp == "not found") { temp = "0.10"; }
168 convert(temp, penalty);
170 temp = validParameter.validFile(parameters, "min", false); if (temp == "not found") { temp = "true"; }
171 minWanted = m->isTrue(temp);
173 temp = validParameter.validFile(parameters, "merge", false); if (temp == "not found") { temp = "true"; }
174 merge = m->isTrue(temp);
176 temp = validParameter.validFile(parameters, "hcluster", false); if (temp == "not found") { temp = "false"; }
177 hclusterWanted = m->isTrue(temp);
179 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; }
180 hard = m->isTrue(temp);
184 catch(exception& e) {
185 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
189 //**********************************************************************************************************************
190 int MGClusterCommand::execute(){
193 if (abort == true) { if (calledHelp) { return 0; } return 2; }
196 if (namefile != "") {
197 nameMap = new NameAssignment(namefile);
199 }else{ nameMap= new NameAssignment(); }
201 string fileroot = outputDir + m->getRootName(m->getSimpleName(blastfile));
204 float previousDist = 0.00000;
205 float rndPreviousDist = 0.00000;
207 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
208 //must remember to delete those objects here since readBlast does not
209 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
212 list = new ListVector(nameMap->getListVector());
213 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
215 if (m->control_pressed) { outputTypes.clear(); delete nameMap; delete read; delete list; delete rabund; return 0; }
219 map<string, int> Seq2Bin;
220 map<string, int> oldSeq2Bin;
222 if (method == "furthest") { tag = "fn"; }
223 else if (method == "nearest") { tag = "nn"; }
227 m->openOutputFile(fileroot+ tag + ".list", listFile);
228 m->openOutputFile(fileroot+ tag + ".rabund", rabundFile);
229 m->openOutputFile(fileroot+ tag + ".sabund", sabundFile);
231 if (m->control_pressed) {
232 delete nameMap; delete read; delete list; delete rabund;
233 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
238 if (!hclusterWanted) {
239 //get distmatrix and overlap
240 SparseMatrix* distMatrix = read->getDistMatrix();
241 overlapMatrix = read->getOverlapMatrix(); //already sorted by read
245 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method); }
246 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
247 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); }
248 cluster->setMapWanted(true);
249 Seq2Bin = cluster->getSeqtoBin();
250 oldSeq2Bin = Seq2Bin;
252 if (m->control_pressed) {
253 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
254 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
259 //cluster using cluster classes
260 while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
262 cluster->update(cutoff);
264 if (m->control_pressed) {
265 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
266 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
271 float dist = distMatrix->getSmallDist();
274 rndDist = m->ceilDist(dist, precision);
276 rndDist = m->roundDist(dist, precision);
279 if(previousDist <= 0.0000 && dist != previousDist){
280 oldList.setLabel("unique");
283 else if(rndDist != rndPreviousDist){
285 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
287 if (m->control_pressed) {
288 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
289 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
294 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
298 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
304 rndPreviousDist = rndDist;
306 Seq2Bin = cluster->getSeqtoBin();
307 oldSeq2Bin = Seq2Bin;
310 if(previousDist <= 0.0000){
311 oldList.setLabel("unique");
314 else if(rndPreviousDist<cutoff){
316 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
318 if (m->control_pressed) {
319 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
320 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
325 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
329 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
335 overlapMatrix.clear();
339 }else { //use hcluster to cluster
340 //get distmatrix and overlap
341 overlapFile = read->getOverlapFile();
342 distFile = read->getDistFile();
345 //sort the distance and overlap files
346 sortHclusterFiles(distFile, overlapFile);
348 if (m->control_pressed) {
349 delete nameMap; delete list; delete rabund;
350 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
356 hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
357 hcluster->setMapWanted(true);
358 Seq2Bin = cluster->getSeqtoBin();
359 oldSeq2Bin = Seq2Bin;
361 vector<seqDist> seqs; seqs.resize(1); // to start loop
362 //ifstream inHcluster;
363 //m->openInputFile(distFile, inHcluster);
365 if (m->control_pressed) {
366 delete nameMap; delete list; delete rabund; delete hcluster;
367 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
372 while (seqs.size() != 0){
374 seqs = hcluster->getSeqs();
376 if (m->control_pressed) {
377 delete nameMap; delete list; delete rabund; delete hcluster;
378 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
379 remove(distFile.c_str());
380 remove(overlapFile.c_str());
385 for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
387 if (seqs[i].seq1 != seqs[i].seq2) {
389 hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
391 if (m->control_pressed) {
392 delete nameMap; delete list; delete rabund; delete hcluster;
393 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
394 remove(distFile.c_str());
395 remove(overlapFile.c_str());
402 rndDist = m->ceilDist(seqs[i].dist, precision);
404 rndDist = m->roundDist(seqs[i].dist, precision);
407 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
408 oldList.setLabel("unique");
411 else if((rndDist != rndPreviousDist)){
413 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
415 if (m->control_pressed) {
416 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
417 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
418 remove(distFile.c_str());
419 remove(overlapFile.c_str());
424 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
428 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
433 previousDist = seqs[i].dist;
434 rndPreviousDist = rndDist;
436 Seq2Bin = cluster->getSeqtoBin();
437 oldSeq2Bin = Seq2Bin;
441 //inHcluster.close();
443 if(previousDist <= 0.0000){
444 oldList.setLabel("unique");
447 else if(rndPreviousDist<cutoff){
449 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
451 if (m->control_pressed) {
452 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
453 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
454 remove(distFile.c_str());
455 remove(overlapFile.c_str());
460 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
464 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
470 remove(distFile.c_str());
471 remove(overlapFile.c_str());
480 if (m->control_pressed) {
482 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
487 m->mothurOutEndLine();
488 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
489 m->mothurOut(fileroot+ tag + ".list"); m->mothurOutEndLine(); outputNames.push_back(fileroot+ tag + ".list"); outputTypes["list"].push_back(fileroot+ tag + ".list");
490 m->mothurOut(fileroot+ tag + ".rabund"); m->mothurOutEndLine(); outputNames.push_back(fileroot+ tag + ".rabund"); outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
491 m->mothurOut(fileroot+ tag + ".sabund"); m->mothurOutEndLine(); outputNames.push_back(fileroot+ tag + ".sabund"); outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
492 m->mothurOutEndLine();
494 //set list file as new current listfile
496 itTypes = outputTypes.find("list");
497 if (itTypes != outputTypes.end()) {
498 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
501 //set rabund file as new current rabundfile
502 itTypes = outputTypes.find("rabund");
503 if (itTypes != outputTypes.end()) {
504 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
507 //set sabund file as new current sabundfile
508 itTypes = outputTypes.find("sabund");
509 if (itTypes != outputTypes.end()) {
510 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
514 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine();
518 catch(exception& e) {
519 m->errorOut(e, "MGClusterCommand", "execute");
523 //**********************************************************************************************************************
524 void MGClusterCommand::printData(ListVector* mergedList){
526 mergedList->print(listFile);
527 mergedList->getRAbundVector().print(rabundFile);
529 SAbundVector sabund = mergedList->getSAbundVector();
532 sabund.print(sabundFile);
534 catch(exception& e) {
535 m->errorOut(e, "MGClusterCommand", "printData");
539 //**********************************************************************************************************************
540 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
541 //that are used to cluster by distance. this is done so that the overlapping data does not have more influenece than the distance data.
542 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
544 //create new listvector so you don't overwrite the clustering
545 ListVector* newList = new ListVector(oldList);
551 if (hclusterWanted) {
552 m->openInputFile(overlapFile, inOverlap);
553 if (inOverlap.eof()) { done = true; }
554 }else { if (overlapMatrix.size() == 0) { done = true; } }
557 if (m->control_pressed) {
558 if (hclusterWanted) { inOverlap.close(); }
564 if (!hclusterWanted) {
565 if (count < overlapMatrix.size()) { //do we have another node in the matrix
566 overlapNode = overlapMatrix[count];
570 if (!inOverlap.eof()) {
571 string firstName, secondName;
572 float overlapDistance;
573 inOverlap >> firstName >> secondName >> overlapDistance; m->gobble(inOverlap);
575 //commented out because we check this in readblast already
576 //map<string,int>::iterator itA = nameMap->find(firstName);
577 //map<string,int>::iterator itB = nameMap->find(secondName);
578 //if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
579 //if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
581 //overlapNode.seq1 = itA->second;
582 //overlapNode.seq2 = itB->second;
583 overlapNode.seq1 = nameMap->get(firstName);
584 overlapNode.seq2 = nameMap->get(secondName);
585 overlapNode.dist = overlapDistance;
586 }else { inOverlap.close(); break; }
589 if (overlapNode.dist < dist) {
590 //get names of seqs that overlap
591 string name1 = nameMap->get(overlapNode.seq1);
592 string name2 = nameMap->get(overlapNode.seq2);
594 //use binInfo to find out if they are already in the same bin
595 //map<string, int>::iterator itBin1 = binInfo.find(name1);
596 //map<string, int>::iterator itBin2 = binInfo.find(name2);
598 //if(itBin1 == binInfo.end()){ cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1); }
599 //if(itBin2 == binInfo.end()){ cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1); }
601 //int binKeep = itBin1->second;
602 //int binRemove = itBin2->second;
604 int binKeep = binInfo[name1];
605 int binRemove = binInfo[name2];
607 //if not merge bins and update binInfo
608 if(binKeep != binRemove) {
610 //save names in old bin
611 string names = newList->get(binRemove);
613 //merge bins into name1s bin
614 newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
615 newList->set(binRemove, "");
618 while (names.find_first_of(',') != -1) {
620 string name = names.substr(0,names.find_first_of(','));
621 //save name and bin number
622 binInfo[name] = binKeep;
623 names = names.substr(names.find_first_of(',')+1, names.length());
627 binInfo[names] = binKeep;
630 }else { done = true; }
637 catch(exception& e) {
638 m->errorOut(e, "MGClusterCommand", "mergeOPFs");
642 //**********************************************************************************************************************
643 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
646 string sortedDistFile = m->sortFile(unsortedDist, outputDir);
647 remove(unsortedDist.c_str()); //delete unsorted file
648 distFile = sortedDistFile;
651 string sortedOverlapFile = m->sortFile(unsortedOverlap, outputDir);
652 remove(unsortedOverlap.c_str()); //delete unsorted file
653 overlapFile = sortedOverlapFile;
655 catch(exception& e) {
656 m->errorOut(e, "MGClusterCommand", "sortHclusterFiles");
661 //**********************************************************************************************************************