5 * Created by westcott on 12/11/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "mgclustercommand.h"
12 //**********************************************************************************************************************
13 vector<string> MGClusterCommand::setParameters(){
15 CommandParameter pblast("blast", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pblast);
16 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
17 CommandParameter plength("length", "Number", "", "5", "", "", "",false,false); parameters.push_back(plength);
18 CommandParameter ppenalty("penalty", "Number", "", "0.10", "", "", "",false,false); parameters.push_back(ppenalty);
19 CommandParameter pcutoff("cutoff", "Number", "", "0.70", "", "", "",false,false); parameters.push_back(pcutoff);
20 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
21 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average", "average", "", "", "",false,false); parameters.push_back(pmethod);
22 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
23 CommandParameter pmin("min", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pmin);
24 CommandParameter pmerge("merge", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pmerge);
25 CommandParameter phcluster("hcluster", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(phcluster);
26 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
27 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
29 vector<string> myArray;
30 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
34 m->errorOut(e, "MGClusterCommand", "setParameters");
38 //**********************************************************************************************************************
39 string MGClusterCommand::getHelpString(){
41 string helpString = "";
42 helpString += "The mgcluster command parameter options are blast, name, cutoff, precision, hard, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n";
43 helpString += "The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n";
44 helpString += "This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n";
45 helpString += "The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n";
46 helpString += "The precision parameter's default value is 100. \n";
47 helpString += "The acceptable mgcluster methods are furthest, nearest and average. If no method is provided then average is assumed.\n";
48 helpString += "The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n";
49 helpString += "The length parameter is used to specify the minimum overlap required. The default is 5.\n";
50 helpString += "The penalty parameter is used to adjust the error rate. The default is 0.10.\n";
51 helpString += "The merge parameter allows you to shut off merging based on overlaps and just cluster. By default merge is true, meaning you want to merge.\n";
52 helpString += "The hcluster parameter allows you to use the hcluster algorithm when clustering. This may be neccessary if your file is too large to fit into RAM. The default is false.\n";
53 helpString += "The mgcluster command should be in the following format: \n";
54 helpString += "mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n";
55 helpString += "Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n";
59 m->errorOut(e, "MGClusterCommand", "getHelpString");
63 //**********************************************************************************************************************
64 MGClusterCommand::MGClusterCommand(){
66 abort = true; calledHelp = true;
68 vector<string> tempOutNames;
69 outputTypes["list"] = tempOutNames;
70 outputTypes["rabund"] = tempOutNames;
71 outputTypes["sabund"] = tempOutNames;
74 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
78 //**********************************************************************************************************************
79 MGClusterCommand::MGClusterCommand(string option) {
81 abort = false; calledHelp = false;
83 //allow user to run help
84 if(option == "help") { help(); abort = true; calledHelp = true; }
85 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
88 vector<string> myArray = setParameters();
90 OptionParser parser(option);
91 map<string, string> parameters = parser.getParameters();
93 ValidParameters validParameter;
94 map<string,string>::iterator it;
96 //check to make sure all parameters are valid for command
97 for (it = parameters.begin(); it != parameters.end(); it++) {
98 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
101 //initialize outputTypes
102 vector<string> tempOutNames;
103 outputTypes["list"] = tempOutNames;
104 outputTypes["rabund"] = tempOutNames;
105 outputTypes["sabund"] = tempOutNames;
107 //if the user changes the input directory command factory will send this info to us in the output parameter
108 string inputDir = validParameter.validFile(parameters, "inputdir", false);
109 if (inputDir == "not found"){ inputDir = ""; }
112 it = parameters.find("blast");
113 //user has given a template file
114 if(it != parameters.end()){
115 path = m->hasPath(it->second);
116 //if the user has not given a path then, add inputdir. else leave path alone.
117 if (path == "") { parameters["blast"] = inputDir + it->second; }
120 it = parameters.find("name");
121 //user has given a template file
122 if(it != parameters.end()){
123 path = m->hasPath(it->second);
124 //if the user has not given a path then, add inputdir. else leave path alone.
125 if (path == "") { parameters["name"] = inputDir + it->second; }
130 //check for required parameters
131 blastfile = validParameter.validFile(parameters, "blast", true);
132 if (blastfile == "not open") { blastfile = ""; abort = true; }
133 else if (blastfile == "not found") { blastfile = ""; }
135 //if the user changes the output directory command factory will send this info to us in the output parameter
136 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
138 outputDir += m->hasPath(blastfile); //if user entered a file with a path then preserve it
141 namefile = validParameter.validFile(parameters, "name", true);
142 if (namefile == "not open") { abort = true; }
143 else if (namefile == "not found") { namefile = ""; }
145 if ((blastfile == "")) { m->mothurOut("When executing a mgcluster command you must provide a blastfile."); m->mothurOutEndLine(); abort = true; }
147 //check for optional parameter and set defaults
149 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
150 precisionLength = temp.length();
151 convert(temp, precision);
153 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.70"; }
154 convert(temp, cutoff);
155 cutoff += (5 / (precision * 10.0));
157 method = validParameter.validFile(parameters, "method", false);
158 if (method == "not found") { method = "average"; }
160 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
161 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
163 temp = validParameter.validFile(parameters, "length", false); if (temp == "not found") { temp = "5"; }
164 convert(temp, length);
166 temp = validParameter.validFile(parameters, "penalty", false); if (temp == "not found") { temp = "0.10"; }
167 convert(temp, penalty);
169 temp = validParameter.validFile(parameters, "min", false); if (temp == "not found") { temp = "true"; }
170 minWanted = m->isTrue(temp);
172 temp = validParameter.validFile(parameters, "merge", false); if (temp == "not found") { temp = "true"; }
173 merge = m->isTrue(temp);
175 temp = validParameter.validFile(parameters, "hcluster", false); if (temp == "not found") { temp = "false"; }
176 hclusterWanted = m->isTrue(temp);
178 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; }
179 hard = m->isTrue(temp);
183 catch(exception& e) {
184 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
188 //**********************************************************************************************************************
189 int MGClusterCommand::execute(){
192 if (abort == true) { if (calledHelp) { return 0; } return 2; }
195 if (namefile != "") {
196 nameMap = new NameAssignment(namefile);
198 }else{ nameMap= new NameAssignment(); }
200 string fileroot = outputDir + m->getRootName(m->getSimpleName(blastfile));
203 float previousDist = 0.00000;
204 float rndPreviousDist = 0.00000;
206 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
207 //must remember to delete those objects here since readBlast does not
208 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
211 list = new ListVector(nameMap->getListVector());
212 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
214 if (m->control_pressed) { outputTypes.clear(); delete nameMap; delete read; delete list; delete rabund; return 0; }
218 map<string, int> Seq2Bin;
219 map<string, int> oldSeq2Bin;
221 if (method == "furthest") { tag = "fn"; }
222 else if (method == "nearest") { tag = "nn"; }
226 m->openOutputFile(fileroot+ tag + ".list", listFile);
227 m->openOutputFile(fileroot+ tag + ".rabund", rabundFile);
228 m->openOutputFile(fileroot+ tag + ".sabund", sabundFile);
230 if (m->control_pressed) {
231 delete nameMap; delete read; delete list; delete rabund;
232 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
237 if (!hclusterWanted) {
238 //get distmatrix and overlap
239 SparseMatrix* distMatrix = read->getDistMatrix();
240 overlapMatrix = read->getOverlapMatrix(); //already sorted by read
244 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method); }
245 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
246 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); }
247 cluster->setMapWanted(true);
248 Seq2Bin = cluster->getSeqtoBin();
249 oldSeq2Bin = Seq2Bin;
251 if (m->control_pressed) {
252 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
253 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
258 //cluster using cluster classes
259 while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
261 cluster->update(cutoff);
263 if (m->control_pressed) {
264 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
265 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
270 float dist = distMatrix->getSmallDist();
273 rndDist = m->ceilDist(dist, precision);
275 rndDist = m->roundDist(dist, precision);
278 if(previousDist <= 0.0000 && dist != previousDist){
279 oldList.setLabel("unique");
282 else if(rndDist != rndPreviousDist){
284 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
286 if (m->control_pressed) {
287 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
288 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
293 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
297 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
303 rndPreviousDist = rndDist;
305 Seq2Bin = cluster->getSeqtoBin();
306 oldSeq2Bin = Seq2Bin;
309 if(previousDist <= 0.0000){
310 oldList.setLabel("unique");
313 else if(rndPreviousDist<cutoff){
315 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
317 if (m->control_pressed) {
318 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
319 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
324 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
328 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
334 overlapMatrix.clear();
338 }else { //use hcluster to cluster
339 //get distmatrix and overlap
340 overlapFile = read->getOverlapFile();
341 distFile = read->getDistFile();
344 //sort the distance and overlap files
345 sortHclusterFiles(distFile, overlapFile);
347 if (m->control_pressed) {
348 delete nameMap; delete list; delete rabund;
349 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
355 hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
356 hcluster->setMapWanted(true);
357 Seq2Bin = cluster->getSeqtoBin();
358 oldSeq2Bin = Seq2Bin;
360 vector<seqDist> seqs; seqs.resize(1); // to start loop
361 //ifstream inHcluster;
362 //m->openInputFile(distFile, inHcluster);
364 if (m->control_pressed) {
365 delete nameMap; delete list; delete rabund; delete hcluster;
366 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
371 while (seqs.size() != 0){
373 seqs = hcluster->getSeqs();
375 if (m->control_pressed) {
376 delete nameMap; delete list; delete rabund; delete hcluster;
377 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
378 remove(distFile.c_str());
379 remove(overlapFile.c_str());
384 for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
386 if (seqs[i].seq1 != seqs[i].seq2) {
388 hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
390 if (m->control_pressed) {
391 delete nameMap; delete list; delete rabund; delete hcluster;
392 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
393 remove(distFile.c_str());
394 remove(overlapFile.c_str());
401 rndDist = m->ceilDist(seqs[i].dist, precision);
403 rndDist = m->roundDist(seqs[i].dist, precision);
406 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
407 oldList.setLabel("unique");
410 else if((rndDist != rndPreviousDist)){
412 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
414 if (m->control_pressed) {
415 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
416 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
417 remove(distFile.c_str());
418 remove(overlapFile.c_str());
423 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
427 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
432 previousDist = seqs[i].dist;
433 rndPreviousDist = rndDist;
435 Seq2Bin = cluster->getSeqtoBin();
436 oldSeq2Bin = Seq2Bin;
440 //inHcluster.close();
442 if(previousDist <= 0.0000){
443 oldList.setLabel("unique");
446 else if(rndPreviousDist<cutoff){
448 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
450 if (m->control_pressed) {
451 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
452 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
453 remove(distFile.c_str());
454 remove(overlapFile.c_str());
459 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
463 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
469 remove(distFile.c_str());
470 remove(overlapFile.c_str());
479 if (m->control_pressed) {
481 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
486 m->mothurOutEndLine();
487 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
488 m->mothurOut(fileroot+ tag + ".list"); m->mothurOutEndLine(); outputNames.push_back(fileroot+ tag + ".list"); outputTypes["list"].push_back(fileroot+ tag + ".list");
489 m->mothurOut(fileroot+ tag + ".rabund"); m->mothurOutEndLine(); outputNames.push_back(fileroot+ tag + ".rabund"); outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
490 m->mothurOut(fileroot+ tag + ".sabund"); m->mothurOutEndLine(); outputNames.push_back(fileroot+ tag + ".sabund"); outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
491 m->mothurOutEndLine();
493 //set list file as new current listfile
495 itTypes = outputTypes.find("list");
496 if (itTypes != outputTypes.end()) {
497 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
500 //set rabund file as new current rabundfile
501 itTypes = outputTypes.find("rabund");
502 if (itTypes != outputTypes.end()) {
503 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
506 //set sabund file as new current sabundfile
507 itTypes = outputTypes.find("sabund");
508 if (itTypes != outputTypes.end()) {
509 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
513 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine();
517 catch(exception& e) {
518 m->errorOut(e, "MGClusterCommand", "execute");
522 //**********************************************************************************************************************
523 void MGClusterCommand::printData(ListVector* mergedList){
525 mergedList->print(listFile);
526 mergedList->getRAbundVector().print(rabundFile);
528 SAbundVector sabund = mergedList->getSAbundVector();
531 sabund.print(sabundFile);
533 catch(exception& e) {
534 m->errorOut(e, "MGClusterCommand", "printData");
538 //**********************************************************************************************************************
539 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
540 //that are used to cluster by distance. this is done so that the overlapping data does not have more influenece than the distance data.
541 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
543 //create new listvector so you don't overwrite the clustering
544 ListVector* newList = new ListVector(oldList);
550 if (hclusterWanted) {
551 m->openInputFile(overlapFile, inOverlap);
552 if (inOverlap.eof()) { done = true; }
553 }else { if (overlapMatrix.size() == 0) { done = true; } }
556 if (m->control_pressed) {
557 if (hclusterWanted) { inOverlap.close(); }
563 if (!hclusterWanted) {
564 if (count < overlapMatrix.size()) { //do we have another node in the matrix
565 overlapNode = overlapMatrix[count];
569 if (!inOverlap.eof()) {
570 string firstName, secondName;
571 float overlapDistance;
572 inOverlap >> firstName >> secondName >> overlapDistance; m->gobble(inOverlap);
574 //commented out because we check this in readblast already
575 //map<string,int>::iterator itA = nameMap->find(firstName);
576 //map<string,int>::iterator itB = nameMap->find(secondName);
577 //if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
578 //if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
580 //overlapNode.seq1 = itA->second;
581 //overlapNode.seq2 = itB->second;
582 overlapNode.seq1 = nameMap->get(firstName);
583 overlapNode.seq2 = nameMap->get(secondName);
584 overlapNode.dist = overlapDistance;
585 }else { inOverlap.close(); break; }
588 if (overlapNode.dist < dist) {
589 //get names of seqs that overlap
590 string name1 = nameMap->get(overlapNode.seq1);
591 string name2 = nameMap->get(overlapNode.seq2);
593 //use binInfo to find out if they are already in the same bin
594 //map<string, int>::iterator itBin1 = binInfo.find(name1);
595 //map<string, int>::iterator itBin2 = binInfo.find(name2);
597 //if(itBin1 == binInfo.end()){ cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1); }
598 //if(itBin2 == binInfo.end()){ cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1); }
600 //int binKeep = itBin1->second;
601 //int binRemove = itBin2->second;
603 int binKeep = binInfo[name1];
604 int binRemove = binInfo[name2];
606 //if not merge bins and update binInfo
607 if(binKeep != binRemove) {
609 //save names in old bin
610 string names = newList->get(binRemove);
612 //merge bins into name1s bin
613 newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
614 newList->set(binRemove, "");
617 while (names.find_first_of(',') != -1) {
619 string name = names.substr(0,names.find_first_of(','));
620 //save name and bin number
621 binInfo[name] = binKeep;
622 names = names.substr(names.find_first_of(',')+1, names.length());
626 binInfo[names] = binKeep;
629 }else { done = true; }
636 catch(exception& e) {
637 m->errorOut(e, "MGClusterCommand", "mergeOPFs");
641 //**********************************************************************************************************************
642 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
645 string sortedDistFile = m->sortFile(unsortedDist, outputDir);
646 remove(unsortedDist.c_str()); //delete unsorted file
647 distFile = sortedDistFile;
650 string sortedOverlapFile = m->sortFile(unsortedOverlap, outputDir);
651 remove(unsortedOverlap.c_str()); //delete unsorted file
652 overlapFile = sortedOverlapFile;
654 catch(exception& e) {
655 m->errorOut(e, "MGClusterCommand", "sortHclusterFiles");
660 //**********************************************************************************************************************