5 * Created by westcott on 12/11/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "mgclustercommand.h"
12 //**********************************************************************************************************************
13 vector<string> MGClusterCommand::setParameters(){
15 CommandParameter pblast("blast", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pblast);
16 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
17 CommandParameter plength("length", "Number", "", "5", "", "", "",false,false); parameters.push_back(plength);
18 CommandParameter ppenalty("penalty", "Number", "", "0.10", "", "", "",false,false); parameters.push_back(ppenalty);
19 CommandParameter pcutoff("cutoff", "Number", "", "0.70", "", "", "",false,false); parameters.push_back(pcutoff);
20 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
21 CommandParameter pmethod("method", "Multiple", "furthest-nearest-average", "average", "", "", "",false,false); parameters.push_back(pmethod);
22 CommandParameter phard("hard", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(phard);
23 CommandParameter pmin("min", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pmin);
24 CommandParameter pmerge("merge", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pmerge);
25 CommandParameter phcluster("hcluster", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(phcluster);
26 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
27 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
29 vector<string> myArray;
30 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
34 m->errorOut(e, "MGClusterCommand", "setParameters");
38 //**********************************************************************************************************************
39 string MGClusterCommand::getHelpString(){
41 string helpString = "";
42 helpString += "The mgcluster command parameter options are blast, name, cutoff, precision, hard, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n";
43 helpString += "The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n";
44 helpString += "This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n";
45 helpString += "The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n";
46 helpString += "The precision parameter's default value is 100. \n";
47 helpString += "The acceptable mgcluster methods are furthest, nearest and average. If no method is provided then average is assumed.\n";
48 helpString += "The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n";
49 helpString += "The length parameter is used to specify the minimum overlap required. The default is 5.\n";
50 helpString += "The penalty parameter is used to adjust the error rate. The default is 0.10.\n";
51 helpString += "The merge parameter allows you to shut off merging based on overlaps and just cluster. By default merge is true, meaning you want to merge.\n";
52 helpString += "The hcluster parameter allows you to use the hcluster algorithm when clustering. This may be neccessary if your file is too large to fit into RAM. The default is false.\n";
53 helpString += "The mgcluster command should be in the following format: \n";
54 helpString += "mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n";
55 helpString += "Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n";
59 m->errorOut(e, "MGClusterCommand", "getHelpString");
63 //**********************************************************************************************************************
64 MGClusterCommand::MGClusterCommand(){
66 abort = true; calledHelp = true;
68 vector<string> tempOutNames;
69 outputTypes["list"] = tempOutNames;
70 outputTypes["rabund"] = tempOutNames;
71 outputTypes["sabund"] = tempOutNames;
74 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
78 //**********************************************************************************************************************
79 MGClusterCommand::MGClusterCommand(string option) {
81 abort = false; calledHelp = false;
83 //allow user to run help
84 if(option == "help") { help(); abort = true; calledHelp = true; }
87 vector<string> myArray = setParameters();
89 OptionParser parser(option);
90 map<string, string> parameters = parser.getParameters();
92 ValidParameters validParameter;
93 map<string,string>::iterator it;
95 //check to make sure all parameters are valid for command
96 for (it = parameters.begin(); it != parameters.end(); it++) {
97 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
100 //initialize outputTypes
101 vector<string> tempOutNames;
102 outputTypes["list"] = tempOutNames;
103 outputTypes["rabund"] = tempOutNames;
104 outputTypes["sabund"] = tempOutNames;
106 //if the user changes the input directory command factory will send this info to us in the output parameter
107 string inputDir = validParameter.validFile(parameters, "inputdir", false);
108 if (inputDir == "not found"){ inputDir = ""; }
111 it = parameters.find("blast");
112 //user has given a template file
113 if(it != parameters.end()){
114 path = m->hasPath(it->second);
115 //if the user has not given a path then, add inputdir. else leave path alone.
116 if (path == "") { parameters["blast"] = inputDir + it->second; }
119 it = parameters.find("name");
120 //user has given a template file
121 if(it != parameters.end()){
122 path = m->hasPath(it->second);
123 //if the user has not given a path then, add inputdir. else leave path alone.
124 if (path == "") { parameters["name"] = inputDir + it->second; }
129 //check for required parameters
130 blastfile = validParameter.validFile(parameters, "blast", true);
131 if (blastfile == "not open") { blastfile = ""; abort = true; }
132 else if (blastfile == "not found") { blastfile = ""; }
134 //if the user changes the output directory command factory will send this info to us in the output parameter
135 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
137 outputDir += m->hasPath(blastfile); //if user entered a file with a path then preserve it
140 namefile = validParameter.validFile(parameters, "name", true);
141 if (namefile == "not open") { abort = true; }
142 else if (namefile == "not found") { namefile = ""; }
144 if ((blastfile == "")) { m->mothurOut("When executing a mgcluster command you must provide a blastfile."); m->mothurOutEndLine(); abort = true; }
146 //check for optional parameter and set defaults
148 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
149 precisionLength = temp.length();
150 convert(temp, precision);
152 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.70"; }
153 convert(temp, cutoff);
154 cutoff += (5 / (precision * 10.0));
156 method = validParameter.validFile(parameters, "method", false);
157 if (method == "not found") { method = "average"; }
159 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
160 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
162 temp = validParameter.validFile(parameters, "length", false); if (temp == "not found") { temp = "5"; }
163 convert(temp, length);
165 temp = validParameter.validFile(parameters, "penalty", false); if (temp == "not found") { temp = "0.10"; }
166 convert(temp, penalty);
168 temp = validParameter.validFile(parameters, "min", false); if (temp == "not found") { temp = "true"; }
169 minWanted = m->isTrue(temp);
171 temp = validParameter.validFile(parameters, "merge", false); if (temp == "not found") { temp = "true"; }
172 merge = m->isTrue(temp);
174 temp = validParameter.validFile(parameters, "hcluster", false); if (temp == "not found") { temp = "false"; }
175 hclusterWanted = m->isTrue(temp);
177 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "T"; }
178 hard = m->isTrue(temp);
182 catch(exception& e) {
183 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
187 //**********************************************************************************************************************
188 int MGClusterCommand::execute(){
191 if (abort == true) { if (calledHelp) { return 0; } return 2; }
194 if (namefile != "") {
195 nameMap = new NameAssignment(namefile);
197 }else{ nameMap= new NameAssignment(); }
199 string fileroot = outputDir + m->getRootName(m->getSimpleName(blastfile));
202 float previousDist = 0.00000;
203 float rndPreviousDist = 0.00000;
205 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
206 //must remember to delete those objects here since readBlast does not
207 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
210 list = new ListVector(nameMap->getListVector());
211 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
213 if (m->control_pressed) { outputTypes.clear(); delete nameMap; delete read; delete list; delete rabund; return 0; }
217 map<string, int> Seq2Bin;
218 map<string, int> oldSeq2Bin;
220 if (method == "furthest") { tag = "fn"; }
221 else if (method == "nearest") { tag = "nn"; }
225 m->openOutputFile(fileroot+ tag + ".list", listFile);
226 m->openOutputFile(fileroot+ tag + ".rabund", rabundFile);
227 m->openOutputFile(fileroot+ tag + ".sabund", sabundFile);
229 if (m->control_pressed) {
230 delete nameMap; delete read; delete list; delete rabund;
231 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
236 if (!hclusterWanted) {
237 //get distmatrix and overlap
238 SparseMatrix* distMatrix = read->getDistMatrix();
239 overlapMatrix = read->getOverlapMatrix(); //already sorted by read
243 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method); }
244 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
245 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); }
246 cluster->setMapWanted(true);
247 Seq2Bin = cluster->getSeqtoBin();
248 oldSeq2Bin = Seq2Bin;
250 if (m->control_pressed) {
251 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
252 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
257 //cluster using cluster classes
258 while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
260 cluster->update(cutoff);
262 if (m->control_pressed) {
263 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
264 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
269 float dist = distMatrix->getSmallDist();
272 rndDist = m->ceilDist(dist, precision);
274 rndDist = m->roundDist(dist, precision);
277 if(previousDist <= 0.0000 && dist != previousDist){
278 oldList.setLabel("unique");
281 else if(rndDist != rndPreviousDist){
283 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
285 if (m->control_pressed) {
286 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
287 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
292 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
296 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
302 rndPreviousDist = rndDist;
304 Seq2Bin = cluster->getSeqtoBin();
305 oldSeq2Bin = Seq2Bin;
308 if(previousDist <= 0.0000){
309 oldList.setLabel("unique");
312 else if(rndPreviousDist<cutoff){
314 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
316 if (m->control_pressed) {
317 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
318 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
323 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
327 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
333 overlapMatrix.clear();
337 }else { //use hcluster to cluster
338 //get distmatrix and overlap
339 overlapFile = read->getOverlapFile();
340 distFile = read->getDistFile();
343 //sort the distance and overlap files
344 sortHclusterFiles(distFile, overlapFile);
346 if (m->control_pressed) {
347 delete nameMap; delete list; delete rabund;
348 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
354 hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
355 hcluster->setMapWanted(true);
356 Seq2Bin = cluster->getSeqtoBin();
357 oldSeq2Bin = Seq2Bin;
359 vector<seqDist> seqs; seqs.resize(1); // to start loop
360 //ifstream inHcluster;
361 //m->openInputFile(distFile, inHcluster);
363 if (m->control_pressed) {
364 delete nameMap; delete list; delete rabund; delete hcluster;
365 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
370 while (seqs.size() != 0){
372 seqs = hcluster->getSeqs();
374 if (m->control_pressed) {
375 delete nameMap; delete list; delete rabund; delete hcluster;
376 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
377 remove(distFile.c_str());
378 remove(overlapFile.c_str());
383 for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
385 if (seqs[i].seq1 != seqs[i].seq2) {
387 hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
389 if (m->control_pressed) {
390 delete nameMap; delete list; delete rabund; delete hcluster;
391 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
392 remove(distFile.c_str());
393 remove(overlapFile.c_str());
400 rndDist = m->ceilDist(seqs[i].dist, precision);
402 rndDist = m->roundDist(seqs[i].dist, precision);
405 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
406 oldList.setLabel("unique");
409 else if((rndDist != rndPreviousDist)){
411 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
413 if (m->control_pressed) {
414 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
415 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
416 remove(distFile.c_str());
417 remove(overlapFile.c_str());
422 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
426 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
431 previousDist = seqs[i].dist;
432 rndPreviousDist = rndDist;
434 Seq2Bin = cluster->getSeqtoBin();
435 oldSeq2Bin = Seq2Bin;
439 //inHcluster.close();
441 if(previousDist <= 0.0000){
442 oldList.setLabel("unique");
445 else if(rndPreviousDist<cutoff){
447 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
449 if (m->control_pressed) {
450 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
451 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
452 remove(distFile.c_str());
453 remove(overlapFile.c_str());
458 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
462 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
468 remove(distFile.c_str());
469 remove(overlapFile.c_str());
478 if (m->control_pressed) {
480 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
485 m->mothurOutEndLine();
486 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
487 m->mothurOut(fileroot+ tag + ".list"); m->mothurOutEndLine(); outputNames.push_back(fileroot+ tag + ".list"); outputTypes["list"].push_back(fileroot+ tag + ".list");
488 m->mothurOut(fileroot+ tag + ".rabund"); m->mothurOutEndLine(); outputNames.push_back(fileroot+ tag + ".rabund"); outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
489 m->mothurOut(fileroot+ tag + ".sabund"); m->mothurOutEndLine(); outputNames.push_back(fileroot+ tag + ".sabund"); outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
490 m->mothurOutEndLine();
492 //set list file as new current listfile
494 itTypes = outputTypes.find("list");
495 if (itTypes != outputTypes.end()) {
496 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
499 //set rabund file as new current rabundfile
500 itTypes = outputTypes.find("rabund");
501 if (itTypes != outputTypes.end()) {
502 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setRabundFile(current); }
505 //set sabund file as new current sabundfile
506 itTypes = outputTypes.find("sabund");
507 if (itTypes != outputTypes.end()) {
508 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setSabundFile(current); }
512 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine();
516 catch(exception& e) {
517 m->errorOut(e, "MGClusterCommand", "execute");
521 //**********************************************************************************************************************
522 void MGClusterCommand::printData(ListVector* mergedList){
524 mergedList->print(listFile);
525 mergedList->getRAbundVector().print(rabundFile);
527 SAbundVector sabund = mergedList->getSAbundVector();
530 sabund.print(sabundFile);
532 catch(exception& e) {
533 m->errorOut(e, "MGClusterCommand", "printData");
537 //**********************************************************************************************************************
538 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
539 //that are used to cluster by distance. this is done so that the overlapping data does not have more influenece than the distance data.
540 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
542 //create new listvector so you don't overwrite the clustering
543 ListVector* newList = new ListVector(oldList);
549 if (hclusterWanted) {
550 m->openInputFile(overlapFile, inOverlap);
551 if (inOverlap.eof()) { done = true; }
552 }else { if (overlapMatrix.size() == 0) { done = true; } }
555 if (m->control_pressed) {
556 if (hclusterWanted) { inOverlap.close(); }
562 if (!hclusterWanted) {
563 if (count < overlapMatrix.size()) { //do we have another node in the matrix
564 overlapNode = overlapMatrix[count];
568 if (!inOverlap.eof()) {
569 string firstName, secondName;
570 float overlapDistance;
571 inOverlap >> firstName >> secondName >> overlapDistance; m->gobble(inOverlap);
573 //commented out because we check this in readblast already
574 //map<string,int>::iterator itA = nameMap->find(firstName);
575 //map<string,int>::iterator itB = nameMap->find(secondName);
576 //if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
577 //if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
579 //overlapNode.seq1 = itA->second;
580 //overlapNode.seq2 = itB->second;
581 overlapNode.seq1 = nameMap->get(firstName);
582 overlapNode.seq2 = nameMap->get(secondName);
583 overlapNode.dist = overlapDistance;
584 }else { inOverlap.close(); break; }
587 if (overlapNode.dist < dist) {
588 //get names of seqs that overlap
589 string name1 = nameMap->get(overlapNode.seq1);
590 string name2 = nameMap->get(overlapNode.seq2);
592 //use binInfo to find out if they are already in the same bin
593 //map<string, int>::iterator itBin1 = binInfo.find(name1);
594 //map<string, int>::iterator itBin2 = binInfo.find(name2);
596 //if(itBin1 == binInfo.end()){ cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1); }
597 //if(itBin2 == binInfo.end()){ cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1); }
599 //int binKeep = itBin1->second;
600 //int binRemove = itBin2->second;
602 int binKeep = binInfo[name1];
603 int binRemove = binInfo[name2];
605 //if not merge bins and update binInfo
606 if(binKeep != binRemove) {
608 //save names in old bin
609 string names = newList->get(binRemove);
611 //merge bins into name1s bin
612 newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
613 newList->set(binRemove, "");
616 while (names.find_first_of(',') != -1) {
618 string name = names.substr(0,names.find_first_of(','));
619 //save name and bin number
620 binInfo[name] = binKeep;
621 names = names.substr(names.find_first_of(',')+1, names.length());
625 binInfo[names] = binKeep;
628 }else { done = true; }
635 catch(exception& e) {
636 m->errorOut(e, "MGClusterCommand", "mergeOPFs");
640 //**********************************************************************************************************************
641 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
644 string sortedDistFile = m->sortFile(unsortedDist, outputDir);
645 remove(unsortedDist.c_str()); //delete unsorted file
646 distFile = sortedDistFile;
649 string sortedOverlapFile = m->sortFile(unsortedOverlap, outputDir);
650 remove(unsortedOverlap.c_str()); //delete unsorted file
651 overlapFile = sortedOverlapFile;
653 catch(exception& e) {
654 m->errorOut(e, "MGClusterCommand", "sortHclusterFiles");
659 //**********************************************************************************************************************