5 * Created by westcott on 12/11/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "mgclustercommand.h"
13 //**********************************************************************************************************************
14 vector<string> MGClusterCommand::getValidParameters(){
16 string Array[] = {"blast", "method", "name", "hard", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"};
17 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
21 m->errorOut(e, "MGClusterCommand", "getValidParameters");
25 //**********************************************************************************************************************
26 MGClusterCommand::MGClusterCommand(){
29 //initialize outputTypes
30 vector<string> tempOutNames;
31 outputTypes["list"] = tempOutNames;
32 outputTypes["rabund"] = tempOutNames;
33 outputTypes["sabund"] = tempOutNames;
36 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
40 //**********************************************************************************************************************
41 vector<string> MGClusterCommand::getRequiredParameters(){
43 string Array[] = {"blast"};
44 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
48 m->errorOut(e, "MGClusterCommand", "getRequiredParameters");
52 //**********************************************************************************************************************
53 vector<string> MGClusterCommand::getRequiredFiles(){
55 vector<string> myArray;
59 m->errorOut(e, "MGClusterCommand", "getRequiredFiles");
63 //**********************************************************************************************************************
64 MGClusterCommand::MGClusterCommand(string option) {
66 globaldata = GlobalData::getInstance();
69 //allow user to run help
70 if(option == "help") { help(); abort = true; }
73 //valid paramters for this command
74 string Array[] = {"blast", "method", "name", "hard", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"};
75 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
77 OptionParser parser(option);
78 map<string, string> parameters = parser.getParameters();
80 ValidParameters validParameter;
81 map<string,string>::iterator it;
83 //check to make sure all parameters are valid for command
84 for (it = parameters.begin(); it != parameters.end(); it++) {
85 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
88 //initialize outputTypes
89 vector<string> tempOutNames;
90 outputTypes["list"] = tempOutNames;
91 outputTypes["rabund"] = tempOutNames;
92 outputTypes["sabund"] = tempOutNames;
94 //if the user changes the input directory command factory will send this info to us in the output parameter
95 string inputDir = validParameter.validFile(parameters, "inputdir", false);
96 if (inputDir == "not found"){ inputDir = ""; }
99 it = parameters.find("blast");
100 //user has given a template file
101 if(it != parameters.end()){
102 path = m->hasPath(it->second);
103 //if the user has not given a path then, add inputdir. else leave path alone.
104 if (path == "") { parameters["blast"] = inputDir + it->second; }
107 it = parameters.find("name");
108 //user has given a template file
109 if(it != parameters.end()){
110 path = m->hasPath(it->second);
111 //if the user has not given a path then, add inputdir. else leave path alone.
112 if (path == "") { parameters["name"] = inputDir + it->second; }
117 //check for required parameters
118 blastfile = validParameter.validFile(parameters, "blast", true);
119 if (blastfile == "not open") { abort = true; }
120 else if (blastfile == "not found") { blastfile = ""; }
122 //if the user changes the output directory command factory will send this info to us in the output parameter
123 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
125 outputDir += m->hasPath(blastfile); //if user entered a file with a path then preserve it
128 namefile = validParameter.validFile(parameters, "name", true);
129 if (namefile == "not open") { abort = true; }
130 else if (namefile == "not found") { namefile = ""; }
132 if ((blastfile == "")) { m->mothurOut("When executing a mgcluster command you must provide a blastfile."); m->mothurOutEndLine(); abort = true; }
134 //check for optional parameter and set defaults
136 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
137 precisionLength = temp.length();
138 convert(temp, precision);
140 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.70"; }
141 convert(temp, cutoff);
142 cutoff += (5 / (precision * 10.0));
144 method = validParameter.validFile(parameters, "method", false);
145 if (method == "not found") { method = "furthest"; }
147 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
148 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
150 temp = validParameter.validFile(parameters, "length", false); if (temp == "not found") { temp = "5"; }
151 convert(temp, length);
153 temp = validParameter.validFile(parameters, "penalty", false); if (temp == "not found") { temp = "0.10"; }
154 convert(temp, penalty);
156 temp = validParameter.validFile(parameters, "min", false); if (temp == "not found") { temp = "true"; }
157 minWanted = m->isTrue(temp);
159 temp = validParameter.validFile(parameters, "merge", false); if (temp == "not found") { temp = "true"; }
160 merge = m->isTrue(temp);
162 temp = validParameter.validFile(parameters, "hcluster", false); if (temp == "not found") { temp = "false"; }
163 hclusterWanted = m->isTrue(temp);
165 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
166 hard = m->isTrue(temp);
170 catch(exception& e) {
171 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
175 //**********************************************************************************************************************
177 void MGClusterCommand::help(){
179 m->mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n");
180 m->mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n");
181 m->mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n");
182 m->mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n");
183 m->mothurOut("The precision parameter's default value is 100. \n");
184 m->mothurOut("The acceptable mgcluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n");
185 m->mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n");
186 m->mothurOut("The length parameter is used to specify the minimum overlap required. The default is 5.\n");
187 m->mothurOut("The penalty parameter is used to adjust the error rate. The default is 0.10.\n");
188 m->mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster. By default merge is true, meaning you want to merge.\n");
189 m->mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering. This may be neccessary if your file is too large to fit into RAM. The default is false.\n");
190 m->mothurOut("The mgcluster command should be in the following format: \n");
191 m->mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n");
192 m->mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n");
194 catch(exception& e) {
195 m->errorOut(e, "MGClusterCommand", "help");
199 //**********************************************************************************************************************
200 MGClusterCommand::~MGClusterCommand(){}
201 //**********************************************************************************************************************
202 int MGClusterCommand::execute(){
205 if (abort == true) { return 0; }
208 if (namefile != "") {
209 nameMap = new NameAssignment(namefile);
211 }else{ nameMap= new NameAssignment(); }
213 string fileroot = outputDir + m->getRootName(m->getSimpleName(blastfile));
216 float previousDist = 0.00000;
217 float rndPreviousDist = 0.00000;
219 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
220 //must remember to delete those objects here since readBlast does not
221 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
224 list = new ListVector(nameMap->getListVector());
225 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
227 if (m->control_pressed) { outputTypes.clear(); delete nameMap; delete read; delete list; delete rabund; return 0; }
231 map<string, int> Seq2Bin;
232 map<string, int> oldSeq2Bin;
234 if (method == "furthest") { tag = "fn"; }
235 else if (method == "nearest") { tag = "nn"; }
239 m->openOutputFile(fileroot+ tag + ".list", listFile);
240 m->openOutputFile(fileroot+ tag + ".rabund", rabundFile);
241 m->openOutputFile(fileroot+ tag + ".sabund", sabundFile);
243 if (m->control_pressed) {
244 delete nameMap; delete read; delete list; delete rabund;
245 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
250 if (!hclusterWanted) {
251 //get distmatrix and overlap
252 SparseMatrix* distMatrix = read->getDistMatrix();
253 overlapMatrix = read->getOverlapMatrix(); //already sorted by read
257 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method); }
258 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
259 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); }
260 cluster->setMapWanted(true);
261 Seq2Bin = cluster->getSeqtoBin();
262 oldSeq2Bin = Seq2Bin;
264 if (m->control_pressed) {
265 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
266 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
271 //cluster using cluster classes
272 while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
274 cluster->update(cutoff);
276 if (m->control_pressed) {
277 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
278 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
283 float dist = distMatrix->getSmallDist();
286 rndDist = m->ceilDist(dist, precision);
288 rndDist = m->roundDist(dist, precision);
291 if(previousDist <= 0.0000 && dist != previousDist){
292 oldList.setLabel("unique");
295 else if(rndDist != rndPreviousDist){
297 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
299 if (m->control_pressed) {
300 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
301 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
306 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
310 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
316 rndPreviousDist = rndDist;
318 Seq2Bin = cluster->getSeqtoBin();
319 oldSeq2Bin = Seq2Bin;
322 if(previousDist <= 0.0000){
323 oldList.setLabel("unique");
326 else if(rndPreviousDist<cutoff){
328 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
330 if (m->control_pressed) {
331 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
332 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
337 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
341 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
347 overlapMatrix.clear();
351 }else { //use hcluster to cluster
352 //get distmatrix and overlap
353 overlapFile = read->getOverlapFile();
354 distFile = read->getDistFile();
357 //sort the distance and overlap files
358 sortHclusterFiles(distFile, overlapFile);
360 if (m->control_pressed) {
361 delete nameMap; delete list; delete rabund;
362 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
368 hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
369 hcluster->setMapWanted(true);
370 Seq2Bin = cluster->getSeqtoBin();
371 oldSeq2Bin = Seq2Bin;
373 vector<seqDist> seqs; seqs.resize(1); // to start loop
374 //ifstream inHcluster;
375 //m->openInputFile(distFile, inHcluster);
377 if (m->control_pressed) {
378 delete nameMap; delete list; delete rabund; delete hcluster;
379 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
384 while (seqs.size() != 0){
386 seqs = hcluster->getSeqs();
388 if (m->control_pressed) {
389 delete nameMap; delete list; delete rabund; delete hcluster;
390 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
391 remove(distFile.c_str());
392 remove(overlapFile.c_str());
397 for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
399 if (seqs[i].seq1 != seqs[i].seq2) {
401 hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
403 if (m->control_pressed) {
404 delete nameMap; delete list; delete rabund; delete hcluster;
405 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
406 remove(distFile.c_str());
407 remove(overlapFile.c_str());
414 rndDist = m->ceilDist(seqs[i].dist, precision);
416 rndDist = m->roundDist(seqs[i].dist, precision);
419 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
420 oldList.setLabel("unique");
423 else if((rndDist != rndPreviousDist)){
425 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
427 if (m->control_pressed) {
428 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
429 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
430 remove(distFile.c_str());
431 remove(overlapFile.c_str());
436 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
440 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
445 previousDist = seqs[i].dist;
446 rndPreviousDist = rndDist;
448 Seq2Bin = cluster->getSeqtoBin();
449 oldSeq2Bin = Seq2Bin;
453 //inHcluster.close();
455 if(previousDist <= 0.0000){
456 oldList.setLabel("unique");
459 else if(rndPreviousDist<cutoff){
461 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
463 if (m->control_pressed) {
464 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
465 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
466 remove(distFile.c_str());
467 remove(overlapFile.c_str());
472 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
476 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
482 remove(distFile.c_str());
483 remove(overlapFile.c_str());
492 globaldata->setListFile(fileroot+ tag + ".list");
493 globaldata->setFormat("list");
495 if (m->control_pressed) {
497 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
498 globaldata->setListFile("");
499 globaldata->setFormat("");
504 m->mothurOutEndLine();
505 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
506 m->mothurOut(fileroot+ tag + ".list"); m->mothurOutEndLine(); outputNames.push_back(fileroot+ tag + ".list"); outputTypes["list"].push_back(fileroot+ tag + ".list");
507 m->mothurOut(fileroot+ tag + ".rabund"); m->mothurOutEndLine(); outputNames.push_back(fileroot+ tag + ".rabund"); outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
508 m->mothurOut(fileroot+ tag + ".sabund"); m->mothurOutEndLine(); outputNames.push_back(fileroot+ tag + ".sabund"); outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
509 m->mothurOutEndLine();
511 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine();
515 catch(exception& e) {
516 m->errorOut(e, "MGClusterCommand", "execute");
520 //**********************************************************************************************************************
521 void MGClusterCommand::printData(ListVector* mergedList){
523 mergedList->print(listFile);
524 mergedList->getRAbundVector().print(rabundFile);
526 SAbundVector sabund = mergedList->getSAbundVector();
529 sabund.print(sabundFile);
531 catch(exception& e) {
532 m->errorOut(e, "MGClusterCommand", "printData");
536 //**********************************************************************************************************************
537 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
538 //that are used to cluster by distance. this is done so that the overlapping data does not have more influenece than the distance data.
539 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
541 //create new listvector so you don't overwrite the clustering
542 ListVector* newList = new ListVector(oldList);
548 if (hclusterWanted) {
549 m->openInputFile(overlapFile, inOverlap);
550 if (inOverlap.eof()) { done = true; }
551 }else { if (overlapMatrix.size() == 0) { done = true; } }
554 if (m->control_pressed) {
555 if (hclusterWanted) { inOverlap.close(); }
561 if (!hclusterWanted) {
562 if (count < overlapMatrix.size()) { //do we have another node in the matrix
563 overlapNode = overlapMatrix[count];
567 if (!inOverlap.eof()) {
568 string firstName, secondName;
569 float overlapDistance;
570 inOverlap >> firstName >> secondName >> overlapDistance; m->gobble(inOverlap);
572 //commented out because we check this in readblast already
573 //map<string,int>::iterator itA = nameMap->find(firstName);
574 //map<string,int>::iterator itB = nameMap->find(secondName);
575 //if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
576 //if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
578 //overlapNode.seq1 = itA->second;
579 //overlapNode.seq2 = itB->second;
580 overlapNode.seq1 = nameMap->get(firstName);
581 overlapNode.seq2 = nameMap->get(secondName);
582 overlapNode.dist = overlapDistance;
583 }else { inOverlap.close(); break; }
586 if (overlapNode.dist < dist) {
587 //get names of seqs that overlap
588 string name1 = nameMap->get(overlapNode.seq1);
589 string name2 = nameMap->get(overlapNode.seq2);
591 //use binInfo to find out if they are already in the same bin
592 //map<string, int>::iterator itBin1 = binInfo.find(name1);
593 //map<string, int>::iterator itBin2 = binInfo.find(name2);
595 //if(itBin1 == binInfo.end()){ cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1); }
596 //if(itBin2 == binInfo.end()){ cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1); }
598 //int binKeep = itBin1->second;
599 //int binRemove = itBin2->second;
601 int binKeep = binInfo[name1];
602 int binRemove = binInfo[name2];
604 //if not merge bins and update binInfo
605 if(binKeep != binRemove) {
607 //save names in old bin
608 string names = newList->get(binRemove);
610 //merge bins into name1s bin
611 newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
612 newList->set(binRemove, "");
615 while (names.find_first_of(',') != -1) {
617 string name = names.substr(0,names.find_first_of(','));
618 //save name and bin number
619 binInfo[name] = binKeep;
620 names = names.substr(names.find_first_of(',')+1, names.length());
624 binInfo[names] = binKeep;
627 }else { done = true; }
634 catch(exception& e) {
635 m->errorOut(e, "MGClusterCommand", "mergeOPFs");
639 //**********************************************************************************************************************
640 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
643 string sortedDistFile = m->sortFile(unsortedDist, outputDir);
644 remove(unsortedDist.c_str()); //delete unsorted file
645 distFile = sortedDistFile;
648 string sortedOverlapFile = m->sortFile(unsortedOverlap, outputDir);
649 remove(unsortedOverlap.c_str()); //delete unsorted file
650 overlapFile = sortedOverlapFile;
652 catch(exception& e) {
653 m->errorOut(e, "MGClusterCommand", "sortHclusterFiles");
658 //**********************************************************************************************************************