5 * Created by westcott on 12/11/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "mgclustercommand.h"
13 //**********************************************************************************************************************
14 vector<string> MGClusterCommand::getValidParameters(){
16 string Array[] = {"blast", "method", "name", "hard", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"};
17 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
21 m->errorOut(e, "MGClusterCommand", "getValidParameters");
25 //**********************************************************************************************************************
26 MGClusterCommand::MGClusterCommand(){
28 abort = true; calledHelp = true;
29 vector<string> tempOutNames;
30 outputTypes["list"] = tempOutNames;
31 outputTypes["rabund"] = tempOutNames;
32 outputTypes["sabund"] = tempOutNames;
35 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
39 //**********************************************************************************************************************
40 vector<string> MGClusterCommand::getRequiredParameters(){
42 string Array[] = {"blast"};
43 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
47 m->errorOut(e, "MGClusterCommand", "getRequiredParameters");
51 //**********************************************************************************************************************
52 vector<string> MGClusterCommand::getRequiredFiles(){
54 vector<string> myArray;
58 m->errorOut(e, "MGClusterCommand", "getRequiredFiles");
62 //**********************************************************************************************************************
63 MGClusterCommand::MGClusterCommand(string option) {
65 globaldata = GlobalData::getInstance();
66 abort = false; calledHelp = false;
68 //allow user to run help
69 if(option == "help") { help(); abort = true; calledHelp = true; }
72 //valid paramters for this command
73 string Array[] = {"blast", "method", "name", "hard", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"};
74 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
76 OptionParser parser(option);
77 map<string, string> parameters = parser.getParameters();
79 ValidParameters validParameter;
80 map<string,string>::iterator it;
82 //check to make sure all parameters are valid for command
83 for (it = parameters.begin(); it != parameters.end(); it++) {
84 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
87 //initialize outputTypes
88 vector<string> tempOutNames;
89 outputTypes["list"] = tempOutNames;
90 outputTypes["rabund"] = tempOutNames;
91 outputTypes["sabund"] = tempOutNames;
93 //if the user changes the input directory command factory will send this info to us in the output parameter
94 string inputDir = validParameter.validFile(parameters, "inputdir", false);
95 if (inputDir == "not found"){ inputDir = ""; }
98 it = parameters.find("blast");
99 //user has given a template file
100 if(it != parameters.end()){
101 path = m->hasPath(it->second);
102 //if the user has not given a path then, add inputdir. else leave path alone.
103 if (path == "") { parameters["blast"] = inputDir + it->second; }
106 it = parameters.find("name");
107 //user has given a template file
108 if(it != parameters.end()){
109 path = m->hasPath(it->second);
110 //if the user has not given a path then, add inputdir. else leave path alone.
111 if (path == "") { parameters["name"] = inputDir + it->second; }
116 //check for required parameters
117 blastfile = validParameter.validFile(parameters, "blast", true);
118 if (blastfile == "not open") { abort = true; }
119 else if (blastfile == "not found") { blastfile = ""; }
121 //if the user changes the output directory command factory will send this info to us in the output parameter
122 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
124 outputDir += m->hasPath(blastfile); //if user entered a file with a path then preserve it
127 namefile = validParameter.validFile(parameters, "name", true);
128 if (namefile == "not open") { abort = true; }
129 else if (namefile == "not found") { namefile = ""; }
131 if ((blastfile == "")) { m->mothurOut("When executing a mgcluster command you must provide a blastfile."); m->mothurOutEndLine(); abort = true; }
133 //check for optional parameter and set defaults
135 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
136 precisionLength = temp.length();
137 convert(temp, precision);
139 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.70"; }
140 convert(temp, cutoff);
141 cutoff += (5 / (precision * 10.0));
143 method = validParameter.validFile(parameters, "method", false);
144 if (method == "not found") { method = "furthest"; }
146 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
147 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
149 temp = validParameter.validFile(parameters, "length", false); if (temp == "not found") { temp = "5"; }
150 convert(temp, length);
152 temp = validParameter.validFile(parameters, "penalty", false); if (temp == "not found") { temp = "0.10"; }
153 convert(temp, penalty);
155 temp = validParameter.validFile(parameters, "min", false); if (temp == "not found") { temp = "true"; }
156 minWanted = m->isTrue(temp);
158 temp = validParameter.validFile(parameters, "merge", false); if (temp == "not found") { temp = "true"; }
159 merge = m->isTrue(temp);
161 temp = validParameter.validFile(parameters, "hcluster", false); if (temp == "not found") { temp = "false"; }
162 hclusterWanted = m->isTrue(temp);
164 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
165 hard = m->isTrue(temp);
169 catch(exception& e) {
170 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
174 //**********************************************************************************************************************
176 void MGClusterCommand::help(){
178 m->mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n");
179 m->mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n");
180 m->mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n");
181 m->mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n");
182 m->mothurOut("The precision parameter's default value is 100. \n");
183 m->mothurOut("The acceptable mgcluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n");
184 m->mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n");
185 m->mothurOut("The length parameter is used to specify the minimum overlap required. The default is 5.\n");
186 m->mothurOut("The penalty parameter is used to adjust the error rate. The default is 0.10.\n");
187 m->mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster. By default merge is true, meaning you want to merge.\n");
188 m->mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering. This may be neccessary if your file is too large to fit into RAM. The default is false.\n");
189 m->mothurOut("The mgcluster command should be in the following format: \n");
190 m->mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n");
191 m->mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n");
193 catch(exception& e) {
194 m->errorOut(e, "MGClusterCommand", "help");
198 //**********************************************************************************************************************
199 MGClusterCommand::~MGClusterCommand(){}
200 //**********************************************************************************************************************
201 int MGClusterCommand::execute(){
204 if (abort == true) { if (calledHelp) { return 0; } return 2; }
207 if (namefile != "") {
208 nameMap = new NameAssignment(namefile);
210 }else{ nameMap= new NameAssignment(); }
212 string fileroot = outputDir + m->getRootName(m->getSimpleName(blastfile));
215 float previousDist = 0.00000;
216 float rndPreviousDist = 0.00000;
218 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
219 //must remember to delete those objects here since readBlast does not
220 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
223 list = new ListVector(nameMap->getListVector());
224 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
226 if (m->control_pressed) { outputTypes.clear(); delete nameMap; delete read; delete list; delete rabund; return 0; }
230 map<string, int> Seq2Bin;
231 map<string, int> oldSeq2Bin;
233 if (method == "furthest") { tag = "fn"; }
234 else if (method == "nearest") { tag = "nn"; }
238 m->openOutputFile(fileroot+ tag + ".list", listFile);
239 m->openOutputFile(fileroot+ tag + ".rabund", rabundFile);
240 m->openOutputFile(fileroot+ tag + ".sabund", sabundFile);
242 if (m->control_pressed) {
243 delete nameMap; delete read; delete list; delete rabund;
244 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
249 if (!hclusterWanted) {
250 //get distmatrix and overlap
251 SparseMatrix* distMatrix = read->getDistMatrix();
252 overlapMatrix = read->getOverlapMatrix(); //already sorted by read
256 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method); }
257 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
258 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); }
259 cluster->setMapWanted(true);
260 Seq2Bin = cluster->getSeqtoBin();
261 oldSeq2Bin = Seq2Bin;
263 if (m->control_pressed) {
264 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
265 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
270 //cluster using cluster classes
271 while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
273 cluster->update(cutoff);
275 if (m->control_pressed) {
276 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
277 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
282 float dist = distMatrix->getSmallDist();
285 rndDist = m->ceilDist(dist, precision);
287 rndDist = m->roundDist(dist, precision);
290 if(previousDist <= 0.0000 && dist != previousDist){
291 oldList.setLabel("unique");
294 else if(rndDist != rndPreviousDist){
296 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
298 if (m->control_pressed) {
299 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
300 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
305 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
309 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
315 rndPreviousDist = rndDist;
317 Seq2Bin = cluster->getSeqtoBin();
318 oldSeq2Bin = Seq2Bin;
321 if(previousDist <= 0.0000){
322 oldList.setLabel("unique");
325 else if(rndPreviousDist<cutoff){
327 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
329 if (m->control_pressed) {
330 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
331 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
336 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
340 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
346 overlapMatrix.clear();
350 }else { //use hcluster to cluster
351 //get distmatrix and overlap
352 overlapFile = read->getOverlapFile();
353 distFile = read->getDistFile();
356 //sort the distance and overlap files
357 sortHclusterFiles(distFile, overlapFile);
359 if (m->control_pressed) {
360 delete nameMap; delete list; delete rabund;
361 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
367 hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
368 hcluster->setMapWanted(true);
369 Seq2Bin = cluster->getSeqtoBin();
370 oldSeq2Bin = Seq2Bin;
372 vector<seqDist> seqs; seqs.resize(1); // to start loop
373 //ifstream inHcluster;
374 //m->openInputFile(distFile, inHcluster);
376 if (m->control_pressed) {
377 delete nameMap; delete list; delete rabund; delete hcluster;
378 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
383 while (seqs.size() != 0){
385 seqs = hcluster->getSeqs();
387 if (m->control_pressed) {
388 delete nameMap; delete list; delete rabund; delete hcluster;
389 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
390 remove(distFile.c_str());
391 remove(overlapFile.c_str());
396 for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
398 if (seqs[i].seq1 != seqs[i].seq2) {
400 hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
402 if (m->control_pressed) {
403 delete nameMap; delete list; delete rabund; delete hcluster;
404 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
405 remove(distFile.c_str());
406 remove(overlapFile.c_str());
413 rndDist = m->ceilDist(seqs[i].dist, precision);
415 rndDist = m->roundDist(seqs[i].dist, precision);
418 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
419 oldList.setLabel("unique");
422 else if((rndDist != rndPreviousDist)){
424 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
426 if (m->control_pressed) {
427 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
428 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
429 remove(distFile.c_str());
430 remove(overlapFile.c_str());
435 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
439 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
444 previousDist = seqs[i].dist;
445 rndPreviousDist = rndDist;
447 Seq2Bin = cluster->getSeqtoBin();
448 oldSeq2Bin = Seq2Bin;
452 //inHcluster.close();
454 if(previousDist <= 0.0000){
455 oldList.setLabel("unique");
458 else if(rndPreviousDist<cutoff){
460 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
462 if (m->control_pressed) {
463 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
464 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
465 remove(distFile.c_str());
466 remove(overlapFile.c_str());
471 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
475 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
481 remove(distFile.c_str());
482 remove(overlapFile.c_str());
491 globaldata->setListFile(fileroot+ tag + ".list");
492 globaldata->setFormat("list");
494 if (m->control_pressed) {
496 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
497 globaldata->setListFile("");
498 globaldata->setFormat("");
503 m->mothurOutEndLine();
504 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
505 m->mothurOut(fileroot+ tag + ".list"); m->mothurOutEndLine(); outputNames.push_back(fileroot+ tag + ".list"); outputTypes["list"].push_back(fileroot+ tag + ".list");
506 m->mothurOut(fileroot+ tag + ".rabund"); m->mothurOutEndLine(); outputNames.push_back(fileroot+ tag + ".rabund"); outputTypes["rabund"].push_back(fileroot+ tag + ".rabund");
507 m->mothurOut(fileroot+ tag + ".sabund"); m->mothurOutEndLine(); outputNames.push_back(fileroot+ tag + ".sabund"); outputTypes["sabund"].push_back(fileroot+ tag + ".sabund");
508 m->mothurOutEndLine();
510 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine();
514 catch(exception& e) {
515 m->errorOut(e, "MGClusterCommand", "execute");
519 //**********************************************************************************************************************
520 void MGClusterCommand::printData(ListVector* mergedList){
522 mergedList->print(listFile);
523 mergedList->getRAbundVector().print(rabundFile);
525 SAbundVector sabund = mergedList->getSAbundVector();
528 sabund.print(sabundFile);
530 catch(exception& e) {
531 m->errorOut(e, "MGClusterCommand", "printData");
535 //**********************************************************************************************************************
536 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
537 //that are used to cluster by distance. this is done so that the overlapping data does not have more influenece than the distance data.
538 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
540 //create new listvector so you don't overwrite the clustering
541 ListVector* newList = new ListVector(oldList);
547 if (hclusterWanted) {
548 m->openInputFile(overlapFile, inOverlap);
549 if (inOverlap.eof()) { done = true; }
550 }else { if (overlapMatrix.size() == 0) { done = true; } }
553 if (m->control_pressed) {
554 if (hclusterWanted) { inOverlap.close(); }
560 if (!hclusterWanted) {
561 if (count < overlapMatrix.size()) { //do we have another node in the matrix
562 overlapNode = overlapMatrix[count];
566 if (!inOverlap.eof()) {
567 string firstName, secondName;
568 float overlapDistance;
569 inOverlap >> firstName >> secondName >> overlapDistance; m->gobble(inOverlap);
571 //commented out because we check this in readblast already
572 //map<string,int>::iterator itA = nameMap->find(firstName);
573 //map<string,int>::iterator itB = nameMap->find(secondName);
574 //if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
575 //if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
577 //overlapNode.seq1 = itA->second;
578 //overlapNode.seq2 = itB->second;
579 overlapNode.seq1 = nameMap->get(firstName);
580 overlapNode.seq2 = nameMap->get(secondName);
581 overlapNode.dist = overlapDistance;
582 }else { inOverlap.close(); break; }
585 if (overlapNode.dist < dist) {
586 //get names of seqs that overlap
587 string name1 = nameMap->get(overlapNode.seq1);
588 string name2 = nameMap->get(overlapNode.seq2);
590 //use binInfo to find out if they are already in the same bin
591 //map<string, int>::iterator itBin1 = binInfo.find(name1);
592 //map<string, int>::iterator itBin2 = binInfo.find(name2);
594 //if(itBin1 == binInfo.end()){ cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1); }
595 //if(itBin2 == binInfo.end()){ cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1); }
597 //int binKeep = itBin1->second;
598 //int binRemove = itBin2->second;
600 int binKeep = binInfo[name1];
601 int binRemove = binInfo[name2];
603 //if not merge bins and update binInfo
604 if(binKeep != binRemove) {
606 //save names in old bin
607 string names = newList->get(binRemove);
609 //merge bins into name1s bin
610 newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
611 newList->set(binRemove, "");
614 while (names.find_first_of(',') != -1) {
616 string name = names.substr(0,names.find_first_of(','));
617 //save name and bin number
618 binInfo[name] = binKeep;
619 names = names.substr(names.find_first_of(',')+1, names.length());
623 binInfo[names] = binKeep;
626 }else { done = true; }
633 catch(exception& e) {
634 m->errorOut(e, "MGClusterCommand", "mergeOPFs");
638 //**********************************************************************************************************************
639 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
642 string sortedDistFile = m->sortFile(unsortedDist, outputDir);
643 remove(unsortedDist.c_str()); //delete unsorted file
644 distFile = sortedDistFile;
647 string sortedOverlapFile = m->sortFile(unsortedOverlap, outputDir);
648 remove(unsortedOverlap.c_str()); //delete unsorted file
649 overlapFile = sortedOverlapFile;
651 catch(exception& e) {
652 m->errorOut(e, "MGClusterCommand", "sortHclusterFiles");
657 //**********************************************************************************************************************