5 * Created by westcott on 12/11/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "mgclustercommand.h"
12 //**********************************************************************************************************************
13 MGClusterCommand::MGClusterCommand(string option) {
15 globaldata = GlobalData::getInstance();
18 //allow user to run help
19 if(option == "help") { help(); abort = true; }
22 //valid paramters for this command
23 string Array[] = {"blast", "method", "name", "hard", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"};
24 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
26 OptionParser parser(option);
27 map<string, string> parameters = parser.getParameters();
29 ValidParameters validParameter;
30 map<string,string>::iterator it;
32 //check to make sure all parameters are valid for command
33 for (it = parameters.begin(); it != parameters.end(); it++) {
34 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
37 //if the user changes the input directory command factory will send this info to us in the output parameter
38 string inputDir = validParameter.validFile(parameters, "inputdir", false);
39 if (inputDir == "not found"){ inputDir = ""; }
42 it = parameters.find("blast");
43 //user has given a template file
44 if(it != parameters.end()){
45 path = hasPath(it->second);
46 //if the user has not given a path then, add inputdir. else leave path alone.
47 if (path == "") { parameters["blast"] = inputDir + it->second; }
50 it = parameters.find("name");
51 //user has given a template file
52 if(it != parameters.end()){
53 path = hasPath(it->second);
54 //if the user has not given a path then, add inputdir. else leave path alone.
55 if (path == "") { parameters["name"] = inputDir + it->second; }
60 //check for required parameters
61 blastfile = validParameter.validFile(parameters, "blast", true);
62 if (blastfile == "not open") { abort = true; }
63 else if (blastfile == "not found") { blastfile = ""; }
65 //if the user changes the output directory command factory will send this info to us in the output parameter
66 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
68 outputDir += hasPath(blastfile); //if user entered a file with a path then preserve it
71 namefile = validParameter.validFile(parameters, "name", true);
72 if (namefile == "not open") { abort = true; }
73 else if (namefile == "not found") { namefile = ""; }
75 if ((blastfile == "")) { m->mothurOut("When executing a mgcluster command you must provide a blastfile."); m->mothurOutEndLine(); abort = true; }
77 //check for optional parameter and set defaults
79 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
80 precisionLength = temp.length();
81 convert(temp, precision);
83 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.70"; }
84 convert(temp, cutoff);
85 cutoff += (5 / (precision * 10.0));
87 method = validParameter.validFile(parameters, "method", false);
88 if (method == "not found") { method = "furthest"; }
90 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
91 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
93 temp = validParameter.validFile(parameters, "length", false); if (temp == "not found") { temp = "5"; }
94 convert(temp, length);
96 temp = validParameter.validFile(parameters, "penalty", false); if (temp == "not found") { temp = "0.10"; }
97 convert(temp, penalty);
99 temp = validParameter.validFile(parameters, "min", false); if (temp == "not found") { temp = "true"; }
100 minWanted = isTrue(temp);
102 temp = validParameter.validFile(parameters, "merge", false); if (temp == "not found") { temp = "true"; }
103 merge = isTrue(temp);
105 temp = validParameter.validFile(parameters, "hcluster", false); if (temp == "not found") { temp = "false"; }
106 hclusterWanted = isTrue(temp);
108 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
113 catch(exception& e) {
114 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
118 //**********************************************************************************************************************
120 void MGClusterCommand::help(){
122 m->mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n");
123 m->mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n");
124 m->mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n");
125 m->mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n");
126 m->mothurOut("The precision parameter's default value is 100. \n");
127 m->mothurOut("The acceptable mgcluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n");
128 m->mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n");
129 m->mothurOut("The length parameter is used to specify the minimum overlap required. The default is 5.\n");
130 m->mothurOut("The penalty parameter is used to adjust the error rate. The default is 0.10.\n");
131 m->mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster. By default merge is true, meaning you want to merge.\n");
132 m->mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering. This may be neccessary if your file is too large to fit into RAM. The default is false.\n");
133 m->mothurOut("The mgcluster command should be in the following format: \n");
134 m->mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n");
135 m->mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n");
137 catch(exception& e) {
138 m->errorOut(e, "MGClusterCommand", "help");
142 //**********************************************************************************************************************
143 MGClusterCommand::~MGClusterCommand(){}
144 //**********************************************************************************************************************
145 int MGClusterCommand::execute(){
148 if (abort == true) { return 0; }
151 if (namefile != "") {
152 nameMap = new NameAssignment(namefile);
154 }else{ nameMap= new NameAssignment(); }
156 string fileroot = outputDir + getRootName(getSimpleName(blastfile));
159 float previousDist = 0.00000;
160 float rndPreviousDist = 0.00000;
162 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
163 //must remember to delete those objects here since readBlast does not
164 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
167 list = new ListVector(nameMap->getListVector());
168 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
170 if (m->control_pressed) { delete nameMap; delete read; delete list; delete rabund; return 0; }
174 map<string, int> Seq2Bin;
175 map<string, int> oldSeq2Bin;
177 if (method == "furthest") { tag = "fn"; }
178 else if (method == "nearest") { tag = "nn"; }
182 openOutputFile(fileroot+ tag + ".list", listFile);
183 openOutputFile(fileroot+ tag + ".rabund", rabundFile);
184 openOutputFile(fileroot+ tag + ".sabund", sabundFile);
186 if (m->control_pressed) {
187 delete nameMap; delete read; delete list; delete rabund;
188 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
192 if (!hclusterWanted) {
193 //get distmatrix and overlap
194 SparseMatrix* distMatrix = read->getDistMatrix();
195 overlapMatrix = read->getOverlapMatrix(); //already sorted by read
199 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method); }
200 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
201 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); }
202 cluster->setMapWanted(true);
203 Seq2Bin = cluster->getSeqtoBin();
204 oldSeq2Bin = Seq2Bin;
206 if (m->control_pressed) {
207 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
208 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
212 //cluster using cluster classes
213 while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
215 cluster->update(cutoff);
217 if (m->control_pressed) {
218 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
219 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
223 float dist = distMatrix->getSmallDist();
226 rndDist = ceilDist(dist, precision);
228 rndDist = roundDist(dist, precision);
232 if(previousDist <= 0.0000 && dist != previousDist){
233 oldList.setLabel("unique");
236 else if(rndDist != rndPreviousDist){
238 Seq2Bin = cluster->getSeqtoBin();
239 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
241 if (m->control_pressed) {
242 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
243 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
247 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
251 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
257 cout << "prev distance = " << previousDist << " dist = " << dist << endl;
258 rndPreviousDist = rndDist;
260 oldSeq2Bin = Seq2Bin;
263 if(previousDist <= 0.0000){
264 oldList.setLabel("unique");
267 else if(rndPreviousDist<cutoff){
269 Seq2Bin = cluster->getSeqtoBin();
270 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
272 if (m->control_pressed) {
273 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
274 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
278 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
282 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
288 overlapMatrix.clear();
292 }else { //use hcluster to cluster
293 //get distmatrix and overlap
294 overlapFile = read->getOverlapFile();
295 distFile = read->getDistFile();
298 //sort the distance and overlap files
299 sortHclusterFiles(distFile, overlapFile);
301 if (m->control_pressed) {
302 delete nameMap; delete list; delete rabund;
303 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
308 hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
309 hcluster->setMapWanted(true);
310 Seq2Bin = cluster->getSeqtoBin();
311 oldSeq2Bin = Seq2Bin;
313 vector<seqDist> seqs; seqs.resize(1); // to start loop
314 //ifstream inHcluster;
315 //openInputFile(distFile, inHcluster);
317 if (m->control_pressed) {
318 delete nameMap; delete list; delete rabund; delete hcluster;
319 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
323 while (seqs.size() != 0){
325 seqs = hcluster->getSeqs();
327 if (m->control_pressed) {
328 delete nameMap; delete list; delete rabund; delete hcluster;
329 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
330 remove(distFile.c_str());
331 remove(overlapFile.c_str());
335 for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
337 if (seqs[i].seq1 != seqs[i].seq2) {
339 hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
341 if (m->control_pressed) {
342 delete nameMap; delete list; delete rabund; delete hcluster;
343 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
344 remove(distFile.c_str());
345 remove(overlapFile.c_str());
351 rndDist = ceilDist(seqs[i].dist, precision);
353 rndDist = roundDist(seqs[i].dist, precision);
356 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
357 oldList.setLabel("unique");
360 else if((rndDist != rndPreviousDist)){
362 Seq2Bin = hcluster->getSeqtoBin();
363 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
365 if (m->control_pressed) {
366 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
367 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
368 remove(distFile.c_str());
369 remove(overlapFile.c_str());
373 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
377 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
382 previousDist = seqs[i].dist;
383 rndPreviousDist = rndDist;
385 oldSeq2Bin = Seq2Bin;
389 //inHcluster.close();
391 if(previousDist <= 0.0000){
392 oldList.setLabel("unique");
395 else if(rndPreviousDist<cutoff){
397 Seq2Bin = hcluster->getSeqtoBin();
398 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
400 if (m->control_pressed) {
401 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
402 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
403 remove(distFile.c_str());
404 remove(overlapFile.c_str());
408 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
412 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
418 //remove(distFile.c_str());
419 //remove(overlapFile.c_str());
428 globaldata->setListFile(fileroot+ tag + ".list");
429 globaldata->setFormat("list");
431 if (m->control_pressed) {
433 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
434 globaldata->setListFile("");
435 globaldata->setFormat("");
439 m->mothurOutEndLine();
440 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
441 m->mothurOut(fileroot+ tag + ".list"); m->mothurOutEndLine();
442 m->mothurOut(fileroot+ tag + ".rabund"); m->mothurOutEndLine();
443 m->mothurOut(fileroot+ tag + ".sabund"); m->mothurOutEndLine();
444 m->mothurOutEndLine();
446 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine();
450 catch(exception& e) {
451 m->errorOut(e, "MGClusterCommand", "execute");
455 //**********************************************************************************************************************
456 void MGClusterCommand::printData(ListVector* mergedList){
458 mergedList->print(listFile);
459 mergedList->getRAbundVector().print(rabundFile);
461 SAbundVector sabund = mergedList->getSAbundVector();
464 sabund.print(sabundFile);
466 catch(exception& e) {
467 m->errorOut(e, "MGClusterCommand", "printData");
471 //**********************************************************************************************************************
472 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
473 //that are used to cluster by distance. this is done so that the overlapping data does not have more influenece than the distance data.
474 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
476 //create new listvector so you don't overwrite the clustering
477 ListVector* newList = new ListVector(oldList);
482 if (hclusterWanted) {
483 openInputFile(overlapFile, inOverlap);
484 if (inOverlap.eof()) { done = true; }
485 }else { if (overlapMatrix.size() == 0) { done = true; } }
488 if (m->control_pressed) {
489 if (hclusterWanted) { inOverlap.close(); }
495 if (!hclusterWanted) {
496 if (count < overlapMatrix.size()) { //do we have another node in the matrix
497 overlapNode = overlapMatrix[count];
501 if (!inOverlap.eof()) {
502 string firstName, secondName;
503 float overlapDistance;
504 inOverlap >> firstName >> secondName >> overlapDistance; gobble(inOverlap);
506 map<string,int>::iterator itA = nameMap->find(firstName);
507 map<string,int>::iterator itB = nameMap->find(secondName);
508 if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
509 if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
511 overlapNode.seq1 = itA->second;
512 overlapNode.seq2 = itB->second;
513 overlapNode.dist = overlapDistance;
514 }else { inOverlap.close(); break; }
517 if (overlapNode.dist < dist) {
518 //get names of seqs that overlap
519 string name1 = nameMap->get(overlapNode.seq1);
520 string name2 = nameMap->get(overlapNode.seq2);
522 //use binInfo to find out if they are already in the same bin
523 map<string, int>::iterator itBin1 = binInfo.find(name1);
524 map<string, int>::iterator itBin2 = binInfo.find(name2);
526 if(itBin1 == binInfo.end()){ cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1); }
527 if(itBin2 == binInfo.end()){ cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1); }
529 int binKeep = itBin1->second;
530 int binRemove = itBin2->second;
532 //if not merge bins and update binInfo
533 if(binKeep != binRemove) {
535 //save names in old bin
536 string names = newList->get(binRemove);
538 //merge bins into name1s bin
539 newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
540 newList->set(binRemove, "");
543 while (names.find_first_of(',') != -1) {
545 string name = names.substr(0,names.find_first_of(','));
546 //save name and bin number
547 binInfo[name] = binKeep;
548 names = names.substr(names.find_first_of(',')+1, names.length());
552 binInfo[names] = binKeep;
555 }else { done = true; }
562 catch(exception& e) {
563 m->errorOut(e, "MGClusterCommand", "mergeOPFs");
567 //**********************************************************************************************************************
568 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
571 string sortedDistFile = sortFile(unsortedDist, outputDir);
572 remove(unsortedDist.c_str()); //delete unsorted file
573 distFile = sortedDistFile;
576 string sortedOverlapFile = sortFile(unsortedOverlap, outputDir);
577 remove(unsortedOverlap.c_str()); //delete unsorted file
578 overlapFile = sortedOverlapFile;
580 catch(exception& e) {
581 m->errorOut(e, "MGClusterCommand", "sortHclusterFiles");
586 //**********************************************************************************************************************