5 * Created by westcott on 12/11/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "mgclustercommand.h"
12 //**********************************************************************************************************************
13 MGClusterCommand::MGClusterCommand(string option) {
15 globaldata = GlobalData::getInstance();
18 //allow user to run help
19 if(option == "help") { help(); abort = true; }
22 //valid paramters for this command
23 string Array[] = {"blast", "method", "name", "hard", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"};
24 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
26 OptionParser parser(option);
27 map<string, string> parameters = parser.getParameters();
29 ValidParameters validParameter;
30 map<string,string>::iterator it;
32 //check to make sure all parameters are valid for command
33 for (it = parameters.begin(); it != parameters.end(); it++) {
34 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
37 //if the user changes the input directory command factory will send this info to us in the output parameter
38 string inputDir = validParameter.validFile(parameters, "inputdir", false);
39 if (inputDir == "not found"){ inputDir = ""; }
42 it = parameters.find("blast");
43 //user has given a template file
44 if(it != parameters.end()){
45 path = hasPath(it->second);
46 //if the user has not given a path then, add inputdir. else leave path alone.
47 if (path == "") { parameters["blast"] = inputDir + it->second; }
50 it = parameters.find("name");
51 //user has given a template file
52 if(it != parameters.end()){
53 path = hasPath(it->second);
54 //if the user has not given a path then, add inputdir. else leave path alone.
55 if (path == "") { parameters["name"] = inputDir + it->second; }
60 //check for required parameters
61 blastfile = validParameter.validFile(parameters, "blast", true);
62 if (blastfile == "not open") { abort = true; }
63 else if (blastfile == "not found") { blastfile = ""; }
65 //if the user changes the output directory command factory will send this info to us in the output parameter
66 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
68 outputDir += hasPath(blastfile); //if user entered a file with a path then preserve it
71 namefile = validParameter.validFile(parameters, "name", true);
72 if (namefile == "not open") { abort = true; }
73 else if (namefile == "not found") { namefile = ""; }
75 if ((blastfile == "")) { m->mothurOut("When executing a mgcluster command you must provide a blastfile."); m->mothurOutEndLine(); abort = true; }
77 //check for optional parameter and set defaults
79 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
80 precisionLength = temp.length();
81 convert(temp, precision);
83 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.70"; }
84 convert(temp, cutoff);
85 cutoff += (5 / (precision * 10.0));
87 method = validParameter.validFile(parameters, "method", false);
88 if (method == "not found") { method = "furthest"; }
90 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
91 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
93 temp = validParameter.validFile(parameters, "length", false); if (temp == "not found") { temp = "5"; }
94 convert(temp, length);
96 temp = validParameter.validFile(parameters, "penalty", false); if (temp == "not found") { temp = "0.10"; }
97 convert(temp, penalty);
99 temp = validParameter.validFile(parameters, "min", false); if (temp == "not found") { temp = "true"; }
100 minWanted = isTrue(temp);
102 temp = validParameter.validFile(parameters, "merge", false); if (temp == "not found") { temp = "true"; }
103 merge = isTrue(temp);
105 temp = validParameter.validFile(parameters, "hcluster", false); if (temp == "not found") { temp = "false"; }
106 hclusterWanted = isTrue(temp);
108 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
113 catch(exception& e) {
114 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
118 //**********************************************************************************************************************
120 void MGClusterCommand::help(){
122 m->mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n");
123 m->mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n");
124 m->mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n");
125 m->mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n");
126 m->mothurOut("The precision parameter's default value is 100. \n");
127 m->mothurOut("The acceptable mgcluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n");
128 m->mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n");
129 m->mothurOut("The length parameter is used to specify the minimum overlap required. The default is 5.\n");
130 m->mothurOut("The penalty parameter is used to adjust the error rate. The default is 0.10.\n");
131 m->mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster. By default merge is true, meaning you want to merge.\n");
132 m->mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering. This may be neccessary if your file is too large to fit into RAM. The default is false.\n");
133 m->mothurOut("The mgcluster command should be in the following format: \n");
134 m->mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n");
135 m->mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n");
137 catch(exception& e) {
138 m->errorOut(e, "MGClusterCommand", "help");
142 //**********************************************************************************************************************
143 MGClusterCommand::~MGClusterCommand(){}
144 //**********************************************************************************************************************
145 int MGClusterCommand::execute(){
148 if (abort == true) { return 0; }
151 if (namefile != "") {
152 nameMap = new NameAssignment(namefile);
154 }else{ nameMap= new NameAssignment(); }
156 string fileroot = outputDir + getRootName(getSimpleName(blastfile));
159 float previousDist = 0.00000;
160 float rndPreviousDist = 0.00000;
162 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
163 //must remember to delete those objects here since readBlast does not
164 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
167 list = new ListVector(nameMap->getListVector());
168 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
170 if (m->control_pressed) { delete nameMap; delete read; delete list; delete rabund; return 0; }
175 if (method == "furthest") { tag = "fn"; }
176 else if (method == "nearest") { tag = "nn"; }
180 openOutputFile(fileroot+ tag + ".list", listFile);
181 openOutputFile(fileroot+ tag + ".rabund", rabundFile);
182 openOutputFile(fileroot+ tag + ".sabund", sabundFile);
184 if (m->control_pressed) {
185 delete nameMap; delete read; delete list; delete rabund;
186 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
190 if (!hclusterWanted) {
191 //get distmatrix and overlap
192 SparseMatrix* distMatrix = read->getDistMatrix();
193 overlapMatrix = read->getOverlapMatrix(); //already sorted by read
197 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method); }
198 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
199 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); }
200 cluster->setMapWanted(true);
202 if (m->control_pressed) {
203 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
204 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
208 //cluster using cluster classes
209 while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
211 cluster->update(cutoff);
213 if (m->control_pressed) {
214 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
215 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
219 float dist = distMatrix->getSmallDist();
222 rndDist = ceilDist(dist, precision);
224 rndDist = roundDist(dist, precision);
228 if(previousDist <= 0.0000 && dist != previousDist){
229 oldList.setLabel("unique");
232 else if(rndDist != rndPreviousDist){
234 map<string, int> seq2Bin = cluster->getSeqtoBin();
235 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
237 if (m->control_pressed) {
238 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
239 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
243 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
247 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
253 rndPreviousDist = rndDist;
257 if(previousDist <= 0.0000){
258 oldList.setLabel("unique");
261 else if(rndPreviousDist<cutoff){
263 map<string, int> seq2Bin = cluster->getSeqtoBin();
264 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
266 if (m->control_pressed) {
267 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
268 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
272 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
276 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
282 overlapMatrix.clear();
286 }else { //use hcluster to cluster
287 //get distmatrix and overlap
288 overlapFile = read->getOverlapFile();
289 distFile = read->getDistFile();
292 //sort the distance and overlap files
293 sortHclusterFiles(distFile, overlapFile);
295 if (m->control_pressed) {
296 delete nameMap; delete list; delete rabund;
297 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
302 hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
303 hcluster->setMapWanted(true);
305 vector<seqDist> seqs; seqs.resize(1); // to start loop
306 //ifstream inHcluster;
307 //openInputFile(distFile, inHcluster);
309 if (m->control_pressed) {
310 delete nameMap; delete list; delete rabund; delete hcluster;
311 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
315 while (seqs.size() != 0){
317 seqs = hcluster->getSeqs();
319 if (m->control_pressed) {
320 delete nameMap; delete list; delete rabund; delete hcluster;
321 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
322 remove(distFile.c_str());
323 remove(overlapFile.c_str());
327 for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
329 if (seqs[i].seq1 != seqs[i].seq2) {
331 hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
333 if (m->control_pressed) {
334 delete nameMap; delete list; delete rabund; delete hcluster;
335 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
336 remove(distFile.c_str());
337 remove(overlapFile.c_str());
343 rndDist = ceilDist(seqs[i].dist, precision);
345 rndDist = roundDist(seqs[i].dist, precision);
348 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
349 oldList.setLabel("unique");
352 else if((rndDist != rndPreviousDist)){
354 map<string, int> seq2Bin = hcluster->getSeqtoBin();
355 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
357 if (m->control_pressed) {
358 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
359 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
360 remove(distFile.c_str());
361 remove(overlapFile.c_str());
365 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
369 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
374 previousDist = seqs[i].dist;
375 rndPreviousDist = rndDist;
380 //inHcluster.close();
382 if(previousDist <= 0.0000){
383 oldList.setLabel("unique");
386 else if(rndPreviousDist<cutoff){
388 map<string, int> seq2Bin = hcluster->getSeqtoBin();
389 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
391 if (m->control_pressed) {
392 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
393 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
394 remove(distFile.c_str());
395 remove(overlapFile.c_str());
399 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
403 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
409 remove(distFile.c_str());
410 remove(overlapFile.c_str());
419 globaldata->setListFile(fileroot+ tag + ".list");
420 globaldata->setFormat("list");
422 if (m->control_pressed) {
424 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
425 globaldata->setListFile("");
426 globaldata->setFormat("");
430 m->mothurOutEndLine();
431 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
432 m->mothurOut(fileroot+ tag + ".list"); m->mothurOutEndLine();
433 m->mothurOut(fileroot+ tag + ".rabund"); m->mothurOutEndLine();
434 m->mothurOut(fileroot+ tag + ".sabund"); m->mothurOutEndLine();
435 m->mothurOutEndLine();
437 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine();
441 catch(exception& e) {
442 m->errorOut(e, "MGClusterCommand", "execute");
446 //**********************************************************************************************************************
447 void MGClusterCommand::printData(ListVector* mergedList){
449 mergedList->print(listFile);
450 mergedList->getRAbundVector().print(rabundFile);
452 SAbundVector sabund = mergedList->getSAbundVector();
455 sabund.print(sabundFile);
457 catch(exception& e) {
458 m->errorOut(e, "MGClusterCommand", "printData");
462 //**********************************************************************************************************************
463 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
464 //that are used to cluster by distance. this is done so that the overlapping data does not have more influenece than the distance data.
465 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
467 //create new listvector so you don't overwrite the clustering
468 ListVector* newList = new ListVector(oldList);
473 if (hclusterWanted) {
474 openInputFile(overlapFile, inOverlap);
475 if (inOverlap.eof()) { done = true; }
476 }else { if (overlapMatrix.size() == 0) { done = true; } }
479 if (m->control_pressed) {
480 if (hclusterWanted) { inOverlap.close(); }
486 if (!hclusterWanted) {
487 if (count < overlapMatrix.size()) { //do we have another node in the matrix
488 overlapNode = overlapMatrix[count];
492 if (!inOverlap.eof()) {
493 string firstName, secondName;
494 float overlapDistance;
495 inOverlap >> firstName >> secondName >> overlapDistance; gobble(inOverlap);
497 map<string,int>::iterator itA = nameMap->find(firstName);
498 map<string,int>::iterator itB = nameMap->find(secondName);
499 if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
500 if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
502 overlapNode.seq1 = itA->second;
503 overlapNode.seq2 = itB->second;
504 overlapNode.dist = overlapDistance;
505 }else { inOverlap.close(); break; }
508 if (overlapNode.dist < dist) {
509 //get names of seqs that overlap
510 string name1 = nameMap->get(overlapNode.seq1);
511 string name2 = nameMap->get(overlapNode.seq2);
513 //use binInfo to find out if they are already in the same bin
514 int binKeep = binInfo[name1];
515 int binRemove = binInfo[name2];
517 //if not merge bins and update binInfo
518 if(binKeep != binRemove) {
519 //save names in old bin
520 string names = list->get(binRemove);
522 //merge bins into name1s bin
523 newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
524 newList->set(binRemove, "");
527 while (names.find_first_of(',') != -1) {
529 string name = names.substr(0,names.find_first_of(','));
530 //save name and bin number
531 binInfo[name] = binKeep;
532 names = names.substr(names.find_first_of(',')+1, names.length());
536 binInfo[names] = binKeep;
539 }else { done = true; }
546 catch(exception& e) {
547 m->errorOut(e, "MGClusterCommand", "mergeOPFs");
551 //**********************************************************************************************************************
552 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
555 string sortedDistFile = sortFile(unsortedDist, outputDir);
556 remove(unsortedDist.c_str()); //delete unsorted file
557 distFile = sortedDistFile;
560 string sortedOverlapFile = sortFile(unsortedOverlap, outputDir);
561 remove(unsortedOverlap.c_str()); //delete unsorted file
562 overlapFile = sortedOverlapFile;
564 catch(exception& e) {
565 m->errorOut(e, "MGClusterCommand", "sortHclusterFiles");
570 //**********************************************************************************************************************