5 * Created by westcott on 12/11/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "mgclustercommand.h"
12 //**********************************************************************************************************************
13 MGClusterCommand::MGClusterCommand(string option) {
15 globaldata = GlobalData::getInstance();
18 //allow user to run help
19 if(option == "help") { help(); abort = true; }
22 //valid paramters for this command
23 string Array[] = {"blast", "method", "name", "hard", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"};
24 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
26 OptionParser parser(option);
27 map<string, string> parameters = parser.getParameters();
29 ValidParameters validParameter;
30 map<string,string>::iterator it;
32 //check to make sure all parameters are valid for command
33 for (it = parameters.begin(); it != parameters.end(); it++) {
34 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
37 //if the user changes the input directory command factory will send this info to us in the output parameter
38 string inputDir = validParameter.validFile(parameters, "inputdir", false);
39 if (inputDir == "not found"){ inputDir = ""; }
42 it = parameters.find("blast");
43 //user has given a template file
44 if(it != parameters.end()){
45 path = hasPath(it->second);
46 //if the user has not given a path then, add inputdir. else leave path alone.
47 if (path == "") { parameters["blast"] = inputDir + it->second; }
50 it = parameters.find("name");
51 //user has given a template file
52 if(it != parameters.end()){
53 path = hasPath(it->second);
54 //if the user has not given a path then, add inputdir. else leave path alone.
55 if (path == "") { parameters["name"] = inputDir + it->second; }
60 //check for required parameters
61 blastfile = validParameter.validFile(parameters, "blast", true);
62 if (blastfile == "not open") { abort = true; }
63 else if (blastfile == "not found") { blastfile = ""; }
65 //if the user changes the output directory command factory will send this info to us in the output parameter
66 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
68 outputDir += hasPath(blastfile); //if user entered a file with a path then preserve it
71 namefile = validParameter.validFile(parameters, "name", true);
72 if (namefile == "not open") { abort = true; }
73 else if (namefile == "not found") { namefile = ""; }
75 if ((blastfile == "")) { m->mothurOut("When executing a mgcluster command you must provide a blastfile."); m->mothurOutEndLine(); abort = true; }
77 //check for optional parameter and set defaults
79 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
80 precisionLength = temp.length();
81 convert(temp, precision);
83 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.70"; }
84 convert(temp, cutoff);
85 cutoff += (5 / (precision * 10.0));
87 method = validParameter.validFile(parameters, "method", false);
88 if (method == "not found") { method = "furthest"; }
90 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
91 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
93 temp = validParameter.validFile(parameters, "length", false); if (temp == "not found") { temp = "5"; }
94 convert(temp, length);
96 temp = validParameter.validFile(parameters, "penalty", false); if (temp == "not found") { temp = "0.10"; }
97 convert(temp, penalty);
99 temp = validParameter.validFile(parameters, "min", false); if (temp == "not found") { temp = "true"; }
100 minWanted = isTrue(temp);
102 temp = validParameter.validFile(parameters, "merge", false); if (temp == "not found") { temp = "true"; }
103 merge = isTrue(temp);
105 temp = validParameter.validFile(parameters, "hcluster", false); if (temp == "not found") { temp = "false"; }
106 hclusterWanted = isTrue(temp);
108 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
113 catch(exception& e) {
114 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
118 //**********************************************************************************************************************
120 void MGClusterCommand::help(){
122 m->mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n");
123 m->mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n");
124 m->mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n");
125 m->mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n");
126 m->mothurOut("The precision parameter's default value is 100. \n");
127 m->mothurOut("The acceptable mgcluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n");
128 m->mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n");
129 m->mothurOut("The length parameter is used to specify the minimum overlap required. The default is 5.\n");
130 m->mothurOut("The penalty parameter is used to adjust the error rate. The default is 0.10.\n");
131 m->mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster. By default merge is true, meaning you want to merge.\n");
132 m->mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering. This may be neccessary if your file is too large to fit into RAM. The default is false.\n");
133 m->mothurOut("The mgcluster command should be in the following format: \n");
134 m->mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n");
135 m->mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n");
137 catch(exception& e) {
138 m->errorOut(e, "MGClusterCommand", "help");
142 //**********************************************************************************************************************
143 MGClusterCommand::~MGClusterCommand(){}
144 //**********************************************************************************************************************
145 int MGClusterCommand::execute(){
148 if (abort == true) { return 0; }
151 if (namefile != "") {
152 nameMap = new NameAssignment(namefile);
154 }else{ nameMap= new NameAssignment(); }
156 string fileroot = outputDir + getRootName(getSimpleName(blastfile));
159 float previousDist = 0.00000;
160 float rndPreviousDist = 0.00000;
162 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
163 //must remember to delete those objects here since readBlast does not
164 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
167 list = new ListVector(nameMap->getListVector());
168 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
170 if (m->control_pressed) { delete nameMap; delete read; delete list; delete rabund; return 0; }
174 map<string, int> Seq2Bin;
175 map<string, int> oldSeq2Bin;
177 if (method == "furthest") { tag = "fn"; }
178 else if (method == "nearest") { tag = "nn"; }
182 openOutputFile(fileroot+ tag + ".list", listFile);
183 openOutputFile(fileroot+ tag + ".rabund", rabundFile);
184 openOutputFile(fileroot+ tag + ".sabund", sabundFile);
186 if (m->control_pressed) {
187 delete nameMap; delete read; delete list; delete rabund;
188 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
192 if (!hclusterWanted) {
193 //get distmatrix and overlap
194 SparseMatrix* distMatrix = read->getDistMatrix();
195 overlapMatrix = read->getOverlapMatrix(); //already sorted by read
199 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method); }
200 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
201 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); }
202 cluster->setMapWanted(true);
203 Seq2Bin = cluster->getSeqtoBin();
204 oldSeq2Bin = Seq2Bin;
206 if (m->control_pressed) {
207 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
208 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
212 //cluster using cluster classes
213 while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
215 cluster->update(cutoff);
217 if (m->control_pressed) {
218 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
219 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
223 float dist = distMatrix->getSmallDist();
226 rndDist = ceilDist(dist, precision);
228 rndDist = roundDist(dist, precision);
231 if(previousDist <= 0.0000 && dist != previousDist){
232 oldList.setLabel("unique");
235 else if(rndDist != rndPreviousDist){
237 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
239 if (m->control_pressed) {
240 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
241 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
245 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
249 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
255 rndPreviousDist = rndDist;
257 Seq2Bin = cluster->getSeqtoBin();
258 oldSeq2Bin = Seq2Bin;
261 if(previousDist <= 0.0000){
262 oldList.setLabel("unique");
265 else if(rndPreviousDist<cutoff){
267 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
269 if (m->control_pressed) {
270 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
271 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
275 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
279 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
285 overlapMatrix.clear();
289 }else { //use hcluster to cluster
290 //get distmatrix and overlap
291 overlapFile = read->getOverlapFile();
292 distFile = read->getDistFile();
295 //sort the distance and overlap files
296 sortHclusterFiles(distFile, overlapFile);
298 if (m->control_pressed) {
299 delete nameMap; delete list; delete rabund;
300 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
305 hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
306 hcluster->setMapWanted(true);
307 Seq2Bin = cluster->getSeqtoBin();
308 oldSeq2Bin = Seq2Bin;
310 vector<seqDist> seqs; seqs.resize(1); // to start loop
311 //ifstream inHcluster;
312 //openInputFile(distFile, inHcluster);
314 if (m->control_pressed) {
315 delete nameMap; delete list; delete rabund; delete hcluster;
316 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
320 while (seqs.size() != 0){
322 seqs = hcluster->getSeqs();
324 if (m->control_pressed) {
325 delete nameMap; delete list; delete rabund; delete hcluster;
326 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
327 remove(distFile.c_str());
328 remove(overlapFile.c_str());
332 for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
334 if (seqs[i].seq1 != seqs[i].seq2) {
336 hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
338 if (m->control_pressed) {
339 delete nameMap; delete list; delete rabund; delete hcluster;
340 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
341 remove(distFile.c_str());
342 remove(overlapFile.c_str());
348 rndDist = ceilDist(seqs[i].dist, precision);
350 rndDist = roundDist(seqs[i].dist, precision);
353 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
354 oldList.setLabel("unique");
357 else if((rndDist != rndPreviousDist)){
359 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
361 if (m->control_pressed) {
362 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
363 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
364 remove(distFile.c_str());
365 remove(overlapFile.c_str());
369 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
373 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
378 previousDist = seqs[i].dist;
379 rndPreviousDist = rndDist;
381 Seq2Bin = cluster->getSeqtoBin();
382 oldSeq2Bin = Seq2Bin;
386 //inHcluster.close();
388 if(previousDist <= 0.0000){
389 oldList.setLabel("unique");
392 else if(rndPreviousDist<cutoff){
394 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
396 if (m->control_pressed) {
397 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
398 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
399 remove(distFile.c_str());
400 remove(overlapFile.c_str());
404 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
408 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
414 remove(distFile.c_str());
415 remove(overlapFile.c_str());
424 globaldata->setListFile(fileroot+ tag + ".list");
425 globaldata->setFormat("list");
427 if (m->control_pressed) {
429 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
430 globaldata->setListFile("");
431 globaldata->setFormat("");
435 m->mothurOutEndLine();
436 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
437 m->mothurOut(fileroot+ tag + ".list"); m->mothurOutEndLine();
438 m->mothurOut(fileroot+ tag + ".rabund"); m->mothurOutEndLine();
439 m->mothurOut(fileroot+ tag + ".sabund"); m->mothurOutEndLine();
440 m->mothurOutEndLine();
442 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine();
446 catch(exception& e) {
447 m->errorOut(e, "MGClusterCommand", "execute");
451 //**********************************************************************************************************************
452 void MGClusterCommand::printData(ListVector* mergedList){
454 mergedList->print(listFile);
455 mergedList->getRAbundVector().print(rabundFile);
457 SAbundVector sabund = mergedList->getSAbundVector();
460 sabund.print(sabundFile);
462 catch(exception& e) {
463 m->errorOut(e, "MGClusterCommand", "printData");
467 //**********************************************************************************************************************
468 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
469 //that are used to cluster by distance. this is done so that the overlapping data does not have more influenece than the distance data.
470 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
472 //create new listvector so you don't overwrite the clustering
473 ListVector* newList = new ListVector(oldList);
479 if (hclusterWanted) {
480 openInputFile(overlapFile, inOverlap);
481 if (inOverlap.eof()) { done = true; }
482 }else { if (overlapMatrix.size() == 0) { done = true; } }
485 if (m->control_pressed) {
486 if (hclusterWanted) { inOverlap.close(); }
492 if (!hclusterWanted) {
493 if (count < overlapMatrix.size()) { //do we have another node in the matrix
494 overlapNode = overlapMatrix[count];
498 if (!inOverlap.eof()) {
499 string firstName, secondName;
500 float overlapDistance;
501 inOverlap >> firstName >> secondName >> overlapDistance; gobble(inOverlap);
503 //commented out because we check this in readblast already
504 //map<string,int>::iterator itA = nameMap->find(firstName);
505 //map<string,int>::iterator itB = nameMap->find(secondName);
506 //if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
507 //if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
509 //overlapNode.seq1 = itA->second;
510 //overlapNode.seq2 = itB->second;
511 overlapNode.seq1 = nameMap->get(firstName);
512 overlapNode.seq2 = nameMap->get(secondName);
513 overlapNode.dist = overlapDistance;
514 }else { inOverlap.close(); break; }
517 if (overlapNode.dist < dist) {
518 //get names of seqs that overlap
519 string name1 = nameMap->get(overlapNode.seq1);
520 string name2 = nameMap->get(overlapNode.seq2);
522 //use binInfo to find out if they are already in the same bin
523 //map<string, int>::iterator itBin1 = binInfo.find(name1);
524 //map<string, int>::iterator itBin2 = binInfo.find(name2);
526 //if(itBin1 == binInfo.end()){ cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1); }
527 //if(itBin2 == binInfo.end()){ cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1); }
529 //int binKeep = itBin1->second;
530 //int binRemove = itBin2->second;
532 int binKeep = binInfo[name1];
533 int binRemove = binInfo[name2];
535 //if not merge bins and update binInfo
536 if(binKeep != binRemove) {
538 //save names in old bin
539 string names = newList->get(binRemove);
541 //merge bins into name1s bin
542 newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
543 newList->set(binRemove, "");
546 while (names.find_first_of(',') != -1) {
548 string name = names.substr(0,names.find_first_of(','));
549 //save name and bin number
550 binInfo[name] = binKeep;
551 names = names.substr(names.find_first_of(',')+1, names.length());
555 binInfo[names] = binKeep;
558 }else { done = true; }
565 catch(exception& e) {
566 m->errorOut(e, "MGClusterCommand", "mergeOPFs");
570 //**********************************************************************************************************************
571 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
574 string sortedDistFile = sortFile(unsortedDist, outputDir);
575 remove(unsortedDist.c_str()); //delete unsorted file
576 distFile = sortedDistFile;
579 string sortedOverlapFile = sortFile(unsortedOverlap, outputDir);
580 remove(unsortedOverlap.c_str()); //delete unsorted file
581 overlapFile = sortedOverlapFile;
583 catch(exception& e) {
584 m->errorOut(e, "MGClusterCommand", "sortHclusterFiles");
589 //**********************************************************************************************************************