5 * Created by westcott on 12/11/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "mgclustercommand.h"
12 //**********************************************************************************************************************
13 MGClusterCommand::MGClusterCommand(string option) {
15 globaldata = GlobalData::getInstance();
18 //allow user to run help
19 if(option == "help") { help(); abort = true; }
22 //valid paramters for this command
23 string Array[] = {"blast", "method", "name", "hard", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"};
24 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
26 OptionParser parser(option);
27 map<string, string> parameters = parser.getParameters();
29 ValidParameters validParameter;
30 map<string,string>::iterator it;
32 //check to make sure all parameters are valid for command
33 for (it = parameters.begin(); it != parameters.end(); it++) {
34 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
37 //if the user changes the input directory command factory will send this info to us in the output parameter
38 string inputDir = validParameter.validFile(parameters, "inputdir", false);
39 if (inputDir == "not found"){ inputDir = ""; }
42 it = parameters.find("blast");
43 //user has given a template file
44 if(it != parameters.end()){
45 path = hasPath(it->second);
46 //if the user has not given a path then, add inputdir. else leave path alone.
47 if (path == "") { parameters["blast"] = inputDir + it->second; }
50 it = parameters.find("name");
51 //user has given a template file
52 if(it != parameters.end()){
53 path = hasPath(it->second);
54 //if the user has not given a path then, add inputdir. else leave path alone.
55 if (path == "") { parameters["name"] = inputDir + it->second; }
60 //check for required parameters
61 blastfile = validParameter.validFile(parameters, "blast", true);
62 if (blastfile == "not open") { abort = true; }
63 else if (blastfile == "not found") { blastfile = ""; }
65 //if the user changes the output directory command factory will send this info to us in the output parameter
66 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
68 outputDir += hasPath(blastfile); //if user entered a file with a path then preserve it
71 namefile = validParameter.validFile(parameters, "name", true);
72 if (namefile == "not open") { abort = true; }
73 else if (namefile == "not found") { namefile = ""; }
75 if ((blastfile == "")) { m->mothurOut("When executing a mgcluster command you must provide a blastfile."); m->mothurOutEndLine(); abort = true; }
77 //check for optional parameter and set defaults
79 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
80 precisionLength = temp.length();
81 convert(temp, precision);
83 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.70"; }
84 convert(temp, cutoff);
85 cutoff += (5 / (precision * 10.0));
87 method = validParameter.validFile(parameters, "method", false);
88 if (method == "not found") { method = "furthest"; }
90 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
91 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
93 temp = validParameter.validFile(parameters, "length", false); if (temp == "not found") { temp = "5"; }
94 convert(temp, length);
96 temp = validParameter.validFile(parameters, "penalty", false); if (temp == "not found") { temp = "0.10"; }
97 convert(temp, penalty);
99 temp = validParameter.validFile(parameters, "min", false); if (temp == "not found") { temp = "true"; }
100 minWanted = isTrue(temp);
102 temp = validParameter.validFile(parameters, "merge", false); if (temp == "not found") { temp = "true"; }
103 merge = isTrue(temp);
105 temp = validParameter.validFile(parameters, "hcluster", false); if (temp == "not found") { temp = "false"; }
106 hclusterWanted = isTrue(temp);
108 temp = validParameter.validFile(parameters, "hard", false); if (temp == "not found") { temp = "F"; }
113 catch(exception& e) {
114 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
118 //**********************************************************************************************************************
120 void MGClusterCommand::help(){
122 m->mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n");
123 m->mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n");
124 m->mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n");
125 m->mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n");
126 m->mothurOut("The precision parameter's default value is 100. \n");
127 m->mothurOut("The acceptable mgcluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n");
128 m->mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n");
129 m->mothurOut("The length parameter is used to specify the minimum overlap required. The default is 5.\n");
130 m->mothurOut("The penalty parameter is used to adjust the error rate. The default is 0.10.\n");
131 m->mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster. By default merge is true, meaning you want to merge.\n");
132 m->mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering. This may be neccessary if your file is too large to fit into RAM. The default is false.\n");
133 m->mothurOut("The mgcluster command should be in the following format: \n");
134 m->mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n");
135 m->mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n");
137 catch(exception& e) {
138 m->errorOut(e, "MGClusterCommand", "help");
142 //**********************************************************************************************************************
143 MGClusterCommand::~MGClusterCommand(){}
144 //**********************************************************************************************************************
145 int MGClusterCommand::execute(){
148 if (abort == true) { return 0; }
151 if (namefile != "") {
152 nameMap = new NameAssignment(namefile);
154 }else{ nameMap= new NameAssignment(); }
156 string fileroot = outputDir + getRootName(getSimpleName(blastfile));
159 float previousDist = 0.00000;
160 float rndPreviousDist = 0.00000;
162 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
163 //must remember to delete those objects here since readBlast does not
164 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
167 list = new ListVector(nameMap->getListVector());
168 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
169 for (int i = 0; i < list->getNumBins(); i++) {
170 string bin = list->get(i);
172 cout << "bin " << i << " is blank."<< endl;
175 cout << "after outputting blank bins." << endl;
177 if (m->control_pressed) { delete nameMap; delete read; delete list; delete rabund; return 0; }
181 map<string, int> Seq2Bin;
182 map<string, int> oldSeq2Bin;
184 if (method == "furthest") { tag = "fn"; }
185 else if (method == "nearest") { tag = "nn"; }
189 openOutputFile(fileroot+ tag + ".list", listFile);
190 openOutputFile(fileroot+ tag + ".rabund", rabundFile);
191 openOutputFile(fileroot+ tag + ".sabund", sabundFile);
193 if (m->control_pressed) {
194 delete nameMap; delete read; delete list; delete rabund;
195 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
199 if (!hclusterWanted) {
200 //get distmatrix and overlap
201 SparseMatrix* distMatrix = read->getDistMatrix();
202 overlapMatrix = read->getOverlapMatrix(); //already sorted by read
206 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method); }
207 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
208 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); }
209 cluster->setMapWanted(true);
210 Seq2Bin = cluster->getSeqtoBin();
211 oldSeq2Bin = Seq2Bin;
213 if (m->control_pressed) {
214 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
215 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
219 //cluster using cluster classes
220 while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
222 cluster->update(cutoff);
224 if (m->control_pressed) {
225 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
226 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
230 float dist = distMatrix->getSmallDist();
233 rndDist = ceilDist(dist, precision);
235 rndDist = roundDist(dist, precision);
237 cout << "here " << count << '\t' << dist << endl;
239 if(previousDist <= 0.0000 && dist != previousDist){
240 oldList.setLabel("unique");
242 Seq2Bin = cluster->getSeqtoBin();
244 else if(rndDist != rndPreviousDist){
246 Seq2Bin = cluster->getSeqtoBin();
247 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
249 if (m->control_pressed) {
250 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
251 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
255 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
259 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
263 //cout << "after merge " << count << '\t' << dist << endl;
266 rndPreviousDist = rndDist;
268 oldSeq2Bin = Seq2Bin;
271 if(previousDist <= 0.0000){
272 oldList.setLabel("unique");
274 Seq2Bin = cluster->getSeqtoBin();
276 else if(rndPreviousDist<cutoff){
278 Seq2Bin = cluster->getSeqtoBin();
279 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
281 if (m->control_pressed) {
282 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
283 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
287 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
291 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
297 overlapMatrix.clear();
301 }else { //use hcluster to cluster
302 //get distmatrix and overlap
303 overlapFile = read->getOverlapFile();
304 distFile = read->getDistFile();
307 //sort the distance and overlap files
308 sortHclusterFiles(distFile, overlapFile);
310 if (m->control_pressed) {
311 delete nameMap; delete list; delete rabund;
312 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
317 hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
318 hcluster->setMapWanted(true);
319 Seq2Bin = cluster->getSeqtoBin();
320 oldSeq2Bin = Seq2Bin;
322 vector<seqDist> seqs; seqs.resize(1); // to start loop
323 //ifstream inHcluster;
324 //openInputFile(distFile, inHcluster);
326 if (m->control_pressed) {
327 delete nameMap; delete list; delete rabund; delete hcluster;
328 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
332 while (seqs.size() != 0){
334 seqs = hcluster->getSeqs();
336 if (m->control_pressed) {
337 delete nameMap; delete list; delete rabund; delete hcluster;
338 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
339 remove(distFile.c_str());
340 remove(overlapFile.c_str());
344 for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
346 if (seqs[i].seq1 != seqs[i].seq2) {
348 hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
350 if (m->control_pressed) {
351 delete nameMap; delete list; delete rabund; delete hcluster;
352 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
353 remove(distFile.c_str());
354 remove(overlapFile.c_str());
360 rndDist = ceilDist(seqs[i].dist, precision);
362 rndDist = roundDist(seqs[i].dist, precision);
365 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
366 oldList.setLabel("unique");
368 Seq2Bin = hcluster->getSeqtoBin();
370 else if((rndDist != rndPreviousDist)){
372 Seq2Bin = hcluster->getSeqtoBin();
373 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
375 if (m->control_pressed) {
376 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
377 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
378 remove(distFile.c_str());
379 remove(overlapFile.c_str());
383 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
387 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
392 previousDist = seqs[i].dist;
393 rndPreviousDist = rndDist;
395 oldSeq2Bin = Seq2Bin;
399 //inHcluster.close();
401 if(previousDist <= 0.0000){
402 oldList.setLabel("unique");
405 else if(rndPreviousDist<cutoff){
407 Seq2Bin = hcluster->getSeqtoBin();
408 ListVector* temp = mergeOPFs(oldSeq2Bin, rndPreviousDist);
410 if (m->control_pressed) {
411 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
412 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
413 remove(distFile.c_str());
414 remove(overlapFile.c_str());
418 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
422 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
428 remove(distFile.c_str());
429 remove(overlapFile.c_str());
438 globaldata->setListFile(fileroot+ tag + ".list");
439 globaldata->setFormat("list");
441 if (m->control_pressed) {
443 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
444 globaldata->setListFile("");
445 globaldata->setFormat("");
449 m->mothurOutEndLine();
450 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
451 m->mothurOut(fileroot+ tag + ".list"); m->mothurOutEndLine();
452 m->mothurOut(fileroot+ tag + ".rabund"); m->mothurOutEndLine();
453 m->mothurOut(fileroot+ tag + ".sabund"); m->mothurOutEndLine();
454 m->mothurOutEndLine();
456 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine();
460 catch(exception& e) {
461 m->errorOut(e, "MGClusterCommand", "execute");
465 //**********************************************************************************************************************
466 void MGClusterCommand::printData(ListVector* mergedList){
468 mergedList->print(listFile);
469 mergedList->getRAbundVector().print(rabundFile);
471 SAbundVector sabund = mergedList->getSAbundVector();
474 sabund.print(sabundFile);
476 catch(exception& e) {
477 m->errorOut(e, "MGClusterCommand", "printData");
481 //**********************************************************************************************************************
482 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
483 //that are used to cluster by distance. this is done so that the overlapping data does not have more influenece than the distance data.
484 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
486 //create new listvector so you don't overwrite the clustering
487 ListVector* newList = new ListVector(oldList);
488 for (int i = 0; i < newList->getNumBins(); i++) {
489 string bin = newList->get(i);
491 cout << "bin " << i << " is blank."<< endl;
492 for (map<string, int>::iterator itBin = binInfo.begin(); itBin != binInfo.end(); itBin++) {
493 if (itBin->second == i) { cout << itBin->first << " maps to an empty bin." << endl; }
497 cout << "after outputting blank bins." << endl;
503 if (hclusterWanted) {
504 openInputFile(overlapFile, inOverlap);
505 if (inOverlap.eof()) { done = true; }
506 }else { if (overlapMatrix.size() == 0) { done = true; } }
509 if (m->control_pressed) {
510 if (hclusterWanted) { inOverlap.close(); }
516 if (!hclusterWanted) {
517 if (count < overlapMatrix.size()) { //do we have another node in the matrix
518 overlapNode = overlapMatrix[count];
522 if (!inOverlap.eof()) {
523 string firstName, secondName;
524 float overlapDistance;
525 inOverlap >> firstName >> secondName >> overlapDistance; gobble(inOverlap);
527 map<string,int>::iterator itA = nameMap->find(firstName);
528 map<string,int>::iterator itB = nameMap->find(secondName);
529 if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
530 if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
532 overlapNode.seq1 = itA->second;
533 overlapNode.seq2 = itB->second;
534 overlapNode.dist = overlapDistance;
535 }else { inOverlap.close(); break; }
538 if (overlapNode.dist < dist) {
539 //get names of seqs that overlap
540 string name1 = nameMap->get(overlapNode.seq1);
541 string name2 = nameMap->get(overlapNode.seq2);
543 //use binInfo to find out if they are already in the same bin
544 map<string, int>::iterator itBin1 = binInfo.find(name1);
545 map<string, int>::iterator itBin2 = binInfo.find(name2);
547 if(itBin1 == binInfo.end()){ cerr << "AAError: Sequence '" << name1 << "' does not have any bin info.\n"; exit(1); }
548 if(itBin2 == binInfo.end()){ cerr << "ABError: Sequence '" << name2 << "' does not have any bin info.\n"; exit(1); }
549 cout << overlapNode.dist << '\t' << dist << endl;
550 int binKeep = itBin1->second;
551 int binRemove = itBin2->second;
553 //if not merge bins and update binInfo
554 if(binKeep != binRemove) {
555 cout << "bin keep = " << binKeep << " bin remove = " << binRemove << endl;
556 //save names in old bin
557 string names = newList->get(binRemove);
558 cout << names << endl << endl << endl;
559 cout << newList->get(binKeep) << endl << endl << endl;
560 //merge bins into name1s bin
561 newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
562 newList->set(binRemove, "");
565 while (names.find_first_of(',') != -1) {
567 string name = names.substr(0,names.find_first_of(','));
568 //save name and bin number
569 binInfo[name] = binKeep;
570 names = names.substr(names.find_first_of(',')+1, names.length());
574 binInfo[names] = binKeep;
577 }else { done = true; }
584 catch(exception& e) {
585 m->errorOut(e, "MGClusterCommand", "mergeOPFs");
589 //**********************************************************************************************************************
590 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
593 string sortedDistFile = sortFile(unsortedDist, outputDir);
594 remove(unsortedDist.c_str()); //delete unsorted file
595 distFile = sortedDistFile;
598 string sortedOverlapFile = sortFile(unsortedOverlap, outputDir);
599 remove(unsortedOverlap.c_str()); //delete unsorted file
600 overlapFile = sortedOverlapFile;
602 catch(exception& e) {
603 m->errorOut(e, "MGClusterCommand", "sortHclusterFiles");
608 //**********************************************************************************************************************