5 * Created by westcott on 12/11/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "mgclustercommand.h"
12 //**********************************************************************************************************************
13 MGClusterCommand::MGClusterCommand(string option) {
15 globaldata = GlobalData::getInstance();
18 //allow user to run help
19 if(option == "help") { help(); abort = true; }
22 //valid paramters for this command
23 string Array[] = {"blast", "method", "name", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"};
24 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
26 OptionParser parser(option);
27 map<string, string> parameters = parser.getParameters();
29 ValidParameters validParameter;
30 map<string,string>::iterator it;
32 //check to make sure all parameters are valid for command
33 for (it = parameters.begin(); it != parameters.end(); it++) {
34 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
37 //if the user changes the input directory command factory will send this info to us in the output parameter
38 string inputDir = validParameter.validFile(parameters, "inputdir", false);
39 if (inputDir == "not found"){ inputDir = ""; }
42 it = parameters.find("blast");
43 //user has given a template file
44 if(it != parameters.end()){
45 path = hasPath(it->second);
46 //if the user has not given a path then, add inputdir. else leave path alone.
47 if (path == "") { parameters["blast"] = inputDir + it->second; }
50 it = parameters.find("name");
51 //user has given a template file
52 if(it != parameters.end()){
53 path = hasPath(it->second);
54 //if the user has not given a path then, add inputdir. else leave path alone.
55 if (path == "") { parameters["name"] = inputDir + it->second; }
60 //check for required parameters
61 blastfile = validParameter.validFile(parameters, "blast", true);
62 if (blastfile == "not open") { abort = true; }
63 else if (blastfile == "not found") { blastfile = ""; }
65 //if the user changes the output directory command factory will send this info to us in the output parameter
66 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
68 outputDir += hasPath(blastfile); //if user entered a file with a path then preserve it
71 namefile = validParameter.validFile(parameters, "name", true);
72 if (namefile == "not open") { abort = true; }
73 else if (namefile == "not found") { namefile = ""; }
75 if ((blastfile == "")) { m->mothurOut("When executing a mgcluster command you must provide a blastfile."); m->mothurOutEndLine(); abort = true; }
77 //check for optional parameter and set defaults
79 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
80 precisionLength = temp.length();
81 convert(temp, precision);
83 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.70"; }
84 convert(temp, cutoff);
85 cutoff += (5 / (precision * 10.0));
87 method = validParameter.validFile(parameters, "method", false);
88 if (method == "not found") { method = "furthest"; }
90 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
91 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
93 temp = validParameter.validFile(parameters, "length", false); if (temp == "not found") { temp = "5"; }
94 convert(temp, length);
96 temp = validParameter.validFile(parameters, "penalty", false); if (temp == "not found") { temp = "0.10"; }
97 convert(temp, penalty);
99 temp = validParameter.validFile(parameters, "min", false); if (temp == "not found") { temp = "true"; }
100 minWanted = isTrue(temp);
102 temp = validParameter.validFile(parameters, "merge", false); if (temp == "not found") { temp = "true"; }
103 merge = isTrue(temp);
105 temp = validParameter.validFile(parameters, "hcluster", false); if (temp == "not found") { temp = "false"; }
106 hclusterWanted = isTrue(temp);
110 catch(exception& e) {
111 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
115 //**********************************************************************************************************************
117 void MGClusterCommand::help(){
119 m->mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n");
120 m->mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n");
121 m->mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n");
122 m->mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n");
123 m->mothurOut("The precision parameter's default value is 100. \n");
124 m->mothurOut("The acceptable mgcluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n");
125 m->mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n");
126 m->mothurOut("The length parameter is used to specify the minimum overlap required. The default is 5.\n");
127 m->mothurOut("The penalty parameter is used to adjust the error rate. The default is 0.10.\n");
128 m->mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster. By default merge is true, meaning you want to merge.\n");
129 m->mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering. This may be neccessary if your file is too large to fit into RAM. The default is false.\n");
130 m->mothurOut("The mgcluster command should be in the following format: \n");
131 m->mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n");
132 m->mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n");
134 catch(exception& e) {
135 m->errorOut(e, "MGClusterCommand", "help");
139 //**********************************************************************************************************************
140 MGClusterCommand::~MGClusterCommand(){}
141 //**********************************************************************************************************************
142 int MGClusterCommand::execute(){
145 if (abort == true) { return 0; }
148 if (namefile != "") {
149 nameMap = new NameAssignment(namefile);
151 }else{ nameMap= new NameAssignment(); }
153 string fileroot = outputDir + getRootName(getSimpleName(blastfile));
156 float previousDist = 0.00000;
157 float rndPreviousDist = 0.00000;
159 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
160 //must remember to delete those objects here since readBlast does not
161 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
164 list = new ListVector(nameMap->getListVector());
165 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
167 if (m->control_pressed) { delete nameMap; delete read; delete list; delete rabund; return 0; }
172 if (method == "furthest") { tag = "fn"; }
173 else if (method == "nearest") { tag = "nn"; }
177 openOutputFile(fileroot+ tag + ".list", listFile);
178 openOutputFile(fileroot+ tag + ".rabund", rabundFile);
179 openOutputFile(fileroot+ tag + ".sabund", sabundFile);
181 if (m->control_pressed) {
182 delete nameMap; delete read; delete list; delete rabund;
183 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
187 if (!hclusterWanted) {
188 //get distmatrix and overlap
189 SparseMatrix* distMatrix = read->getDistMatrix();
190 overlapMatrix = read->getOverlapMatrix(); //already sorted by read
194 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method); }
195 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
196 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); }
197 cluster->setMapWanted(true);
199 if (m->control_pressed) {
200 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
201 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
205 //cluster using cluster classes
206 while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
208 cluster->update(cutoff);
210 if (m->control_pressed) {
211 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster;
212 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
216 float dist = distMatrix->getSmallDist();
217 float rndDist = roundDist(dist, precision);
219 if(previousDist <= 0.0000 && dist != previousDist){
220 oldList.setLabel("unique");
223 else if(rndDist != rndPreviousDist){
225 map<string, int> seq2Bin = cluster->getSeqtoBin();
226 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
228 if (m->control_pressed) {
229 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
230 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
234 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
238 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
244 rndPreviousDist = rndDist;
248 if(previousDist <= 0.0000){
249 oldList.setLabel("unique");
252 else if(rndPreviousDist<cutoff){
254 map<string, int> seq2Bin = cluster->getSeqtoBin();
255 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
257 if (m->control_pressed) {
258 delete nameMap; delete distMatrix; delete list; delete rabund; delete cluster; delete temp;
259 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
263 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
267 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
273 overlapMatrix.clear();
277 }else { //use hcluster to cluster
278 //get distmatrix and overlap
279 overlapFile = read->getOverlapFile();
280 distFile = read->getDistFile();
283 //sort the distance and overlap files
284 sortHclusterFiles(distFile, overlapFile);
286 if (m->control_pressed) {
287 delete nameMap; delete list; delete rabund;
288 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
293 hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
294 hcluster->setMapWanted(true);
296 vector<seqDist> seqs; seqs.resize(1); // to start loop
297 //ifstream inHcluster;
298 //openInputFile(distFile, inHcluster);
300 if (m->control_pressed) {
301 delete nameMap; delete list; delete rabund; delete hcluster;
302 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
306 while (seqs.size() != 0){
308 seqs = hcluster->getSeqs();
310 if (m->control_pressed) {
311 delete nameMap; delete list; delete rabund; delete hcluster;
312 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
313 remove(distFile.c_str());
314 remove(overlapFile.c_str());
318 for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
320 if (seqs[i].seq1 != seqs[i].seq2) {
322 hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
324 if (m->control_pressed) {
325 delete nameMap; delete list; delete rabund; delete hcluster;
326 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
327 remove(distFile.c_str());
328 remove(overlapFile.c_str());
332 float rndDist = roundDist(seqs[i].dist, precision);
334 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
335 oldList.setLabel("unique");
338 else if((rndDist != rndPreviousDist)){
340 map<string, int> seq2Bin = hcluster->getSeqtoBin();
341 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
343 if (m->control_pressed) {
344 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
345 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
346 remove(distFile.c_str());
347 remove(overlapFile.c_str());
351 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
355 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
360 previousDist = seqs[i].dist;
361 rndPreviousDist = rndDist;
366 //inHcluster.close();
368 if(previousDist <= 0.0000){
369 oldList.setLabel("unique");
372 else if(rndPreviousDist<cutoff){
374 map<string, int> seq2Bin = hcluster->getSeqtoBin();
375 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
377 if (m->control_pressed) {
378 delete nameMap; delete list; delete rabund; delete hcluster; delete temp;
379 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
380 remove(distFile.c_str());
381 remove(overlapFile.c_str());
385 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
389 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
395 remove(distFile.c_str());
396 remove(overlapFile.c_str());
405 globaldata->setListFile(fileroot+ tag + ".list");
406 globaldata->setFormat("list");
408 if (m->control_pressed) {
410 listFile.close(); rabundFile.close(); sabundFile.close(); remove((fileroot+ tag + ".list").c_str()); remove((fileroot+ tag + ".rabund").c_str()); remove((fileroot+ tag + ".sabund").c_str());
411 globaldata->setListFile("");
412 globaldata->setFormat("");
416 m->mothurOutEndLine();
417 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
418 m->mothurOut(fileroot+ tag + ".list"); m->mothurOutEndLine();
419 m->mothurOut(fileroot+ tag + ".rabund"); m->mothurOutEndLine();
420 m->mothurOut(fileroot+ tag + ".sabund"); m->mothurOutEndLine();
421 m->mothurOutEndLine();
423 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine();
427 catch(exception& e) {
428 m->errorOut(e, "MGClusterCommand", "execute");
432 //**********************************************************************************************************************
433 void MGClusterCommand::printData(ListVector* mergedList){
435 mergedList->print(listFile);
436 mergedList->getRAbundVector().print(rabundFile);
438 SAbundVector sabund = mergedList->getSAbundVector();
441 sabund.print(sabundFile);
443 catch(exception& e) {
444 m->errorOut(e, "MGClusterCommand", "printData");
448 //**********************************************************************************************************************
449 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
450 //that are used to cluster by distance. this is done so that the overlapping data does not have more influenece than the distance data.
451 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
453 //create new listvector so you don't overwrite the clustering
454 ListVector* newList = new ListVector(oldList);
459 if (hclusterWanted) {
460 openInputFile(overlapFile, inOverlap);
461 if (inOverlap.eof()) { done = true; }
462 }else { if (overlapMatrix.size() == 0) { done = true; } }
465 if (m->control_pressed) {
466 if (hclusterWanted) { inOverlap.close(); }
472 if (!hclusterWanted) {
473 if (count < overlapMatrix.size()) { //do we have another node in the matrix
474 overlapNode = overlapMatrix[count];
478 if (!inOverlap.eof()) {
479 string firstName, secondName;
480 float overlapDistance;
481 inOverlap >> firstName >> secondName >> overlapDistance; gobble(inOverlap);
483 map<string,int>::iterator itA = nameMap->find(firstName);
484 map<string,int>::iterator itB = nameMap->find(secondName);
485 if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
486 if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
488 overlapNode.seq1 = itA->second;
489 overlapNode.seq2 = itB->second;
490 overlapNode.dist = overlapDistance;
491 }else { inOverlap.close(); break; }
494 if (overlapNode.dist < dist) {
495 //get names of seqs that overlap
496 string name1 = nameMap->get(overlapNode.seq1);
497 string name2 = nameMap->get(overlapNode.seq2);
499 //use binInfo to find out if they are already in the same bin
500 int binKeep = binInfo[name1];
501 int binRemove = binInfo[name2];
503 //if not merge bins and update binInfo
504 if(binKeep != binRemove) {
505 //save names in old bin
506 string names = list->get(binRemove);
508 //merge bins into name1s bin
509 newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
510 newList->set(binRemove, "");
513 while (names.find_first_of(',') != -1) {
515 string name = names.substr(0,names.find_first_of(','));
516 //save name and bin number
517 binInfo[name] = binKeep;
518 names = names.substr(names.find_first_of(',')+1, names.length());
522 binInfo[names] = binKeep;
525 }else { done = true; }
532 catch(exception& e) {
533 m->errorOut(e, "MGClusterCommand", "mergeOPFs");
537 //**********************************************************************************************************************
538 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
541 string sortedDistFile = sortFile(unsortedDist, outputDir);
542 remove(unsortedDist.c_str()); //delete unsorted file
543 distFile = sortedDistFile;
546 string sortedOverlapFile = sortFile(unsortedOverlap, outputDir);
547 remove(unsortedOverlap.c_str()); //delete unsorted file
548 overlapFile = sortedOverlapFile;
550 catch(exception& e) {
551 m->errorOut(e, "MGClusterCommand", "sortHclusterFiles");
556 //**********************************************************************************************************************