5 * Created by westcott on 12/11/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "mgclustercommand.h"
12 //**********************************************************************************************************************
13 MGClusterCommand::MGClusterCommand(string option) {
15 globaldata = GlobalData::getInstance();
18 //allow user to run help
19 if(option == "help") { help(); abort = true; }
22 //valid paramters for this command
23 string Array[] = {"blast", "method", "name", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"};
24 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
26 OptionParser parser(option);
27 map<string, string> parameters = parser.getParameters();
29 ValidParameters validParameter;
30 map<string,string>::iterator it;
32 //check to make sure all parameters are valid for command
33 for (it = parameters.begin(); it != parameters.end(); it++) {
34 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
37 //if the user changes the input directory command factory will send this info to us in the output parameter
38 string inputDir = validParameter.validFile(parameters, "inputdir", false);
39 if (inputDir == "not found"){ inputDir = ""; }
42 it = parameters.find("blast");
43 //user has given a template file
44 if(it != parameters.end()){
45 path = hasPath(it->second);
46 //if the user has not given a path then, add inputdir. else leave path alone.
47 if (path == "") { parameters["blast"] = inputDir + it->second; }
50 it = parameters.find("name");
51 //user has given a template file
52 if(it != parameters.end()){
53 path = hasPath(it->second);
54 //if the user has not given a path then, add inputdir. else leave path alone.
55 if (path == "") { parameters["name"] = inputDir + it->second; }
60 //check for required parameters
61 blastfile = validParameter.validFile(parameters, "blast", true);
62 if (blastfile == "not open") { abort = true; }
63 else if (blastfile == "not found") { blastfile = ""; }
65 //if the user changes the output directory command factory will send this info to us in the output parameter
66 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
68 outputDir += hasPath(blastfile); //if user entered a file with a path then preserve it
71 namefile = validParameter.validFile(parameters, "name", true);
72 if (namefile == "not open") { abort = true; }
73 else if (namefile == "not found") { namefile = ""; }
75 if ((blastfile == "")) { m->mothurOut("When executing a mgcluster command you must provide a blastfile."); m->mothurOutEndLine(); abort = true; }
77 //check for optional parameter and set defaults
79 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
80 precisionLength = temp.length();
81 convert(temp, precision);
83 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.70"; }
84 convert(temp, cutoff);
85 cutoff += (5 / (precision * 10.0));
87 method = validParameter.validFile(parameters, "method", false);
88 if (method == "not found") { method = "furthest"; }
90 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
91 else { m->mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
93 temp = validParameter.validFile(parameters, "length", false); if (temp == "not found") { temp = "5"; }
94 convert(temp, length);
96 temp = validParameter.validFile(parameters, "penalty", false); if (temp == "not found") { temp = "0.10"; }
97 convert(temp, penalty);
99 temp = validParameter.validFile(parameters, "min", false); if (temp == "not found") { temp = "true"; }
100 minWanted = isTrue(temp);
102 temp = validParameter.validFile(parameters, "merge", false); if (temp == "not found") { temp = "true"; }
103 merge = isTrue(temp);
105 temp = validParameter.validFile(parameters, "hcluster", false); if (temp == "not found") { temp = "false"; }
106 hclusterWanted = isTrue(temp);
110 catch(exception& e) {
111 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
115 //**********************************************************************************************************************
117 void MGClusterCommand::help(){
119 m->mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n");
120 m->mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n");
121 m->mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n");
122 m->mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n");
123 m->mothurOut("The precision parameter's default value is 100. \n");
124 m->mothurOut("The acceptable mgcluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n");
125 m->mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n");
126 m->mothurOut("The length parameter is used to specify the minimum overlap required. The default is 5.\n");
127 m->mothurOut("The penalty parameter is used to adjust the error rate. The default is 0.10.\n");
128 m->mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster. By default merge is true, meaning you want to merge.\n");
129 m->mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering. This may be neccessary if your file is too large to fit into RAM. The default is false.\n");
130 m->mothurOut("The mgcluster command should be in the following format: \n");
131 m->mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n");
132 m->mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n");
134 catch(exception& e) {
135 m->errorOut(e, "MGClusterCommand", "help");
139 //**********************************************************************************************************************
140 MGClusterCommand::~MGClusterCommand(){}
141 //**********************************************************************************************************************
142 int MGClusterCommand::execute(){
145 if (abort == true) { return 0; }
148 if (namefile != "") {
149 nameMap = new NameAssignment(namefile);
151 }else{ nameMap= new NameAssignment(); }
153 string fileroot = outputDir + getRootName(getSimpleName(blastfile));
156 float previousDist = 0.00000;
157 float rndPreviousDist = 0.00000;
159 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
160 //must remember to delete those objects here since readBlast does not
161 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
164 list = new ListVector(nameMap->getListVector());
165 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
170 if (method == "furthest") { tag = "fn"; }
171 else if (method == "nearest") { tag = "nn"; }
175 openOutputFile(fileroot+ tag + ".list", listFile);
176 openOutputFile(fileroot+ tag + ".rabund", rabundFile);
177 openOutputFile(fileroot+ tag + ".sabund", sabundFile);
179 if (!hclusterWanted) {
180 //get distmatrix and overlap
181 SparseMatrix* distMatrix = read->getDistMatrix();
182 overlapMatrix = read->getOverlapMatrix(); //already sorted by read
186 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method); }
187 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
188 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); }
189 cluster->setMapWanted(true);
191 //cluster using cluster classes
192 while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
194 cluster->update(cutoff);
195 float dist = distMatrix->getSmallDist();
196 float rndDist = roundDist(dist, precision);
198 if(previousDist <= 0.0000 && dist != previousDist){
199 oldList.setLabel("unique");
202 else if(rndDist != rndPreviousDist){
204 map<string, int> seq2Bin = cluster->getSeqtoBin();
205 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
206 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
210 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
216 rndPreviousDist = rndDist;
220 if(previousDist <= 0.0000){
221 oldList.setLabel("unique");
224 else if(rndPreviousDist<cutoff){
226 map<string, int> seq2Bin = cluster->getSeqtoBin();
227 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
228 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
232 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
238 overlapMatrix.clear();
242 }else { //use hcluster to cluster
243 //get distmatrix and overlap
244 overlapFile = read->getOverlapFile();
245 distFile = read->getDistFile();
248 //sort the distance and overlap files
249 sortHclusterFiles(distFile, overlapFile);
252 hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
253 hcluster->setMapWanted(true);
255 vector<seqDist> seqs; seqs.resize(1); // to start loop
256 //ifstream inHcluster;
257 //openInputFile(distFile, inHcluster);
259 while (seqs.size() != 0){
261 seqs = hcluster->getSeqs();
263 for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
265 if (seqs[i].seq1 != seqs[i].seq2) {
267 hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
269 float rndDist = roundDist(seqs[i].dist, precision);
271 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
272 oldList.setLabel("unique");
275 else if((rndDist != rndPreviousDist)){
277 map<string, int> seq2Bin = hcluster->getSeqtoBin();
278 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
279 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
283 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
288 previousDist = seqs[i].dist;
289 rndPreviousDist = rndDist;
294 //inHcluster.close();
296 if(previousDist <= 0.0000){
297 oldList.setLabel("unique");
300 else if(rndPreviousDist<cutoff){
302 map<string, int> seq2Bin = hcluster->getSeqtoBin();
303 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
304 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
308 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
314 remove(distFile.c_str());
315 remove(overlapFile.c_str());
324 globaldata->setListFile(fileroot+ tag + ".list");
325 globaldata->setFormat("list");
327 m->mothurOutEndLine();
328 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
329 m->mothurOut(fileroot+ tag + ".list"); m->mothurOutEndLine();
330 m->mothurOut(fileroot+ tag + ".rabund"); m->mothurOutEndLine();
331 m->mothurOut(fileroot+ tag + ".sabund"); m->mothurOutEndLine();
332 m->mothurOutEndLine();
334 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine();
338 catch(exception& e) {
339 m->errorOut(e, "MGClusterCommand", "execute");
343 //**********************************************************************************************************************
344 void MGClusterCommand::printData(ListVector* mergedList){
346 mergedList->print(listFile);
347 mergedList->getRAbundVector().print(rabundFile);
349 SAbundVector sabund = mergedList->getSAbundVector();
352 sabund.print(sabundFile);
354 catch(exception& e) {
355 m->errorOut(e, "MGClusterCommand", "printData");
359 //**********************************************************************************************************************
360 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
361 //that are used to cluster by distance. this is done so that the overlapping data does not have more influenece than the distance data.
362 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
364 //create new listvector so you don't overwrite the clustering
365 ListVector* newList = new ListVector(oldList);
370 if (hclusterWanted) {
371 openInputFile(overlapFile, inOverlap);
372 if (inOverlap.eof()) { done = true; }
373 }else { if (overlapMatrix.size() == 0) { done = true; } }
379 if (!hclusterWanted) {
380 if (count < overlapMatrix.size()) { //do we have another node in the matrix
381 overlapNode = overlapMatrix[count];
385 if (!inOverlap.eof()) {
386 string firstName, secondName;
387 float overlapDistance;
388 inOverlap >> firstName >> secondName >> overlapDistance; gobble(inOverlap);
390 map<string,int>::iterator itA = nameMap->find(firstName);
391 map<string,int>::iterator itB = nameMap->find(secondName);
392 if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
393 if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
395 overlapNode.seq1 = itA->second;
396 overlapNode.seq2 = itB->second;
397 overlapNode.dist = overlapDistance;
398 }else { inOverlap.close(); break; }
401 if (overlapNode.dist < dist) {
402 //get names of seqs that overlap
403 string name1 = nameMap->get(overlapNode.seq1);
404 string name2 = nameMap->get(overlapNode.seq2);
406 //use binInfo to find out if they are already in the same bin
407 int binKeep = binInfo[name1];
408 int binRemove = binInfo[name2];
410 //if not merge bins and update binInfo
411 if(binKeep != binRemove) {
412 //save names in old bin
413 string names = list->get(binRemove);
415 //merge bins into name1s bin
416 newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
417 newList->set(binRemove, "");
420 while (names.find_first_of(',') != -1) {
422 string name = names.substr(0,names.find_first_of(','));
423 //save name and bin number
424 binInfo[name] = binKeep;
425 names = names.substr(names.find_first_of(',')+1, names.length());
429 binInfo[names] = binKeep;
432 }else { done = true; }
439 catch(exception& e) {
440 m->errorOut(e, "MGClusterCommand", "mergeOPFs");
444 //**********************************************************************************************************************
445 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
448 string sortedDistFile = sortFile(unsortedDist);
449 remove(unsortedDist.c_str()); //delete unsorted file
450 distFile = sortedDistFile;
453 string sortedOverlapFile = sortFile(unsortedOverlap);
454 remove(unsortedOverlap.c_str()); //delete unsorted file
455 overlapFile = sortedOverlapFile;
457 catch(exception& e) {
458 m->errorOut(e, "MGClusterCommand", "sortHclusterFiles");
463 //**********************************************************************************************************************