5 * Created by westcott on 12/11/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "mgclustercommand.h"
12 //**********************************************************************************************************************
13 MGClusterCommand::MGClusterCommand(string option){
15 globaldata = GlobalData::getInstance();
18 //allow user to run help
19 if(option == "help") { help(); abort = true; }
22 //valid paramters for this command
23 string Array[] = {"blast", "method", "name", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"};
24 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
26 OptionParser parser(option);
27 map<string, string> parameters = parser.getParameters();
29 ValidParameters validParameter;
30 map<string,string>::iterator it;
32 //check to make sure all parameters are valid for command
33 for (it = parameters.begin(); it != parameters.end(); it++) {
34 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
37 //if the user changes the input directory command factory will send this info to us in the output parameter
38 string inputDir = validParameter.validFile(parameters, "inputdir", false);
39 if (inputDir == "not found"){ inputDir = ""; }
42 it = parameters.find("blast");
43 //user has given a template file
44 if(it != parameters.end()){
45 path = hasPath(it->second);
46 //if the user has not given a path then, add inputdir. else leave path alone.
47 if (path == "") { parameters["blast"] = inputDir + it->second; }
50 it = parameters.find("name");
51 //user has given a template file
52 if(it != parameters.end()){
53 path = hasPath(it->second);
54 //if the user has not given a path then, add inputdir. else leave path alone.
55 if (path == "") { parameters["name"] = inputDir + it->second; }
60 //check for required parameters
61 blastfile = validParameter.validFile(parameters, "blast", true);
62 if (blastfile == "not open") { abort = true; }
63 else if (blastfile == "not found") { blastfile = ""; }
65 //if the user changes the output directory command factory will send this info to us in the output parameter
66 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
68 outputDir += hasPath(blastfile); //if user entered a file with a path then preserve it
71 namefile = validParameter.validFile(parameters, "name", true);
72 if (namefile == "not open") { abort = true; }
73 else if (namefile == "not found") { namefile = ""; }
75 if ((blastfile == "")) { mothurOut("When executing a mgcluster command you must provide a blastfile."); mothurOutEndLine(); abort = true; }
77 //check for optional parameter and set defaults
79 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
80 precisionLength = temp.length();
81 convert(temp, precision);
83 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.70"; }
84 convert(temp, cutoff);
85 cutoff += (5 / (precision * 10.0));
87 method = validParameter.validFile(parameters, "method", false);
88 if (method == "not found") { method = "furthest"; }
90 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
91 else { mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); mothurOutEndLine(); abort = true; }
93 temp = validParameter.validFile(parameters, "length", false); if (temp == "not found") { temp = "5"; }
94 convert(temp, length);
96 temp = validParameter.validFile(parameters, "penalty", false); if (temp == "not found") { temp = "0.10"; }
97 convert(temp, penalty);
99 temp = validParameter.validFile(parameters, "min", false); if (temp == "not found") { temp = "true"; }
100 minWanted = isTrue(temp);
102 temp = validParameter.validFile(parameters, "merge", false); if (temp == "not found") { temp = "true"; }
103 merge = isTrue(temp);
105 temp = validParameter.validFile(parameters, "hcluster", false); if (temp == "not found") { temp = "false"; }
106 hclusterWanted = isTrue(temp);
110 catch(exception& e) {
111 errorOut(e, "MGClusterCommand", "MGClusterCommand");
115 //**********************************************************************************************************************
117 void MGClusterCommand::help(){
119 mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n");
120 mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n");
121 mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n");
122 mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n");
123 mothurOut("The precision parameter's default value is 100. \n");
124 mothurOut("The acceptable mgcluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n");
125 mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n");
126 mothurOut("The length parameter is used to specify the minimum overlap required. The default is 5.\n");
127 mothurOut("The penalty parameter is used to adjust the error rate. The default is 0.10.\n");
128 mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster. By default merge is true, meaning you want to merge.\n");
129 mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering. This may be neccessary if your file is too large to fit into RAM. The default is false.\n");
130 mothurOut("The mgcluster command should be in the following format: \n");
131 mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n");
132 mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n");
134 catch(exception& e) {
135 errorOut(e, "MGClusterCommand", "help");
139 //**********************************************************************************************************************
140 MGClusterCommand::~MGClusterCommand(){}
141 //**********************************************************************************************************************
142 int MGClusterCommand::execute(){
145 if (abort == true) { return 0; }
148 if (namefile != "") {
149 nameMap = new NameAssignment(namefile);
151 }else{ nameMap= new NameAssignment(); }
153 string fileroot = outputDir + getRootName(getSimpleName(blastfile));
156 float previousDist = 0.00000;
157 float rndPreviousDist = 0.00000;
159 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
160 //must remember to delete those objects here since readBlast does not
161 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
164 list = new ListVector(nameMap->getListVector());
165 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
170 if (method == "furthest") { tag = "fn"; }
171 else if (method == "nearest") { tag = "nn"; }
175 openOutputFile(fileroot+ tag + ".list", listFile);
176 openOutputFile(fileroot+ tag + ".rabund", rabundFile);
177 openOutputFile(fileroot+ tag + ".sabund", sabundFile);
179 if (!hclusterWanted) {
180 //get distmatrix and overlap
181 SparseMatrix* distMatrix = read->getDistMatrix();
182 overlapMatrix = read->getOverlapMatrix(); //already sorted by read
186 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method); }
187 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
188 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); }
189 cluster->setMapWanted(true);
191 //cluster using cluster classes
192 while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
194 cluster->update(cutoff);
195 float dist = distMatrix->getSmallDist();
196 float rndDist = roundDist(dist, precision);
198 if(previousDist <= 0.0000 && dist != previousDist){
199 oldList.setLabel("unique");
202 else if(rndDist != rndPreviousDist){
204 map<string, int> seq2Bin = cluster->getSeqtoBin();
205 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
206 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
210 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
216 rndPreviousDist = rndDist;
220 if(previousDist <= 0.0000){
221 oldList.setLabel("unique");
224 else if(rndPreviousDist<cutoff){
226 map<string, int> seq2Bin = cluster->getSeqtoBin();
227 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
228 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
232 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
238 overlapMatrix.clear();
242 }else { //use hcluster to cluster
243 //get distmatrix and overlap
244 overlapFile = read->getOverlapFile();
245 distFile = read->getDistFile();
248 //sort the distance and overlap files
249 sortHclusterFiles(distFile, overlapFile);
252 hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
253 hcluster->setMapWanted(true);
255 vector<seqDist> seqs; seqs.resize(1); // to start loop
256 //ifstream inHcluster;
257 //openInputFile(distFile, inHcluster);
259 while (seqs.size() != 0){
261 seqs = hcluster->getSeqs();
263 for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
265 if (seqs[i].seq1 != seqs[i].seq2) {
267 hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
269 float rndDist = roundDist(seqs[i].dist, precision);
271 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
272 oldList.setLabel("unique");
275 else if((rndDist != rndPreviousDist)){
277 map<string, int> seq2Bin = hcluster->getSeqtoBin();
278 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
279 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
283 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
288 previousDist = seqs[i].dist;
289 rndPreviousDist = rndDist;
294 //inHcluster.close();
296 if(previousDist <= 0.0000){
297 oldList.setLabel("unique");
300 else if(rndPreviousDist<cutoff){
302 map<string, int> seq2Bin = hcluster->getSeqtoBin();
303 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
304 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
308 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
314 remove(distFile.c_str());
315 remove(overlapFile.c_str());
324 globaldata->setListFile(fileroot+ tag + ".list");
325 globaldata->setFormat("list");
327 mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); mothurOutEndLine();
331 catch(exception& e) {
332 errorOut(e, "MGClusterCommand", "execute");
336 //**********************************************************************************************************************
337 void MGClusterCommand::printData(ListVector* mergedList){
339 mergedList->print(listFile);
340 mergedList->getRAbundVector().print(rabundFile);
342 SAbundVector sabund = mergedList->getSAbundVector();
345 sabund.print(sabundFile);
347 catch(exception& e) {
348 errorOut(e, "MGClusterCommand", "printData");
352 //**********************************************************************************************************************
353 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
354 //that are used to cluster by distance. this is done so that the overlapping data does not have more influenece than the distance data.
355 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
357 //create new listvector so you don't overwrite the clustering
358 ListVector* newList = new ListVector(oldList);
363 if (hclusterWanted) {
364 openInputFile(overlapFile, inOverlap);
365 if (inOverlap.eof()) { done = true; }
366 }else { if (overlapMatrix.size() == 0) { done = true; } }
372 if (!hclusterWanted) {
373 if (count < overlapMatrix.size()) { //do we have another node in the matrix
374 overlapNode = overlapMatrix[count];
378 if (!inOverlap.eof()) {
379 string firstName, secondName;
380 float overlapDistance;
381 inOverlap >> firstName >> secondName >> overlapDistance; gobble(inOverlap);
383 map<string,int>::iterator itA = nameMap->find(firstName);
384 map<string,int>::iterator itB = nameMap->find(secondName);
385 if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
386 if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
388 overlapNode.seq1 = itA->second;
389 overlapNode.seq2 = itB->second;
390 overlapNode.dist = overlapDistance;
391 }else { inOverlap.close(); break; }
394 if (overlapNode.dist < dist) {
395 //get names of seqs that overlap
396 string name1 = nameMap->get(overlapNode.seq1);
397 string name2 = nameMap->get(overlapNode.seq2);
399 //use binInfo to find out if they are already in the same bin
400 int binKeep = binInfo[name1];
401 int binRemove = binInfo[name2];
403 //if not merge bins and update binInfo
404 if(binKeep != binRemove) {
405 //save names in old bin
406 string names = list->get(binRemove);
408 //merge bins into name1s bin
409 newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
410 newList->set(binRemove, "");
413 while (names.find_first_of(',') != -1) {
415 string name = names.substr(0,names.find_first_of(','));
416 //save name and bin number
417 binInfo[name] = binKeep;
418 names = names.substr(names.find_first_of(',')+1, names.length());
422 binInfo[names] = binKeep;
425 }else { done = true; }
432 catch(exception& e) {
433 errorOut(e, "MGClusterCommand", "mergeOPFs");
437 //**********************************************************************************************************************
438 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
441 string sortedDistFile = sortFile(unsortedDist);
442 remove(unsortedDist.c_str()); //delete unsorted file
443 distFile = sortedDistFile;
446 string sortedOverlapFile = sortFile(unsortedOverlap);
447 remove(unsortedOverlap.c_str()); //delete unsorted file
448 overlapFile = sortedOverlapFile;
450 catch(exception& e) {
451 errorOut(e, "MGClusterCommand", "sortHclusterFiles");
456 //**********************************************************************************************************************