5 * Created by westcott on 12/11/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "mgclustercommand.h"
12 //**********************************************************************************************************************
13 MGClusterCommand::MGClusterCommand(string option){
15 globaldata = GlobalData::getInstance();
18 //allow user to run help
19 if(option == "help") { help(); abort = true; }
22 //valid paramters for this command
23 string Array[] = {"blast", "method", "name", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge"};
24 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
26 OptionParser parser(option);
27 map<string, string> parameters = parser.getParameters();
29 ValidParameters validParameter;
31 //check to make sure all parameters are valid for command
32 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
33 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
36 //check for required parameters
37 blastfile = validParameter.validFile(parameters, "blast", true);
38 if (blastfile == "not open") { abort = true; }
39 else if (blastfile == "not found") { blastfile = ""; }
41 namefile = validParameter.validFile(parameters, "name", true);
42 if (namefile == "not open") { abort = true; }
43 else if (namefile == "not found") { namefile = ""; }
45 if ((blastfile == "")) { mothurOut("When executing a mgcluster command you must provide a blastfile."); mothurOutEndLine(); abort = true; }
47 //check for optional parameter and set defaults
49 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
50 precisionLength = temp.length();
51 convert(temp, precision);
53 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.70"; }
54 convert(temp, cutoff);
55 cutoff += (5 / (precision * 10.0));
57 method = validParameter.validFile(parameters, "method", false);
58 if (method == "not found") { method = "furthest"; }
60 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
61 else { mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); mothurOutEndLine(); abort = true; }
63 temp = validParameter.validFile(parameters, "length", false); if (temp == "not found") { temp = "5"; }
64 convert(temp, length);
66 temp = validParameter.validFile(parameters, "penalty", false); if (temp == "not found") { temp = "0.10"; }
67 convert(temp, penalty);
69 temp = validParameter.validFile(parameters, "min", false); if (temp == "not found") { temp = "true"; }
70 minWanted = isTrue(temp);
72 temp = validParameter.validFile(parameters, "merge", false); if (temp == "not found") { temp = "true"; }
75 temp = validParameter.validFile(parameters, "hcluster", false); if (temp == "not found") { temp = "false"; }
76 hclusterWanted = isTrue(temp);
81 errorOut(e, "MGClusterCommand", "MGClusterCommand");
85 //**********************************************************************************************************************
87 void MGClusterCommand::help(){
89 mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n");
90 mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n");
91 mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n");
92 mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n");
93 mothurOut("The precision parameter's default value is 100. \n");
94 mothurOut("The acceptable mgcluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n");
95 mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n");
96 mothurOut("The length parameter is used to specify the minimum overlap required. The default is 5.\n");
97 mothurOut("The penalty parameter is used to adjust the error rate. The default is 0.10.\n");
98 mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster. By default merge is true, meaning you want to merge.\n");
99 mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering. This may be neccessary if your file is too large to fit into RAM. The default is false.\n");
100 mothurOut("The mgcluster command should be in the following format: \n");
101 mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n");
102 mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n");
104 catch(exception& e) {
105 errorOut(e, "MGClusterCommand", "help");
109 //**********************************************************************************************************************
110 MGClusterCommand::~MGClusterCommand(){}
111 //**********************************************************************************************************************
112 int MGClusterCommand::execute(){
115 if (abort == true) { return 0; }
118 if (namefile != "") {
119 nameMap = new NameAssignment(namefile);
121 }else{ nameMap= new NameAssignment(); }
123 string fileroot = getRootName(blastfile);
126 float previousDist = 0.00000;
127 float rndPreviousDist = 0.00000;
129 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
130 //must remember to delete those objects here since readBlast does not
131 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
134 list = new ListVector(nameMap->getListVector());
135 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
140 if (method == "furthest") { tag = "fn"; }
141 else if (method == "nearest") { tag = "nn"; }
145 openOutputFile(fileroot+ tag + ".list", listFile);
146 openOutputFile(fileroot+ tag + ".rabund", rabundFile);
147 openOutputFile(fileroot+ tag + ".sabund", sabundFile);
149 if (!hclusterWanted) {
150 //get distmatrix and overlap
151 SparseMatrix* distMatrix = read->getDistMatrix();
152 overlapMatrix = read->getOverlapMatrix(); //already sorted by read
156 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix); }
157 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix); }
158 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix); }
159 cluster->setMapWanted(true);
161 //cluster using cluster classes
162 while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
165 float dist = distMatrix->getSmallDist();
166 float rndDist = roundDist(dist, precision);
168 if(previousDist <= 0.0000 && dist != previousDist){
169 oldList.setLabel("unique");
172 else if(rndDist != rndPreviousDist){
174 map<string, int> seq2Bin = cluster->getSeqtoBin();
175 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
176 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
180 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
186 rndPreviousDist = rndDist;
190 if(previousDist <= 0.0000){
191 oldList.setLabel("unique");
194 else if(rndPreviousDist<cutoff){
196 map<string, int> seq2Bin = cluster->getSeqtoBin();
197 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
198 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
202 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
208 overlapMatrix.clear();
212 }else { //use hcluster to cluster
214 //get distmatrix and overlap
215 overlapFile = read->getOverlapFile();
216 distFile = read->getDistFile();
219 //sort the distance and overlap files
220 sortHclusterFiles(distFile, overlapFile);
223 hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
224 hcluster->setMapWanted(true);
226 vector<seqDist> seqs; seqs.resize(1); // to start loop
227 //ifstream inHcluster;
228 //openInputFile(distFile, inHcluster);
230 while (seqs.size() != 0){
232 seqs = hcluster->getSeqs();
234 for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
236 if (seqs[i].seq1 != seqs[i].seq2) {
238 hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
240 float rndDist = roundDist(seqs[i].dist, precision);
242 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
243 oldList.setLabel("unique");
246 else if((rndDist != rndPreviousDist)){
248 map<string, int> seq2Bin = hcluster->getSeqtoBin();
249 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
250 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
254 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
259 previousDist = seqs[i].dist;
260 rndPreviousDist = rndDist;
265 //inHcluster.close();
267 if(previousDist <= 0.0000){
268 oldList.setLabel("unique");
271 else if(rndPreviousDist<cutoff){
273 map<string, int> seq2Bin = hcluster->getSeqtoBin();
274 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
275 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
279 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
285 remove(distFile.c_str());
286 remove(overlapFile.c_str());
295 globaldata->setListFile(fileroot+ tag + ".list");
296 globaldata->setFormat("list");
298 mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); mothurOutEndLine();
302 catch(exception& e) {
303 errorOut(e, "MGClusterCommand", "execute");
307 //**********************************************************************************************************************
308 void MGClusterCommand::printData(ListVector* mergedList){
310 mergedList->print(listFile);
311 mergedList->getRAbundVector().print(rabundFile);
313 SAbundVector sabund = mergedList->getSAbundVector();
316 sabund.print(sabundFile);
318 catch(exception& e) {
319 errorOut(e, "MGClusterCommand", "printData");
323 //**********************************************************************************************************************
324 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
325 //that are used to cluster by distance. this is done so that the overlapping data does not have more influenece than the distance data.
326 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
328 //create new listvector so you don't overwrite the clustering
329 ListVector* newList = new ListVector(oldList);
334 if (hclusterWanted) {
335 openInputFile(overlapFile, inOverlap);
336 if (inOverlap.eof()) { done = true; }
337 }else { if (overlapMatrix.size() == 0) { done = true; } }
343 if (!hclusterWanted) {
344 if (count < overlapMatrix.size()) { //do we have another node in the matrix
345 overlapNode = overlapMatrix[count];
349 if (!inOverlap.eof()) {
350 string firstName, secondName;
351 float overlapDistance;
352 inOverlap >> firstName >> secondName >> overlapDistance; gobble(inOverlap);
354 map<string,int>::iterator itA = nameMap->find(firstName);
355 map<string,int>::iterator itB = nameMap->find(secondName);
356 if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
357 if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
359 overlapNode.seq1 = itA->second;
360 overlapNode.seq2 = itB->second;
361 overlapNode.dist = overlapDistance;
362 }else { inOverlap.close(); break; }
365 if (overlapNode.dist < dist) {
366 //get names of seqs that overlap
367 string name1 = nameMap->get(overlapNode.seq1);
368 string name2 = nameMap->get(overlapNode.seq2);
370 //use binInfo to find out if they are already in the same bin
371 int binKeep = binInfo[name1];
372 int binRemove = binInfo[name2];
374 //if not merge bins and update binInfo
375 if(binKeep != binRemove) {
376 //save names in old bin
377 string names = list->get(binRemove);
379 //merge bins into name1s bin
380 newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
381 newList->set(binRemove, "");
384 while (names.find_first_of(',') != -1) {
386 string name = names.substr(0,names.find_first_of(','));
387 //save name and bin number
388 binInfo[name] = binKeep;
389 names = names.substr(names.find_first_of(',')+1, names.length());
393 binInfo[names] = binKeep;
396 }else { done = true; }
403 catch(exception& e) {
404 errorOut(e, "MGClusterCommand", "mergeOPFs");
408 //**********************************************************************************************************************
409 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
412 string sortedDistFile = sortFile(unsortedDist);
413 remove(unsortedDist.c_str()); //delete unsorted file
414 distFile = sortedDistFile;
417 string sortedOverlapFile = sortFile(unsortedOverlap);
418 remove(unsortedOverlap.c_str()); //delete unsorted file
419 overlapFile = sortedOverlapFile;
421 catch(exception& e) {
422 errorOut(e, "MGClusterCommand", "sortHclusterFiles");
427 //**********************************************************************************************************************