5 * Created by westcott on 12/11/09.
6 * Copyright 2009 Schloss Lab. All rights reserved.
10 #include "mgclustercommand.h"
11 #include "readcluster.h"
13 //**********************************************************************************************************************
14 MGClusterCommand::MGClusterCommand(string option){
16 globaldata = GlobalData::getInstance();
19 //allow user to run help
20 if(option == "help") { help(); abort = true; }
23 //valid paramters for this command
24 string Array[] = {"blast", "method", "name", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge"};
25 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
27 OptionParser parser(option);
28 map<string, string> parameters = parser.getParameters();
30 ValidParameters validParameter;
32 //check to make sure all parameters are valid for command
33 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
34 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
37 //check for required parameters
38 blastfile = validParameter.validFile(parameters, "blast", true);
39 if (blastfile == "not open") { abort = true; }
40 else if (blastfile == "not found") { blastfile = ""; }
42 namefile = validParameter.validFile(parameters, "name", true);
43 if (namefile == "not open") { abort = true; }
44 else if (namefile == "not found") { namefile = ""; }
46 if ((blastfile == "")) { mothurOut("When executing a mgcluster command you must provide a blastfile."); mothurOutEndLine(); abort = true; }
48 //check for optional parameter and set defaults
50 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
51 precisionLength = temp.length();
52 convert(temp, precision);
54 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "0.70"; }
55 convert(temp, cutoff);
56 cutoff += (5 / (precision * 10.0));
58 method = validParameter.validFile(parameters, "method", false);
59 if (method == "not found") { method = "furthest"; }
61 if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
62 else { mothurOut("Not a valid clustering method. Valid clustering algorithms are furthest, nearest or average."); mothurOutEndLine(); abort = true; }
64 temp = validParameter.validFile(parameters, "length", false); if (temp == "not found") { temp = "5"; }
65 convert(temp, length);
67 temp = validParameter.validFile(parameters, "penalty", false); if (temp == "not found") { temp = "0.10"; }
68 convert(temp, penalty);
70 temp = validParameter.validFile(parameters, "min", false); if (temp == "not found") { temp = "true"; }
71 minWanted = isTrue(temp);
73 temp = validParameter.validFile(parameters, "merge", false); if (temp == "not found") { temp = "true"; }
76 temp = validParameter.validFile(parameters, "hcluster", false); if (temp == "not found") { temp = "false"; }
77 hclusterWanted = isTrue(temp);
82 errorOut(e, "MGClusterCommand", "MGClusterCommand");
86 //**********************************************************************************************************************
88 void MGClusterCommand::help(){
90 mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n");
91 mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n");
92 mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n");
93 mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n");
94 mothurOut("The precision parameter's default value is 100. \n");
95 mothurOut("The acceptable mgcluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n");
96 mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n");
97 mothurOut("The length parameter is used to specify the minimum overlap required. The default is 5.\n");
98 mothurOut("The penalty parameter is used to adjust the error rate. The default is 0.10.\n");
99 mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster. By default merge is true, meaning you want to merge.\n");
100 mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering. This may be neccessary if your file is too large to fit into RAM. The default is false.\n");
101 mothurOut("The mgcluster command should be in the following format: \n");
102 mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n");
103 mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n");
105 catch(exception& e) {
106 errorOut(e, "MGClusterCommand", "help");
110 //**********************************************************************************************************************
111 MGClusterCommand::~MGClusterCommand(){}
112 //**********************************************************************************************************************
113 int MGClusterCommand::execute(){
116 if (abort == true) { return 0; }
119 if (namefile != "") {
120 nameMap = new NameAssignment(namefile);
122 }else{ nameMap= new NameAssignment(); }
124 string fileroot = getRootName(blastfile);
127 float previousDist = 0.00000;
128 float rndPreviousDist = 0.00000;
130 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
131 //must remember to delete those objects here since readBlast does not
132 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
135 list = new ListVector(nameMap->getListVector());
136 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
141 if (!hclusterWanted) {
142 //get distmatrix and overlap
143 SparseMatrix* distMatrix = read->getDistMatrix();
144 overlapMatrix = read->getOverlapMatrix(); //already sorted by read
148 if (method == "furthest") { cluster = new CompleteLinkage(rabund, list, distMatrix); }
149 else if(method == "nearest"){ cluster = new SingleLinkage(rabund, list, distMatrix); }
150 else if(method == "average"){ cluster = new AverageLinkage(rabund, list, distMatrix); }
151 tag = cluster->getTag();
152 cluster->setMapWanted(true);
155 openOutputFile(fileroot+ tag + ".list", listFile);
156 openOutputFile(fileroot+ tag + ".rabund", rabundFile);
157 openOutputFile(fileroot+ tag + ".sabund", sabundFile);
159 //cluster using cluster classes
160 while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
163 float dist = distMatrix->getSmallDist();
164 float rndDist = roundDist(dist, precision);
166 if(previousDist <= 0.0000 && dist != previousDist){
167 oldList.setLabel("unique");
170 else if(rndDist != rndPreviousDist){
172 map<string, int> seq2Bin = cluster->getSeqtoBin();
173 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
174 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
178 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
184 rndPreviousDist = rndDist;
188 if(previousDist <= 0.0000){
189 oldList.setLabel("unique");
192 else if(rndPreviousDist<cutoff){
194 map<string, int> seq2Bin = cluster->getSeqtoBin();
195 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
196 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
200 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
206 overlapMatrix.clear();
210 }else { //use hcluster to cluster
212 //get distmatrix and overlap
213 overlapFile = read->getOverlapFile();
214 distFile = read->getDistFile();
217 //sort the distance and overlap files
218 sortHclusterFiles(distFile, overlapFile);
221 hcluster = new HCluster(rabund, list);
222 hcluster->setMapWanted(true);
225 openOutputFile(fileroot+ tag + ".list", listFile);
226 openOutputFile(fileroot+ tag + ".rabund", rabundFile);
227 openOutputFile(fileroot+ tag + ".sabund", sabundFile);
229 vector<DistNode> seqs; seqs.resize(1); // to start loop
230 exitedBreak = false; //lets you know if there is a distance stored in next
233 openInputFile(distFile, inHcluster);
235 while (seqs.size() != 0){
237 seqs = getSeqs(inHcluster);
238 random_shuffle(seqs.begin(), seqs.end());
240 for (int i = 0; i < seqs.size(); i++) { //-1 means skip me
242 if (seqs[i].seq1 != seqs[i].seq2) {
244 hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
246 float rndDist = roundDist(seqs[i].dist, precision);
248 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
249 oldList.setLabel("unique");
252 else if((rndDist != rndPreviousDist)){
254 map<string, int> seq2Bin = hcluster->getSeqtoBin();
255 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
256 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
260 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
265 previousDist = seqs[i].dist;
266 rndPreviousDist = rndDist;
273 if(previousDist <= 0.0000){
274 oldList.setLabel("unique");
277 else if(rndPreviousDist<cutoff){
279 map<string, int> seq2Bin = hcluster->getSeqtoBin();
280 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
281 temp->setLabel(toString(rndPreviousDist, precisionLength-1));
285 oldList.setLabel(toString(rndPreviousDist, precisionLength-1));
291 remove(distFile.c_str());
292 remove(overlapFile.c_str());
301 globaldata->setListFile(fileroot+ tag + ".list");
302 globaldata->setFormat("list");
304 mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); mothurOutEndLine();
308 catch(exception& e) {
309 errorOut(e, "MGClusterCommand", "execute");
313 //**********************************************************************************************************************
314 void MGClusterCommand::printData(ListVector* mergedList){
316 mergedList->print(listFile);
317 mergedList->getRAbundVector().print(rabundFile);
319 SAbundVector sabund = mergedList->getSAbundVector();
322 sabund.print(sabundFile);
324 catch(exception& e) {
325 errorOut(e, "MGClusterCommand", "printData");
329 //**********************************************************************************************************************
330 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
331 //that are used to cluster by distance. this is done so that the overlapping data does not have more influenece than the distance data.
332 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
334 //create new listvector so you don't overwrite the clustering
335 ListVector* newList = new ListVector(oldList);
340 if (hclusterWanted) {
341 openInputFile(overlapFile, inOverlap);
342 if (inOverlap.eof()) { done = true; }
343 }else { if (overlapMatrix.size() == 0) { done = true; } }
348 DistNode overlapNode;
349 if (!hclusterWanted) {
350 if (count < overlapMatrix.size()) { //do we have another node in the matrix
351 overlapNode = overlapMatrix[count];
355 if (!inOverlap.eof()) {
356 string firstName, secondName;
357 float overlapDistance;
358 inOverlap >> firstName >> secondName >> overlapDistance; gobble(inOverlap);
360 map<string,int>::iterator itA = nameMap->find(firstName);
361 map<string,int>::iterator itB = nameMap->find(secondName);
362 if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
363 if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
365 overlapNode.seq1 = itA->second;
366 overlapNode.seq2 = itB->second;
367 overlapNode.dist = overlapDistance;
368 }else { inOverlap.close(); break; }
371 if (overlapNode.dist < dist) {
372 //get names of seqs that overlap
373 string name1 = nameMap->get(overlapNode.seq1);
374 string name2 = nameMap->get(overlapNode.seq2);
376 //use binInfo to find out if they are already in the same bin
377 int binKeep = binInfo[name1];
378 int binRemove = binInfo[name2];
380 //if not merge bins and update binInfo
381 if(binKeep != binRemove) {
382 //save names in old bin
383 string names = list->get(binRemove);
385 //merge bins into name1s bin
386 newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
387 newList->set(binRemove, "");
390 while (names.find_first_of(',') != -1) {
392 string name = names.substr(0,names.find_first_of(','));
393 //save name and bin number
394 binInfo[name] = binKeep;
395 names = names.substr(names.find_first_of(',')+1, names.length());
399 binInfo[names] = binKeep;
402 }else { done = true; }
409 catch(exception& e) {
410 errorOut(e, "MGClusterCommand", "mergeOPFs");
414 //**********************************************************************************************************************
415 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
418 ReadCluster* readCluster = new ReadCluster(unsortedDist, cutoff);
419 readCluster->setFormat("column");
420 readCluster->read(nameMap);
421 string sortedDistFile = readCluster->getOutputFile();
422 ListVector* extralist = readCluster->getListVector(); //we get this so we can delete it and not have a memory leak
424 remove(unsortedDist.c_str()); //delete unsorted file
425 distFile = sortedDistFile;
430 readCluster = new ReadCluster(unsortedOverlap, cutoff);
431 readCluster->setFormat("column");
432 readCluster->read(nameMap);
433 string sortedOverlapFile = readCluster->getOutputFile();
434 extralist = readCluster->getListVector(); //we get this so we can delete it and not have a memory leak
436 remove(unsortedOverlap.c_str()); //delete unsorted file
437 overlapFile = sortedOverlapFile;
441 catch(exception& e) {
442 errorOut(e, "MGClusterCommand", "sortHclusterFiles");
446 //**********************************************************************************************************************
447 vector<DistNode> MGClusterCommand::getSeqs(ifstream& filehandle){
449 string firstName, secondName;
450 float distance, prevDistance;
451 vector<DistNode> sameSeqs;
454 //if you are not at the beginning of the file
456 sameSeqs.push_back(next);
457 prevDistance = next.dist;
462 while (!filehandle.eof()) {
464 filehandle >> firstName >> secondName >> distance; gobble(filehandle);
467 if (prevDistance == -1) { prevDistance = distance; }
469 map<string,int>::iterator itA = nameMap->find(firstName);
470 map<string,int>::iterator itB = nameMap->find(secondName);
471 if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1); }
472 if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1); }
475 if (distance > cutoff) { break; }
477 if (distance != -1) { //-1 means skip me
479 //are the distances the same
480 if (distance == prevDistance) { //save in vector
481 DistNode temp(itA->second, itB->second, distance);
482 sameSeqs.push_back(temp);
485 next.seq1 = itA->second;
486 next.seq2 = itB->second;
487 next.dist = distance;
496 catch(exception& e) {
497 errorOut(e, "MGClusterCommand", "getSeqs");
501 //**********************************************************************************************************************