]> git.donarmstrong.com Git - mothur.git/blob - mgclustercommand.cpp
added warning about average neighbor merges around cutoff. fixed little bugs.
[mothur.git] / mgclustercommand.cpp
1 /*
2  *  mgclustercommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 12/11/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "mgclustercommand.h"
11
12 //**********************************************************************************************************************
13 MGClusterCommand::MGClusterCommand(string option){
14         try {
15                 globaldata = GlobalData::getInstance();
16                 abort = false;
17                 
18                 //allow user to run help
19                 if(option == "help") { help(); abort = true; }
20                 
21                 else {
22                         //valid paramters for this command
23                         string Array[] =  {"blast", "method", "name", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge"};
24                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
25                         
26                         OptionParser parser(option);
27                         map<string, string> parameters = parser.getParameters();
28                         
29                         ValidParameters validParameter;
30                 
31                         //check to make sure all parameters are valid for command
32                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
33                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
34                         }
35                         
36                         //check for required parameters
37                         blastfile = validParameter.validFile(parameters, "blast", true);
38                         if (blastfile == "not open") { abort = true; }  
39                         else if (blastfile == "not found") { blastfile = ""; }
40                         
41                         namefile = validParameter.validFile(parameters, "name", true);
42                         if (namefile == "not open") { abort = true; }   
43                         else if (namefile == "not found") { namefile = ""; }
44                         
45                         if ((blastfile == "")) { mothurOut("When executing a mgcluster command you must provide a blastfile."); mothurOutEndLine(); abort = true; }
46                         
47                         //check for optional parameter and set defaults
48                         string temp;
49                         temp = validParameter.validFile(parameters, "precision", false);                if (temp == "not found") { temp = "100"; }
50                         precisionLength = temp.length();
51                         convert(temp, precision); 
52                         
53                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "0.70"; }
54                         convert(temp, cutoff); 
55                         cutoff += (5 / (precision * 10.0));
56                         
57                         method = validParameter.validFile(parameters, "method", false);
58                         if (method == "not found") { method = "furthest"; }
59                         
60                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
61                         else { mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); mothurOutEndLine(); abort = true; }
62
63                         temp = validParameter.validFile(parameters, "length", false);                   if (temp == "not found") { temp = "5"; }
64                         convert(temp, length); 
65                         
66                         temp = validParameter.validFile(parameters, "penalty", false);                  if (temp == "not found") { temp = "0.10"; }
67                         convert(temp, penalty); 
68                         
69                         temp = validParameter.validFile(parameters, "min", false);                              if (temp == "not found") { temp = "true"; }
70                         minWanted = isTrue(temp); 
71                         
72                         temp = validParameter.validFile(parameters, "merge", false);                    if (temp == "not found") { temp = "true"; }
73                         merge = isTrue(temp); 
74                         
75                         temp = validParameter.validFile(parameters, "hcluster", false);                 if (temp == "not found") { temp = "false"; }
76                         hclusterWanted = isTrue(temp); 
77                 }
78
79         }
80         catch(exception& e) {
81                 errorOut(e, "MGClusterCommand", "MGClusterCommand");
82                 exit(1);
83         }
84 }
85 //**********************************************************************************************************************
86
87 void MGClusterCommand::help(){
88         try {
89                 mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n");
90                 mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n");
91                 mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n");
92                 mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n");
93                 mothurOut("The precision parameter's default value is 100. \n");
94                 mothurOut("The acceptable mgcluster methods are furthest, nearest and average.  If no method is provided then furthest is assumed.\n\n");       
95                 mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n");
96                 mothurOut("The length parameter is used to specify the minimum overlap required.  The default is 5.\n");
97                 mothurOut("The penalty parameter is used to adjust the error rate.  The default is 0.10.\n");
98                 mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster.  By default merge is true, meaning you want to merge.\n");
99                 mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering.  This may be neccessary if your file is too large to fit into RAM. The default is false.\n");
100                 mothurOut("The mgcluster command should be in the following format: \n");
101                 mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n");
102                 mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n");
103         }
104         catch(exception& e) {
105                 errorOut(e, "MGClusterCommand", "help");
106                 exit(1);
107         }
108 }
109 //**********************************************************************************************************************
110 MGClusterCommand::~MGClusterCommand(){}
111 //**********************************************************************************************************************
112 int MGClusterCommand::execute(){
113         try {
114                 
115                 if (abort == true) {    return 0;       }
116                 
117                 //read names file
118                 if (namefile != "") {
119                         nameMap = new NameAssignment(namefile);
120                         nameMap->readMap();
121                 }else{ nameMap= new NameAssignment(); }
122                 
123                 string fileroot = getRootName(blastfile);
124                 string tag = "";
125                 time_t start;
126                 float previousDist = 0.00000;
127                 float rndPreviousDist = 0.00000;
128                 
129                 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
130                 //must remember to delete those objects here since readBlast does not
131                 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
132                 read->read(nameMap);
133                 
134                 list = new ListVector(nameMap->getListVector());
135                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
136                 
137                 start = time(NULL);
138                 oldList = *list;
139                 
140                 if (method == "furthest")               { tag = "fn";  }
141                 else if (method == "nearest")   { tag = "nn";  }
142                 else                                                    { tag = "an";  }
143                 
144                 //open output files
145                 openOutputFile(fileroot+ tag + ".list",  listFile);
146                 openOutputFile(fileroot+ tag + ".rabund",  rabundFile);
147                 openOutputFile(fileroot+ tag + ".sabund",  sabundFile);
148                 
149                 if (!hclusterWanted) {
150                         //get distmatrix and overlap
151                         SparseMatrix* distMatrix = read->getDistMatrix();
152                         overlapMatrix = read->getOverlapMatrix(); //already sorted by read 
153                         delete read;
154                 
155                         //create cluster
156                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff); }
157                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, distMatrix, cutoff); }
158                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, distMatrix, cutoff); }
159                         cluster->setMapWanted(true);
160                         
161                         //cluster using cluster classes
162                         while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
163                                 
164                                 cluster->update();
165                                 float dist = distMatrix->getSmallDist();
166                                 float rndDist = roundDist(dist, precision);
167                                 
168                                 if(previousDist <= 0.0000 && dist != previousDist){
169                                         oldList.setLabel("unique");
170                                         printData(&oldList);
171                                 }
172                                 else if(rndDist != rndPreviousDist){
173                                         if (merge) {
174                                                 map<string, int> seq2Bin = cluster->getSeqtoBin();
175                                                 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
176                                                 temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
177                                                 printData(temp);
178                                                 delete temp;
179                                         }else{
180                                                 oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
181                                                 printData(&oldList);
182                                         }
183                                 }
184                                 
185                                 previousDist = dist;
186                                 rndPreviousDist = rndDist;
187                                 oldList = *list;
188                         }
189                         
190                         if(previousDist <= 0.0000){
191                                 oldList.setLabel("unique");
192                                 printData(&oldList);
193                         }
194                         else if(rndPreviousDist<cutoff){
195                                 if (merge) {
196                                         map<string, int> seq2Bin = cluster->getSeqtoBin();
197                                         ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
198                                         temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
199                                         printData(temp);
200                                         delete temp;
201                                 }else{
202                                         oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
203                                         printData(&oldList);
204                                 }
205                         }
206                         
207                         //free memory
208                         overlapMatrix.clear();
209                         delete distMatrix;
210                         delete cluster;
211                         
212                 }else { //use hcluster to cluster
213                         tag = "fn";
214                         //get distmatrix and overlap
215                         overlapFile = read->getOverlapFile();
216                         distFile = read->getDistFile(); 
217                         delete read;
218                 
219                         //sort the distance and overlap files
220                         sortHclusterFiles(distFile, overlapFile);
221                 
222                         //create cluster
223                         hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
224                         hcluster->setMapWanted(true);
225                         
226                         vector<seqDist> seqs; seqs.resize(1); // to start loop
227                         //ifstream inHcluster;
228                         //openInputFile(distFile, inHcluster);
229
230                         while (seqs.size() != 0){
231                 
232                                 seqs = hcluster->getSeqs();
233                                 
234                                 for (int i = 0; i < seqs.size(); i++) {  //-1 means skip me
235                                         
236                                         if (seqs[i].seq1 != seqs[i].seq2) {
237                 
238                                                 hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
239         
240                                                 float rndDist = roundDist(seqs[i].dist, precision);
241                                                                                                 
242                                                 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
243                                                         oldList.setLabel("unique");
244                                                         printData(&oldList);
245                                                 }
246                                                 else if((rndDist != rndPreviousDist)){
247                                                         if (merge) {
248                                                                 map<string, int> seq2Bin = hcluster->getSeqtoBin();
249                                                                 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
250                                                                 temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
251                                                                 printData(temp);
252                                                                 delete temp;
253                                                         }else{
254                                                                 oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
255                                                                 printData(&oldList);
256                                                         }
257                                                 }
258                                                 
259                                                 previousDist = seqs[i].dist;
260                                                 rndPreviousDist = rndDist;
261                                                 oldList = *list;
262                                         }
263                                 }
264                         }
265                         //inHcluster.close();
266                         
267                         if(previousDist <= 0.0000){
268                                 oldList.setLabel("unique");
269                                 printData(&oldList);
270                         }
271                         else if(rndPreviousDist<cutoff){
272                                 if (merge) {
273                                         map<string, int> seq2Bin = hcluster->getSeqtoBin();
274                                         ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
275                                         temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
276                                         printData(temp);
277                                         delete temp;
278                                 }else{
279                                         oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
280                                         printData(&oldList);
281                                 }
282                         }
283                         
284                         delete hcluster;
285                         remove(distFile.c_str());
286                         remove(overlapFile.c_str());
287                 }
288                 
289                 delete list; 
290                 delete rabund;
291                 listFile.close();
292                 sabundFile.close();
293                 rabundFile.close();
294         
295                 globaldata->setListFile(fileroot+ tag + ".list");
296                 globaldata->setFormat("list");
297                         
298                 mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); mothurOutEndLine();
299                         
300                 return 0;
301         }
302         catch(exception& e) {
303                 errorOut(e, "MGClusterCommand", "execute");
304                 exit(1);
305         }
306 }
307 //**********************************************************************************************************************
308 void MGClusterCommand::printData(ListVector* mergedList){
309         try {
310                 mergedList->print(listFile);
311                 mergedList->getRAbundVector().print(rabundFile);
312                 
313                 SAbundVector sabund = mergedList->getSAbundVector();
314
315                 sabund.print(cout);
316                 sabund.print(sabundFile);
317         }
318         catch(exception& e) {
319                 errorOut(e, "MGClusterCommand", "printData");
320                 exit(1);
321         }
322 }
323 //**********************************************************************************************************************
324 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
325 //that are used to cluster by distance.  this is done so that the overlapping data does not have more influenece than the distance data.
326 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
327         try {
328                 //create new listvector so you don't overwrite the clustering
329                 ListVector* newList = new ListVector(oldList);
330                 bool done = false;
331                 ifstream inOverlap;
332                 int count = 0;
333                 
334                 if (hclusterWanted) {  
335                         openInputFile(overlapFile, inOverlap);  
336                         if (inOverlap.eof()) {  done = true;  }
337                 }else { if (overlapMatrix.size() == 0)  {  done = true;  } } 
338                 
339                 while (!done) {
340                         
341                         //get next overlap
342                         seqDist overlapNode;
343                         if (!hclusterWanted) {  
344                                 if (count < overlapMatrix.size()) { //do we have another node in the matrix
345                                         overlapNode = overlapMatrix[count];
346                                         count++;
347                                 }else { break; }
348                         }else { 
349                                 if (!inOverlap.eof()) {
350                                         string firstName, secondName;
351                                         float overlapDistance;
352                                         inOverlap >> firstName >> secondName >> overlapDistance; gobble(inOverlap);
353                                         
354                                         map<string,int>::iterator itA = nameMap->find(firstName);
355                                         map<string,int>::iterator itB = nameMap->find(secondName);
356                                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
357                                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
358                                         
359                                         overlapNode.seq1 = itA->second;
360                                         overlapNode.seq2 = itB->second;
361                                         overlapNode.dist = overlapDistance;
362                                 }else { inOverlap.close(); break; }
363                         } 
364                 
365                         if (overlapNode.dist < dist) {
366                                 //get names of seqs that overlap
367                                 string name1 = nameMap->get(overlapNode.seq1);
368                                 string name2 = nameMap->get(overlapNode.seq2);
369                                 
370                                 //use binInfo to find out if they are already in the same bin
371                                 int binKeep = binInfo[name1];
372                                 int binRemove = binInfo[name2];
373                                 
374                                 //if not merge bins and update binInfo
375                                 if(binKeep != binRemove) {
376                                         //save names in old bin
377                                         string names = list->get(binRemove);
378                                         
379                                         //merge bins into name1s bin
380                                         newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
381                                         newList->set(binRemove, "");    
382                                         
383                                         //update binInfo
384                                         while (names.find_first_of(',') != -1) { 
385                                                 //get name from bin
386                                                 string name = names.substr(0,names.find_first_of(','));
387                                                 //save name and bin number
388                                                 binInfo[name] = binKeep;
389                                                 names = names.substr(names.find_first_of(',')+1, names.length());
390                                         }
391                                         
392                                         //get last name
393                                         binInfo[names] = binKeep;
394                                 }
395                                 
396                         }else { done = true; }
397                 }
398                 
399                 //return listvector
400                 return newList;
401                                 
402         }
403         catch(exception& e) {
404                 errorOut(e, "MGClusterCommand", "mergeOPFs");
405                 exit(1);
406         }
407 }
408 //**********************************************************************************************************************
409 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
410         try {
411                 //sort distFile
412                 string sortedDistFile = sortFile(unsortedDist);
413                 remove(unsortedDist.c_str());  //delete unsorted file
414                 distFile = sortedDistFile;
415                 
416                 //sort overlap file
417                 string sortedOverlapFile = sortFile(unsortedOverlap);
418                 remove(unsortedOverlap.c_str());  //delete unsorted file
419                 overlapFile = sortedOverlapFile;
420         }
421         catch(exception& e) {
422                 errorOut(e, "MGClusterCommand", "sortHclusterFiles");
423                 exit(1);
424         }
425 }
426
427 //**********************************************************************************************************************
428
429
430
431
432