]> git.donarmstrong.com Git - mothur.git/blob - mgclustercommand.cpp
fixed some bugs and added mgcluster command
[mothur.git] / mgclustercommand.cpp
1 /*
2  *  mgclustercommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 12/11/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "mgclustercommand.h"
11 #include "readcluster.h"
12
13 //**********************************************************************************************************************
14 MGClusterCommand::MGClusterCommand(string option){
15         try {
16                 globaldata = GlobalData::getInstance();
17                 abort = false;
18                 
19                 //allow user to run help
20                 if(option == "help") { help(); abort = true; }
21                 
22                 else {
23                         //valid paramters for this command
24                         string Array[] =  {"blast", "method", "name", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge"};
25                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
26                         
27                         OptionParser parser(option);
28                         map<string, string> parameters = parser.getParameters();
29                         
30                         ValidParameters validParameter;
31                 
32                         //check to make sure all parameters are valid for command
33                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
34                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
35                         }
36                         
37                         //check for required parameters
38                         blastfile = validParameter.validFile(parameters, "blast", true);
39                         if (blastfile == "not open") { abort = true; }  
40                         else if (blastfile == "not found") { blastfile = ""; }
41                         
42                         namefile = validParameter.validFile(parameters, "name", true);
43                         if (namefile == "not open") { abort = true; }   
44                         else if (namefile == "not found") { namefile = ""; }
45                         
46                         if ((blastfile == "")) { mothurOut("When executing a mgcluster command you must provide a blastfile."); mothurOutEndLine(); abort = true; }
47                         
48                         //check for optional parameter and set defaults
49                         string temp;
50                         temp = validParameter.validFile(parameters, "precision", false);                if (temp == "not found") { temp = "100"; }
51                         precisionLength = temp.length();
52                         convert(temp, precision); 
53                         
54                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "0.70"; }
55                         convert(temp, cutoff); 
56                         cutoff += (5 / (precision * 10.0));
57                         
58                         method = validParameter.validFile(parameters, "method", false);
59                         if (method == "not found") { method = "furthest"; }
60                         
61                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
62                         else { mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); mothurOutEndLine(); abort = true; }
63
64                         temp = validParameter.validFile(parameters, "length", false);                   if (temp == "not found") { temp = "5"; }
65                         convert(temp, length); 
66                         
67                         temp = validParameter.validFile(parameters, "penalty", false);                  if (temp == "not found") { temp = "0.10"; }
68                         convert(temp, penalty); 
69                         
70                         temp = validParameter.validFile(parameters, "min", false);                              if (temp == "not found") { temp = "true"; }
71                         minWanted = isTrue(temp); 
72                         
73                         temp = validParameter.validFile(parameters, "merge", false);                    if (temp == "not found") { temp = "true"; }
74                         merge = isTrue(temp); 
75                         
76                         temp = validParameter.validFile(parameters, "hcluster", false);                 if (temp == "not found") { temp = "false"; }
77                         hclusterWanted = isTrue(temp); 
78                 }
79
80         }
81         catch(exception& e) {
82                 errorOut(e, "MGClusterCommand", "MGClusterCommand");
83                 exit(1);
84         }
85 }
86 //**********************************************************************************************************************
87
88 void MGClusterCommand::help(){
89         try {
90                 mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n");
91                 mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n");
92                 mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n");
93                 mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n");
94                 mothurOut("The precision parameter's default value is 100. \n");
95                 mothurOut("The acceptable mgcluster methods are furthest, nearest and average.  If no method is provided then furthest is assumed.\n\n");       
96                 mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n");
97                 mothurOut("The length parameter is used to specify the minimum overlap required.  The default is 5.\n");
98                 mothurOut("The penalty parameter is used to adjust the error rate.  The default is 0.10.\n");
99                 mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster.  By default merge is true, meaning you want to merge.\n");
100                 mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering.  This may be neccessary if your file is too large to fit into RAM. The default is false.\n");
101                 mothurOut("The mgcluster command should be in the following format: \n");
102                 mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n");
103                 mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n");
104         }
105         catch(exception& e) {
106                 errorOut(e, "MGClusterCommand", "help");
107                 exit(1);
108         }
109 }
110 //**********************************************************************************************************************
111 MGClusterCommand::~MGClusterCommand(){}
112 //**********************************************************************************************************************
113 int MGClusterCommand::execute(){
114         try {
115                 
116                 if (abort == true) {    return 0;       }
117                 
118                 //read names file
119                 if (namefile != "") {
120                         nameMap = new NameAssignment(namefile);
121                         nameMap->readMap();
122                 }else{ nameMap= new NameAssignment(); }
123                 
124                 string fileroot = getRootName(blastfile);
125                 string tag = "";
126                 time_t start;
127                 float previousDist = 0.00000;
128                 float rndPreviousDist = 0.00000;
129                 
130                 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
131                 //must remember to delete those objects here since readBlast does not
132                 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
133                 read->read(nameMap);
134                 
135                 list = new ListVector(nameMap->getListVector());
136                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
137                 
138                 start = time(NULL);
139                 oldList = *list;
140                 
141                 if (!hclusterWanted) {
142                         //get distmatrix and overlap
143                         SparseMatrix* distMatrix = read->getDistMatrix();
144                         overlapMatrix = read->getOverlapMatrix(); //already sorted by read 
145                         delete read;
146                 
147                         //create cluster
148                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, distMatrix); }
149                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, distMatrix); }
150                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, distMatrix); }
151                         tag = cluster->getTag();
152                         cluster->setMapWanted(true);
153                         
154                         //open output files
155                         openOutputFile(fileroot+ tag + ".list",  listFile);
156                         openOutputFile(fileroot+ tag + ".rabund",  rabundFile);
157                         openOutputFile(fileroot+ tag + ".sabund",  sabundFile);
158                         
159                         //cluster using cluster classes
160                         while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
161                                 
162                                 cluster->update();
163                                 float dist = distMatrix->getSmallDist();
164                                 float rndDist = roundDist(dist, precision);
165                                 
166                                 if(previousDist <= 0.0000 && dist != previousDist){
167                                         oldList.setLabel("unique");
168                                         printData(&oldList);
169                                 }
170                                 else if(rndDist != rndPreviousDist){
171                                         if (merge) {
172                                                 map<string, int> seq2Bin = cluster->getSeqtoBin();
173                                                 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
174                                                 temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
175                                                 printData(temp);
176                                                 delete temp;
177                                         }else{
178                                                 oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
179                                                 printData(&oldList);
180                                         }
181                                 }
182                                 
183                                 previousDist = dist;
184                                 rndPreviousDist = rndDist;
185                                 oldList = *list;
186                         }
187                         
188                         if(previousDist <= 0.0000){
189                                 oldList.setLabel("unique");
190                                 printData(&oldList);
191                         }
192                         else if(rndPreviousDist<cutoff){
193                                 if (merge) {
194                                         map<string, int> seq2Bin = cluster->getSeqtoBin();
195                                         ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
196                                         temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
197                                         printData(temp);
198                                         delete temp;
199                                 }else{
200                                         oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
201                                         printData(&oldList);
202                                 }
203                         }
204                         
205                         //free memory
206                         overlapMatrix.clear();
207                         delete distMatrix;
208                         delete cluster;
209                         
210                 }else { //use hcluster to cluster
211                         tag = "fn";
212                         //get distmatrix and overlap
213                         overlapFile = read->getOverlapFile();
214                         distFile = read->getDistFile(); 
215                         delete read;
216                 
217                         //sort the distance and overlap files
218                         sortHclusterFiles(distFile, overlapFile);
219                 
220                         //create cluster
221                         hcluster = new HCluster(rabund, list);
222                         hcluster->setMapWanted(true);
223                         
224                         //open output files
225                         openOutputFile(fileroot+ tag + ".list",  listFile);
226                         openOutputFile(fileroot+ tag + ".rabund",  rabundFile);
227                         openOutputFile(fileroot+ tag + ".sabund",  sabundFile);
228                         
229                         vector<DistNode> seqs; seqs.resize(1); // to start loop
230                         exitedBreak = false;  //lets you know if there is a distance stored in next
231                         
232                         ifstream inHcluster;
233                         openInputFile(distFile, inHcluster);
234
235                         while (seqs.size() != 0){
236                 
237                                 seqs = getSeqs(inHcluster);
238                                 random_shuffle(seqs.begin(), seqs.end());
239                                 
240                                 for (int i = 0; i < seqs.size(); i++) {  //-1 means skip me
241                                         
242                                         if (seqs[i].seq1 != seqs[i].seq2) {
243                 
244                                                 hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
245         
246                                                 float rndDist = roundDist(seqs[i].dist, precision);
247                                                                                                 
248                                                 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
249                                                         oldList.setLabel("unique");
250                                                         printData(&oldList);
251                                                 }
252                                                 else if((rndDist != rndPreviousDist)){
253                                                         if (merge) {
254                                                                 map<string, int> seq2Bin = hcluster->getSeqtoBin();
255                                                                 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
256                                                                 temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
257                                                                 printData(temp);
258                                                                 delete temp;
259                                                         }else{
260                                                                 oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
261                                                                 printData(&oldList);
262                                                         }
263                                                 }
264                                                 
265                                                 previousDist = seqs[i].dist;
266                                                 rndPreviousDist = rndDist;
267                                                 oldList = *list;
268                                         }
269                                 }
270                         }
271                         inHcluster.close();
272                         
273                         if(previousDist <= 0.0000){
274                                 oldList.setLabel("unique");
275                                 printData(&oldList);
276                         }
277                         else if(rndPreviousDist<cutoff){
278                                 if (merge) {
279                                         map<string, int> seq2Bin = hcluster->getSeqtoBin();
280                                         ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
281                                         temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
282                                         printData(temp);
283                                         delete temp;
284                                 }else{
285                                         oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
286                                         printData(&oldList);
287                                 }
288                         }
289                         
290                         delete hcluster;
291                         remove(distFile.c_str());
292                         remove(overlapFile.c_str());
293                 }
294                 
295                 delete list; 
296                 delete rabund;
297                 listFile.close();
298                 sabundFile.close();
299                 rabundFile.close();
300         
301                 globaldata->setListFile(fileroot+ tag + ".list");
302                 globaldata->setFormat("list");
303                         
304                 mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); mothurOutEndLine();
305                         
306                 return 0;
307         }
308         catch(exception& e) {
309                 errorOut(e, "MGClusterCommand", "execute");
310                 exit(1);
311         }
312 }
313 //**********************************************************************************************************************
314 void MGClusterCommand::printData(ListVector* mergedList){
315         try {
316                 mergedList->print(listFile);
317                 mergedList->getRAbundVector().print(rabundFile);
318                 
319                 SAbundVector sabund = mergedList->getSAbundVector();
320
321                 sabund.print(cout);
322                 sabund.print(sabundFile);
323         }
324         catch(exception& e) {
325                 errorOut(e, "MGClusterCommand", "printData");
326                 exit(1);
327         }
328 }
329 //**********************************************************************************************************************
330 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
331 //that are used to cluster by distance.  this is done so that the overlapping data does not have more influenece than the distance data.
332 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
333         try {
334                 //create new listvector so you don't overwrite the clustering
335                 ListVector* newList = new ListVector(oldList);
336                 bool done = false;
337                 ifstream inOverlap;
338                 int count = 0;
339                 
340                 if (hclusterWanted) {  
341                         openInputFile(overlapFile, inOverlap);  
342                         if (inOverlap.eof()) {  done = true;  }
343                 }else { if (overlapMatrix.size() == 0)  {  done = true;  } } 
344                 
345                 while (!done) {
346                         
347                         //get next overlap
348                         DistNode overlapNode;
349                         if (!hclusterWanted) {  
350                                 if (count < overlapMatrix.size()) { //do we have another node in the matrix
351                                         overlapNode = overlapMatrix[count];
352                                         count++;
353                                 }else { break; }
354                         }else { 
355                                 if (!inOverlap.eof()) {
356                                         string firstName, secondName;
357                                         float overlapDistance;
358                                         inOverlap >> firstName >> secondName >> overlapDistance; gobble(inOverlap);
359                                         
360                                         map<string,int>::iterator itA = nameMap->find(firstName);
361                                         map<string,int>::iterator itB = nameMap->find(secondName);
362                                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
363                                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
364                                         
365                                         overlapNode.seq1 = itA->second;
366                                         overlapNode.seq2 = itB->second;
367                                         overlapNode.dist = overlapDistance;
368                                 }else { inOverlap.close(); break; }
369                         } 
370                 
371                         if (overlapNode.dist < dist) {
372                                 //get names of seqs that overlap
373                                 string name1 = nameMap->get(overlapNode.seq1);
374                                 string name2 = nameMap->get(overlapNode.seq2);
375                                 
376                                 //use binInfo to find out if they are already in the same bin
377                                 int binKeep = binInfo[name1];
378                                 int binRemove = binInfo[name2];
379                                 
380                                 //if not merge bins and update binInfo
381                                 if(binKeep != binRemove) {
382                                         //save names in old bin
383                                         string names = list->get(binRemove);
384                                         
385                                         //merge bins into name1s bin
386                                         newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
387                                         newList->set(binRemove, "");    
388                                         
389                                         //update binInfo
390                                         while (names.find_first_of(',') != -1) { 
391                                                 //get name from bin
392                                                 string name = names.substr(0,names.find_first_of(','));
393                                                 //save name and bin number
394                                                 binInfo[name] = binKeep;
395                                                 names = names.substr(names.find_first_of(',')+1, names.length());
396                                         }
397                                         
398                                         //get last name
399                                         binInfo[names] = binKeep;
400                                 }
401                                 
402                         }else { done = true; }
403                 }
404                 
405                 //return listvector
406                 return newList;
407                                 
408         }
409         catch(exception& e) {
410                 errorOut(e, "MGClusterCommand", "mergeOPFs");
411                 exit(1);
412         }
413 }
414 //**********************************************************************************************************************
415 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
416         try {
417                 //sort distFile
418                 ReadCluster* readCluster = new ReadCluster(unsortedDist, cutoff);       
419                 readCluster->setFormat("column");
420                 readCluster->read(nameMap);
421                 string sortedDistFile = readCluster->getOutputFile();
422                 ListVector* extralist = readCluster->getListVector();  //we get this so we can delete it and not have a memory leak
423                 
424                 remove(unsortedDist.c_str());  //delete unsorted file
425                 distFile = sortedDistFile;
426                 delete extralist;
427                 delete readCluster;
428                 
429                 //sort overlap file
430                 readCluster = new ReadCluster(unsortedOverlap, cutoff);         
431                 readCluster->setFormat("column");
432                 readCluster->read(nameMap);
433                 string sortedOverlapFile = readCluster->getOutputFile();
434                 extralist = readCluster->getListVector();  //we get this so we can delete it and not have a memory leak
435                 
436                 remove(unsortedOverlap.c_str());  //delete unsorted file
437                 overlapFile = sortedOverlapFile;
438                 delete extralist;
439                 delete readCluster;
440         }
441         catch(exception& e) {
442                 errorOut(e, "MGClusterCommand", "sortHclusterFiles");
443                 exit(1);
444         }
445 }
446 //**********************************************************************************************************************
447 vector<DistNode> MGClusterCommand::getSeqs(ifstream& filehandle){
448         try {
449                 string firstName, secondName;
450                 float distance, prevDistance;
451                 vector<DistNode> sameSeqs;
452                 prevDistance = -1;
453         
454                 //if you are not at the beginning of the file
455                 if (exitedBreak) { 
456                         sameSeqs.push_back(next);
457                         prevDistance = next.dist;
458                         exitedBreak = false;
459                 }
460         
461                 //get entry
462                 while (!filehandle.eof()) {
463                         
464                         filehandle >> firstName >> secondName >> distance;    gobble(filehandle); 
465         
466                         //save first one
467                         if (prevDistance == -1) { prevDistance = distance; }
468                         
469                         map<string,int>::iterator itA = nameMap->find(firstName);
470                         map<string,int>::iterator itB = nameMap->find(secondName);
471                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
472                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
473                 
474                         //using cutoff
475                         if (distance > cutoff) { break; }
476                 
477                         if (distance != -1) { //-1 means skip me
478                         
479                                 //are the distances the same
480                                 if (distance == prevDistance) { //save in vector
481                                         DistNode temp(itA->second, itB->second, distance);
482                                         sameSeqs.push_back(temp);
483                                         exitedBreak = false;
484                                 }else{ 
485                                         next.seq1 = itA->second;
486                                         next.seq2 = itB->second;
487                                         next.dist = distance;
488                                         exitedBreak = true;
489                                         break;
490                                 }
491                         }
492                 }
493                 
494                 return sameSeqs;
495         }
496         catch(exception& e) {
497                 errorOut(e, "MGClusterCommand", "getSeqs");
498                 exit(1);
499         }
500 }
501 //**********************************************************************************************************************
502
503
504
505
506