]> git.donarmstrong.com Git - mothur.git/blob - mgclustercommand.cpp
created mothurOut class to handle logfiles
[mothur.git] / mgclustercommand.cpp
1 /*
2  *  mgclustercommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 12/11/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "mgclustercommand.h"
11
12 //**********************************************************************************************************************
13 MGClusterCommand::MGClusterCommand(string option) {
14         try {
15                 globaldata = GlobalData::getInstance();
16                 abort = false;
17                 
18                 //allow user to run help
19                 if(option == "help") { help(); abort = true; }
20                 
21                 else {
22                         //valid paramters for this command
23                         string Array[] =  {"blast", "method", "name", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"};
24                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
25                         
26                         OptionParser parser(option);
27                         map<string, string> parameters = parser.getParameters();
28                         
29                         ValidParameters validParameter;
30                         map<string,string>::iterator it;
31                 
32                         //check to make sure all parameters are valid for command
33                         for (it = parameters.begin(); it != parameters.end(); it++) { 
34                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
35                         }
36                         
37                         //if the user changes the input directory command factory will send this info to us in the output parameter 
38                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
39                         if (inputDir == "not found"){   inputDir = "";          }
40                         else {
41                                 string path;
42                                 it = parameters.find("blast");
43                                 //user has given a template file
44                                 if(it != parameters.end()){ 
45                                         path = hasPath(it->second);
46                                         //if the user has not given a path then, add inputdir. else leave path alone.
47                                         if (path == "") {       parameters["blast"] = inputDir + it->second;            }
48                                 }
49                                 
50                                 it = parameters.find("name");
51                                 //user has given a template file
52                                 if(it != parameters.end()){ 
53                                         path = hasPath(it->second);
54                                         //if the user has not given a path then, add inputdir. else leave path alone.
55                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
56                                 }
57                         }
58
59                         
60                         //check for required parameters
61                         blastfile = validParameter.validFile(parameters, "blast", true);
62                         if (blastfile == "not open") { abort = true; }  
63                         else if (blastfile == "not found") { blastfile = ""; }
64                         
65                         //if the user changes the output directory command factory will send this info to us in the output parameter 
66                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
67                                 outputDir = ""; 
68                                 outputDir += hasPath(blastfile); //if user entered a file with a path then preserve it  
69                         }
70                         
71                         namefile = validParameter.validFile(parameters, "name", true);
72                         if (namefile == "not open") { abort = true; }   
73                         else if (namefile == "not found") { namefile = ""; }
74                         
75                         if ((blastfile == "")) { m->mothurOut("When executing a mgcluster command you must provide a blastfile."); m->mothurOutEndLine(); abort = true; }
76                         
77                         //check for optional parameter and set defaults
78                         string temp;
79                         temp = validParameter.validFile(parameters, "precision", false);                if (temp == "not found") { temp = "100"; }
80                         precisionLength = temp.length();
81                         convert(temp, precision); 
82                         
83                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "0.70"; }
84                         convert(temp, cutoff); 
85                         cutoff += (5 / (precision * 10.0));
86                         
87                         method = validParameter.validFile(parameters, "method", false);
88                         if (method == "not found") { method = "furthest"; }
89                         
90                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
91                         else { m->mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); m->mothurOutEndLine(); abort = true; }
92
93                         temp = validParameter.validFile(parameters, "length", false);                   if (temp == "not found") { temp = "5"; }
94                         convert(temp, length); 
95                         
96                         temp = validParameter.validFile(parameters, "penalty", false);                  if (temp == "not found") { temp = "0.10"; }
97                         convert(temp, penalty); 
98                         
99                         temp = validParameter.validFile(parameters, "min", false);                              if (temp == "not found") { temp = "true"; }
100                         minWanted = isTrue(temp); 
101                         
102                         temp = validParameter.validFile(parameters, "merge", false);                    if (temp == "not found") { temp = "true"; }
103                         merge = isTrue(temp); 
104                         
105                         temp = validParameter.validFile(parameters, "hcluster", false);                 if (temp == "not found") { temp = "false"; }
106                         hclusterWanted = isTrue(temp); 
107                 }
108
109         }
110         catch(exception& e) {
111                 m->errorOut(e, "MGClusterCommand", "MGClusterCommand");
112                 exit(1);
113         }
114 }
115 //**********************************************************************************************************************
116
117 void MGClusterCommand::help(){
118         try {
119                 m->mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n");
120                 m->mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n");
121                 m->mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n");
122                 m->mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n");
123                 m->mothurOut("The precision parameter's default value is 100. \n");
124                 m->mothurOut("The acceptable mgcluster methods are furthest, nearest and average.  If no method is provided then furthest is assumed.\n\n");    
125                 m->mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n");
126                 m->mothurOut("The length parameter is used to specify the minimum overlap required.  The default is 5.\n");
127                 m->mothurOut("The penalty parameter is used to adjust the error rate.  The default is 0.10.\n");
128                 m->mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster.  By default merge is true, meaning you want to merge.\n");
129                 m->mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering.  This may be neccessary if your file is too large to fit into RAM. The default is false.\n");
130                 m->mothurOut("The mgcluster command should be in the following format: \n");
131                 m->mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n");
132                 m->mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n");
133         }
134         catch(exception& e) {
135                 m->errorOut(e, "MGClusterCommand", "help");
136                 exit(1);
137         }
138 }
139 //**********************************************************************************************************************
140 MGClusterCommand::~MGClusterCommand(){}
141 //**********************************************************************************************************************
142 int MGClusterCommand::execute(){
143         try {
144                 
145                 if (abort == true) {    return 0;       }
146                 
147                 //read names file
148                 if (namefile != "") {
149                         nameMap = new NameAssignment(namefile);
150                         nameMap->readMap();
151                 }else{ nameMap= new NameAssignment(); }
152                 
153                 string fileroot = outputDir + getRootName(getSimpleName(blastfile));
154                 string tag = "";
155                 time_t start;
156                 float previousDist = 0.00000;
157                 float rndPreviousDist = 0.00000;
158                 
159                 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
160                 //must remember to delete those objects here since readBlast does not
161                 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
162                 read->read(nameMap);
163                 
164                 list = new ListVector(nameMap->getListVector());
165                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
166                 
167                 start = time(NULL);
168                 oldList = *list;
169                 
170                 if (method == "furthest")               { tag = "fn";  }
171                 else if (method == "nearest")   { tag = "nn";  }
172                 else                                                    { tag = "an";  }
173                 
174                 //open output files
175                 openOutputFile(fileroot+ tag + ".list",  listFile);
176                 openOutputFile(fileroot+ tag + ".rabund",  rabundFile);
177                 openOutputFile(fileroot+ tag + ".sabund",  sabundFile);
178                 
179                 if (!hclusterWanted) {
180                         //get distmatrix and overlap
181                         SparseMatrix* distMatrix = read->getDistMatrix();
182                         overlapMatrix = read->getOverlapMatrix(); //already sorted by read 
183                         delete read;
184                 
185                         //create cluster
186                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff, method); }
187                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, distMatrix, cutoff, method); }
188                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, distMatrix, cutoff, method); }
189                         cluster->setMapWanted(true);
190                         
191                         //cluster using cluster classes
192                         while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
193                                 
194                                 cluster->update(cutoff);
195                                 float dist = distMatrix->getSmallDist();
196                                 float rndDist = roundDist(dist, precision);
197                                 
198                                 if(previousDist <= 0.0000 && dist != previousDist){
199                                         oldList.setLabel("unique");
200                                         printData(&oldList);
201                                 }
202                                 else if(rndDist != rndPreviousDist){
203                                         if (merge) {
204                                                 map<string, int> seq2Bin = cluster->getSeqtoBin();
205                                                 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
206                                                 temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
207                                                 printData(temp);
208                                                 delete temp;
209                                         }else{
210                                                 oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
211                                                 printData(&oldList);
212                                         }
213                                 }
214                                 
215                                 previousDist = dist;
216                                 rndPreviousDist = rndDist;
217                                 oldList = *list;
218                         }
219                         
220                         if(previousDist <= 0.0000){
221                                 oldList.setLabel("unique");
222                                 printData(&oldList);
223                         }
224                         else if(rndPreviousDist<cutoff){
225                                 if (merge) {
226                                         map<string, int> seq2Bin = cluster->getSeqtoBin();
227                                         ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
228                                         temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
229                                         printData(temp);
230                                         delete temp;
231                                 }else{
232                                         oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
233                                         printData(&oldList);
234                                 }
235                         }
236                         
237                         //free memory
238                         overlapMatrix.clear();
239                         delete distMatrix;
240                         delete cluster;
241                         
242                 }else { //use hcluster to cluster
243                         //get distmatrix and overlap
244                         overlapFile = read->getOverlapFile();
245                         distFile = read->getDistFile(); 
246                         delete read;
247                 
248                         //sort the distance and overlap files
249                         sortHclusterFiles(distFile, overlapFile);
250                 
251                         //create cluster
252                         hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
253                         hcluster->setMapWanted(true);
254                         
255                         vector<seqDist> seqs; seqs.resize(1); // to start loop
256                         //ifstream inHcluster;
257                         //openInputFile(distFile, inHcluster);
258
259                         while (seqs.size() != 0){
260                 
261                                 seqs = hcluster->getSeqs();
262                                 
263                                 for (int i = 0; i < seqs.size(); i++) {  //-1 means skip me
264                                         
265                                         if (seqs[i].seq1 != seqs[i].seq2) {
266                 
267                                                 hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
268         
269                                                 float rndDist = roundDist(seqs[i].dist, precision);
270                                                                                                 
271                                                 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
272                                                         oldList.setLabel("unique");
273                                                         printData(&oldList);
274                                                 }
275                                                 else if((rndDist != rndPreviousDist)){
276                                                         if (merge) {
277                                                                 map<string, int> seq2Bin = hcluster->getSeqtoBin();
278                                                                 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
279                                                                 temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
280                                                                 printData(temp);
281                                                                 delete temp;
282                                                         }else{
283                                                                 oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
284                                                                 printData(&oldList);
285                                                         }
286                                                 }
287                                                 
288                                                 previousDist = seqs[i].dist;
289                                                 rndPreviousDist = rndDist;
290                                                 oldList = *list;
291                                         }
292                                 }
293                         }
294                         //inHcluster.close();
295                         
296                         if(previousDist <= 0.0000){
297                                 oldList.setLabel("unique");
298                                 printData(&oldList);
299                         }
300                         else if(rndPreviousDist<cutoff){
301                                 if (merge) {
302                                         map<string, int> seq2Bin = hcluster->getSeqtoBin();
303                                         ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
304                                         temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
305                                         printData(temp);
306                                         delete temp;
307                                 }else{
308                                         oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
309                                         printData(&oldList);
310                                 }
311                         }
312                         
313                         delete hcluster;
314                         remove(distFile.c_str());
315                         remove(overlapFile.c_str());
316                 }
317                 
318                 delete list; 
319                 delete rabund;
320                 listFile.close();
321                 sabundFile.close();
322                 rabundFile.close();
323         
324                 globaldata->setListFile(fileroot+ tag + ".list");
325                 globaldata->setFormat("list");
326                 
327                 m->mothurOutEndLine();
328                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
329                 m->mothurOut(fileroot+ tag + ".list"); m->mothurOutEndLine();   
330                 m->mothurOut(fileroot+ tag + ".rabund"); m->mothurOutEndLine(); 
331                 m->mothurOut(fileroot+ tag + ".sabund"); m->mothurOutEndLine(); 
332                 m->mothurOutEndLine();
333                 
334                 m->mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); m->mothurOutEndLine();
335                         
336                 return 0;
337         }
338         catch(exception& e) {
339                 m->errorOut(e, "MGClusterCommand", "execute");
340                 exit(1);
341         }
342 }
343 //**********************************************************************************************************************
344 void MGClusterCommand::printData(ListVector* mergedList){
345         try {
346                 mergedList->print(listFile);
347                 mergedList->getRAbundVector().print(rabundFile);
348                 
349                 SAbundVector sabund = mergedList->getSAbundVector();
350
351                 sabund.print(cout);
352                 sabund.print(sabundFile);
353         }
354         catch(exception& e) {
355                 m->errorOut(e, "MGClusterCommand", "printData");
356                 exit(1);
357         }
358 }
359 //**********************************************************************************************************************
360 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
361 //that are used to cluster by distance.  this is done so that the overlapping data does not have more influenece than the distance data.
362 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
363         try {
364                 //create new listvector so you don't overwrite the clustering
365                 ListVector* newList = new ListVector(oldList);
366                 bool done = false;
367                 ifstream inOverlap;
368                 int count = 0;
369                 
370                 if (hclusterWanted) {  
371                         openInputFile(overlapFile, inOverlap);  
372                         if (inOverlap.eof()) {  done = true;  }
373                 }else { if (overlapMatrix.size() == 0)  {  done = true;  } } 
374                 
375                 while (!done) {
376                         
377                         //get next overlap
378                         seqDist overlapNode;
379                         if (!hclusterWanted) {  
380                                 if (count < overlapMatrix.size()) { //do we have another node in the matrix
381                                         overlapNode = overlapMatrix[count];
382                                         count++;
383                                 }else { break; }
384                         }else { 
385                                 if (!inOverlap.eof()) {
386                                         string firstName, secondName;
387                                         float overlapDistance;
388                                         inOverlap >> firstName >> secondName >> overlapDistance; gobble(inOverlap);
389                                         
390                                         map<string,int>::iterator itA = nameMap->find(firstName);
391                                         map<string,int>::iterator itB = nameMap->find(secondName);
392                                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
393                                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
394                                         
395                                         overlapNode.seq1 = itA->second;
396                                         overlapNode.seq2 = itB->second;
397                                         overlapNode.dist = overlapDistance;
398                                 }else { inOverlap.close(); break; }
399                         } 
400                 
401                         if (overlapNode.dist < dist) {
402                                 //get names of seqs that overlap
403                                 string name1 = nameMap->get(overlapNode.seq1);
404                                 string name2 = nameMap->get(overlapNode.seq2);
405                                 
406                                 //use binInfo to find out if they are already in the same bin
407                                 int binKeep = binInfo[name1];
408                                 int binRemove = binInfo[name2];
409                                 
410                                 //if not merge bins and update binInfo
411                                 if(binKeep != binRemove) {
412                                         //save names in old bin
413                                         string names = list->get(binRemove);
414                                         
415                                         //merge bins into name1s bin
416                                         newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
417                                         newList->set(binRemove, "");    
418                                         
419                                         //update binInfo
420                                         while (names.find_first_of(',') != -1) { 
421                                                 //get name from bin
422                                                 string name = names.substr(0,names.find_first_of(','));
423                                                 //save name and bin number
424                                                 binInfo[name] = binKeep;
425                                                 names = names.substr(names.find_first_of(',')+1, names.length());
426                                         }
427                                         
428                                         //get last name
429                                         binInfo[names] = binKeep;
430                                 }
431                                 
432                         }else { done = true; }
433                 }
434                 
435                 //return listvector
436                 return newList;
437                                 
438         }
439         catch(exception& e) {
440                 m->errorOut(e, "MGClusterCommand", "mergeOPFs");
441                 exit(1);
442         }
443 }
444 //**********************************************************************************************************************
445 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
446         try {
447                 //sort distFile
448                 string sortedDistFile = sortFile(unsortedDist);
449                 remove(unsortedDist.c_str());  //delete unsorted file
450                 distFile = sortedDistFile;
451                 
452                 //sort overlap file
453                 string sortedOverlapFile = sortFile(unsortedOverlap);
454                 remove(unsortedOverlap.c_str());  //delete unsorted file
455                 overlapFile = sortedOverlapFile;
456         }
457         catch(exception& e) {
458                 m->errorOut(e, "MGClusterCommand", "sortHclusterFiles");
459                 exit(1);
460         }
461 }
462
463 //**********************************************************************************************************************
464
465
466
467
468