]> git.donarmstrong.com Git - mothur.git/blob - mgclustercommand.cpp
added set.dir command and modified commands to redirect input and output, removed...
[mothur.git] / mgclustercommand.cpp
1 /*
2  *  mgclustercommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 12/11/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "mgclustercommand.h"
11
12 //**********************************************************************************************************************
13 MGClusterCommand::MGClusterCommand(string option){
14         try {
15                 globaldata = GlobalData::getInstance();
16                 abort = false;
17                 
18                 //allow user to run help
19                 if(option == "help") { help(); abort = true; }
20                 
21                 else {
22                         //valid paramters for this command
23                         string Array[] =  {"blast", "method", "name", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge","outputdir","inputdir"};
24                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
25                         
26                         OptionParser parser(option);
27                         map<string, string> parameters = parser.getParameters();
28                         
29                         ValidParameters validParameter;
30                         map<string,string>::iterator it;
31                 
32                         //check to make sure all parameters are valid for command
33                         for (it = parameters.begin(); it != parameters.end(); it++) { 
34                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
35                         }
36                         
37                         //if the user changes the input directory command factory will send this info to us in the output parameter 
38                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
39                         if (inputDir == "not found"){   inputDir = "";          }
40                         else {
41                                 string path;
42                                 it = parameters.find("blast");
43                                 //user has given a template file
44                                 if(it != parameters.end()){ 
45                                         path = hasPath(it->second);
46                                         //if the user has not given a path then, add inputdir. else leave path alone.
47                                         if (path == "") {       parameters["blast"] = inputDir + it->second;            }
48                                 }
49                                 
50                                 it = parameters.find("name");
51                                 //user has given a template file
52                                 if(it != parameters.end()){ 
53                                         path = hasPath(it->second);
54                                         //if the user has not given a path then, add inputdir. else leave path alone.
55                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
56                                 }
57                         }
58
59                         
60                         //check for required parameters
61                         blastfile = validParameter.validFile(parameters, "blast", true);
62                         if (blastfile == "not open") { abort = true; }  
63                         else if (blastfile == "not found") { blastfile = ""; }
64                         
65                         //if the user changes the output directory command factory will send this info to us in the output parameter 
66                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
67                                 outputDir = ""; 
68                                 outputDir += hasPath(blastfile); //if user entered a file with a path then preserve it  
69                         }
70                         
71                         namefile = validParameter.validFile(parameters, "name", true);
72                         if (namefile == "not open") { abort = true; }   
73                         else if (namefile == "not found") { namefile = ""; }
74                         
75                         if ((blastfile == "")) { mothurOut("When executing a mgcluster command you must provide a blastfile."); mothurOutEndLine(); abort = true; }
76                         
77                         //check for optional parameter and set defaults
78                         string temp;
79                         temp = validParameter.validFile(parameters, "precision", false);                if (temp == "not found") { temp = "100"; }
80                         precisionLength = temp.length();
81                         convert(temp, precision); 
82                         
83                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "0.70"; }
84                         convert(temp, cutoff); 
85                         cutoff += (5 / (precision * 10.0));
86                         
87                         method = validParameter.validFile(parameters, "method", false);
88                         if (method == "not found") { method = "furthest"; }
89                         
90                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
91                         else { mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); mothurOutEndLine(); abort = true; }
92
93                         temp = validParameter.validFile(parameters, "length", false);                   if (temp == "not found") { temp = "5"; }
94                         convert(temp, length); 
95                         
96                         temp = validParameter.validFile(parameters, "penalty", false);                  if (temp == "not found") { temp = "0.10"; }
97                         convert(temp, penalty); 
98                         
99                         temp = validParameter.validFile(parameters, "min", false);                              if (temp == "not found") { temp = "true"; }
100                         minWanted = isTrue(temp); 
101                         
102                         temp = validParameter.validFile(parameters, "merge", false);                    if (temp == "not found") { temp = "true"; }
103                         merge = isTrue(temp); 
104                         
105                         temp = validParameter.validFile(parameters, "hcluster", false);                 if (temp == "not found") { temp = "false"; }
106                         hclusterWanted = isTrue(temp); 
107                 }
108
109         }
110         catch(exception& e) {
111                 errorOut(e, "MGClusterCommand", "MGClusterCommand");
112                 exit(1);
113         }
114 }
115 //**********************************************************************************************************************
116
117 void MGClusterCommand::help(){
118         try {
119                 mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n");
120                 mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n");
121                 mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n");
122                 mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n");
123                 mothurOut("The precision parameter's default value is 100. \n");
124                 mothurOut("The acceptable mgcluster methods are furthest, nearest and average.  If no method is provided then furthest is assumed.\n\n");       
125                 mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n");
126                 mothurOut("The length parameter is used to specify the minimum overlap required.  The default is 5.\n");
127                 mothurOut("The penalty parameter is used to adjust the error rate.  The default is 0.10.\n");
128                 mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster.  By default merge is true, meaning you want to merge.\n");
129                 mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering.  This may be neccessary if your file is too large to fit into RAM. The default is false.\n");
130                 mothurOut("The mgcluster command should be in the following format: \n");
131                 mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n");
132                 mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n");
133         }
134         catch(exception& e) {
135                 errorOut(e, "MGClusterCommand", "help");
136                 exit(1);
137         }
138 }
139 //**********************************************************************************************************************
140 MGClusterCommand::~MGClusterCommand(){}
141 //**********************************************************************************************************************
142 int MGClusterCommand::execute(){
143         try {
144                 
145                 if (abort == true) {    return 0;       }
146                 
147                 //read names file
148                 if (namefile != "") {
149                         nameMap = new NameAssignment(namefile);
150                         nameMap->readMap();
151                 }else{ nameMap= new NameAssignment(); }
152                 
153                 string fileroot = outputDir + getRootName(getSimpleName(blastfile));
154                 string tag = "";
155                 time_t start;
156                 float previousDist = 0.00000;
157                 float rndPreviousDist = 0.00000;
158                 
159                 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
160                 //must remember to delete those objects here since readBlast does not
161                 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
162                 read->read(nameMap);
163                 
164                 list = new ListVector(nameMap->getListVector());
165                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
166                 
167                 start = time(NULL);
168                 oldList = *list;
169                 
170                 if (method == "furthest")               { tag = "fn";  }
171                 else if (method == "nearest")   { tag = "nn";  }
172                 else                                                    { tag = "an";  }
173                 
174                 //open output files
175                 openOutputFile(fileroot+ tag + ".list",  listFile);
176                 openOutputFile(fileroot+ tag + ".rabund",  rabundFile);
177                 openOutputFile(fileroot+ tag + ".sabund",  sabundFile);
178                 
179                 if (!hclusterWanted) {
180                         //get distmatrix and overlap
181                         SparseMatrix* distMatrix = read->getDistMatrix();
182                         overlapMatrix = read->getOverlapMatrix(); //already sorted by read 
183                         delete read;
184                 
185                         //create cluster
186                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, distMatrix, cutoff); }
187                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, distMatrix, cutoff); }
188                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, distMatrix, cutoff); }
189                         cluster->setMapWanted(true);
190                         
191                         //cluster using cluster classes
192                         while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
193                                 
194                                 cluster->update();
195                                 float dist = distMatrix->getSmallDist();
196                                 float rndDist = roundDist(dist, precision);
197                                 
198                                 if(previousDist <= 0.0000 && dist != previousDist){
199                                         oldList.setLabel("unique");
200                                         printData(&oldList);
201                                 }
202                                 else if(rndDist != rndPreviousDist){
203                                         if (merge) {
204                                                 map<string, int> seq2Bin = cluster->getSeqtoBin();
205                                                 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
206                                                 temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
207                                                 printData(temp);
208                                                 delete temp;
209                                         }else{
210                                                 oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
211                                                 printData(&oldList);
212                                         }
213                                 }
214                                 
215                                 previousDist = dist;
216                                 rndPreviousDist = rndDist;
217                                 oldList = *list;
218                         }
219                         
220                         if(previousDist <= 0.0000){
221                                 oldList.setLabel("unique");
222                                 printData(&oldList);
223                         }
224                         else if(rndPreviousDist<cutoff){
225                                 if (merge) {
226                                         map<string, int> seq2Bin = cluster->getSeqtoBin();
227                                         ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
228                                         temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
229                                         printData(temp);
230                                         delete temp;
231                                 }else{
232                                         oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
233                                         printData(&oldList);
234                                 }
235                         }
236                         
237                         //free memory
238                         overlapMatrix.clear();
239                         delete distMatrix;
240                         delete cluster;
241                         
242                 }else { //use hcluster to cluster
243                         //get distmatrix and overlap
244                         overlapFile = read->getOverlapFile();
245                         distFile = read->getDistFile(); 
246                         delete read;
247                 
248                         //sort the distance and overlap files
249                         sortHclusterFiles(distFile, overlapFile);
250                 
251                         //create cluster
252                         hcluster = new HCluster(rabund, list, method, distFile, nameMap, cutoff);
253                         hcluster->setMapWanted(true);
254                         
255                         vector<seqDist> seqs; seqs.resize(1); // to start loop
256                         //ifstream inHcluster;
257                         //openInputFile(distFile, inHcluster);
258
259                         while (seqs.size() != 0){
260                 
261                                 seqs = hcluster->getSeqs();
262                                 
263                                 for (int i = 0; i < seqs.size(); i++) {  //-1 means skip me
264                                         
265                                         if (seqs[i].seq1 != seqs[i].seq2) {
266                 
267                                                 hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
268         
269                                                 float rndDist = roundDist(seqs[i].dist, precision);
270                                                                                                 
271                                                 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
272                                                         oldList.setLabel("unique");
273                                                         printData(&oldList);
274                                                 }
275                                                 else if((rndDist != rndPreviousDist)){
276                                                         if (merge) {
277                                                                 map<string, int> seq2Bin = hcluster->getSeqtoBin();
278                                                                 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
279                                                                 temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
280                                                                 printData(temp);
281                                                                 delete temp;
282                                                         }else{
283                                                                 oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
284                                                                 printData(&oldList);
285                                                         }
286                                                 }
287                                                 
288                                                 previousDist = seqs[i].dist;
289                                                 rndPreviousDist = rndDist;
290                                                 oldList = *list;
291                                         }
292                                 }
293                         }
294                         //inHcluster.close();
295                         
296                         if(previousDist <= 0.0000){
297                                 oldList.setLabel("unique");
298                                 printData(&oldList);
299                         }
300                         else if(rndPreviousDist<cutoff){
301                                 if (merge) {
302                                         map<string, int> seq2Bin = hcluster->getSeqtoBin();
303                                         ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
304                                         temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
305                                         printData(temp);
306                                         delete temp;
307                                 }else{
308                                         oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
309                                         printData(&oldList);
310                                 }
311                         }
312                         
313                         delete hcluster;
314                         remove(distFile.c_str());
315                         remove(overlapFile.c_str());
316                 }
317                 
318                 delete list; 
319                 delete rabund;
320                 listFile.close();
321                 sabundFile.close();
322                 rabundFile.close();
323         
324                 globaldata->setListFile(fileroot+ tag + ".list");
325                 globaldata->setFormat("list");
326                         
327                 mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); mothurOutEndLine();
328                         
329                 return 0;
330         }
331         catch(exception& e) {
332                 errorOut(e, "MGClusterCommand", "execute");
333                 exit(1);
334         }
335 }
336 //**********************************************************************************************************************
337 void MGClusterCommand::printData(ListVector* mergedList){
338         try {
339                 mergedList->print(listFile);
340                 mergedList->getRAbundVector().print(rabundFile);
341                 
342                 SAbundVector sabund = mergedList->getSAbundVector();
343
344                 sabund.print(cout);
345                 sabund.print(sabundFile);
346         }
347         catch(exception& e) {
348                 errorOut(e, "MGClusterCommand", "printData");
349                 exit(1);
350         }
351 }
352 //**********************************************************************************************************************
353 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
354 //that are used to cluster by distance.  this is done so that the overlapping data does not have more influenece than the distance data.
355 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
356         try {
357                 //create new listvector so you don't overwrite the clustering
358                 ListVector* newList = new ListVector(oldList);
359                 bool done = false;
360                 ifstream inOverlap;
361                 int count = 0;
362                 
363                 if (hclusterWanted) {  
364                         openInputFile(overlapFile, inOverlap);  
365                         if (inOverlap.eof()) {  done = true;  }
366                 }else { if (overlapMatrix.size() == 0)  {  done = true;  } } 
367                 
368                 while (!done) {
369                         
370                         //get next overlap
371                         seqDist overlapNode;
372                         if (!hclusterWanted) {  
373                                 if (count < overlapMatrix.size()) { //do we have another node in the matrix
374                                         overlapNode = overlapMatrix[count];
375                                         count++;
376                                 }else { break; }
377                         }else { 
378                                 if (!inOverlap.eof()) {
379                                         string firstName, secondName;
380                                         float overlapDistance;
381                                         inOverlap >> firstName >> secondName >> overlapDistance; gobble(inOverlap);
382                                         
383                                         map<string,int>::iterator itA = nameMap->find(firstName);
384                                         map<string,int>::iterator itB = nameMap->find(secondName);
385                                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
386                                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
387                                         
388                                         overlapNode.seq1 = itA->second;
389                                         overlapNode.seq2 = itB->second;
390                                         overlapNode.dist = overlapDistance;
391                                 }else { inOverlap.close(); break; }
392                         } 
393                 
394                         if (overlapNode.dist < dist) {
395                                 //get names of seqs that overlap
396                                 string name1 = nameMap->get(overlapNode.seq1);
397                                 string name2 = nameMap->get(overlapNode.seq2);
398                                 
399                                 //use binInfo to find out if they are already in the same bin
400                                 int binKeep = binInfo[name1];
401                                 int binRemove = binInfo[name2];
402                                 
403                                 //if not merge bins and update binInfo
404                                 if(binKeep != binRemove) {
405                                         //save names in old bin
406                                         string names = list->get(binRemove);
407                                         
408                                         //merge bins into name1s bin
409                                         newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
410                                         newList->set(binRemove, "");    
411                                         
412                                         //update binInfo
413                                         while (names.find_first_of(',') != -1) { 
414                                                 //get name from bin
415                                                 string name = names.substr(0,names.find_first_of(','));
416                                                 //save name and bin number
417                                                 binInfo[name] = binKeep;
418                                                 names = names.substr(names.find_first_of(',')+1, names.length());
419                                         }
420                                         
421                                         //get last name
422                                         binInfo[names] = binKeep;
423                                 }
424                                 
425                         }else { done = true; }
426                 }
427                 
428                 //return listvector
429                 return newList;
430                                 
431         }
432         catch(exception& e) {
433                 errorOut(e, "MGClusterCommand", "mergeOPFs");
434                 exit(1);
435         }
436 }
437 //**********************************************************************************************************************
438 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
439         try {
440                 //sort distFile
441                 string sortedDistFile = sortFile(unsortedDist);
442                 remove(unsortedDist.c_str());  //delete unsorted file
443                 distFile = sortedDistFile;
444                 
445                 //sort overlap file
446                 string sortedOverlapFile = sortFile(unsortedOverlap);
447                 remove(unsortedOverlap.c_str());  //delete unsorted file
448                 overlapFile = sortedOverlapFile;
449         }
450         catch(exception& e) {
451                 errorOut(e, "MGClusterCommand", "sortHclusterFiles");
452                 exit(1);
453         }
454 }
455
456 //**********************************************************************************************************************
457
458
459
460
461