mgclustercommand.cpp

   1 /*
   2  *  mgclustercommand.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 12/11/09.
   6  *  Copyright 2009 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "mgclustercommand.h"
  11 #include "readcluster.h"
  12
  13 //**********************************************************************************************************************
  14 MGClusterCommand::MGClusterCommand(string option){
  15         try {
  16                 globaldata = GlobalData::getInstance();
  17                 abort = false;
  18
  19                 //allow user to run help
  20                 if(option == "help") { help(); abort = true; }
  21
  22                 else {
  23                         //valid paramters for this command
  24                         string Array[] =  {"blast", "method", "name", "cutoff", "precision", "length", "min", "penalty", "hcluster","merge"};
  25                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
  26
  27                         OptionParser parser(option);
  28                         map<string, string> parameters = parser.getParameters();
  29
  30                         ValidParameters validParameter;
  31
  32                         //check to make sure all parameters are valid for command
  33                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
  34                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
  35                         }
  36
  37                         //check for required parameters
  38                         blastfile = validParameter.validFile(parameters, "blast", true);
  39                         if (blastfile == "not open") { abort = true; }
  40                         else if (blastfile == "not found") { blastfile = ""; }
  41
  42                         namefile = validParameter.validFile(parameters, "name", true);
  43                         if (namefile == "not open") { abort = true; }
  44                         else if (namefile == "not found") { namefile = ""; }
  45
  46                         if ((blastfile == "")) { mothurOut("When executing a mgcluster command you must provide a blastfile."); mothurOutEndLine(); abort = true; }
  47
  48                         //check for optional parameter and set defaults
  49                         string temp;
  50                         temp = validParameter.validFile(parameters, "precision", false);                if (temp == "not found") { temp = "100"; }
  51                         precisionLength = temp.length();
  52                         convert(temp, precision);
  53
  54                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "0.70"; }
  55                         convert(temp, cutoff);
  56                         cutoff += (5 / (precision * 10.0));
  57
  58                         method = validParameter.validFile(parameters, "method", false);
  59                         if (method == "not found") { method = "furthest"; }
  60
  61                         if ((method == "furthest") || (method == "nearest") || (method == "average")) { }
  62                         else { mothurOut("Not a valid clustering method.  Valid clustering algorithms are furthest, nearest or average."); mothurOutEndLine(); abort = true; }
  63
  64                         temp = validParameter.validFile(parameters, "length", false);                   if (temp == "not found") { temp = "5"; }
  65                         convert(temp, length);
  66
  67                         temp = validParameter.validFile(parameters, "penalty", false);                  if (temp == "not found") { temp = "0.10"; }
  68                         convert(temp, penalty);
  69
  70                         temp = validParameter.validFile(parameters, "min", false);                              if (temp == "not found") { temp = "true"; }
  71                         minWanted = isTrue(temp);
  72
  73                         temp = validParameter.validFile(parameters, "merge", false);                    if (temp == "not found") { temp = "true"; }
  74                         merge = isTrue(temp);
  75
  76                         temp = validParameter.validFile(parameters, "hcluster", false);                 if (temp == "not found") { temp = "false"; }
  77                         hclusterWanted = isTrue(temp);
  78                 }
  79
  80         }
  81         catch(exception& e) {
  82                 errorOut(e, "MGClusterCommand", "MGClusterCommand");
  83                 exit(1);
  84         }
  85 }
  86 //**********************************************************************************************************************
  87
  88 void MGClusterCommand::help(){
  89         try {
  90                 mothurOut("The mgcluster command parameter options are blast, name, cutoff, precision, method, merge, min, length, penalty and hcluster. The blast parameter is required.\n");
  91                 mothurOut("The mgcluster command reads a blast and name file and clusters the sequences into OPF units similiar to the OTUs.\n");
  92                 mothurOut("This command outputs a .list, .rabund and .sabund file that can be used with mothur other commands to estimate richness.\n");
  93                 mothurOut("The cutoff parameter is used to specify the maximum distance you would like to cluster to. The default is 0.70.\n");
  94                 mothurOut("The precision parameter's default value is 100. \n");
  95                 mothurOut("The acceptable mgcluster methods are furthest, nearest and average.  If no method is provided then furthest is assumed.\n\n");
  96                 mothurOut("The min parameter allows you to specify is you want the minimum or maximum blast score ratio used in calculating the distance. The default is true, meaning you want the minimum.\n");
  97                 mothurOut("The length parameter is used to specify the minimum overlap required.  The default is 5.\n");
  98                 mothurOut("The penalty parameter is used to adjust the error rate.  The default is 0.10.\n");
  99                 mothurOut("The merge parameter allows you to shut off merging based on overlaps and just cluster.  By default merge is true, meaning you want to merge.\n");
 100                 mothurOut("The hcluster parameter allows you to use the hcluster algorithm when clustering.  This may be neccessary if your file is too large to fit into RAM. The default is false.\n");
 101                 mothurOut("The mgcluster command should be in the following format: \n");
 102                 mothurOut("mgcluster(blast=yourBlastfile, name=yourNameFile, cutoff=yourCutOff).\n");
 103                 mothurOut("Note: No spaces between parameter labels (i.e. balst), '=' and parameters (i.e.yourBlastfile).\n\n");
 104         }
 105         catch(exception& e) {
 106                 errorOut(e, "MGClusterCommand", "help");
 107                 exit(1);
 108         }
 109 }
 110 //**********************************************************************************************************************
 111 MGClusterCommand::~MGClusterCommand(){}
 112 //**********************************************************************************************************************
 113 int MGClusterCommand::execute(){
 114         try {
 115
 116                 if (abort == true) {    return 0;       }
 117
 118                 //read names file
 119                 if (namefile != "") {
 120                         nameMap = new NameAssignment(namefile);
 121                         nameMap->readMap();
 122                 }else{ nameMap= new NameAssignment(); }
 123
 124                 string fileroot = getRootName(blastfile);
 125                 string tag = "";
 126                 time_t start;
 127                 float previousDist = 0.00000;
 128                 float rndPreviousDist = 0.00000;
 129
 130                 //read blastfile - creates sparsematrices for the distances and overlaps as well as a listvector
 131                 //must remember to delete those objects here since readBlast does not
 132                 read = new ReadBlast(blastfile, cutoff, penalty, length, minWanted, hclusterWanted);
 133                 read->read(nameMap);
 134
 135                 list = new ListVector(nameMap->getListVector());
 136                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
 137
 138                 start = time(NULL);
 139                 oldList = *list;
 140
 141                 if (!hclusterWanted) {
 142                         //get distmatrix and overlap
 143                         SparseMatrix* distMatrix = read->getDistMatrix();
 144                         overlapMatrix = read->getOverlapMatrix(); //already sorted by read
 145                         delete read;
 146
 147                         //create cluster
 148                         if (method == "furthest")       {       cluster = new CompleteLinkage(rabund, list, distMatrix); }
 149                         else if(method == "nearest"){   cluster = new SingleLinkage(rabund, list, distMatrix); }
 150                         else if(method == "average"){   cluster = new AverageLinkage(rabund, list, distMatrix); }
 151                         tag = cluster->getTag();
 152                         cluster->setMapWanted(true);
 153
 154                         //open output files
 155                         openOutputFile(fileroot+ tag + ".list",  listFile);
 156                         openOutputFile(fileroot+ tag + ".rabund",  rabundFile);
 157                         openOutputFile(fileroot+ tag + ".sabund",  sabundFile);
 158
 159                         //cluster using cluster classes
 160                         while (distMatrix->getSmallDist() < cutoff && distMatrix->getNNodes() > 0){
 161
 162                                 cluster->update();
 163                                 float dist = distMatrix->getSmallDist();
 164                                 float rndDist = roundDist(dist, precision);
 165
 166                                 if(previousDist <= 0.0000 && dist != previousDist){
 167                                         oldList.setLabel("unique");
 168                                         printData(&oldList);
 169                                 }
 170                                 else if(rndDist != rndPreviousDist){
 171                                         if (merge) {
 172                                                 map<string, int> seq2Bin = cluster->getSeqtoBin();
 173                                                 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
 174                                                 temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
 175                                                 printData(temp);
 176                                                 delete temp;
 177                                         }else{
 178                                                 oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
 179                                                 printData(&oldList);
 180                                         }
 181                                 }
 182
 183                                 previousDist = dist;
 184                                 rndPreviousDist = rndDist;
 185                                 oldList = *list;
 186                         }
 187
 188                         if(previousDist <= 0.0000){
 189                                 oldList.setLabel("unique");
 190                                 printData(&oldList);
 191                         }
 192                         else if(rndPreviousDist<cutoff){
 193                                 if (merge) {
 194                                         map<string, int> seq2Bin = cluster->getSeqtoBin();
 195                                         ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
 196                                         temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
 197                                         printData(temp);
 198                                         delete temp;
 199                                 }else{
 200                                         oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
 201                                         printData(&oldList);
 202                                 }
 203                         }
 204
 205                         //free memory
 206                         overlapMatrix.clear();
 207                         delete distMatrix;
 208                         delete cluster;
 209
 210                 }else { //use hcluster to cluster
 211                         tag = "fn";
 212                         //get distmatrix and overlap
 213                         overlapFile = read->getOverlapFile();
 214                         distFile = read->getDistFile();
 215                         delete read;
 216
 217                         //sort the distance and overlap files
 218                         sortHclusterFiles(distFile, overlapFile);
 219
 220                         //create cluster
 221                         hcluster = new HCluster(rabund, list);
 222                         hcluster->setMapWanted(true);
 223
 224                         //open output files
 225                         openOutputFile(fileroot+ tag + ".list",  listFile);
 226                         openOutputFile(fileroot+ tag + ".rabund",  rabundFile);
 227                         openOutputFile(fileroot+ tag + ".sabund",  sabundFile);
 228
 229                         vector<DistNode> seqs; seqs.resize(1); // to start loop
 230                         exitedBreak = false;  //lets you know if there is a distance stored in next
 231
 232                         ifstream inHcluster;
 233                         openInputFile(distFile, inHcluster);
 234
 235                         while (seqs.size() != 0){
 236
 237                                 seqs = getSeqs(inHcluster);
 238                                 random_shuffle(seqs.begin(), seqs.end());
 239
 240                                 for (int i = 0; i < seqs.size(); i++) {  //-1 means skip me
 241
 242                                         if (seqs[i].seq1 != seqs[i].seq2) {
 243
 244                                                 hcluster->update(seqs[i].seq1, seqs[i].seq2, seqs[i].dist);
 245
 246                                                 float rndDist = roundDist(seqs[i].dist, precision);
 247
 248                                                 if((previousDist <= 0.0000) && (seqs[i].dist != previousDist)){
 249                                                         oldList.setLabel("unique");
 250                                                         printData(&oldList);
 251                                                 }
 252                                                 else if((rndDist != rndPreviousDist)){
 253                                                         if (merge) {
 254                                                                 map<string, int> seq2Bin = hcluster->getSeqtoBin();
 255                                                                 ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
 256                                                                 temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
 257                                                                 printData(temp);
 258                                                                 delete temp;
 259                                                         }else{
 260                                                                 oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
 261                                                                 printData(&oldList);
 262                                                         }
 263                                                 }
 264
 265                                                 previousDist = seqs[i].dist;
 266                                                 rndPreviousDist = rndDist;
 267                                                 oldList = *list;
 268                                         }
 269                                 }
 270                         }
 271                         inHcluster.close();
 272
 273                         if(previousDist <= 0.0000){
 274                                 oldList.setLabel("unique");
 275                                 printData(&oldList);
 276                         }
 277                         else if(rndPreviousDist<cutoff){
 278                                 if (merge) {
 279                                         map<string, int> seq2Bin = hcluster->getSeqtoBin();
 280                                         ListVector* temp = mergeOPFs(seq2Bin, rndPreviousDist);
 281                                         temp->setLabel(toString(rndPreviousDist,  precisionLength-1));
 282                                         printData(temp);
 283                                         delete temp;
 284                                 }else{
 285                                         oldList.setLabel(toString(rndPreviousDist,  precisionLength-1));
 286                                         printData(&oldList);
 287                                 }
 288                         }
 289
 290                         delete hcluster;
 291                         remove(distFile.c_str());
 292                         remove(overlapFile.c_str());
 293                 }
 294
 295                 delete list;
 296                 delete rabund;
 297                 listFile.close();
 298                 sabundFile.close();
 299                 rabundFile.close();
 300
 301                 globaldata->setListFile(fileroot+ tag + ".list");
 302                 globaldata->setFormat("list");
 303
 304                 mothurOut("It took " + toString(time(NULL) - start) + " seconds to cluster."); mothurOutEndLine();
 305
 306                 return 0;
 307         }
 308         catch(exception& e) {
 309                 errorOut(e, "MGClusterCommand", "execute");
 310                 exit(1);
 311         }
 312 }
 313 //**********************************************************************************************************************
 314 void MGClusterCommand::printData(ListVector* mergedList){
 315         try {
 316                 mergedList->print(listFile);
 317                 mergedList->getRAbundVector().print(rabundFile);
 318
 319                 SAbundVector sabund = mergedList->getSAbundVector();
 320
 321                 sabund.print(cout);
 322                 sabund.print(sabundFile);
 323         }
 324         catch(exception& e) {
 325                 errorOut(e, "MGClusterCommand", "printData");
 326                 exit(1);
 327         }
 328 }
 329 //**********************************************************************************************************************
 330 //this merging is just at the reporting level, after this info is printed to the file it is gone and does not effect the datastructures
 331 //that are used to cluster by distance.  this is done so that the overlapping data does not have more influenece than the distance data.
 332 ListVector* MGClusterCommand::mergeOPFs(map<string, int> binInfo, float dist){
 333         try {
 334                 //create new listvector so you don't overwrite the clustering
 335                 ListVector* newList = new ListVector(oldList);
 336                 bool done = false;
 337                 ifstream inOverlap;
 338                 int count = 0;
 339
 340                 if (hclusterWanted) {
 341                         openInputFile(overlapFile, inOverlap);
 342                         if (inOverlap.eof()) {  done = true;  }
 343                 }else { if (overlapMatrix.size() == 0)  {  done = true;  } }
 344
 345                 while (!done) {
 346
 347                         //get next overlap
 348                         DistNode overlapNode;
 349                         if (!hclusterWanted) {
 350                                 if (count < overlapMatrix.size()) { //do we have another node in the matrix
 351                                         overlapNode = overlapMatrix[count];
 352                                         count++;
 353                                 }else { break; }
 354                         }else {
 355                                 if (!inOverlap.eof()) {
 356                                         string firstName, secondName;
 357                                         float overlapDistance;
 358                                         inOverlap >> firstName >> secondName >> overlapDistance; gobble(inOverlap);
 359
 360                                         map<string,int>::iterator itA = nameMap->find(firstName);
 361                                         map<string,int>::iterator itB = nameMap->find(secondName);
 362                                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
 363                                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
 364
 365                                         overlapNode.seq1 = itA->second;
 366                                         overlapNode.seq2 = itB->second;
 367                                         overlapNode.dist = overlapDistance;
 368                                 }else { inOverlap.close(); break; }
 369                         }
 370
 371                         if (overlapNode.dist < dist) {
 372                                 //get names of seqs that overlap
 373                                 string name1 = nameMap->get(overlapNode.seq1);
 374                                 string name2 = nameMap->get(overlapNode.seq2);
 375
 376                                 //use binInfo to find out if they are already in the same bin
 377                                 int binKeep = binInfo[name1];
 378                                 int binRemove = binInfo[name2];
 379
 380                                 //if not merge bins and update binInfo
 381                                 if(binKeep != binRemove) {
 382                                         //save names in old bin
 383                                         string names = list->get(binRemove);
 384
 385                                         //merge bins into name1s bin
 386                                         newList->set(binKeep, newList->get(binRemove)+','+newList->get(binKeep));
 387                                         newList->set(binRemove, "");
 388
 389                                         //update binInfo
 390                                         while (names.find_first_of(',') != -1) {
 391                                                 //get name from bin
 392                                                 string name = names.substr(0,names.find_first_of(','));
 393                                                 //save name and bin number
 394                                                 binInfo[name] = binKeep;
 395                                                 names = names.substr(names.find_first_of(',')+1, names.length());
 396                                         }
 397
 398                                         //get last name
 399                                         binInfo[names] = binKeep;
 400                                 }
 401
 402                         }else { done = true; }
 403                 }
 404
 405                 //return listvector
 406                 return newList;
 407
 408         }
 409         catch(exception& e) {
 410                 errorOut(e, "MGClusterCommand", "mergeOPFs");
 411                 exit(1);
 412         }
 413 }
 414 //**********************************************************************************************************************
 415 void MGClusterCommand::sortHclusterFiles(string unsortedDist, string unsortedOverlap) {
 416         try {
 417                 //sort distFile
 418                 ReadCluster* readCluster = new ReadCluster(unsortedDist, cutoff);
 419                 readCluster->setFormat("column");
 420                 readCluster->read(nameMap);
 421                 string sortedDistFile = readCluster->getOutputFile();
 422                 ListVector* extralist = readCluster->getListVector();  //we get this so we can delete it and not have a memory leak
 423
 424                 remove(unsortedDist.c_str());  //delete unsorted file
 425                 distFile = sortedDistFile;
 426                 delete extralist;
 427                 delete readCluster;
 428
 429                 //sort overlap file
 430                 readCluster = new ReadCluster(unsortedOverlap, cutoff);
 431                 readCluster->setFormat("column");
 432                 readCluster->read(nameMap);
 433                 string sortedOverlapFile = readCluster->getOutputFile();
 434                 extralist = readCluster->getListVector();  //we get this so we can delete it and not have a memory leak
 435
 436                 remove(unsortedOverlap.c_str());  //delete unsorted file
 437                 overlapFile = sortedOverlapFile;
 438                 delete extralist;
 439                 delete readCluster;
 440         }
 441         catch(exception& e) {
 442                 errorOut(e, "MGClusterCommand", "sortHclusterFiles");
 443                 exit(1);
 444         }
 445 }
 446 //**********************************************************************************************************************
 447 vector<DistNode> MGClusterCommand::getSeqs(ifstream& filehandle){
 448         try {
 449                 string firstName, secondName;
 450                 float distance, prevDistance;
 451                 vector<DistNode> sameSeqs;
 452                 prevDistance = -1;
 453
 454                 //if you are not at the beginning of the file
 455                 if (exitedBreak) {
 456                         sameSeqs.push_back(next);
 457                         prevDistance = next.dist;
 458                         exitedBreak = false;
 459                 }
 460
 461                 //get entry
 462                 while (!filehandle.eof()) {
 463
 464                         filehandle >> firstName >> secondName >> distance;    gobble(filehandle);
 465
 466                         //save first one
 467                         if (prevDistance == -1) { prevDistance = distance; }
 468
 469                         map<string,int>::iterator itA = nameMap->find(firstName);
 470                         map<string,int>::iterator itB = nameMap->find(secondName);
 471                         if(itA == nameMap->end()){  cerr << "AAError: Sequence '" << firstName << "' was not found in the names file, please correct\n"; exit(1);  }
 472                         if(itB == nameMap->end()){  cerr << "ABError: Sequence '" << secondName << "' was not found in the names file, please correct\n"; exit(1);  }
 473
 474                         //using cutoff
 475                         if (distance > cutoff) { break; }
 476
 477                         if (distance != -1) { //-1 means skip me
 478
 479                                 //are the distances the same
 480                                 if (distance == prevDistance) { //save in vector
 481                                         DistNode temp(itA->second, itB->second, distance);
 482                                         sameSeqs.push_back(temp);
 483                                         exitedBreak = false;
 484                                 }else{
 485                                         next.seq1 = itA->second;
 486                                         next.seq2 = itB->second;
 487                                         next.dist = distance;
 488                                         exitedBreak = true;
 489                                         break;
 490                                 }
 491                         }
 492                 }
 493
 494                 return sameSeqs;
 495         }
 496         catch(exception& e) {
 497                 errorOut(e, "MGClusterCommand", "getSeqs");
 498                 exit(1);
 499         }
 500 }
 501 //**********************************************************************************************************************
 502
 503
 504
 505
 506