preclustercommand.cpp

   1 /*
   2  *  preclustercommand.cpp
   3  *  Mothur
   4  *
   5  *  Created by westcott on 12/21/09.
   6  *  Copyright 2009 Schloss Lab. All rights reserved.
   7  *
   8  */
   9
  10 #include "preclustercommand.h"
  11 #include "deconvolutecommand.h"
  12
  13 //**********************************************************************************************************************
  14 vector<string> PreClusterCommand::setParameters(){
  15         try {
  16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta-name",false,true,true); parameters.push_back(pfasta);
  17                 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname);
  18         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","count",false,false,true); parameters.push_back(pcount);
  19                 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
  20                 CommandParameter pdiffs("diffs", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pdiffs);
  21                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
  22         CommandParameter ptopdown("topdown", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(ptopdown);
  23
  24                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
  25                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
  26
  27                 vector<string> myArray;
  28                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
  29                 return myArray;
  30         }
  31         catch(exception& e) {
  32                 m->errorOut(e, "PreClusterCommand", "setParameters");
  33                 exit(1);
  34         }
  35 }
  36 //**********************************************************************************************************************
  37 string PreClusterCommand::getHelpString(){
  38         try {
  39                 string helpString = "";
  40                 helpString += "The pre.cluster command groups sequences that are within a given number of base mismatches.\n";
  41                 helpString += "The pre.cluster command outputs a new fasta and name file.\n";
  42                 helpString += "The pre.cluster command parameters are fasta, name, group, count, topdown, processors and diffs. The fasta parameter is required. \n";
  43                 helpString += "The name parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n";
  44                 helpString += "The group parameter allows you to provide a group file so you can cluster by group. \n";
  45         helpString += "The count parameter allows you to provide a count file so you can cluster by group. \n";
  46                 helpString += "The diffs parameter allows you to specify maximum number of mismatched bases allowed between sequences in a grouping. The default is 1.\n";
  47         helpString += "The topdown parameter allows you to specify whether to cluster from largest abundance to smallest or smallest to largest.  Default=T, meaning largest to smallest.\n";
  48                 helpString += "The pre.cluster command should be in the following format: \n";
  49                 helpString += "pre.cluster(fasta=yourFastaFile, names=yourNamesFile, diffs=yourMaxDiffs) \n";
  50                 helpString += "Example pre.cluster(fasta=amazon.fasta, diffs=2).\n";
  51                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
  52                 return helpString;
  53         }
  54         catch(exception& e) {
  55                 m->errorOut(e, "PreClusterCommand", "getHelpString");
  56                 exit(1);
  57         }
  58 }
  59 //**********************************************************************************************************************
  60 string PreClusterCommand::getOutputPattern(string type) {
  61     try {
  62         string pattern = "";
  63
  64         if (type == "fasta") {  pattern = "[filename],precluster,[extension]"; }
  65         else if (type == "name") {  pattern = "[filename],precluster.names"; }
  66         else if (type == "count") {  pattern = "[filename],precluster.count_table"; }
  67         else if (type == "map") {  pattern =  "[filename],precluster.map"; }
  68         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
  69
  70         return pattern;
  71     }
  72     catch(exception& e) {
  73         m->errorOut(e, "PreClusterCommand", "getOutputPattern");
  74         exit(1);
  75     }
  76 }
  77 //**********************************************************************************************************************
  78 PreClusterCommand::PreClusterCommand(){
  79         try {
  80                 abort = true; calledHelp = true;
  81                 setParameters();
  82                 vector<string> tempOutNames;
  83                 outputTypes["fasta"] = tempOutNames;
  84                 outputTypes["name"] = tempOutNames;
  85         outputTypes["count"] = tempOutNames;
  86                 outputTypes["map"] = tempOutNames;
  87         }
  88         catch(exception& e) {
  89                 m->errorOut(e, "PreClusterCommand", "PreClusterCommand");
  90                 exit(1);
  91         }
  92 }
  93 //**********************************************************************************************************************
  94
  95 PreClusterCommand::PreClusterCommand(string option) {
  96         try {
  97                 abort = false; calledHelp = false;
  98
  99                 //allow user to run help
 100                 if(option == "help") { help(); abort = true; calledHelp = true; }
 101                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
 102
 103                 else {
 104                         vector<string> myArray = setParameters();
 105
 106                         OptionParser parser(option);
 107                         map<string, string> parameters = parser.getParameters();
 108
 109                         ValidParameters validParameter;
 110                         map<string, string>::iterator it;
 111
 112                         //check to make sure all parameters are valid for command
 113                         for (map<string, string>::iterator it2 = parameters.begin(); it2 != parameters.end(); it2++) {
 114                                 if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) {  abort = true;  }
 115                         }
 116
 117                         //initialize outputTypes
 118                         vector<string> tempOutNames;
 119                         outputTypes["fasta"] = tempOutNames;
 120                         outputTypes["name"] = tempOutNames;
 121                         outputTypes["map"] = tempOutNames;
 122             outputTypes["count"] = tempOutNames;
 123
 124                         //if the user changes the input directory command factory will send this info to us in the output parameter
 125                         string inputDir = validParameter.validFile(parameters, "inputdir", false);
 126                         if (inputDir == "not found"){   inputDir = "";          }
 127                         else {
 128                                 string path;
 129                                 it = parameters.find("fasta");
 130                                 //user has given a template file
 131                                 if(it != parameters.end()){
 132                                         path = m->hasPath(it->second);
 133                                         //if the user has not given a path then, add inputdir. else leave path alone.
 134                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
 135                                 }
 136
 137                                 it = parameters.find("name");
 138                                 //user has given a template file
 139                                 if(it != parameters.end()){
 140                                         path = m->hasPath(it->second);
 141                                         //if the user has not given a path then, add inputdir. else leave path alone.
 142                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
 143                                 }
 144
 145                                 it = parameters.find("group");
 146                                 //user has given a template file
 147                                 if(it != parameters.end()){
 148                                         path = m->hasPath(it->second);
 149                                         //if the user has not given a path then, add inputdir. else leave path alone.
 150                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
 151                                 }
 152
 153                 it = parameters.find("count");
 154                                 //user has given a template file
 155                                 if(it != parameters.end()){
 156                                         path = m->hasPath(it->second);
 157                                         //if the user has not given a path then, add inputdir. else leave path alone.
 158                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
 159                                 }
 160                         }
 161
 162                         //check for required parameters
 163                         fastafile = validParameter.validFile(parameters, "fasta", true);
 164                         if (fastafile == "not found") {
 165                                 fastafile = m->getFastaFile();
 166                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
 167                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
 168                         }
 169                         else if (fastafile == "not open") { abort = true; }
 170                         else { m->setFastaFile(fastafile); }
 171
 172                         //if the user changes the output directory command factory will send this info to us in the output parameter
 173                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){
 174                                 outputDir = "";
 175                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it
 176                         }
 177
 178                         //check for optional parameter and set defaults
 179                         // ...at some point should added some additional type checking...
 180                         namefile = validParameter.validFile(parameters, "name", true);
 181                         if (namefile == "not found") { namefile =  "";  }
 182                         else if (namefile == "not open") { namefile = ""; abort = true; }
 183                         else {  m->setNameFile(namefile); }
 184
 185                         groupfile = validParameter.validFile(parameters, "group", true);
 186                         if (groupfile == "not found") { groupfile =  "";  bygroup = false; }
 187                         else if (groupfile == "not open") { abort = true; groupfile =  ""; }
 188                         else {   m->setGroupFile(groupfile); bygroup = true;  }
 189
 190             countfile = validParameter.validFile(parameters, "count", true);
 191                         if (countfile == "not found") { countfile =  "";   }
 192                         else if (countfile == "not open") { abort = true; countfile =  ""; }
 193                         else {
 194                 m->setCountTableFile(countfile);
 195                 ct.readTable(countfile, true, false);
 196                 if (ct.hasGroupInfo()) { bygroup = true; }
 197                 else { bygroup = false;  }
 198             }
 199
 200             if ((namefile != "") && (countfile != "")) {
 201                 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
 202             }
 203
 204             if ((groupfile != "") && (countfile != "")) {
 205                 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
 206             }
 207
 208
 209                         string temp     = validParameter.validFile(parameters, "diffs", false);         if(temp == "not found"){        temp = "1"; }
 210                         m->mothurConvert(temp, diffs);
 211
 212                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
 213                         m->setProcessors(temp);
 214                         m->mothurConvert(temp, processors);
 215
 216             temp = validParameter.validFile(parameters, "topdown", false);              if(temp == "not found"){  temp = "T"; }
 217                         topdown = m->isTrue(temp);
 218
 219             if (countfile == "") {
 220                 if (namefile == "") {
 221                     vector<string> files; files.push_back(fastafile);
 222                     parser.getNameFile(files);
 223                 }
 224             }
 225                 }
 226
 227         }
 228         catch(exception& e) {
 229                 m->errorOut(e, "PreClusterCommand", "PreClusterCommand");
 230                 exit(1);
 231         }
 232 }
 233 //**********************************************************************************************************************
 234
 235 int PreClusterCommand::execute(){
 236         try {
 237
 238                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
 239
 240                 int start = time(NULL);
 241
 242                 string fileroot = outputDir + m->getRootName(m->getSimpleName(fastafile));
 243         map<string, string> variables;
 244         variables["[filename]"] = fileroot;
 245                 string newNamesFile = getOutputFileName("name",variables);
 246         string newCountFile = getOutputFileName("count",variables);
 247                 string newMapFile = getOutputFileName("map",variables); //add group name if by group
 248         variables["[extension]"] = m->getExtension(fastafile);
 249                 string newFastaFile = getOutputFileName("fasta", variables);
 250                 outputNames.push_back(newFastaFile); outputTypes["fasta"].push_back(newFastaFile);
 251                 if (countfile == "") { outputNames.push_back(newNamesFile); outputTypes["name"].push_back(newNamesFile); }
 252                 else { outputNames.push_back(newCountFile); outputTypes["count"].push_back(newCountFile); }
 253
 254                 if (bygroup) {
 255                         //clear out old files
 256                         ofstream outFasta; m->openOutputFile(newFastaFile, outFasta); outFasta.close();
 257                         ofstream outNames; m->openOutputFile(newNamesFile, outNames);  outNames.close();
 258                         newMapFile = fileroot + "precluster.";
 259
 260                         //parse fasta and name file by group
 261             vector<string> groups;
 262                         if (countfile != "") {
 263                 cparser = new SequenceCountParser(countfile, fastafile);
 264                 groups = cparser->getNamesOfGroups();
 265             }else {
 266                 if (namefile != "") { parser = new SequenceParser(groupfile, fastafile, namefile);      }
 267                 else                            { parser = new SequenceParser(groupfile, fastafile);                    }
 268                 groups = parser->getNamesOfGroups();
 269                         }
 270
 271                         if(processors == 1)     {       driverGroups(newFastaFile, newNamesFile, newMapFile, 0, groups.size(), groups); }
 272                         else                            {       createProcessesGroups(newFastaFile, newNamesFile, newMapFile, groups);                  }
 273
 274                         if (countfile != "") {
 275                 mergeGroupCounts(newCountFile, newNamesFile, newFastaFile);
 276                 delete cparser;
 277             }else {
 278                 delete parser;
 279                 //run unique.seqs for deconvolute results
 280                 string inputString = "fasta=" + newFastaFile;
 281                 if (namefile != "") { inputString += ", name=" + newNamesFile; }
 282                 m->mothurOutEndLine();
 283                 m->mothurOut("/******************************************/"); m->mothurOutEndLine();
 284                 m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine();
 285                 m->mothurCalling = true;
 286
 287                 Command* uniqueCommand = new DeconvoluteCommand(inputString);
 288                 uniqueCommand->execute();
 289
 290                 map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
 291
 292                 delete uniqueCommand;
 293                 m->mothurCalling = false;
 294                 m->mothurOut("/******************************************/"); m->mothurOutEndLine();
 295
 296                 m->renameFile(filenames["fasta"][0], newFastaFile);
 297                 m->renameFile(filenames["name"][0], newNamesFile);
 298                         }
 299             if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {    m->mothurRemove(outputNames[i]);        }        return 0; }
 300                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run pre.cluster."); m->mothurOutEndLine();
 301
 302                 }else {
 303             if (processors != 1) { m->mothurOut("When using running without group information mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; }
 304                         if (namefile != "") { readNameFile(); }
 305
 306                         //reads fasta file and return number of seqs
 307                         int numSeqs = readFASTA(); //fills alignSeqs and makes all seqs active
 308
 309                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }
 310
 311                         if (numSeqs == 0) { m->mothurOut("Error reading fasta file...please correct."); m->mothurOutEndLine(); return 0;  }
 312                         if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); return 0;  }
 313
 314                         int count = process(newMapFile);
 315                         outputNames.push_back(newMapFile); outputTypes["map"].push_back(newMapFile);
 316
 317                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }
 318
 319                         m->mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
 320                         m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
 321                         if (countfile != "") { newNamesFile = newCountFile; }
 322             printData(newFastaFile, newNamesFile, "");
 323
 324                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
 325                 }
 326
 327                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }
 328
 329                 m->mothurOutEndLine();
 330                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
 331                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
 332                 m->mothurOutEndLine();
 333
 334                 //set fasta file as new current fastafile
 335                 string current = "";
 336                 itTypes = outputTypes.find("fasta");
 337                 if (itTypes != outputTypes.end()) {
 338                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
 339                 }
 340
 341                 itTypes = outputTypes.find("name");
 342                 if (itTypes != outputTypes.end()) {
 343                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
 344                 }
 345
 346         itTypes = outputTypes.find("count");
 347                 if (itTypes != outputTypes.end()) {
 348                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
 349                 }
 350
 351                 return 0;
 352
 353         }
 354         catch(exception& e) {
 355                 m->errorOut(e, "PreClusterCommand", "execute");
 356                 exit(1);
 357         }
 358 }
 359 /**************************************************************************************************/
 360 int PreClusterCommand::createProcessesGroups(string newFName, string newNName, string newMFile, vector<string> groups) {
 361         try {
 362
 363                 vector<int> processIDS;
 364                 int process = 1;
 365                 int num = 0;
 366                 bool recalc = false;
 367
 368                 //sanity check
 369                 if (groups.size() < processors) { processors = groups.size(); }
 370
 371                 //divide the groups between the processors
 372                 vector<linePair> lines;
 373                 int remainingPairs = groups.size();
 374         int startIndex = 0;
 375         for (int remainingProcessors = processors; remainingProcessors > 0; remainingProcessors--) {
 376             int numPairs = remainingPairs; //case for last processor
 377             if (remainingProcessors != 1) { numPairs = ceil(remainingPairs / remainingProcessors); }
 378             lines.push_back(linePair(startIndex, (startIndex+numPairs))); //startIndex, endIndex
 379             startIndex = startIndex + numPairs;
 380             remainingPairs = remainingPairs - numPairs;
 381         }
 382
 383 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
 384
 385                 //loop through and create all the processes you want
 386                 while (process != processors) {
 387                         pid_t pid = fork();
 388
 389                         if (pid > 0) {
 390                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
 391                                 process++;
 392                         }else if (pid == 0){
 393                 outputNames.clear();
 394                                 num = driverGroups(newFName + m->mothurGetpid(process) + ".temp", newNName + m->mothurGetpid(process) + ".temp", newMFile, lines[process].start, lines[process].end, groups);
 395
 396                 string tempFile = m->mothurGetpid(process) + ".outputNames.temp";
 397                 ofstream outTemp;
 398                 m->openOutputFile(tempFile, outTemp);
 399
 400                 outTemp << outputNames.size();
 401                 for (int i = 0; i < outputNames.size(); i++) { outTemp << outputNames[i] << endl; }
 402                 outTemp.close();
 403
 404                                 exit(0);
 405                         }else {
 406                 m->mothurOut("[ERROR]: unable to spawn the number of processes you requested, reducing number to " + toString(process) + "\n"); processors = process;
 407                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
 408                 recalc = true;
 409                                 break;
 410                         }
 411                 }
 412
 413         if (recalc) {
 414             lines.clear();
 415             num = 0;
 416             processIDS.resize(0);
 417             process = 1;
 418
 419             int remainingPairs = groups.size();
 420             int startIndex = 0;
 421             for (int remainingProcessors = processors; remainingProcessors > 0; remainingProcessors--) {
 422                 int numPairs = remainingPairs; //case for last processor
 423                 if (remainingProcessors != 1) { numPairs = ceil(remainingPairs / remainingProcessors); }
 424                 lines.push_back(linePair(startIndex, (startIndex+numPairs))); //startIndex, endIndex
 425                 startIndex = startIndex + numPairs;
 426                 remainingPairs = remainingPairs - numPairs;
 427             }
 428
 429             while (process != processors) {
 430                 pid_t pid = fork();
 431
 432                 if (pid > 0) {
 433                     processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
 434                     process++;
 435                 }else if (pid == 0){
 436                     outputNames.clear();
 437                     num = driverGroups(newFName + m->mothurGetpid(process) + ".temp", newNName + m->mothurGetpid(process) + ".temp", newMFile, lines[process].start, lines[process].end, groups);
 438
 439                     string tempFile = m->mothurGetpid(process) + ".outputNames.temp";
 440                     ofstream outTemp;
 441                     m->openOutputFile(tempFile, outTemp);
 442
 443                     outTemp << outputNames.size();
 444                     for (int i = 0; i < outputNames.size(); i++) { outTemp << outputNames[i] << endl; }
 445                     outTemp.close();
 446
 447                     exit(0);
 448                 }else {
 449                     m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
 450                     for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
 451                     exit(0);
 452                 }
 453             }
 454         }
 455
 456
 457                 //do my part
 458                 num = driverGroups(newFName, newNName, newMFile, lines[0].start, lines[0].end, groups);
 459
 460                 //force parent to wait until all the processes are done
 461                 for (int i=0;i<processIDS.size();i++) {
 462                         int temp = processIDS[i];
 463                         wait(&temp);
 464                 }
 465
 466         for (int i = 0; i < processIDS.size(); i++) {
 467             string tempFile = toString(processIDS[i]) +  ".outputNames.temp";
 468             ifstream intemp;
 469             m->openInputFile(tempFile, intemp);
 470
 471             int num;
 472             intemp >> num;
 473             for (int k = 0; k < num; k++) {
 474                 string name = "";
 475                 intemp >> name; m->gobble(intemp);
 476
 477                 outputNames.push_back(name); outputTypes["map"].push_back(name);
 478             }
 479             intemp.close();
 480             m->mothurRemove(tempFile);
 481         }
 482 #else
 483
 484                 //////////////////////////////////////////////////////////////////////////////////////////////////////
 485                 //Windows version shared memory, so be careful when passing variables through the preClusterData struct.
 486                 //Above fork() will clone, so memory is separate, but that's not the case with windows,
 487                 //////////////////////////////////////////////////////////////////////////////////////////////////////
 488
 489                 vector<preClusterData*> pDataArray;
 490                 DWORD   dwThreadIdArray[processors-1];
 491                 HANDLE  hThreadArray[processors-1];
 492
 493                 //Create processor worker threads.
 494                 for( int i=1; i<processors; i++ ){
 495                         // Allocate memory for thread data.
 496                         string extension = toString(i) + ".temp";
 497
 498                         preClusterData* tempPreCluster = new preClusterData(fastafile, namefile, groupfile, countfile, (newFName+extension), (newNName+extension), newMFile, groups, m, lines[i].start, lines[i].end, diffs, topdown, i);
 499                         pDataArray.push_back(tempPreCluster);
 500                         processIDS.push_back(i);
 501
 502                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
 503                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
 504                         hThreadArray[i-1] = CreateThread(NULL, 0, MyPreclusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);
 505                 }
 506
 507
 508                 //using the main process as a worker saves time and memory
 509                 num = driverGroups(newFName, newNName, newMFile, lines[0].start, lines[0].end, groups);
 510
 511                 //Wait until all threads have terminated.
 512                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
 513
 514                 //Close all thread handles and free memory allocations.
 515                 for(int i=0; i < pDataArray.size(); i++){
 516             if (pDataArray[i]->count != (pDataArray[i]->end-pDataArray[i]->start)) {
 517                 m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end-pDataArray[i]->start) + " groups assigned to it, quitting. \n"); m->control_pressed = true;
 518             }
 519                         for (int j = 0; j < pDataArray[i]->mapFileNames.size(); j++) {
 520                                 outputNames.push_back(pDataArray[i]->mapFileNames[j]); outputTypes["map"].push_back(pDataArray[i]->mapFileNames[j]);
 521                         }
 522                         CloseHandle(hThreadArray[i]);
 523                         delete pDataArray[i];
 524                 }
 525
 526 #endif
 527
 528                 //append output files
 529                 for(int i=0;i<processIDS.size();i++){
 530                         //newFName = m->getFullPathName(".\\" + newFName);
 531                         //newNName = m->getFullPathName(".\\" + newNName);
 532
 533                         m->appendFiles((newFName + toString(processIDS[i]) + ".temp"), newFName);
 534                         m->mothurRemove((newFName + toString(processIDS[i]) + ".temp"));
 535
 536                         m->appendFiles((newNName + toString(processIDS[i]) + ".temp"), newNName);
 537                         m->mothurRemove((newNName + toString(processIDS[i]) + ".temp"));
 538                 }
 539
 540                 return num;
 541
 542         }
 543         catch(exception& e) {
 544                 m->errorOut(e, "PreClusterCommand", "createProcessesGroups");
 545                 exit(1);
 546         }
 547 }
 548 /**************************************************************************************************/
 549 int PreClusterCommand::driverGroups(string newFFile, string newNFile, string newMFile, int start, int end, vector<string> groups){
 550         try {
 551
 552                 int numSeqs = 0;
 553
 554                 //precluster each group
 555                 for (int i = start; i < end; i++) {
 556
 557                         start = time(NULL);
 558
 559                         if (m->control_pressed) {  return 0; }
 560
 561                         m->mothurOutEndLine(); m->mothurOut("Processing group " + groups[i] + ":"); m->mothurOutEndLine();
 562
 563                         map<string, string> thisNameMap;
 564             vector<Sequence> thisSeqs;
 565                         if (groupfile != "") {
 566                 thisSeqs = parser->getSeqs(groups[i]);
 567             }else if (countfile != "") {
 568                 thisSeqs = cparser->getSeqs(groups[i]);
 569             }
 570                         if (namefile != "") {  thisNameMap = parser->getNameMap(groups[i]); }
 571
 572                         //fill alignSeqs with this groups info.
 573                         numSeqs = loadSeqs(thisNameMap, thisSeqs, groups[i]);
 574
 575                         if (m->control_pressed) {   return 0; }
 576
 577                         if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); m->control_pressed = true; return 0;  }
 578
 579                         int count= process(newMFile+groups[i]+".map");
 580                         outputNames.push_back(newMFile+groups[i]+".map"); outputTypes["map"].push_back(newMFile+groups[i]+".map");
 581
 582                         if (m->control_pressed) {  return 0; }
 583
 584                         m->mothurOut("Total number of sequences before pre.cluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
 585                         m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
 586                         printData(newFFile, newNFile, groups[i]);
 587
 588                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine();
 589
 590                 }
 591
 592                 return numSeqs;
 593         }
 594         catch(exception& e) {
 595                 m->errorOut(e, "PreClusterCommand", "driverGroups");
 596                 exit(1);
 597         }
 598 }
 599 /**************************************************************************************************/
 600 int PreClusterCommand::process(string newMapFile){
 601         try {
 602                 ofstream out;
 603                 m->openOutputFile(newMapFile, out);
 604
 605                 //sort seqs by number of identical seqs
 606         if (topdown) { sort(alignSeqs.begin(), alignSeqs.end(), comparePriorityTopDown);  }
 607         else {  sort(alignSeqs.begin(), alignSeqs.end(), comparePriorityDownTop);  }
 608
 609                 int count = 0;
 610                 int numSeqs = alignSeqs.size();
 611
 612         if (topdown) {
 613             //think about running through twice...
 614             for (int i = 0; i < numSeqs; i++) {
 615
 616                 if (alignSeqs[i].active) {  //this sequence has not been merged yet
 617
 618                     string chunk = alignSeqs[i].seq.getName() + "\t" + toString(alignSeqs[i].numIdentical) + "\t" + toString(0) + "\t" + alignSeqs[i].seq.getAligned() + "\n";
 619
 620                     //try to merge it with all smaller seqs
 621                     for (int j = i+1; j < numSeqs; j++) {
 622
 623                         if (m->control_pressed) { out.close(); return 0; }
 624
 625                         if (alignSeqs[j].active) {  //this sequence has not been merged yet
 626                             //are you within "diff" bases
 627                             int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
 628
 629                             if (mismatch <= diffs) {
 630                                 //merge
 631                                 alignSeqs[i].names += ',' + alignSeqs[j].names;
 632                                 alignSeqs[i].numIdentical += alignSeqs[j].numIdentical;
 633
 634                                 chunk += alignSeqs[j].seq.getName() + "\t" + toString(alignSeqs[j].numIdentical) + "\t" + toString(mismatch) + "\t" + alignSeqs[j].seq.getAligned() + "\n";
 635
 636                                 alignSeqs[j].active = 0;
 637                                 alignSeqs[j].numIdentical = 0;
 638                                 count++;
 639                             }
 640                         }//end if j active
 641                     }//end for loop j
 642
 643                     //remove from active list
 644                     alignSeqs[i].active = 0;
 645
 646                     out << "ideal_seq_" << (i+1) << '\t' << alignSeqs[i].numIdentical << endl << chunk << endl;;
 647
 648                 }//end if active i
 649                 if(i % 100 == 0)        { m->mothurOutJustToScreen(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)+"\n");       }
 650             }
 651         }else {
 652             map<int, string> mapFile;
 653             map<int, int> originalCount;
 654             map<int, int>::iterator itCount;
 655             for (int i = 0; i < numSeqs; i++) { mapFile[i] = ""; originalCount[i] = alignSeqs[i].numIdentical; }
 656
 657             //think about running through twice...
 658             for (int i = 0; i < numSeqs; i++) {
 659
 660                 //try to merge it into larger seqs
 661                 for (int j = i+1; j < numSeqs; j++) {
 662
 663                     if (m->control_pressed) { out.close(); return 0; }
 664
 665                     if (originalCount[j] > originalCount[i]) {  //this sequence is more abundant than I am
 666                         //are you within "diff" bases
 667                         int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
 668
 669                         if (mismatch <= diffs) {
 670                             //merge
 671                             alignSeqs[j].names += ',' + alignSeqs[i].names;
 672                             alignSeqs[j].numIdentical += alignSeqs[i].numIdentical;
 673
 674                             mapFile[j] = alignSeqs[i].seq.getName() + "\t" + toString(alignSeqs[i].numIdentical) + "\t" + toString(mismatch) + "\t" + alignSeqs[i].seq.getAligned() + "\n" + mapFile[i];
 675                             alignSeqs[i].numIdentical = 0;
 676                             originalCount.erase(i);
 677                             mapFile[i] = "";
 678                             count++;
 679                             j+=numSeqs; //exit search, we merged this one in.
 680                         }
 681                     }//end abundance check
 682                 }//end for loop j
 683
 684                 if(i % 100 == 0)        { m->mothurOutJustToScreen(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)+"\n");       }
 685             }
 686
 687             for (int i = 0; i < numSeqs; i++) {
 688                 if (alignSeqs[i].numIdentical != 0) {
 689                     out << "ideal_seq_" << (i+1) << '\t' << alignSeqs[i].numIdentical << endl  << alignSeqs[i].seq.getName() + "\t" + toString(alignSeqs[i].numIdentical) + "\t" + toString(0) + "\t" + alignSeqs[i].seq.getAligned() + "\n" << mapFile[i] << endl;
 690                 }
 691             }
 692
 693         }
 694                 out.close();
 695
 696                 if(numSeqs % 100 != 0)  { m->mothurOut(toString(numSeqs) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine();   }
 697
 698                 return count;
 699
 700         }
 701         catch(exception& e) {
 702                 m->errorOut(e, "PreClusterCommand", "process");
 703                 exit(1);
 704         }
 705 }
 706 /**************************************************************************************************/
 707 int PreClusterCommand::readFASTA(){
 708         try {
 709                 //ifstream inNames;
 710                 ifstream inFasta;
 711
 712                 m->openInputFile(fastafile, inFasta);
 713                 set<int> lengths;
 714
 715                 while (!inFasta.eof()) {
 716
 717                         if (m->control_pressed) { inFasta.close(); return 0; }
 718
 719                         Sequence seq(inFasta);  m->gobble(inFasta);
 720
 721                         if (seq.getName() != "") {  //can get "" if commented line is at end of fasta file
 722                                 if (namefile != "") {
 723                                         itSize = sizes.find(seq.getName());
 724
 725                                         if (itSize == sizes.end()) { m->mothurOut(seq.getName() + " is not in your names file, please correct."); m->mothurOutEndLine(); exit(1); }
 726                                         else{
 727                                                 seqPNode tempNode(itSize->second, seq, names[seq.getName()]);
 728                                                 alignSeqs.push_back(tempNode);
 729                                                 lengths.insert(seq.getAligned().length());
 730                                         }
 731                                 }else { //no names file, you are identical to yourself
 732                     int numRep = 1;
 733                     if (countfile != "") { numRep = ct.getNumSeqs(seq.getName()); }
 734                                         seqPNode tempNode(numRep, seq, seq.getName());
 735                                         alignSeqs.push_back(tempNode);
 736                                         lengths.insert(seq.getAligned().length());
 737                                 }
 738                         }
 739                 }
 740                 inFasta.close();
 741
 742         if (lengths.size() > 1) { m->control_pressed = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); }
 743         else if (lengths.size() == 1) { length = *(lengths.begin()); }
 744
 745                 return alignSeqs.size();
 746         }
 747
 748         catch(exception& e) {
 749                 m->errorOut(e, "PreClusterCommand", "readFASTA");
 750                 exit(1);
 751         }
 752 }
 753 /**************************************************************************************************/
 754 int PreClusterCommand::loadSeqs(map<string, string>& thisName, vector<Sequence>& thisSeqs, string group){
 755         try {
 756                 set<int> lengths;
 757                 alignSeqs.clear();
 758                 map<string, string>::iterator it;
 759                 bool error = false;
 760         map<string, int> thisCount;
 761         if (countfile != "") { thisCount = cparser->getCountTable(group);  }
 762
 763                 for (int i = 0; i < thisSeqs.size(); i++) {
 764
 765                         if (m->control_pressed) { return 0; }
 766
 767                         if (namefile != "") {
 768                                 it = thisName.find(thisSeqs[i].getName());
 769
 770                                 //should never be true since parser checks for this
 771                                 if (it == thisName.end()) { m->mothurOut(thisSeqs[i].getName() + " is not in your names file, please correct."); m->mothurOutEndLine(); error = true; }
 772                                 else{
 773                                         //get number of reps
 774                                         int numReps = 1;
 775                                         for(int j=0;j<(it->second).length();j++){
 776                                                 if((it->second)[j] == ','){     numReps++;      }
 777                                         }
 778
 779                                         seqPNode tempNode(numReps, thisSeqs[i], it->second);
 780                                         alignSeqs.push_back(tempNode);
 781                     lengths.insert(thisSeqs[i].getAligned().length());
 782                                 }
 783                         }else { //no names file, you are identical to yourself
 784                 int numRep = 1;
 785                 if (countfile != "") {
 786                     map<string, int>::iterator it2 = thisCount.find(thisSeqs[i].getName());
 787
 788                     //should never be true since parser checks for this
 789                     if (it2 == thisCount.end()) { m->mothurOut(thisSeqs[i].getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); error = true; }
 790                     else { numRep = it2->second;  }
 791                 }
 792                                 seqPNode tempNode(numRep, thisSeqs[i], thisSeqs[i].getName());
 793                                 alignSeqs.push_back(tempNode);
 794                                 lengths.insert(thisSeqs[i].getAligned().length());
 795                         }
 796                 }
 797
 798         if (lengths.size() > 1) { error = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); }
 799         else if (lengths.size() == 1) { length = *(lengths.begin()); }
 800
 801                 //sanity check
 802                 if (error) { m->control_pressed = true; }
 803
 804                 thisSeqs.clear();
 805
 806                 return alignSeqs.size();
 807         }
 808
 809         catch(exception& e) {
 810                 m->errorOut(e, "PreClusterCommand", "loadSeqs");
 811                 exit(1);
 812         }
 813 }
 814
 815 /**************************************************************************************************/
 816
 817 int PreClusterCommand::calcMisMatches(string seq1, string seq2){
 818         try {
 819                 int numBad = 0;
 820
 821                 for (int i = 0; i < seq1.length(); i++) {
 822                         //do they match
 823                         if (seq1[i] != seq2[i]) { numBad++; }
 824                         if (numBad > diffs) { return length;  } //to far to cluster
 825                 }
 826
 827                 return numBad;
 828         }
 829         catch(exception& e) {
 830                 m->errorOut(e, "PreClusterCommand", "calcMisMatches");
 831                 exit(1);
 832         }
 833 }
 834 /**************************************************************************************************/
 835
 836 int PreClusterCommand::mergeGroupCounts(string newcount, string newname, string newfasta){
 837         try {
 838                 ifstream inNames;
 839         m->openInputFile(newname, inNames);
 840
 841         string group, first, second;
 842         set<string> uniqueNames;
 843         while (!inNames.eof()) {
 844             if (m->control_pressed) { break; }
 845             inNames >> group; m->gobble(inNames);
 846             inNames >> first; m->gobble(inNames);
 847             inNames >> second; m->gobble(inNames);
 848
 849             vector<string> names;
 850             m->splitAtComma(second, names);
 851
 852             uniqueNames.insert(first);
 853
 854             int total = ct.getGroupCount(first, group);
 855             for (int i = 1; i < names.size(); i++) {
 856                 total += ct.getGroupCount(names[i], group);
 857                 ct.setAbund(names[i], group, 0);
 858             }
 859             ct.setAbund(first, group, total);
 860         }
 861         inNames.close();
 862
 863         vector<string> namesOfSeqs = ct.getNamesOfSeqs();
 864         for (int i = 0; i < namesOfSeqs.size(); i++) {
 865             if (ct.getNumSeqs(namesOfSeqs[i]) == 0) {
 866                 ct.remove(namesOfSeqs[i]);
 867             }
 868         }
 869
 870         ct.printTable(newcount);
 871         m->mothurRemove(newname);
 872
 873         if (bygroup) { //if by group, must remove the duplicate seqs that are named the same
 874             ifstream in;
 875             m->openInputFile(newfasta, in);
 876
 877             ofstream out;
 878             m->openOutputFile(newfasta+"temp", out);
 879
 880             int count = 0;
 881             set<string> already;
 882             while(!in.eof()) {
 883                 if (m->control_pressed) { break; }
 884
 885                 Sequence seq(in); m->gobble(in);
 886
 887                 if (seq.getName() != "") {
 888                     count++;
 889                     if (already.count(seq.getName()) == 0) {
 890                         seq.printSequence(out);
 891                         already.insert(seq.getName());
 892                     }
 893                 }
 894             }
 895             in.close();
 896             out.close();
 897             m->mothurRemove(newfasta);
 898             m->renameFile(newfasta+"temp", newfasta);
 899         }
 900                         return 0;
 901
 902         }
 903         catch(exception& e) {
 904                 m->errorOut(e, "PreClusterCommand", "mergeGroupCounts");
 905                 exit(1);
 906         }
 907 }
 908
 909 /**************************************************************************************************/
 910
 911 void PreClusterCommand::printData(string newfasta, string newname, string group){
 912         try {
 913                 ofstream outFasta;
 914                 ofstream outNames;
 915
 916                 if (bygroup) {
 917                         m->openOutputFileAppend(newfasta, outFasta);
 918                         m->openOutputFileAppend(newname, outNames);
 919                 }else {
 920                         m->openOutputFile(newfasta, outFasta);
 921                         m->openOutputFile(newname, outNames);
 922                 }
 923
 924         if ((countfile != "") && (group == ""))  { outNames << "Representative_Sequence\ttotal\n";  }
 925                 for (int i = 0; i < alignSeqs.size(); i++) {
 926                         if (alignSeqs[i].numIdentical != 0) {
 927                                 alignSeqs[i].seq.printSequence(outFasta);
 928                                 if (countfile != "") {
 929                     if (group != "") {  outNames << group << '\t' << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; }
 930                     else {  outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].numIdentical << endl;  }
 931                 }else {  outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl;  }
 932                         }
 933                 }
 934
 935                 outFasta.close();
 936                 outNames.close();
 937
 938         }
 939         catch(exception& e) {
 940                 m->errorOut(e, "PreClusterCommand", "printData");
 941                 exit(1);
 942         }
 943 }
 944 /**************************************************************************************************/
 945
 946 void PreClusterCommand::readNameFile(){
 947         try {
 948                 ifstream in;
 949                 m->openInputFile(namefile, in);
 950                 string firstCol, secondCol;
 951
 952                 while (!in.eof()) {
 953                         in >> firstCol >> secondCol; m->gobble(in);
 954
 955             m->checkName(firstCol);
 956             m->checkName(secondCol);
 957             int size = m->getNumNames(secondCol);
 958
 959                         names[firstCol] = secondCol;
 960             sizes[firstCol] = size;
 961                 }
 962                 in.close();
 963         }
 964         catch(exception& e) {
 965                 m->errorOut(e, "PreClusterCommand", "readNameFile");
 966                 exit(1);
 967         }
 968 }
 969
 970 /**************************************************************************************************/
 971
 972