]> git.donarmstrong.com Git - mothur.git/blob - preclustercommand.cpp
pre.cluster unable to spawn necessary processes adjustment.
[mothur.git] / preclustercommand.cpp
1 /*
2  *  preclustercommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 12/21/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "preclustercommand.h"
11 #include "deconvolutecommand.h"
12
13 //**********************************************************************************************************************
14 vector<string> PreClusterCommand::setParameters(){      
15         try {
16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta-name",false,true,true); parameters.push_back(pfasta);
17                 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname);
18         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","count",false,false,true); parameters.push_back(pcount);
19                 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
20                 CommandParameter pdiffs("diffs", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pdiffs);
21                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
22         CommandParameter ptopdown("topdown", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(ptopdown);
23
24                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
25                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
26                 
27                 vector<string> myArray;
28                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
29                 return myArray;
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "PreClusterCommand", "setParameters");
33                 exit(1);
34         }
35 }
36 //**********************************************************************************************************************
37 string PreClusterCommand::getHelpString(){      
38         try {
39                 string helpString = "";
40                 helpString += "The pre.cluster command groups sequences that are within a given number of base mismatches.\n";
41                 helpString += "The pre.cluster command outputs a new fasta and name file.\n";
42                 helpString += "The pre.cluster command parameters are fasta, name, group, count, topdown, processors and diffs. The fasta parameter is required. \n";
43                 helpString += "The name parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n";
44                 helpString += "The group parameter allows you to provide a group file so you can cluster by group. \n";
45         helpString += "The count parameter allows you to provide a count file so you can cluster by group. \n";
46                 helpString += "The diffs parameter allows you to specify maximum number of mismatched bases allowed between sequences in a grouping. The default is 1.\n";
47         helpString += "The topdown parameter allows you to specify whether to cluster from largest abundance to smallest or smallest to largest.  Default=T, meaning largest to smallest.\n";
48                 helpString += "The pre.cluster command should be in the following format: \n";
49                 helpString += "pre.cluster(fasta=yourFastaFile, names=yourNamesFile, diffs=yourMaxDiffs) \n";
50                 helpString += "Example pre.cluster(fasta=amazon.fasta, diffs=2).\n";
51                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
52                 return helpString;
53         }
54         catch(exception& e) {
55                 m->errorOut(e, "PreClusterCommand", "getHelpString");
56                 exit(1);
57         }
58 }
59 //**********************************************************************************************************************
60 string PreClusterCommand::getOutputPattern(string type) {
61     try {
62         string pattern = "";
63         
64         if (type == "fasta") {  pattern = "[filename],precluster,[extension]"; } 
65         else if (type == "name") {  pattern = "[filename],precluster.names"; } 
66         else if (type == "count") {  pattern = "[filename],precluster.count_table"; }
67         else if (type == "map") {  pattern =  "[filename],precluster.map"; }
68         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
69         
70         return pattern;
71     }
72     catch(exception& e) {
73         m->errorOut(e, "PreClusterCommand", "getOutputPattern");
74         exit(1);
75     }
76 }
77 //**********************************************************************************************************************
78 PreClusterCommand::PreClusterCommand(){ 
79         try {
80                 abort = true; calledHelp = true; 
81                 setParameters();
82                 vector<string> tempOutNames;
83                 outputTypes["fasta"] = tempOutNames;
84                 outputTypes["name"] = tempOutNames;
85         outputTypes["count"] = tempOutNames;
86                 outputTypes["map"] = tempOutNames;
87         }
88         catch(exception& e) {
89                 m->errorOut(e, "PreClusterCommand", "PreClusterCommand");
90                 exit(1);
91         }
92 }
93 //**********************************************************************************************************************
94
95 PreClusterCommand::PreClusterCommand(string option) {
96         try {
97                 abort = false; calledHelp = false;   
98                 
99                 //allow user to run help
100                 if(option == "help") { help(); abort = true; calledHelp = true; }
101                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
102                 
103                 else {
104                         vector<string> myArray = setParameters();
105                         
106                         OptionParser parser(option);
107                         map<string, string> parameters = parser.getParameters();
108                         
109                         ValidParameters validParameter;
110                         map<string, string>::iterator it;
111                 
112                         //check to make sure all parameters are valid for command
113                         for (map<string, string>::iterator it2 = parameters.begin(); it2 != parameters.end(); it2++) { 
114                                 if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) {  abort = true;  }
115                         }
116                         
117                         //initialize outputTypes
118                         vector<string> tempOutNames;
119                         outputTypes["fasta"] = tempOutNames;
120                         outputTypes["name"] = tempOutNames;
121                         outputTypes["map"] = tempOutNames;
122             outputTypes["count"] = tempOutNames;
123                 
124                         //if the user changes the input directory command factory will send this info to us in the output parameter 
125                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
126                         if (inputDir == "not found"){   inputDir = "";          }
127                         else {
128                                 string path;
129                                 it = parameters.find("fasta");
130                                 //user has given a template file
131                                 if(it != parameters.end()){ 
132                                         path = m->hasPath(it->second);
133                                         //if the user has not given a path then, add inputdir. else leave path alone.
134                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
135                                 }
136                                 
137                                 it = parameters.find("name");
138                                 //user has given a template file
139                                 if(it != parameters.end()){ 
140                                         path = m->hasPath(it->second);
141                                         //if the user has not given a path then, add inputdir. else leave path alone.
142                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
143                                 }
144                                 
145                                 it = parameters.find("group");
146                                 //user has given a template file
147                                 if(it != parameters.end()){ 
148                                         path = m->hasPath(it->second);
149                                         //if the user has not given a path then, add inputdir. else leave path alone.
150                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
151                                 }
152                 
153                 it = parameters.find("count");
154                                 //user has given a template file
155                                 if(it != parameters.end()){ 
156                                         path = m->hasPath(it->second);
157                                         //if the user has not given a path then, add inputdir. else leave path alone.
158                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
159                                 }
160                         }
161
162                         //check for required parameters
163                         fastafile = validParameter.validFile(parameters, "fasta", true);
164                         if (fastafile == "not found") {                                 
165                                 fastafile = m->getFastaFile(); 
166                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
167                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
168                         }
169                         else if (fastafile == "not open") { abort = true; }     
170                         else { m->setFastaFile(fastafile); }
171                         
172                         //if the user changes the output directory command factory will send this info to us in the output parameter 
173                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
174                                 outputDir = ""; 
175                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it       
176                         }
177
178                         //check for optional parameter and set defaults
179                         // ...at some point should added some additional type checking...
180                         namefile = validParameter.validFile(parameters, "name", true);
181                         if (namefile == "not found") { namefile =  "";  }
182                         else if (namefile == "not open") { namefile = ""; abort = true; }       
183                         else {  m->setNameFile(namefile); }
184                         
185                         groupfile = validParameter.validFile(parameters, "group", true);
186                         if (groupfile == "not found") { groupfile =  "";  bygroup = false; }
187                         else if (groupfile == "not open") { abort = true; groupfile =  ""; }    
188                         else {   m->setGroupFile(groupfile); bygroup = true;  }
189             
190             countfile = validParameter.validFile(parameters, "count", true);
191                         if (countfile == "not found") { countfile =  "";   }
192                         else if (countfile == "not open") { abort = true; countfile =  ""; }    
193                         else {   
194                 m->setCountTableFile(countfile); 
195                 ct.readTable(countfile, true, false);
196                 if (ct.hasGroupInfo()) { bygroup = true; }
197                 else { bygroup = false;  }
198             }
199             
200             if ((namefile != "") && (countfile != "")) {
201                 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
202             }
203             
204             if ((groupfile != "") && (countfile != "")) {
205                 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
206             }
207
208                         
209                         string temp     = validParameter.validFile(parameters, "diffs", false);         if(temp == "not found"){        temp = "1"; }
210                         m->mothurConvert(temp, diffs); 
211                         
212                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
213                         m->setProcessors(temp);
214                         m->mothurConvert(temp, processors);
215                         
216             temp = validParameter.validFile(parameters, "topdown", false);              if(temp == "not found"){  temp = "T"; }
217                         topdown = m->isTrue(temp);
218             
219             if (countfile == "") {
220                 if (namefile == "") {
221                     vector<string> files; files.push_back(fastafile);
222                     parser.getNameFile(files);
223                 }
224             }
225                 }
226                                 
227         }
228         catch(exception& e) {
229                 m->errorOut(e, "PreClusterCommand", "PreClusterCommand");
230                 exit(1);
231         }
232 }
233 //**********************************************************************************************************************
234
235 int PreClusterCommand::execute(){
236         try {
237                 
238                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
239                 
240                 int start = time(NULL);
241                 
242                 string fileroot = outputDir + m->getRootName(m->getSimpleName(fastafile));
243         map<string, string> variables; 
244         variables["[filename]"] = fileroot;
245                 string newNamesFile = getOutputFileName("name",variables);
246         string newCountFile = getOutputFileName("count",variables);
247                 string newMapFile = getOutputFileName("map",variables); //add group name if by group
248         variables["[extension]"] = m->getExtension(fastafile);
249                 string newFastaFile = getOutputFileName("fasta", variables);
250                 outputNames.push_back(newFastaFile); outputTypes["fasta"].push_back(newFastaFile);
251                 if (countfile == "") { outputNames.push_back(newNamesFile); outputTypes["name"].push_back(newNamesFile); }
252                 else { outputNames.push_back(newCountFile); outputTypes["count"].push_back(newCountFile); }
253                 
254                 if (bygroup) {
255                         //clear out old files
256                         ofstream outFasta; m->openOutputFile(newFastaFile, outFasta); outFasta.close();
257                         ofstream outNames; m->openOutputFile(newNamesFile, outNames);  outNames.close();
258                         newMapFile = fileroot + "precluster.";
259                         
260                         //parse fasta and name file by group
261             vector<string> groups;
262                         if (countfile != "") {
263                 cparser = new SequenceCountParser(countfile, fastafile);
264                 groups = cparser->getNamesOfGroups();
265             }else {
266                 if (namefile != "") { parser = new SequenceParser(groupfile, fastafile, namefile);      }
267                 else                            { parser = new SequenceParser(groupfile, fastafile);                    }
268                 groups = parser->getNamesOfGroups();
269                         }
270             
271                         if(processors == 1)     {       driverGroups(newFastaFile, newNamesFile, newMapFile, 0, groups.size(), groups); }
272                         else                            {       createProcessesGroups(newFastaFile, newNamesFile, newMapFile, groups);                  }
273                         
274                         if (countfile != "") { 
275                 mergeGroupCounts(newCountFile, newNamesFile, newFastaFile);
276                 delete cparser; 
277             }else {  
278                 delete parser; 
279                 //run unique.seqs for deconvolute results
280                 string inputString = "fasta=" + newFastaFile;
281                 if (namefile != "") { inputString += ", name=" + newNamesFile; }
282                 m->mothurOutEndLine(); 
283                 m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
284                 m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); 
285                 m->mothurCalling = true;
286                 
287                 Command* uniqueCommand = new DeconvoluteCommand(inputString);
288                 uniqueCommand->execute();
289                 
290                 map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
291                 
292                 delete uniqueCommand;
293                 m->mothurCalling = false;
294                 m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
295                 
296                 m->renameFile(filenames["fasta"][0], newFastaFile);
297                 m->renameFile(filenames["name"][0], newNamesFile); 
298                         }
299             if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {    m->mothurRemove(outputNames[i]);        }        return 0; }
300                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run pre.cluster."); m->mothurOutEndLine(); 
301                                 
302                 }else {
303             if (processors != 1) { m->mothurOut("When using running without group information mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; }
304                         if (namefile != "") { readNameFile(); }
305                 
306                         //reads fasta file and return number of seqs
307                         int numSeqs = readFASTA(); //fills alignSeqs and makes all seqs active
308                 
309                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }
310         
311                         if (numSeqs == 0) { m->mothurOut("Error reading fasta file...please correct."); m->mothurOutEndLine(); return 0;  }
312                         if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); return 0;  }
313                         
314                         int count = process(newMapFile);
315                         outputNames.push_back(newMapFile); outputTypes["map"].push_back(newMapFile);
316                         
317                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }   
318                         
319                         m->mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
320                         m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); 
321                         if (countfile != "") { newNamesFile = newCountFile; }
322             printData(newFastaFile, newNamesFile, "");
323                                 
324                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); 
325                 }
326                                 
327                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }
328                 
329                 m->mothurOutEndLine();
330                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
331                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }               
332                 m->mothurOutEndLine();
333                 
334                 //set fasta file as new current fastafile
335                 string current = "";
336                 itTypes = outputTypes.find("fasta");
337                 if (itTypes != outputTypes.end()) {
338                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
339                 }
340                 
341                 itTypes = outputTypes.find("name");
342                 if (itTypes != outputTypes.end()) {
343                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
344                 }
345         
346         itTypes = outputTypes.find("count");
347                 if (itTypes != outputTypes.end()) {
348                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
349                 }
350                 
351                 return 0;
352                 
353         }
354         catch(exception& e) {
355                 m->errorOut(e, "PreClusterCommand", "execute");
356                 exit(1);
357         }
358 }
359 /**************************************************************************************************/
360 int PreClusterCommand::createProcessesGroups(string newFName, string newNName, string newMFile, vector<string> groups) {
361         try {
362                 
363                 vector<int> processIDS;
364                 int process = 1;
365                 int num = 0;
366                 bool recalc = false;
367         
368                 //sanity check
369                 if (groups.size() < processors) { processors = groups.size(); }
370                 
371                 //divide the groups between the processors
372                 vector<linePair> lines;
373                 int remainingPairs = groups.size();
374         int startIndex = 0;
375         for (int remainingProcessors = processors; remainingProcessors > 0; remainingProcessors--) {
376             int numPairs = remainingPairs; //case for last processor
377             if (remainingProcessors != 1) { numPairs = ceil(remainingPairs / remainingProcessors); }
378             lines.push_back(linePair(startIndex, (startIndex+numPairs))); //startIndex, endIndex
379             startIndex = startIndex + numPairs;
380             remainingPairs = remainingPairs - numPairs;
381         }
382                 
383 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)          
384                 
385                 //loop through and create all the processes you want
386                 while (process != processors) {
387                         pid_t pid = fork();
388                         
389                         if (pid > 0) {
390                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
391                                 process++;
392                         }else if (pid == 0){
393                 outputNames.clear();
394                                 num = driverGroups(newFName + m->mothurGetpid(process) + ".temp", newNName + m->mothurGetpid(process) + ".temp", newMFile, lines[process].start, lines[process].end, groups);
395                 
396                 string tempFile = m->mothurGetpid(process) + ".outputNames.temp";
397                 ofstream outTemp;
398                 m->openOutputFile(tempFile, outTemp);
399                 
400                 outTemp << outputNames.size();
401                 for (int i = 0; i < outputNames.size(); i++) { outTemp << outputNames[i] << endl; }
402                 outTemp.close();
403                 
404                                 exit(0);
405                         }else {
406                 m->mothurOut("[ERROR]: unable to spawn the number of processes you requested, reducing number to " + toString(process) + "\n"); processors = process;
407                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
408                 recalc = true;
409                                 break;
410                         }
411                 }
412         
413         if (recalc) {
414             lines.clear();
415             num = 0;
416             processIDS.resize(0);
417             process = 1;
418             
419             int remainingPairs = groups.size();
420             int startIndex = 0;
421             for (int remainingProcessors = processors; remainingProcessors > 0; remainingProcessors--) {
422                 int numPairs = remainingPairs; //case for last processor
423                 if (remainingProcessors != 1) { numPairs = ceil(remainingPairs / remainingProcessors); }
424                 lines.push_back(linePair(startIndex, (startIndex+numPairs))); //startIndex, endIndex
425                 startIndex = startIndex + numPairs;
426                 remainingPairs = remainingPairs - numPairs;
427             }
428             
429             while (process != processors) {
430                 pid_t pid = fork();
431                 
432                 if (pid > 0) {
433                     processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
434                     process++;
435                 }else if (pid == 0){
436                     outputNames.clear();
437                     num = driverGroups(newFName + m->mothurGetpid(process) + ".temp", newNName + m->mothurGetpid(process) + ".temp", newMFile, lines[process].start, lines[process].end, groups);
438                     
439                     string tempFile = m->mothurGetpid(process) + ".outputNames.temp";
440                     ofstream outTemp;
441                     m->openOutputFile(tempFile, outTemp);
442                     
443                     outTemp << outputNames.size();
444                     for (int i = 0; i < outputNames.size(); i++) { outTemp << outputNames[i] << endl; }
445                     outTemp.close();
446                     
447                     exit(0);
448                 }else {
449                     m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
450                     for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
451                     exit(0);
452                 }
453             }
454         }
455
456                 
457                 //do my part
458                 num = driverGroups(newFName, newNName, newMFile, lines[0].start, lines[0].end, groups);
459                 
460                 //force parent to wait until all the processes are done
461                 for (int i=0;i<processIDS.size();i++) { 
462                         int temp = processIDS[i];
463                         wait(&temp);
464                 }
465         
466         for (int i = 0; i < processIDS.size(); i++) {
467             string tempFile = toString(processIDS[i]) +  ".outputNames.temp";
468             ifstream intemp;
469             m->openInputFile(tempFile, intemp);
470             
471             int num;
472             intemp >> num;
473             for (int k = 0; k < num; k++) {
474                 string name = "";
475                 intemp >> name; m->gobble(intemp);
476                 
477                 outputNames.push_back(name); outputTypes["map"].push_back(name);
478             }
479             intemp.close();
480             m->mothurRemove(tempFile);
481         }
482 #else
483                 
484                 //////////////////////////////////////////////////////////////////////////////////////////////////////
485                 //Windows version shared memory, so be careful when passing variables through the preClusterData struct. 
486                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
487                 //////////////////////////////////////////////////////////////////////////////////////////////////////
488                 
489                 vector<preClusterData*> pDataArray; 
490                 DWORD   dwThreadIdArray[processors-1];
491                 HANDLE  hThreadArray[processors-1]; 
492                 
493                 //Create processor worker threads.
494                 for( int i=1; i<processors; i++ ){
495                         // Allocate memory for thread data.
496                         string extension = toString(i) + ".temp";
497                         
498                         preClusterData* tempPreCluster = new preClusterData(fastafile, namefile, groupfile, countfile, (newFName+extension), (newNName+extension), newMFile, groups, m, lines[i].start, lines[i].end, diffs, topdown, i);
499                         pDataArray.push_back(tempPreCluster);
500                         processIDS.push_back(i);
501                         
502                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
503                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
504                         hThreadArray[i-1] = CreateThread(NULL, 0, MyPreclusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);   
505                 }
506                 
507                                 
508                 //using the main process as a worker saves time and memory
509                 num = driverGroups(newFName, newNName, newMFile, lines[0].start, lines[0].end, groups);
510                 
511                 //Wait until all threads have terminated.
512                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
513                 
514                 //Close all thread handles and free memory allocations.
515                 for(int i=0; i < pDataArray.size(); i++){
516             if (pDataArray[i]->count != (pDataArray[i]->end-pDataArray[i]->start)) {
517                 m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end-pDataArray[i]->start) + " groups assigned to it, quitting. \n"); m->control_pressed = true; 
518             }
519                         for (int j = 0; j < pDataArray[i]->mapFileNames.size(); j++) {
520                                 outputNames.push_back(pDataArray[i]->mapFileNames[j]); outputTypes["map"].push_back(pDataArray[i]->mapFileNames[j]); 
521                         }
522                         CloseHandle(hThreadArray[i]);
523                         delete pDataArray[i];
524                 }
525                 
526 #endif          
527                 
528                 //append output files
529                 for(int i=0;i<processIDS.size();i++){
530                         //newFName = m->getFullPathName(".\\" + newFName);
531                         //newNName = m->getFullPathName(".\\" + newNName);
532                         
533                         m->appendFiles((newFName + toString(processIDS[i]) + ".temp"), newFName);
534                         m->mothurRemove((newFName + toString(processIDS[i]) + ".temp"));
535                         
536                         m->appendFiles((newNName + toString(processIDS[i]) + ".temp"), newNName);
537                         m->mothurRemove((newNName + toString(processIDS[i]) + ".temp"));
538                 }
539                 
540                 return num;     
541                 
542         }
543         catch(exception& e) {
544                 m->errorOut(e, "PreClusterCommand", "createProcessesGroups");
545                 exit(1);
546         }
547 }
548 /**************************************************************************************************/
549 int PreClusterCommand::driverGroups(string newFFile, string newNFile, string newMFile, int start, int end, vector<string> groups){
550         try {
551                 
552                 int numSeqs = 0;
553                 
554                 //precluster each group
555                 for (int i = start; i < end; i++) {
556                         
557                         start = time(NULL);
558                         
559                         if (m->control_pressed) {  return 0; }
560                         
561                         m->mothurOutEndLine(); m->mothurOut("Processing group " + groups[i] + ":"); m->mothurOutEndLine();
562                         
563                         map<string, string> thisNameMap;
564             vector<Sequence> thisSeqs;
565                         if (groupfile != "") { 
566                 thisSeqs = parser->getSeqs(groups[i]);
567             }else if (countfile != "") {
568                 thisSeqs = cparser->getSeqs(groups[i]);
569             }
570                         if (namefile != "") {  thisNameMap = parser->getNameMap(groups[i]); }
571             
572                         //fill alignSeqs with this groups info.
573                         numSeqs = loadSeqs(thisNameMap, thisSeqs, groups[i]);
574                         
575                         if (m->control_pressed) {   return 0; }
576                         
577                         if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); m->control_pressed = true; return 0;  }
578                         
579                         int count= process(newMFile+groups[i]+".map");
580                         outputNames.push_back(newMFile+groups[i]+".map"); outputTypes["map"].push_back(newMFile+groups[i]+".map");
581                         
582                         if (m->control_pressed) {  return 0; }
583                         
584                         m->mothurOut("Total number of sequences before pre.cluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
585                         m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); 
586                         printData(newFFile, newNFile, groups[i]);
587                         
588                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); 
589                         
590                 }
591                 
592                 return numSeqs;
593         }
594         catch(exception& e) {
595                 m->errorOut(e, "PreClusterCommand", "driverGroups");
596                 exit(1);
597         }
598 }
599 /**************************************************************************************************/
600 int PreClusterCommand::process(string newMapFile){
601         try {
602                 ofstream out;
603                 m->openOutputFile(newMapFile, out);
604                 
605                 //sort seqs by number of identical seqs
606         if (topdown) { sort(alignSeqs.begin(), alignSeqs.end(), comparePriorityTopDown);  }
607         else {  sort(alignSeqs.begin(), alignSeqs.end(), comparePriorityDownTop);  }
608                 
609                 int count = 0;
610                 int numSeqs = alignSeqs.size();
611                 
612         if (topdown) {
613             //think about running through twice...
614             for (int i = 0; i < numSeqs; i++) {
615                 
616                 if (alignSeqs[i].active) {  //this sequence has not been merged yet
617                     
618                     string chunk = alignSeqs[i].seq.getName() + "\t" + toString(alignSeqs[i].numIdentical) + "\t" + toString(0) + "\t" + alignSeqs[i].seq.getAligned() + "\n";
619                     
620                     //try to merge it with all smaller seqs
621                     for (int j = i+1; j < numSeqs; j++) {
622                         
623                         if (m->control_pressed) { out.close(); return 0; }
624                         
625                         if (alignSeqs[j].active) {  //this sequence has not been merged yet
626                             //are you within "diff" bases
627                             int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
628                             
629                             if (mismatch <= diffs) {
630                                 //merge
631                                 alignSeqs[i].names += ',' + alignSeqs[j].names;
632                                 alignSeqs[i].numIdentical += alignSeqs[j].numIdentical;
633                                 
634                                 chunk += alignSeqs[j].seq.getName() + "\t" + toString(alignSeqs[j].numIdentical) + "\t" + toString(mismatch) + "\t" + alignSeqs[j].seq.getAligned() + "\n";
635                                 
636                                 alignSeqs[j].active = 0;
637                                 alignSeqs[j].numIdentical = 0;
638                                 count++;
639                             }
640                         }//end if j active
641                     }//end for loop j
642                     
643                     //remove from active list 
644                     alignSeqs[i].active = 0;
645                     
646                     out << "ideal_seq_" << (i+1) << '\t' << alignSeqs[i].numIdentical << endl << chunk << endl;;
647                     
648                 }//end if active i
649                 if(i % 100 == 0)        { m->mothurOutJustToScreen(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)+"\n");       }
650             }
651         }else {
652             map<int, string> mapFile;
653             map<int, int> originalCount;
654             map<int, int>::iterator itCount;
655             for (int i = 0; i < numSeqs; i++) { mapFile[i] = ""; originalCount[i] = alignSeqs[i].numIdentical; }
656             
657             //think about running through twice...
658             for (int i = 0; i < numSeqs; i++) {
659                 
660                 //try to merge it into larger seqs
661                 for (int j = i+1; j < numSeqs; j++) {
662                     
663                     if (m->control_pressed) { out.close(); return 0; }
664                     
665                     if (originalCount[j] > originalCount[i]) {  //this sequence is more abundant than I am
666                         //are you within "diff" bases
667                         int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
668                         
669                         if (mismatch <= diffs) {
670                             //merge
671                             alignSeqs[j].names += ',' + alignSeqs[i].names;
672                             alignSeqs[j].numIdentical += alignSeqs[i].numIdentical;
673                             
674                             mapFile[j] = alignSeqs[i].seq.getName() + "\t" + toString(alignSeqs[i].numIdentical) + "\t" + toString(mismatch) + "\t" + alignSeqs[i].seq.getAligned() + "\n" + mapFile[i];
675                             alignSeqs[i].numIdentical = 0;
676                             originalCount.erase(i);
677                             mapFile[i] = "";
678                             count++;
679                             j+=numSeqs; //exit search, we merged this one in.
680                         }
681                     }//end abundance check
682                 }//end for loop j
683                 
684                 if(i % 100 == 0)        { m->mothurOutJustToScreen(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)+"\n");       }
685             }
686             
687             for (int i = 0; i < numSeqs; i++) {
688                 if (alignSeqs[i].numIdentical != 0) {
689                     out << "ideal_seq_" << (i+1) << '\t' << alignSeqs[i].numIdentical << endl  << alignSeqs[i].seq.getName() + "\t" + toString(alignSeqs[i].numIdentical) + "\t" + toString(0) + "\t" + alignSeqs[i].seq.getAligned() + "\n" << mapFile[i] << endl;
690                 }
691             }
692             
693         }
694                 out.close();
695                 
696                 if(numSeqs % 100 != 0)  { m->mothurOut(toString(numSeqs) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine();   }       
697                 
698                 return count;
699                 
700         }
701         catch(exception& e) {
702                 m->errorOut(e, "PreClusterCommand", "process");
703                 exit(1);
704         }
705 }
706 /**************************************************************************************************/
707 int PreClusterCommand::readFASTA(){
708         try {
709                 //ifstream inNames;
710                 ifstream inFasta;
711                 
712                 m->openInputFile(fastafile, inFasta);
713                 set<int> lengths;
714                 
715                 while (!inFasta.eof()) {
716                         
717                         if (m->control_pressed) { inFasta.close(); return 0; }
718                                                 
719                         Sequence seq(inFasta);  m->gobble(inFasta);
720                         
721                         if (seq.getName() != "") {  //can get "" if commented line is at end of fasta file
722                                 if (namefile != "") {
723                                         itSize = sizes.find(seq.getName());
724                                         
725                                         if (itSize == sizes.end()) { m->mothurOut(seq.getName() + " is not in your names file, please correct."); m->mothurOutEndLine(); exit(1); }
726                                         else{
727                                                 seqPNode tempNode(itSize->second, seq, names[seq.getName()]);
728                                                 alignSeqs.push_back(tempNode);
729                                                 lengths.insert(seq.getAligned().length());
730                                         }       
731                                 }else { //no names file, you are identical to yourself 
732                     int numRep = 1;
733                     if (countfile != "") { numRep = ct.getNumSeqs(seq.getName()); }
734                                         seqPNode tempNode(numRep, seq, seq.getName());
735                                         alignSeqs.push_back(tempNode);
736                                         lengths.insert(seq.getAligned().length());
737                                 }
738                         }
739                 }
740                 inFasta.close();
741         
742         if (lengths.size() > 1) { m->control_pressed = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); }
743         else if (lengths.size() == 1) { length = *(lengths.begin()); }
744         
745                 return alignSeqs.size();
746         }
747         
748         catch(exception& e) {
749                 m->errorOut(e, "PreClusterCommand", "readFASTA");
750                 exit(1);
751         }
752 }
753 /**************************************************************************************************/
754 int PreClusterCommand::loadSeqs(map<string, string>& thisName, vector<Sequence>& thisSeqs, string group){
755         try {
756                 set<int> lengths;
757                 alignSeqs.clear();
758                 map<string, string>::iterator it;
759                 bool error = false;
760         map<string, int> thisCount;
761         if (countfile != "") { thisCount = cparser->getCountTable(group);  }
762                 
763                 for (int i = 0; i < thisSeqs.size(); i++) {
764                         
765                         if (m->control_pressed) { return 0; }
766                                                 
767                         if (namefile != "") {
768                                 it = thisName.find(thisSeqs[i].getName());
769                                 
770                                 //should never be true since parser checks for this
771                                 if (it == thisName.end()) { m->mothurOut(thisSeqs[i].getName() + " is not in your names file, please correct."); m->mothurOutEndLine(); error = true; }
772                                 else{
773                                         //get number of reps
774                                         int numReps = 1;
775                                         for(int j=0;j<(it->second).length();j++){
776                                                 if((it->second)[j] == ','){     numReps++;      }
777                                         }
778                                         
779                                         seqPNode tempNode(numReps, thisSeqs[i], it->second);
780                                         alignSeqs.push_back(tempNode);
781                     lengths.insert(thisSeqs[i].getAligned().length());
782                                 }       
783                         }else { //no names file, you are identical to yourself 
784                 int numRep = 1;
785                 if (countfile != "") { 
786                     map<string, int>::iterator it2 = thisCount.find(thisSeqs[i].getName());
787                     
788                     //should never be true since parser checks for this
789                     if (it2 == thisCount.end()) { m->mothurOut(thisSeqs[i].getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); error = true; }
790                     else { numRep = it2->second;  }
791                 }
792                                 seqPNode tempNode(numRep, thisSeqs[i], thisSeqs[i].getName());
793                                 alignSeqs.push_back(tempNode);
794                                 lengths.insert(thisSeqs[i].getAligned().length());
795                         }
796                 }
797     
798         if (lengths.size() > 1) { error = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); }
799         else if (lengths.size() == 1) { length = *(lengths.begin()); }
800         
801                 //sanity check
802                 if (error) { m->control_pressed = true; }
803                 
804                 thisSeqs.clear();
805                 
806                 return alignSeqs.size();
807         }
808         
809         catch(exception& e) {
810                 m->errorOut(e, "PreClusterCommand", "loadSeqs");
811                 exit(1);
812         }
813 }
814                                 
815 /**************************************************************************************************/
816
817 int PreClusterCommand::calcMisMatches(string seq1, string seq2){
818         try {
819                 int numBad = 0;
820                 
821                 for (int i = 0; i < seq1.length(); i++) {
822                         //do they match
823                         if (seq1[i] != seq2[i]) { numBad++; }
824                         if (numBad > diffs) { return length;  } //to far to cluster
825                 }
826                 
827                 return numBad;
828         }
829         catch(exception& e) {
830                 m->errorOut(e, "PreClusterCommand", "calcMisMatches");
831                 exit(1);
832         }
833 }
834 /**************************************************************************************************/
835
836 int PreClusterCommand::mergeGroupCounts(string newcount, string newname, string newfasta){
837         try {
838                 ifstream inNames;
839         m->openInputFile(newname, inNames);
840         
841         string group, first, second;
842         set<string> uniqueNames;
843         while (!inNames.eof()) {
844             if (m->control_pressed) { break; }
845             inNames >> group; m->gobble(inNames);
846             inNames >> first; m->gobble(inNames);
847             inNames >> second; m->gobble(inNames);
848             
849             vector<string> names;
850             m->splitAtComma(second, names);
851             
852             uniqueNames.insert(first);
853             
854             int total = ct.getGroupCount(first, group);
855             for (int i = 1; i < names.size(); i++) {
856                 total += ct.getGroupCount(names[i], group);
857                 ct.setAbund(names[i], group, 0);
858             }
859             ct.setAbund(first, group, total);
860         }
861         inNames.close();
862         
863         vector<string> namesOfSeqs = ct.getNamesOfSeqs();
864         for (int i = 0; i < namesOfSeqs.size(); i++) {
865             if (ct.getNumSeqs(namesOfSeqs[i]) == 0) {
866                 ct.remove(namesOfSeqs[i]);
867             }
868         }
869         
870         ct.printTable(newcount); 
871         m->mothurRemove(newname);
872         
873         if (bygroup) { //if by group, must remove the duplicate seqs that are named the same
874             ifstream in;
875             m->openInputFile(newfasta, in);
876             
877             ofstream out;
878             m->openOutputFile(newfasta+"temp", out);
879             
880             int count = 0;
881             set<string> already;
882             while(!in.eof()) {
883                 if (m->control_pressed) { break; }
884                 
885                 Sequence seq(in); m->gobble(in);
886                 
887                 if (seq.getName() != "") {
888                     count++;
889                     if (already.count(seq.getName()) == 0) {
890                         seq.printSequence(out);
891                         already.insert(seq.getName());
892                     }
893                 }
894             }
895             in.close();
896             out.close();
897             m->mothurRemove(newfasta);
898             m->renameFile(newfasta+"temp", newfasta);
899         }
900                         return 0;
901                 
902         }
903         catch(exception& e) {
904                 m->errorOut(e, "PreClusterCommand", "mergeGroupCounts");
905                 exit(1);
906         }
907 }
908
909 /**************************************************************************************************/
910
911 void PreClusterCommand::printData(string newfasta, string newname, string group){
912         try {
913                 ofstream outFasta;
914                 ofstream outNames;
915                 
916                 if (bygroup) {
917                         m->openOutputFileAppend(newfasta, outFasta);
918                         m->openOutputFileAppend(newname, outNames);
919                 }else {
920                         m->openOutputFile(newfasta, outFasta);
921                         m->openOutputFile(newname, outNames);
922                 }
923                 
924         if ((countfile != "") && (group == ""))  { outNames << "Representative_Sequence\ttotal\n";  }
925                 for (int i = 0; i < alignSeqs.size(); i++) {
926                         if (alignSeqs[i].numIdentical != 0) {
927                                 alignSeqs[i].seq.printSequence(outFasta); 
928                                 if (countfile != "") {  
929                     if (group != "") {  outNames << group << '\t' << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; }
930                     else {  outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].numIdentical << endl;  }
931                 }else {  outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl;  }
932                         }
933                 }
934                 
935                 outFasta.close();
936                 outNames.close();
937                 
938         }
939         catch(exception& e) {
940                 m->errorOut(e, "PreClusterCommand", "printData");
941                 exit(1);
942         }
943 }
944 /**************************************************************************************************/
945
946 void PreClusterCommand::readNameFile(){
947         try {
948                 ifstream in;
949                 m->openInputFile(namefile, in);
950                 string firstCol, secondCol;
951                                 
952                 while (!in.eof()) {
953                         in >> firstCol >> secondCol; m->gobble(in);
954             
955             m->checkName(firstCol);
956             m->checkName(secondCol);
957             int size = m->getNumNames(secondCol);
958             
959                         names[firstCol] = secondCol;
960             sizes[firstCol] = size;
961                 }
962                 in.close();
963         }
964         catch(exception& e) {
965                 m->errorOut(e, "PreClusterCommand", "readNameFile");
966                 exit(1);
967         }
968 }
969
970 /**************************************************************************************************/
971
972