]> git.donarmstrong.com Git - mothur.git/blob - preclustercommand.cpp
fixes while testing 1.33.0
[mothur.git] / preclustercommand.cpp
1 /*
2  *  preclustercommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 12/21/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "preclustercommand.h"
11 #include "deconvolutecommand.h"
12
13 //**********************************************************************************************************************
14 vector<string> PreClusterCommand::setParameters(){      
15         try {
16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta-name",false,true,true); parameters.push_back(pfasta);
17                 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname);
18         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","count",false,false,true); parameters.push_back(pcount);
19                 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
20                 CommandParameter pdiffs("diffs", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pdiffs);
21                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
22         CommandParameter ptopdown("topdown", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(ptopdown);
23
24                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
25                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
26                 
27                 vector<string> myArray;
28                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
29                 return myArray;
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "PreClusterCommand", "setParameters");
33                 exit(1);
34         }
35 }
36 //**********************************************************************************************************************
37 string PreClusterCommand::getHelpString(){      
38         try {
39                 string helpString = "";
40                 helpString += "The pre.cluster command groups sequences that are within a given number of base mismatches.\n";
41                 helpString += "The pre.cluster command outputs a new fasta and name file.\n";
42                 helpString += "The pre.cluster command parameters are fasta, name, group, count, topdown, processors and diffs. The fasta parameter is required. \n";
43                 helpString += "The name parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n";
44                 helpString += "The group parameter allows you to provide a group file so you can cluster by group. \n";
45         helpString += "The count parameter allows you to provide a count file so you can cluster by group. \n";
46                 helpString += "The diffs parameter allows you to specify maximum number of mismatched bases allowed between sequences in a grouping. The default is 1.\n";
47         helpString += "The topdown parameter allows you to specify whether to cluster from largest abundance to smallest or smallest to largest.  Default=T, meaning largest to smallest.\n";
48                 helpString += "The pre.cluster command should be in the following format: \n";
49                 helpString += "pre.cluster(fasta=yourFastaFile, names=yourNamesFile, diffs=yourMaxDiffs) \n";
50                 helpString += "Example pre.cluster(fasta=amazon.fasta, diffs=2).\n";
51                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
52                 return helpString;
53         }
54         catch(exception& e) {
55                 m->errorOut(e, "PreClusterCommand", "getHelpString");
56                 exit(1);
57         }
58 }
59 //**********************************************************************************************************************
60 string PreClusterCommand::getOutputPattern(string type) {
61     try {
62         string pattern = "";
63         
64         if (type == "fasta") {  pattern = "[filename],precluster,[extension]"; } 
65         else if (type == "name") {  pattern = "[filename],precluster.names"; } 
66         else if (type == "count") {  pattern = "[filename],precluster.count_table"; }
67         else if (type == "map") {  pattern =  "[filename],precluster.map"; }
68         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
69         
70         return pattern;
71     }
72     catch(exception& e) {
73         m->errorOut(e, "PreClusterCommand", "getOutputPattern");
74         exit(1);
75     }
76 }
77 //**********************************************************************************************************************
78 PreClusterCommand::PreClusterCommand(){ 
79         try {
80                 abort = true; calledHelp = true; 
81                 setParameters();
82                 vector<string> tempOutNames;
83                 outputTypes["fasta"] = tempOutNames;
84                 outputTypes["name"] = tempOutNames;
85         outputTypes["count"] = tempOutNames;
86                 outputTypes["map"] = tempOutNames;
87         }
88         catch(exception& e) {
89                 m->errorOut(e, "PreClusterCommand", "PreClusterCommand");
90                 exit(1);
91         }
92 }
93 //**********************************************************************************************************************
94
95 PreClusterCommand::PreClusterCommand(string option) {
96         try {
97                 abort = false; calledHelp = false;   
98                 
99                 //allow user to run help
100                 if(option == "help") { help(); abort = true; calledHelp = true; }
101                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
102                 
103                 else {
104                         vector<string> myArray = setParameters();
105                         
106                         OptionParser parser(option);
107                         map<string, string> parameters = parser.getParameters();
108                         
109                         ValidParameters validParameter;
110                         map<string, string>::iterator it;
111                 
112                         //check to make sure all parameters are valid for command
113                         for (map<string, string>::iterator it2 = parameters.begin(); it2 != parameters.end(); it2++) { 
114                                 if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) {  abort = true;  }
115                         }
116                         
117                         //initialize outputTypes
118                         vector<string> tempOutNames;
119                         outputTypes["fasta"] = tempOutNames;
120                         outputTypes["name"] = tempOutNames;
121                         outputTypes["map"] = tempOutNames;
122             outputTypes["count"] = tempOutNames;
123                 
124                         //if the user changes the input directory command factory will send this info to us in the output parameter 
125                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
126                         if (inputDir == "not found"){   inputDir = "";          }
127                         else {
128                                 string path;
129                                 it = parameters.find("fasta");
130                                 //user has given a template file
131                                 if(it != parameters.end()){ 
132                                         path = m->hasPath(it->second);
133                                         //if the user has not given a path then, add inputdir. else leave path alone.
134                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
135                                 }
136                                 
137                                 it = parameters.find("name");
138                                 //user has given a template file
139                                 if(it != parameters.end()){ 
140                                         path = m->hasPath(it->second);
141                                         //if the user has not given a path then, add inputdir. else leave path alone.
142                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
143                                 }
144                                 
145                                 it = parameters.find("group");
146                                 //user has given a template file
147                                 if(it != parameters.end()){ 
148                                         path = m->hasPath(it->second);
149                                         //if the user has not given a path then, add inputdir. else leave path alone.
150                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
151                                 }
152                 
153                 it = parameters.find("count");
154                                 //user has given a template file
155                                 if(it != parameters.end()){ 
156                                         path = m->hasPath(it->second);
157                                         //if the user has not given a path then, add inputdir. else leave path alone.
158                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
159                                 }
160                         }
161
162                         //check for required parameters
163                         fastafile = validParameter.validFile(parameters, "fasta", true);
164                         if (fastafile == "not found") {                                 
165                                 fastafile = m->getFastaFile(); 
166                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
167                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
168                         }
169                         else if (fastafile == "not open") { abort = true; }     
170                         else { m->setFastaFile(fastafile); }
171                         
172                         //if the user changes the output directory command factory will send this info to us in the output parameter 
173                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
174                                 outputDir = ""; 
175                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it       
176                         }
177
178                         //check for optional parameter and set defaults
179                         // ...at some point should added some additional type checking...
180                         namefile = validParameter.validFile(parameters, "name", true);
181                         if (namefile == "not found") { namefile =  "";  }
182                         else if (namefile == "not open") { namefile = ""; abort = true; }       
183                         else {  m->setNameFile(namefile); }
184                         
185                         groupfile = validParameter.validFile(parameters, "group", true);
186                         if (groupfile == "not found") { groupfile =  "";  bygroup = false; }
187                         else if (groupfile == "not open") { abort = true; groupfile =  ""; }    
188                         else {   m->setGroupFile(groupfile); bygroup = true;  }
189             
190             countfile = validParameter.validFile(parameters, "count", true);
191                         if (countfile == "not found") { countfile =  "";   }
192                         else if (countfile == "not open") { abort = true; countfile =  ""; }    
193                         else {   
194                 m->setCountTableFile(countfile); 
195                 ct.readTable(countfile, true, false);
196                 if (ct.hasGroupInfo()) { bygroup = true; }
197                 else { bygroup = false;  }
198             }
199             
200             if ((namefile != "") && (countfile != "")) {
201                 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
202             }
203             
204             if ((groupfile != "") && (countfile != "")) {
205                 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
206             }
207
208                         
209                         string temp     = validParameter.validFile(parameters, "diffs", false);         if(temp == "not found"){        temp = "1"; }
210                         m->mothurConvert(temp, diffs); 
211                         
212                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
213                         m->setProcessors(temp);
214                         m->mothurConvert(temp, processors);
215                         
216             temp = validParameter.validFile(parameters, "topdown", false);              if(temp == "not found"){  temp = "T"; }
217                         topdown = m->isTrue(temp);
218             
219             if (countfile == "") {
220                 if (namefile == "") {
221                     vector<string> files; files.push_back(fastafile);
222                     parser.getNameFile(files);
223                 }
224             }
225                 }
226                                 
227         }
228         catch(exception& e) {
229                 m->errorOut(e, "PreClusterCommand", "PreClusterCommand");
230                 exit(1);
231         }
232 }
233 //**********************************************************************************************************************
234
235 int PreClusterCommand::execute(){
236         try {
237                 
238                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
239                 
240                 int start = time(NULL);
241                 
242                 string fileroot = outputDir + m->getRootName(m->getSimpleName(fastafile));
243         map<string, string> variables; 
244         variables["[filename]"] = fileroot;
245                 string newNamesFile = getOutputFileName("name",variables);
246         string newCountFile = getOutputFileName("count",variables);
247                 string newMapFile = getOutputFileName("map",variables); //add group name if by group
248         variables["[extension]"] = m->getExtension(fastafile);
249                 string newFastaFile = getOutputFileName("fasta", variables);
250                 outputNames.push_back(newFastaFile); outputTypes["fasta"].push_back(newFastaFile);
251                 if (countfile == "") { outputNames.push_back(newNamesFile); outputTypes["name"].push_back(newNamesFile); }
252                 else { outputNames.push_back(newCountFile); outputTypes["count"].push_back(newCountFile); }
253                 
254                 if (bygroup) {
255                         //clear out old files
256                         ofstream outFasta; m->openOutputFile(newFastaFile, outFasta); outFasta.close();
257                         ofstream outNames; m->openOutputFile(newNamesFile, outNames);  outNames.close();
258                         newMapFile = fileroot + "precluster.";
259                         
260                         //parse fasta and name file by group
261             vector<string> groups;
262                         if (countfile != "") {
263                 cparser = new SequenceCountParser(countfile, fastafile);
264                 groups = cparser->getNamesOfGroups();
265             }else {
266                 if (namefile != "") { parser = new SequenceParser(groupfile, fastafile, namefile);      }
267                 else                            { parser = new SequenceParser(groupfile, fastafile);                    }
268                 groups = parser->getNamesOfGroups();
269                         }
270             
271                         if(processors == 1)     {       driverGroups(newFastaFile, newNamesFile, newMapFile, 0, groups.size(), groups); }
272                         else                            {       createProcessesGroups(newFastaFile, newNamesFile, newMapFile, groups);                  }
273                         
274                         if (countfile != "") { 
275                 mergeGroupCounts(newCountFile, newNamesFile, newFastaFile);
276                 delete cparser; 
277             }else {  
278                 delete parser; 
279                 //run unique.seqs for deconvolute results
280                 string inputString = "fasta=" + newFastaFile;
281                 if (namefile != "") { inputString += ", name=" + newNamesFile; }
282                 m->mothurOutEndLine(); 
283                 m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
284                 m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); 
285                 m->mothurCalling = true;
286                 
287                 Command* uniqueCommand = new DeconvoluteCommand(inputString);
288                 uniqueCommand->execute();
289                 
290                 map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
291                 
292                 delete uniqueCommand;
293                 m->mothurCalling = false;
294                 m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
295                 
296                 m->renameFile(filenames["fasta"][0], newFastaFile);
297                 m->renameFile(filenames["name"][0], newNamesFile); 
298                         }
299             if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {    m->mothurRemove(outputNames[i]);        }        return 0; }
300                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run pre.cluster."); m->mothurOutEndLine(); 
301                                 
302                 }else {
303             if (processors != 1) { m->mothurOut("When using running without group information mothur can only use 1 processor, continuing."); m->mothurOutEndLine(); processors = 1; }
304                         if (namefile != "") { readNameFile(); }
305                 
306                         //reads fasta file and return number of seqs
307                         int numSeqs = readFASTA(); //fills alignSeqs and makes all seqs active
308                 
309                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }
310         
311                         if (numSeqs == 0) { m->mothurOut("Error reading fasta file...please correct."); m->mothurOutEndLine(); return 0;  }
312                         if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); return 0;  }
313                         
314                         int count = process(newMapFile);
315                         outputNames.push_back(newMapFile); outputTypes["map"].push_back(newMapFile);
316                         
317                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }   
318                         
319                         m->mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
320                         m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); 
321                         if (countfile != "") { newNamesFile = newCountFile; }
322             printData(newFastaFile, newNamesFile, "");
323                                 
324                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); 
325                 }
326                                 
327                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }
328                 
329                 m->mothurOutEndLine();
330                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
331                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }               
332                 m->mothurOutEndLine();
333                 
334                 //set fasta file as new current fastafile
335                 string current = "";
336                 itTypes = outputTypes.find("fasta");
337                 if (itTypes != outputTypes.end()) {
338                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
339                 }
340                 
341                 itTypes = outputTypes.find("name");
342                 if (itTypes != outputTypes.end()) {
343                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
344                 }
345         
346         itTypes = outputTypes.find("count");
347                 if (itTypes != outputTypes.end()) {
348                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
349                 }
350                 
351                 return 0;
352                 
353         }
354         catch(exception& e) {
355                 m->errorOut(e, "PreClusterCommand", "execute");
356                 exit(1);
357         }
358 }
359 /**************************************************************************************************/
360 int PreClusterCommand::createProcessesGroups(string newFName, string newNName, string newMFile, vector<string> groups) {
361         try {
362                 
363                 vector<int> processIDS;
364                 int process = 1;
365                 int num = 0;
366                 
367                 //sanity check
368                 if (groups.size() < processors) { processors = groups.size(); }
369                 
370                 //divide the groups between the processors
371                 vector<linePair> lines;
372                 int remainingPairs = groups.size();
373         int startIndex = 0;
374         for (int remainingProcessors = processors; remainingProcessors > 0; remainingProcessors--) {
375             int numPairs = remainingPairs; //case for last processor
376             if (remainingProcessors != 1) { numPairs = ceil(remainingPairs / remainingProcessors); }
377             lines.push_back(linePair(startIndex, (startIndex+numPairs))); //startIndex, endIndex
378             startIndex = startIndex + numPairs;
379             remainingPairs = remainingPairs - numPairs;
380         }
381                 
382 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)          
383                 
384                 //loop through and create all the processes you want
385                 while (process != processors) {
386                         int pid = fork();
387                         
388                         if (pid > 0) {
389                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
390                                 process++;
391                         }else if (pid == 0){
392                 outputNames.clear();
393                                 num = driverGroups(newFName + toString(getpid()) + ".temp", newNName + toString(getpid()) + ".temp", newMFile, lines[process].start, lines[process].end, groups);
394                 
395                 string tempFile = toString(getpid()) + ".outputNames.temp";
396                 ofstream outTemp;
397                 m->openOutputFile(tempFile, outTemp);
398                 
399                 outTemp << outputNames.size();
400                 for (int i = 0; i < outputNames.size(); i++) { outTemp << outputNames[i] << endl; }
401                 outTemp.close();
402                 
403                                 exit(0);
404                         }else { 
405                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
406                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
407                                 exit(0);
408                         }
409                 }
410                 
411                 //do my part
412                 num = driverGroups(newFName, newNName, newMFile, lines[0].start, lines[0].end, groups);
413                 
414                 //force parent to wait until all the processes are done
415                 for (int i=0;i<processIDS.size();i++) { 
416                         int temp = processIDS[i];
417                         wait(&temp);
418                 }
419         
420         for (int i = 0; i < processIDS.size(); i++) {
421             string tempFile = toString(processIDS[i]) +  ".outputNames.temp";
422             ifstream intemp;
423             m->openInputFile(tempFile, intemp);
424             
425             int num;
426             intemp >> num;
427             for (int k = 0; k < num; k++) {
428                 string name = "";
429                 intemp >> name; m->gobble(intemp);
430                 
431                 outputNames.push_back(name); outputTypes["map"].push_back(name);
432             }
433             intemp.close();
434             m->mothurRemove(tempFile);
435         }
436 #else
437                 
438                 //////////////////////////////////////////////////////////////////////////////////////////////////////
439                 //Windows version shared memory, so be careful when passing variables through the preClusterData struct. 
440                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
441                 //////////////////////////////////////////////////////////////////////////////////////////////////////
442                 
443                 vector<preClusterData*> pDataArray; 
444                 DWORD   dwThreadIdArray[processors-1];
445                 HANDLE  hThreadArray[processors-1]; 
446                 
447                 //Create processor worker threads.
448                 for( int i=1; i<processors; i++ ){
449                         // Allocate memory for thread data.
450                         string extension = toString(i) + ".temp";
451                         
452                         preClusterData* tempPreCluster = new preClusterData(fastafile, namefile, groupfile, countfile, (newFName+extension), (newNName+extension), newMFile, groups, m, lines[i].start, lines[i].end, diffs, topdown, i);
453                         pDataArray.push_back(tempPreCluster);
454                         processIDS.push_back(i);
455                         
456                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
457                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
458                         hThreadArray[i-1] = CreateThread(NULL, 0, MyPreclusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);   
459                 }
460                 
461                                 
462                 //using the main process as a worker saves time and memory
463                 num = driverGroups(newFName, newNName, newMFile, lines[0].start, lines[0].end, groups);
464                 
465                 //Wait until all threads have terminated.
466                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
467                 
468                 //Close all thread handles and free memory allocations.
469                 for(int i=0; i < pDataArray.size(); i++){
470             if (pDataArray[i]->count != (pDataArray[i]->end-pDataArray[i]->start)) {
471                 m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end-pDataArray[i]->start) + " groups assigned to it, quitting. \n"); m->control_pressed = true; 
472             }
473                         for (int j = 0; j < pDataArray[i]->mapFileNames.size(); j++) {
474                                 outputNames.push_back(pDataArray[i]->mapFileNames[j]); outputTypes["map"].push_back(pDataArray[i]->mapFileNames[j]); 
475                         }
476                         CloseHandle(hThreadArray[i]);
477                         delete pDataArray[i];
478                 }
479                 
480 #endif          
481                 
482                 //append output files
483                 for(int i=0;i<processIDS.size();i++){
484                         //newFName = m->getFullPathName(".\\" + newFName);
485                         //newNName = m->getFullPathName(".\\" + newNName);
486                         
487                         m->appendFiles((newFName + toString(processIDS[i]) + ".temp"), newFName);
488                         m->mothurRemove((newFName + toString(processIDS[i]) + ".temp"));
489                         
490                         m->appendFiles((newNName + toString(processIDS[i]) + ".temp"), newNName);
491                         m->mothurRemove((newNName + toString(processIDS[i]) + ".temp"));
492                 }
493                 
494                 return num;     
495                 
496         }
497         catch(exception& e) {
498                 m->errorOut(e, "PreClusterCommand", "createProcessesGroups");
499                 exit(1);
500         }
501 }
502 /**************************************************************************************************/
503 int PreClusterCommand::driverGroups(string newFFile, string newNFile, string newMFile, int start, int end, vector<string> groups){
504         try {
505                 
506                 int numSeqs = 0;
507                 
508                 //precluster each group
509                 for (int i = start; i < end; i++) {
510                         
511                         start = time(NULL);
512                         
513                         if (m->control_pressed) {  return 0; }
514                         
515                         m->mothurOutEndLine(); m->mothurOut("Processing group " + groups[i] + ":"); m->mothurOutEndLine();
516                         
517                         map<string, string> thisNameMap;
518             vector<Sequence> thisSeqs;
519                         if (groupfile != "") { 
520                 thisSeqs = parser->getSeqs(groups[i]);
521             }else if (countfile != "") {
522                 thisSeqs = cparser->getSeqs(groups[i]);
523             }
524                         if (namefile != "") {  thisNameMap = parser->getNameMap(groups[i]); }
525             
526                         //fill alignSeqs with this groups info.
527                         numSeqs = loadSeqs(thisNameMap, thisSeqs, groups[i]);
528                         
529                         if (m->control_pressed) {   return 0; }
530                         
531                         if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); m->control_pressed = true; return 0;  }
532                         
533                         int count= process(newMFile+groups[i]+".map");
534                         outputNames.push_back(newMFile+groups[i]+".map"); outputTypes["map"].push_back(newMFile+groups[i]+".map");
535                         
536                         if (m->control_pressed) {  return 0; }
537                         
538                         m->mothurOut("Total number of sequences before pre.cluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
539                         m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); 
540                         printData(newFFile, newNFile, groups[i]);
541                         
542                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); 
543                         
544                 }
545                 
546                 return numSeqs;
547         }
548         catch(exception& e) {
549                 m->errorOut(e, "PreClusterCommand", "driverGroups");
550                 exit(1);
551         }
552 }
553 /**************************************************************************************************/
554 int PreClusterCommand::process(string newMapFile){
555         try {
556                 ofstream out;
557                 m->openOutputFile(newMapFile, out);
558                 
559                 //sort seqs by number of identical seqs
560         if (topdown) { sort(alignSeqs.begin(), alignSeqs.end(), comparePriorityTopDown);  }
561         else {  sort(alignSeqs.begin(), alignSeqs.end(), comparePriorityDownTop);  }
562                 
563                 int count = 0;
564                 int numSeqs = alignSeqs.size();
565                 
566         if (topdown) {
567             //think about running through twice...
568             for (int i = 0; i < numSeqs; i++) {
569                 
570                 if (alignSeqs[i].active) {  //this sequence has not been merged yet
571                     
572                     string chunk = alignSeqs[i].seq.getName() + "\t" + toString(alignSeqs[i].numIdentical) + "\t" + toString(0) + "\t" + alignSeqs[i].seq.getAligned() + "\n";
573                     
574                     //try to merge it with all smaller seqs
575                     for (int j = i+1; j < numSeqs; j++) {
576                         
577                         if (m->control_pressed) { out.close(); return 0; }
578                         
579                         if (alignSeqs[j].active) {  //this sequence has not been merged yet
580                             //are you within "diff" bases
581                             int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
582                             
583                             if (mismatch <= diffs) {
584                                 //merge
585                                 alignSeqs[i].names += ',' + alignSeqs[j].names;
586                                 alignSeqs[i].numIdentical += alignSeqs[j].numIdentical;
587                                 
588                                 chunk += alignSeqs[j].seq.getName() + "\t" + toString(alignSeqs[j].numIdentical) + "\t" + toString(mismatch) + "\t" + alignSeqs[j].seq.getAligned() + "\n";
589                                 
590                                 alignSeqs[j].active = 0;
591                                 alignSeqs[j].numIdentical = 0;
592                                 count++;
593                             }
594                         }//end if j active
595                     }//end for loop j
596                     
597                     //remove from active list 
598                     alignSeqs[i].active = 0;
599                     
600                     out << "ideal_seq_" << (i+1) << '\t' << alignSeqs[i].numIdentical << endl << chunk << endl;;
601                     
602                 }//end if active i
603                 if(i % 100 == 0)        { m->mothurOutJustToScreen(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)+"\n");       }
604             }
605         }else {
606             map<int, string> mapFile;
607             map<int, int> originalCount;
608             map<int, int>::iterator itCount;
609             for (int i = 0; i < numSeqs; i++) { mapFile[i] = ""; originalCount[i] = alignSeqs[i].numIdentical; }
610             
611             //think about running through twice...
612             for (int i = 0; i < numSeqs; i++) {
613                 
614                 //try to merge it into larger seqs
615                 for (int j = i+1; j < numSeqs; j++) {
616                     
617                     if (m->control_pressed) { out.close(); return 0; }
618                     
619                     if (originalCount[j] > originalCount[i]) {  //this sequence is more abundant than I am
620                         //are you within "diff" bases
621                         int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
622                         
623                         if (mismatch <= diffs) {
624                             //merge
625                             alignSeqs[j].names += ',' + alignSeqs[i].names;
626                             alignSeqs[j].numIdentical += alignSeqs[i].numIdentical;
627                             
628                             mapFile[j] = alignSeqs[i].seq.getName() + "\t" + toString(alignSeqs[i].numIdentical) + "\t" + toString(mismatch) + "\t" + alignSeqs[i].seq.getAligned() + "\n" + mapFile[i];
629                             alignSeqs[i].numIdentical = 0;
630                             originalCount.erase(i);
631                             mapFile[i] = "";
632                             count++;
633                             j+=numSeqs; //exit search, we merged this one in.
634                         }
635                     }//end abundance check
636                 }//end for loop j
637                 
638                 if(i % 100 == 0)        { m->mothurOutJustToScreen(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)+"\n");       }
639             }
640             
641             for (int i = 0; i < numSeqs; i++) {
642                 if (alignSeqs[i].numIdentical != 0) {
643                     out << "ideal_seq_" << (i+1) << '\t' << alignSeqs[i].numIdentical << endl  << alignSeqs[i].seq.getName() + "\t" + toString(alignSeqs[i].numIdentical) + "\t" + toString(0) + "\t" + alignSeqs[i].seq.getAligned() + "\n" << mapFile[i] << endl;
644                 }
645             }
646             
647         }
648                 out.close();
649                 
650                 if(numSeqs % 100 != 0)  { m->mothurOut(toString(numSeqs) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine();   }       
651                 
652                 return count;
653                 
654         }
655         catch(exception& e) {
656                 m->errorOut(e, "PreClusterCommand", "process");
657                 exit(1);
658         }
659 }
660 /**************************************************************************************************/
661 int PreClusterCommand::readFASTA(){
662         try {
663                 //ifstream inNames;
664                 ifstream inFasta;
665                 
666                 m->openInputFile(fastafile, inFasta);
667                 set<int> lengths;
668                 
669                 while (!inFasta.eof()) {
670                         
671                         if (m->control_pressed) { inFasta.close(); return 0; }
672                                                 
673                         Sequence seq(inFasta);  m->gobble(inFasta);
674                         
675                         if (seq.getName() != "") {  //can get "" if commented line is at end of fasta file
676                                 if (namefile != "") {
677                                         itSize = sizes.find(seq.getName());
678                                         
679                                         if (itSize == sizes.end()) { m->mothurOut(seq.getName() + " is not in your names file, please correct."); m->mothurOutEndLine(); exit(1); }
680                                         else{
681                                                 seqPNode tempNode(itSize->second, seq, names[seq.getName()]);
682                                                 alignSeqs.push_back(tempNode);
683                                                 lengths.insert(seq.getAligned().length());
684                                         }       
685                                 }else { //no names file, you are identical to yourself 
686                     int numRep = 1;
687                     if (countfile != "") { numRep = ct.getNumSeqs(seq.getName()); }
688                                         seqPNode tempNode(numRep, seq, seq.getName());
689                                         alignSeqs.push_back(tempNode);
690                                         lengths.insert(seq.getAligned().length());
691                                 }
692                         }
693                 }
694                 inFasta.close();
695         
696         if (lengths.size() > 1) { m->control_pressed = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); }
697         else if (lengths.size() == 1) { length = *(lengths.begin()); }
698         
699                 return alignSeqs.size();
700         }
701         
702         catch(exception& e) {
703                 m->errorOut(e, "PreClusterCommand", "readFASTA");
704                 exit(1);
705         }
706 }
707 /**************************************************************************************************/
708 int PreClusterCommand::loadSeqs(map<string, string>& thisName, vector<Sequence>& thisSeqs, string group){
709         try {
710                 set<int> lengths;
711                 alignSeqs.clear();
712                 map<string, string>::iterator it;
713                 bool error = false;
714         map<string, int> thisCount;
715         if (countfile != "") { thisCount = cparser->getCountTable(group);  }
716                 
717                 for (int i = 0; i < thisSeqs.size(); i++) {
718                         
719                         if (m->control_pressed) { return 0; }
720                                                 
721                         if (namefile != "") {
722                                 it = thisName.find(thisSeqs[i].getName());
723                                 
724                                 //should never be true since parser checks for this
725                                 if (it == thisName.end()) { m->mothurOut(thisSeqs[i].getName() + " is not in your names file, please correct."); m->mothurOutEndLine(); error = true; }
726                                 else{
727                                         //get number of reps
728                                         int numReps = 1;
729                                         for(int j=0;j<(it->second).length();j++){
730                                                 if((it->second)[j] == ','){     numReps++;      }
731                                         }
732                                         
733                                         seqPNode tempNode(numReps, thisSeqs[i], it->second);
734                                         alignSeqs.push_back(tempNode);
735                     lengths.insert(thisSeqs[i].getAligned().length());
736                                 }       
737                         }else { //no names file, you are identical to yourself 
738                 int numRep = 1;
739                 if (countfile != "") { 
740                     map<string, int>::iterator it2 = thisCount.find(thisSeqs[i].getName());
741                     
742                     //should never be true since parser checks for this
743                     if (it2 == thisCount.end()) { m->mothurOut(thisSeqs[i].getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); error = true; }
744                     else { numRep = it2->second;  }
745                 }
746                                 seqPNode tempNode(numRep, thisSeqs[i], thisSeqs[i].getName());
747                                 alignSeqs.push_back(tempNode);
748                                 lengths.insert(thisSeqs[i].getAligned().length());
749                         }
750                 }
751     
752         if (lengths.size() > 1) { error = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); }
753         else if (lengths.size() == 1) { length = *(lengths.begin()); }
754         
755                 //sanity check
756                 if (error) { m->control_pressed = true; }
757                 
758                 thisSeqs.clear();
759                 
760                 return alignSeqs.size();
761         }
762         
763         catch(exception& e) {
764                 m->errorOut(e, "PreClusterCommand", "loadSeqs");
765                 exit(1);
766         }
767 }
768                                 
769 /**************************************************************************************************/
770
771 int PreClusterCommand::calcMisMatches(string seq1, string seq2){
772         try {
773                 int numBad = 0;
774                 
775                 for (int i = 0; i < seq1.length(); i++) {
776                         //do they match
777                         if (seq1[i] != seq2[i]) { numBad++; }
778                         if (numBad > diffs) { return length;  } //to far to cluster
779                 }
780                 
781                 return numBad;
782         }
783         catch(exception& e) {
784                 m->errorOut(e, "PreClusterCommand", "calcMisMatches");
785                 exit(1);
786         }
787 }
788 /**************************************************************************************************/
789
790 int PreClusterCommand::mergeGroupCounts(string newcount, string newname, string newfasta){
791         try {
792                 ifstream inNames;
793         m->openInputFile(newname, inNames);
794         
795         string group, first, second;
796         set<string> uniqueNames;
797         while (!inNames.eof()) {
798             if (m->control_pressed) { break; }
799             inNames >> group; m->gobble(inNames);
800             inNames >> first; m->gobble(inNames);
801             inNames >> second; m->gobble(inNames);
802             
803             vector<string> names;
804             m->splitAtComma(second, names);
805             
806             uniqueNames.insert(first);
807             
808             int total = ct.getGroupCount(first, group);
809             for (int i = 1; i < names.size(); i++) {
810                 total += ct.getGroupCount(names[i], group);
811                 ct.setAbund(names[i], group, 0);
812             }
813             ct.setAbund(first, group, total);
814         }
815         inNames.close();
816         
817         vector<string> namesOfSeqs = ct.getNamesOfSeqs();
818         for (int i = 0; i < namesOfSeqs.size(); i++) {
819             if (ct.getNumSeqs(namesOfSeqs[i]) == 0) {
820                 ct.remove(namesOfSeqs[i]);
821             }
822         }
823         
824         ct.printTable(newcount); 
825         m->mothurRemove(newname);
826         
827         if (bygroup) { //if by group, must remove the duplicate seqs that are named the same
828             ifstream in;
829             m->openInputFile(newfasta, in);
830             
831             ofstream out;
832             m->openOutputFile(newfasta+"temp", out);
833             
834             int count = 0;
835             set<string> already;
836             while(!in.eof()) {
837                 if (m->control_pressed) { break; }
838                 
839                 Sequence seq(in); m->gobble(in);
840                 
841                 if (seq.getName() != "") {
842                     count++;
843                     if (already.count(seq.getName()) == 0) {
844                         seq.printSequence(out);
845                         already.insert(seq.getName());
846                     }
847                 }
848             }
849             in.close();
850             out.close();
851             m->mothurRemove(newfasta);
852             m->renameFile(newfasta+"temp", newfasta);
853         }
854                         return 0;
855                 
856         }
857         catch(exception& e) {
858                 m->errorOut(e, "PreClusterCommand", "mergeGroupCounts");
859                 exit(1);
860         }
861 }
862
863 /**************************************************************************************************/
864
865 void PreClusterCommand::printData(string newfasta, string newname, string group){
866         try {
867                 ofstream outFasta;
868                 ofstream outNames;
869                 
870                 if (bygroup) {
871                         m->openOutputFileAppend(newfasta, outFasta);
872                         m->openOutputFileAppend(newname, outNames);
873                 }else {
874                         m->openOutputFile(newfasta, outFasta);
875                         m->openOutputFile(newname, outNames);
876                 }
877                 
878         if ((countfile != "") && (group == ""))  { outNames << "Representative_Sequence\ttotal\n";  }
879                 for (int i = 0; i < alignSeqs.size(); i++) {
880                         if (alignSeqs[i].numIdentical != 0) {
881                                 alignSeqs[i].seq.printSequence(outFasta); 
882                                 if (countfile != "") {  
883                     if (group != "") {  outNames << group << '\t' << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; }
884                     else {  outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].numIdentical << endl;  }
885                 }else {  outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl;  }
886                         }
887                 }
888                 
889                 outFasta.close();
890                 outNames.close();
891                 
892         }
893         catch(exception& e) {
894                 m->errorOut(e, "PreClusterCommand", "printData");
895                 exit(1);
896         }
897 }
898 /**************************************************************************************************/
899
900 void PreClusterCommand::readNameFile(){
901         try {
902                 ifstream in;
903                 m->openInputFile(namefile, in);
904                 string firstCol, secondCol;
905                                 
906                 while (!in.eof()) {
907                         in >> firstCol >> secondCol; m->gobble(in);
908             
909             m->checkName(firstCol);
910             m->checkName(secondCol);
911             int size = m->getNumNames(secondCol);
912             
913                         names[firstCol] = secondCol;
914             sizes[firstCol] = size;
915                 }
916                 in.close();
917         }
918         catch(exception& e) {
919                 m->errorOut(e, "PreClusterCommand", "readNameFile");
920                 exit(1);
921         }
922 }
923
924 /**************************************************************************************************/
925
926