]> git.donarmstrong.com Git - mothur.git/blob - preclustercommand.cpp
added count file to get.oturep, pre.cluster, screen.seqs, tree.shared. made remove...
[mothur.git] / preclustercommand.cpp
1 /*
2  *  preclustercommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 12/21/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "preclustercommand.h"
11 #include "deconvolutecommand.h"
12
13 //**********************************************************************************************************************
14 vector<string> PreClusterCommand::setParameters(){      
15         try {
16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17                 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
18         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
19                 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
20                 CommandParameter pdiffs("diffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pdiffs);
21                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
22                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
23                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
24                 
25                 vector<string> myArray;
26                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
27                 return myArray;
28         }
29         catch(exception& e) {
30                 m->errorOut(e, "PreClusterCommand", "setParameters");
31                 exit(1);
32         }
33 }
34 //**********************************************************************************************************************
35 string PreClusterCommand::getHelpString(){      
36         try {
37                 string helpString = "";
38                 helpString += "The pre.cluster command groups sequences that are within a given number of base mismatches.\n";
39                 helpString += "The pre.cluster command outputs a new fasta and name file.\n";
40                 helpString += "The pre.cluster command parameters are fasta, name, group, count, processors and diffs. The fasta parameter is required. \n";
41                 helpString += "The name parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n";
42                 helpString += "The group parameter allows you to provide a group file so you can cluster by group. \n";
43         helpString += "The count parameter allows you to provide a count file so you can cluster by group. \n";
44                 helpString += "The diffs parameter allows you to specify maximum number of mismatched bases allowed between sequences in a grouping. The default is 1.\n";
45                 helpString += "The pre.cluster command should be in the following format: \n";
46                 helpString += "pre.cluster(fasta=yourFastaFile, names=yourNamesFile, diffs=yourMaxDiffs) \n";
47                 helpString += "Example pre.cluster(fasta=amazon.fasta, diffs=2).\n";
48                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
49                 return helpString;
50         }
51         catch(exception& e) {
52                 m->errorOut(e, "PreClusterCommand", "getHelpString");
53                 exit(1);
54         }
55 }
56 //**********************************************************************************************************************
57 string PreClusterCommand::getOutputFileNameTag(string type, string inputName=""){       
58         try {
59         string outputFileName = "";
60                 map<string, vector<string> >::iterator it;
61         
62         //is this a type this command creates
63         it = outputTypes.find(type);
64         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
65         else {
66             if (type == "fasta") {  outputFileName =  "precluster" + m->getExtension(inputName); }
67             else if (type == "name") {  outputFileName =  "precluster.names"; }
68             else if (type == "count") {  outputFileName =  "precluster.count_table"; }
69             else if (type == "map") {  outputFileName =  "precluster.map"; }
70             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
71         }
72         return outputFileName;
73         }
74         catch(exception& e) {
75                 m->errorOut(e, "PreClusterCommand", "getOutputFileNameTag");
76                 exit(1);
77         }
78 }
79 //**********************************************************************************************************************
80 PreClusterCommand::PreClusterCommand(){ 
81         try {
82                 abort = true; calledHelp = true; 
83                 setParameters();
84                 vector<string> tempOutNames;
85                 outputTypes["fasta"] = tempOutNames;
86                 outputTypes["name"] = tempOutNames;
87         outputTypes["count"] = tempOutNames;
88                 outputTypes["map"] = tempOutNames;
89         }
90         catch(exception& e) {
91                 m->errorOut(e, "PreClusterCommand", "PreClusterCommand");
92                 exit(1);
93         }
94 }
95 //**********************************************************************************************************************
96
97 PreClusterCommand::PreClusterCommand(string option) {
98         try {
99                 abort = false; calledHelp = false;   
100                 
101                 //allow user to run help
102                 if(option == "help") { help(); abort = true; calledHelp = true; }
103                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
104                 
105                 else {
106                         vector<string> myArray = setParameters();
107                         
108                         OptionParser parser(option);
109                         map<string, string> parameters = parser.getParameters();
110                         
111                         ValidParameters validParameter;
112                         map<string, string>::iterator it;
113                 
114                         //check to make sure all parameters are valid for command
115                         for (map<string, string>::iterator it2 = parameters.begin(); it2 != parameters.end(); it2++) { 
116                                 if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) {  abort = true;  }
117                         }
118                         
119                         //initialize outputTypes
120                         vector<string> tempOutNames;
121                         outputTypes["fasta"] = tempOutNames;
122                         outputTypes["name"] = tempOutNames;
123                         outputTypes["map"] = tempOutNames;
124             outputTypes["count"] = tempOutNames;
125                 
126                         //if the user changes the input directory command factory will send this info to us in the output parameter 
127                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
128                         if (inputDir == "not found"){   inputDir = "";          }
129                         else {
130                                 string path;
131                                 it = parameters.find("fasta");
132                                 //user has given a template file
133                                 if(it != parameters.end()){ 
134                                         path = m->hasPath(it->second);
135                                         //if the user has not given a path then, add inputdir. else leave path alone.
136                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
137                                 }
138                                 
139                                 it = parameters.find("name");
140                                 //user has given a template file
141                                 if(it != parameters.end()){ 
142                                         path = m->hasPath(it->second);
143                                         //if the user has not given a path then, add inputdir. else leave path alone.
144                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
145                                 }
146                                 
147                                 it = parameters.find("group");
148                                 //user has given a template file
149                                 if(it != parameters.end()){ 
150                                         path = m->hasPath(it->second);
151                                         //if the user has not given a path then, add inputdir. else leave path alone.
152                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
153                                 }
154                 
155                 it = parameters.find("count");
156                                 //user has given a template file
157                                 if(it != parameters.end()){ 
158                                         path = m->hasPath(it->second);
159                                         //if the user has not given a path then, add inputdir. else leave path alone.
160                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
161                                 }
162                         }
163
164                         //check for required parameters
165                         fastafile = validParameter.validFile(parameters, "fasta", true);
166                         if (fastafile == "not found") {                                 
167                                 fastafile = m->getFastaFile(); 
168                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
169                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
170                         }
171                         else if (fastafile == "not open") { abort = true; }     
172                         else { m->setFastaFile(fastafile); }
173                         
174                         //if the user changes the output directory command factory will send this info to us in the output parameter 
175                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
176                                 outputDir = ""; 
177                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it       
178                         }
179
180                         //check for optional parameter and set defaults
181                         // ...at some point should added some additional type checking...
182                         namefile = validParameter.validFile(parameters, "name", true);
183                         if (namefile == "not found") { namefile =  "";  }
184                         else if (namefile == "not open") { namefile = ""; abort = true; }       
185                         else {  m->setNameFile(namefile); }
186                         
187                         groupfile = validParameter.validFile(parameters, "group", true);
188                         if (groupfile == "not found") { groupfile =  "";  bygroup = false; }
189                         else if (groupfile == "not open") { abort = true; groupfile =  ""; }    
190                         else {   m->setGroupFile(groupfile); bygroup = true;  }
191             
192             countfile = validParameter.validFile(parameters, "count", true);
193                         if (countfile == "not found") { countfile =  "";   }
194                         else if (countfile == "not open") { abort = true; countfile =  ""; }    
195                         else {   
196                 m->setCountTableFile(countfile); 
197                 ct.readTable(countfile);
198                 if (ct.hasGroupInfo()) { bygroup = true; }
199                 else { bygroup = false;  }
200             }
201             
202             if ((namefile != "") && (countfile != "")) {
203                 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
204             }
205             
206             if ((groupfile != "") && (countfile != "")) {
207                 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
208             }
209
210                         
211                         string temp     = validParameter.validFile(parameters, "diffs", false);         if(temp == "not found"){        temp = "1"; }
212                         m->mothurConvert(temp, diffs); 
213                         
214                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
215                         m->setProcessors(temp);
216                         m->mothurConvert(temp, processors);
217                         
218             if (countfile == "") {
219                 if (namefile == "") {
220                     vector<string> files; files.push_back(fastafile);
221                     parser.getNameFile(files);
222                 }
223             }
224                 }
225                                 
226         }
227         catch(exception& e) {
228                 m->errorOut(e, "PreClusterCommand", "PreClusterCommand");
229                 exit(1);
230         }
231 }
232 //**********************************************************************************************************************
233
234 int PreClusterCommand::execute(){
235         try {
236                 
237                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
238                 
239                 int start = time(NULL);
240                 
241                 string fileroot = outputDir + m->getRootName(m->getSimpleName(fastafile));
242                 string newFastaFile = fileroot + getOutputFileNameTag("fasta", fastafile);
243                 string newNamesFile = fileroot + getOutputFileNameTag("name");
244         string newCountFile = fileroot + getOutputFileNameTag("count");
245                 string newMapFile = fileroot + getOutputFileNameTag("map"); //add group name if by group
246                 outputNames.push_back(newFastaFile); outputTypes["fasta"].push_back(newFastaFile);
247                 if (countfile == "") { outputNames.push_back(newNamesFile); outputTypes["name"].push_back(newNamesFile); }
248                 else { outputNames.push_back(newCountFile); outputTypes["count"].push_back(newCountFile); }
249                 
250                 if (bygroup) {
251                         //clear out old files
252                         ofstream outFasta; m->openOutputFile(newFastaFile, outFasta); outFasta.close();
253                         ofstream outNames; m->openOutputFile(newNamesFile, outNames);  outNames.close();
254                         newMapFile = fileroot + "precluster.";
255                         
256                         //parse fasta and name file by group
257             vector<string> groups;
258                         if (countfile != "") {
259                 cparser = new SequenceCountParser(countfile, fastafile);
260                 groups = cparser->getNamesOfGroups();
261             }else {
262                 if (namefile != "") { parser = new SequenceParser(groupfile, fastafile, namefile);      }
263                 else                            { parser = new SequenceParser(groupfile, fastafile);                    }
264                 groups = parser->getNamesOfGroups();
265                         }
266             
267                         if(processors == 1)     {       driverGroups(newFastaFile, newNamesFile, newMapFile, 0, groups.size(), groups); }
268                         else                            {       createProcessesGroups(newFastaFile, newNamesFile, newMapFile, groups);                  }
269                         
270                         if (countfile != "") { 
271                 mergeGroupCounts(newCountFile, newNamesFile, newFastaFile);
272                 delete cparser; 
273             }else {  
274                 delete parser; 
275                 //run unique.seqs for deconvolute results
276                 string inputString = "fasta=" + newFastaFile;
277                 if (namefile != "") { inputString += ", name=" + newNamesFile; }
278                 m->mothurOutEndLine(); 
279                 m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
280                 m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); 
281                 m->mothurCalling = true;
282                 
283                 Command* uniqueCommand = new DeconvoluteCommand(inputString);
284                 uniqueCommand->execute();
285                 
286                 map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
287                 
288                 delete uniqueCommand;
289                 m->mothurCalling = false;
290                 m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
291                 
292                 m->renameFile(filenames["fasta"][0], newFastaFile);
293                 m->renameFile(filenames["name"][0], newNamesFile); 
294                         }
295             if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {    m->mothurRemove(outputNames[i]);        }        return 0; }
296                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run pre.cluster."); m->mothurOutEndLine(); 
297                                 
298                 }else {
299                         if (namefile != "") { readNameFile(); }
300                 
301                         //reads fasta file and return number of seqs
302                         int numSeqs = readFASTA(); //fills alignSeqs and makes all seqs active
303                 
304                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }
305         
306                         if (numSeqs == 0) { m->mothurOut("Error reading fasta file...please correct."); m->mothurOutEndLine(); return 0;  }
307                         if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); return 0;  }
308                         
309                         int count = process(newMapFile);
310                         outputNames.push_back(newMapFile); outputTypes["map"].push_back(newMapFile);
311                         
312                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }   
313                         
314                         m->mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
315                         m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); 
316                         if (countfile != "") { newNamesFile = newCountFile; }
317             printData(newFastaFile, newNamesFile, "");
318                                 
319                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); 
320                 }
321                                 
322                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }
323                 
324                 m->mothurOutEndLine();
325                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
326                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }               
327                 m->mothurOutEndLine();
328                 
329                 //set fasta file as new current fastafile
330                 string current = "";
331                 itTypes = outputTypes.find("fasta");
332                 if (itTypes != outputTypes.end()) {
333                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
334                 }
335                 
336                 itTypes = outputTypes.find("name");
337                 if (itTypes != outputTypes.end()) {
338                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
339                 }
340         
341         itTypes = outputTypes.find("count");
342                 if (itTypes != outputTypes.end()) {
343                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
344                 }
345                 
346                 return 0;
347                 
348         }
349         catch(exception& e) {
350                 m->errorOut(e, "PreClusterCommand", "execute");
351                 exit(1);
352         }
353 }
354 /**************************************************************************************************/
355 int PreClusterCommand::createProcessesGroups(string newFName, string newNName, string newMFile, vector<string> groups) {
356         try {
357                 
358                 vector<int> processIDS;
359                 int process = 1;
360                 int num = 0;
361                 
362                 //sanity check
363                 if (groups.size() < processors) { processors = groups.size(); }
364                 
365                 //divide the groups between the processors
366                 vector<linePair> lines;
367                 int numGroupsPerProcessor = groups.size() / processors;
368                 for (int i = 0; i < processors; i++) {
369                         int startIndex =  i * numGroupsPerProcessor;
370                         int endIndex = (i+1) * numGroupsPerProcessor;
371                         if(i == (processors - 1)){      endIndex = groups.size();       }
372                         lines.push_back(linePair(startIndex, endIndex));
373                 }
374                 
375 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)          
376                 
377                 //loop through and create all the processes you want
378                 while (process != processors) {
379                         int pid = fork();
380                         
381                         if (pid > 0) {
382                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
383                                 process++;
384                         }else if (pid == 0){
385                 outputNames.clear();
386                                 num = driverGroups(newFName + toString(getpid()) + ".temp", newNName + toString(getpid()) + ".temp", newMFile, lines[process].start, lines[process].end, groups);
387                 
388                 string tempFile = toString(getpid()) + ".outputNames.temp";
389                 ofstream outTemp;
390                 m->openOutputFile(tempFile, outTemp);
391                 
392                 outTemp << outputNames.size();
393                 for (int i = 0; i < outputNames.size(); i++) { outTemp << outputNames[i] << endl; }
394                 outTemp.close();
395                 
396                                 exit(0);
397                         }else { 
398                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
399                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
400                                 exit(0);
401                         }
402                 }
403                 
404                 //do my part
405                 num = driverGroups(newFName, newNName, newMFile, lines[0].start, lines[0].end, groups);
406                 
407                 //force parent to wait until all the processes are done
408                 for (int i=0;i<processIDS.size();i++) { 
409                         int temp = processIDS[i];
410                         wait(&temp);
411                 }
412         
413         for (int i = 0; i < processIDS.size(); i++) {
414             string tempFile = toString(processIDS[i]) +  ".outputNames.temp";
415             ifstream intemp;
416             m->openInputFile(tempFile, intemp);
417             
418             int num;
419             intemp >> num;
420             for (int k = 0; k < num; k++) {
421                 string name = "";
422                 intemp >> name; m->gobble(intemp);
423                 
424                 outputNames.push_back(name); outputTypes["map"].push_back(name);
425             }
426             intemp.close();
427             m->mothurRemove(tempFile);
428         }
429 #else
430                 
431                 //////////////////////////////////////////////////////////////////////////////////////////////////////
432                 //Windows version shared memory, so be careful when passing variables through the preClusterData struct. 
433                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
434                 //////////////////////////////////////////////////////////////////////////////////////////////////////
435                 
436                 vector<preClusterData*> pDataArray; 
437                 DWORD   dwThreadIdArray[processors-1];
438                 HANDLE  hThreadArray[processors-1]; 
439                 
440                 //Create processor worker threads.
441                 for( int i=1; i<processors; i++ ){
442                         // Allocate memory for thread data.
443                         string extension = toString(i) + ".temp";
444                         
445                         preClusterData* tempPreCluster = new preClusterData(fastafile, namefile, groupfile, countfile, (newFName+extension), (newNName+extension), newMFile, groups, m, lines[i].start, lines[i].end, diffs, i);
446                         pDataArray.push_back(tempPreCluster);
447                         processIDS.push_back(i);
448                         
449                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
450                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
451                         hThreadArray[i-1] = CreateThread(NULL, 0, MyPreclusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);   
452                 }
453                 
454                                 
455                 //using the main process as a worker saves time and memory
456                 num = driverGroups(newFName, newNName, newMFile, lines[0].start, lines[0].end, groups);
457                 
458                 //Wait until all threads have terminated.
459                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
460                 
461                 //Close all thread handles and free memory allocations.
462                 for(int i=0; i < pDataArray.size(); i++){
463                         for (int j = 0; j < pDataArray[i]->mapFileNames.size(); j++) {
464                                 outputNames.push_back(pDataArray[i]->mapFileNames[j]); outputTypes["map"].push_back(pDataArray[i]->mapFileNames[j]); 
465                         }
466                         CloseHandle(hThreadArray[i]);
467                         delete pDataArray[i];
468                 }
469                 
470 #endif          
471                 
472                 //append output files
473                 for(int i=0;i<processIDS.size();i++){
474                         //newFName = m->getFullPathName(".\\" + newFName);
475                         //newNName = m->getFullPathName(".\\" + newNName);
476                         
477                         m->appendFiles((newFName + toString(processIDS[i]) + ".temp"), newFName);
478                         m->mothurRemove((newFName + toString(processIDS[i]) + ".temp"));
479                         
480                         m->appendFiles((newNName + toString(processIDS[i]) + ".temp"), newNName);
481                         m->mothurRemove((newNName + toString(processIDS[i]) + ".temp"));
482                 }
483                 
484                 return num;     
485                 
486         }
487         catch(exception& e) {
488                 m->errorOut(e, "PreClusterCommand", "createProcessesGroups");
489                 exit(1);
490         }
491 }
492 /**************************************************************************************************/
493 int PreClusterCommand::driverGroups(string newFFile, string newNFile, string newMFile, int start, int end, vector<string> groups){
494         try {
495                 
496                 int numSeqs = 0;
497                 
498                 //precluster each group
499                 for (int i = start; i < end; i++) {
500                         
501                         start = time(NULL);
502                         
503                         if (m->control_pressed) {  return 0; }
504                         
505                         m->mothurOutEndLine(); m->mothurOut("Processing group " + groups[i] + ":"); m->mothurOutEndLine();
506                         
507                         map<string, string> thisNameMap;
508             vector<Sequence> thisSeqs;
509                         if (groupfile != "") { 
510                 thisSeqs = parser->getSeqs(groups[i]);
511             }else if (countfile != "") {
512                 thisSeqs = cparser->getSeqs(groups[i]);
513             }
514                         if (namefile != "") {  thisNameMap = parser->getNameMap(groups[i]); }
515             
516                         //fill alignSeqs with this groups info.
517                         numSeqs = loadSeqs(thisNameMap, thisSeqs, groups[i]);
518                         
519                         if (m->control_pressed) {   return 0; }
520                         
521                         if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); m->control_pressed = true; return 0;  }
522                         
523                         int count= process(newMFile+groups[i]+".map");
524                         outputNames.push_back(newMFile+groups[i]+".map"); outputTypes["map"].push_back(newMFile+groups[i]+".map");
525                         
526                         if (m->control_pressed) {  return 0; }
527                         
528                         m->mothurOut("Total number of sequences before pre.cluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
529                         m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); 
530                         printData(newFFile, newNFile, groups[i]);
531                         
532                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); 
533                         
534                 }
535                 
536                 return numSeqs;
537         }
538         catch(exception& e) {
539                 m->errorOut(e, "PreClusterCommand", "driverGroups");
540                 exit(1);
541         }
542 }
543 /**************************************************************************************************/
544 int PreClusterCommand::process(string newMapFile){
545         try {
546                 ofstream out;
547                 m->openOutputFile(newMapFile, out);
548                 
549                 //sort seqs by number of identical seqs
550                 sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
551                 
552                 int count = 0;
553                 int numSeqs = alignSeqs.size();
554                 
555                 //think about running through twice...
556                 for (int i = 0; i < numSeqs; i++) {
557                         
558                         //are you active
559                         //                      itActive = active.find(alignSeqs[i].seq.getName());
560                         
561                         if (alignSeqs[i].active) {  //this sequence has not been merged yet
562                                 
563                                 string chunk = alignSeqs[i].seq.getName() + "\t" + toString(alignSeqs[i].numIdentical) + "\t" + toString(0) + "\t" + alignSeqs[i].seq.getAligned() + "\n";
564                                 
565                                 //try to merge it with all smaller seqs
566                                 for (int j = i+1; j < numSeqs; j++) {
567                                         
568                                         if (m->control_pressed) { out.close(); return 0; }
569                                         
570                                         if (alignSeqs[j].active) {  //this sequence has not been merged yet
571                                                 //are you within "diff" bases
572                                                 int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
573                                                 
574                                                 if (mismatch <= diffs) {
575                                                         //merge
576                                                         alignSeqs[i].names += ',' + alignSeqs[j].names;
577                                                         alignSeqs[i].numIdentical += alignSeqs[j].numIdentical;
578                                                         
579                                                         chunk += alignSeqs[j].seq.getName() + "\t" + toString(alignSeqs[j].numIdentical) + "\t" + toString(mismatch) + "\t" + alignSeqs[j].seq.getAligned() + "\n";
580                                                         
581                                                         alignSeqs[j].active = 0;
582                                                         alignSeqs[j].numIdentical = 0;
583                                                         count++;
584                                                 }
585                                         }//end if j active
586                                 }//end for loop j
587                                 
588                                 //remove from active list 
589                                 alignSeqs[i].active = 0;
590                                 
591                                 out << "ideal_seq_" << (i+1) << '\t' << alignSeqs[i].numIdentical << endl << chunk << endl;;
592                                 
593                         }//end if active i
594                         if(i % 100 == 0)        { m->mothurOut(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine(); }
595                 }
596                 out.close();
597                 
598                 if(numSeqs % 100 != 0)  { m->mothurOut(toString(numSeqs) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine();   }       
599                 
600                 return count;
601                 
602         }
603         catch(exception& e) {
604                 m->errorOut(e, "PreClusterCommand", "process");
605                 exit(1);
606         }
607 }
608 /**************************************************************************************************/
609 int PreClusterCommand::readFASTA(){
610         try {
611                 //ifstream inNames;
612                 ifstream inFasta;
613                 
614                 m->openInputFile(fastafile, inFasta);
615                 set<int> lengths;
616                 
617                 while (!inFasta.eof()) {
618                         
619                         if (m->control_pressed) { inFasta.close(); return 0; }
620                                                 
621                         Sequence seq(inFasta);  m->gobble(inFasta);
622                         
623                         if (seq.getName() != "") {  //can get "" if commented line is at end of fasta file
624                                 if (namefile != "") {
625                                         itSize = sizes.find(seq.getName());
626                                         
627                                         if (itSize == sizes.end()) { m->mothurOut(seq.getName() + " is not in your names file, please correct."); m->mothurOutEndLine(); exit(1); }
628                                         else{
629                                                 seqPNode tempNode(itSize->second, seq, names[seq.getName()]);
630                                                 alignSeqs.push_back(tempNode);
631                                                 lengths.insert(seq.getAligned().length());
632                                         }       
633                                 }else { //no names file, you are identical to yourself 
634                     int numRep = 1;
635                     if (countfile != "") { numRep = ct.getNumSeqs(seq.getName()); }
636                                         seqPNode tempNode(numRep, seq, seq.getName());
637                                         alignSeqs.push_back(tempNode);
638                                         lengths.insert(seq.getAligned().length());
639                                 }
640                         }
641                 }
642                 inFasta.close();
643         
644         if (lengths.size() > 1) { m->control_pressed = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); }
645         else if (lengths.size() == 1) { length = *(lengths.begin()); }
646         
647                 return alignSeqs.size();
648         }
649         
650         catch(exception& e) {
651                 m->errorOut(e, "PreClusterCommand", "readFASTA");
652                 exit(1);
653         }
654 }
655 /**************************************************************************************************/
656 int PreClusterCommand::loadSeqs(map<string, string>& thisName, vector<Sequence>& thisSeqs, string group){
657         try {
658                 set<int> lengths;
659                 alignSeqs.clear();
660                 map<string, string>::iterator it;
661                 bool error = false;
662         map<string, int> thisCount;
663         if (countfile != "") { thisCount = cparser->getCountTable(group);  }
664                 
665                 for (int i = 0; i < thisSeqs.size(); i++) {
666                         
667                         if (m->control_pressed) { return 0; }
668                                                 
669                         if (namefile != "") {
670                                 it = thisName.find(thisSeqs[i].getName());
671                                 
672                                 //should never be true since parser checks for this
673                                 if (it == thisName.end()) { m->mothurOut(thisSeqs[i].getName() + " is not in your names file, please correct."); m->mothurOutEndLine(); error = true; }
674                                 else{
675                                         //get number of reps
676                                         int numReps = 1;
677                                         for(int j=0;j<(it->second).length();j++){
678                                                 if((it->second)[j] == ','){     numReps++;      }
679                                         }
680                                         
681                                         seqPNode tempNode(numReps, thisSeqs[i], it->second);
682                                         alignSeqs.push_back(tempNode);
683                     lengths.insert(thisSeqs[i].getAligned().length());
684                                 }       
685                         }else { //no names file, you are identical to yourself 
686                 int numRep = 1;
687                 if (countfile != "") { 
688                     map<string, int>::iterator it2 = thisCount.find(thisSeqs[i].getName());
689                     
690                     //should never be true since parser checks for this
691                     if (it2 == thisCount.end()) { m->mothurOut(thisSeqs[i].getName() + " is not in your count file, please correct."); m->mothurOutEndLine(); error = true; }
692                     else { numRep = it2->second;  }
693                 }
694                                 seqPNode tempNode(numRep, thisSeqs[i], thisSeqs[i].getName());
695                                 alignSeqs.push_back(tempNode);
696                                 lengths.insert(thisSeqs[i].getAligned().length());
697                         }
698                 }
699     
700         if (lengths.size() > 1) { error = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); }
701         else if (lengths.size() == 1) { length = *(lengths.begin()); }
702         
703                 //sanity check
704                 if (error) { m->control_pressed = true; }
705                 
706                 thisSeqs.clear();
707                 
708                 return alignSeqs.size();
709         }
710         
711         catch(exception& e) {
712                 m->errorOut(e, "PreClusterCommand", "loadSeqs");
713                 exit(1);
714         }
715 }
716                                 
717 /**************************************************************************************************/
718
719 int PreClusterCommand::calcMisMatches(string seq1, string seq2){
720         try {
721                 int numBad = 0;
722                 
723                 for (int i = 0; i < seq1.length(); i++) {
724                         //do they match
725                         if (seq1[i] != seq2[i]) { numBad++; }
726                         if (numBad > diffs) { return length;  } //to far to cluster
727                 }
728                 
729                 return numBad;
730         }
731         catch(exception& e) {
732                 m->errorOut(e, "PreClusterCommand", "calcMisMatches");
733                 exit(1);
734         }
735 }
736 /**************************************************************************************************/
737
738 int PreClusterCommand::mergeGroupCounts(string newcount, string newname, string newfasta){
739         try {
740                 ifstream inNames;
741         m->openInputFile(newname, inNames);
742         
743         string group, first, second;
744         set<string> uniqueNames;
745         while (!inNames.eof()) {
746             if (m->control_pressed) { break; }
747             inNames >> group; m->gobble(inNames);
748             inNames >> first; m->gobble(inNames);
749             inNames >> second; m->gobble(inNames);
750             
751             vector<string> names;
752             m->splitAtComma(second, names);
753             
754             uniqueNames.insert(first);
755             
756             int total = ct.getGroupCount(first, group);
757             for (int i = 1; i < names.size(); i++) {
758                 total += ct.getGroupCount(names[i], group);
759                 ct.setAbund(names[i], group, 0);
760             }
761             ct.setAbund(first, group, total);
762         }
763         inNames.close();
764         
765         vector<string> namesOfSeqs = ct.getNamesOfSeqs();
766         for (int i = 0; i < namesOfSeqs.size(); i++) {
767             if (ct.getNumSeqs(namesOfSeqs[i]) == 0) {
768                 ct.remove(namesOfSeqs[i]);
769             }
770         }
771         
772         ct.printTable(newcount); 
773         m->mothurRemove(newname);
774         
775         if (bygroup) { //if by group, must remove the duplicate seqs that are named the same
776             ifstream in;
777             m->openInputFile(newfasta, in);
778             
779             ofstream out;
780             m->openOutputFile(newfasta+"temp", out);
781             
782             int count = 0;
783             set<string> already;
784             while(!in.eof()) {
785                 if (m->control_pressed) { break; }
786                 
787                 Sequence seq(in); m->gobble(in);
788                 
789                 if (seq.getName() != "") {
790                     count++;
791                     if (already.count(seq.getName()) == 0) {
792                         seq.printSequence(out);
793                         already.insert(seq.getName());
794                     }
795                 }
796             }
797             in.close();
798             out.close();
799             m->mothurRemove(newfasta);
800             m->renameFile(newfasta+"temp", newfasta);
801         }
802                         return 0;
803                 
804         }
805         catch(exception& e) {
806                 m->errorOut(e, "PreClusterCommand", "mergeGroupCounts");
807                 exit(1);
808         }
809 }
810
811 /**************************************************************************************************/
812
813 void PreClusterCommand::printData(string newfasta, string newname, string group){
814         try {
815                 ofstream outFasta;
816                 ofstream outNames;
817                 
818                 if (bygroup) {
819                         m->openOutputFileAppend(newfasta, outFasta);
820                         m->openOutputFileAppend(newname, outNames);
821                 }else {
822                         m->openOutputFile(newfasta, outFasta);
823                         m->openOutputFile(newname, outNames);
824                 }
825                 
826         if ((countfile != "") && (group == ""))  { outNames << "Representative_Sequence\ttotal\n";  }
827                 for (int i = 0; i < alignSeqs.size(); i++) {
828                         if (alignSeqs[i].numIdentical != 0) {
829                                 alignSeqs[i].seq.printSequence(outFasta); 
830                                 if (countfile != "") {  
831                     if (group != "") {  outNames << group << '\t' << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl; }
832                     else {  outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].numIdentical << endl;  }
833                 }else {  outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl;  }
834                         }
835                 }
836                 
837                 outFasta.close();
838                 outNames.close();
839                 
840         }
841         catch(exception& e) {
842                 m->errorOut(e, "PreClusterCommand", "printData");
843                 exit(1);
844         }
845 }
846 /**************************************************************************************************/
847
848 void PreClusterCommand::readNameFile(){
849         try {
850                 ifstream in;
851                 m->openInputFile(namefile, in);
852                 string firstCol, secondCol;
853                                 
854                 while (!in.eof()) {
855                         in >> firstCol >> secondCol; m->gobble(in);
856                         names[firstCol] = secondCol;
857                         int size = 1;
858
859                         for(int i=0;i<secondCol.size();i++){
860                                 if(secondCol[i] == ','){        size++; }
861                         }
862                         sizes[firstCol] = size;
863                 }
864                 in.close();
865         }
866         catch(exception& e) {
867                 m->errorOut(e, "PreClusterCommand", "readNameFile");
868                 exit(1);
869         }
870 }
871
872 /**************************************************************************************************/
873
874