]> git.donarmstrong.com Git - mothur.git/blob - preclustercommand.cpp
added load.logfile command. changed summary.single output for subsample=t.
[mothur.git] / preclustercommand.cpp
1 /*
2  *  preclustercommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 12/21/09.
6  *  Copyright 2009 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "preclustercommand.h"
11 #include "deconvolutecommand.h"
12
13 //**********************************************************************************************************************
14 vector<string> PreClusterCommand::setParameters(){      
15         try {
16                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
17                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
18                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
19                 CommandParameter pdiffs("diffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(pdiffs);
20                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
21                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
22                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
23                 
24                 vector<string> myArray;
25                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
26                 return myArray;
27         }
28         catch(exception& e) {
29                 m->errorOut(e, "PreClusterCommand", "setParameters");
30                 exit(1);
31         }
32 }
33 //**********************************************************************************************************************
34 string PreClusterCommand::getHelpString(){      
35         try {
36                 string helpString = "";
37                 helpString += "The pre.cluster command groups sequences that are within a given number of base mismatches.\n";
38                 helpString += "The pre.cluster command outputs a new fasta and name file.\n";
39                 helpString += "The pre.cluster command parameters are fasta, names and diffs. The fasta parameter is required. \n";
40                 helpString += "The names parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n";
41                 helpString += "The group parameter allows you to provide a group file so you can cluster by group. \n";
42                 helpString += "The diffs parameter allows you to specify maximum number of mismatched bases allowed between sequences in a grouping. The default is 1.\n";
43                 helpString += "The pre.cluster command should be in the following format: \n";
44                 helpString += "pre.cluster(fasta=yourFastaFile, names=yourNamesFile, diffs=yourMaxDiffs) \n";
45                 helpString += "Example pre.cluster(fasta=amazon.fasta, diffs=2).\n";
46                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
47                 return helpString;
48         }
49         catch(exception& e) {
50                 m->errorOut(e, "PreClusterCommand", "getHelpString");
51                 exit(1);
52         }
53 }
54 //**********************************************************************************************************************
55 string PreClusterCommand::getOutputFileNameTag(string type, string inputName=""){       
56         try {
57         string outputFileName = "";
58                 map<string, vector<string> >::iterator it;
59         
60         //is this a type this command creates
61         it = outputTypes.find(type);
62         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
63         else {
64             if (type == "fasta") {  outputFileName =  "precluster" + m->getExtension(inputName); }
65             else if (type == "name") {  outputFileName =  "precluster.names"; }
66             else if (type == "map") {  outputFileName =  "precluster.map"; }
67             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
68         }
69         return outputFileName;
70         }
71         catch(exception& e) {
72                 m->errorOut(e, "PreClusterCommand", "getOutputFileNameTag");
73                 exit(1);
74         }
75 }
76 //**********************************************************************************************************************
77 PreClusterCommand::PreClusterCommand(){ 
78         try {
79                 abort = true; calledHelp = true; 
80                 setParameters();
81                 vector<string> tempOutNames;
82                 outputTypes["fasta"] = tempOutNames;
83                 outputTypes["name"] = tempOutNames;
84                 outputTypes["map"] = tempOutNames;
85         }
86         catch(exception& e) {
87                 m->errorOut(e, "PreClusterCommand", "PreClusterCommand");
88                 exit(1);
89         }
90 }
91 //**********************************************************************************************************************
92
93 PreClusterCommand::PreClusterCommand(string option) {
94         try {
95                 abort = false; calledHelp = false;   
96                 
97                 //allow user to run help
98                 if(option == "help") { help(); abort = true; calledHelp = true; }
99                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
100                 
101                 else {
102                         vector<string> myArray = setParameters();
103                         
104                         OptionParser parser(option);
105                         map<string, string> parameters = parser.getParameters();
106                         
107                         ValidParameters validParameter;
108                         map<string, string>::iterator it;
109                 
110                         //check to make sure all parameters are valid for command
111                         for (map<string, string>::iterator it2 = parameters.begin(); it2 != parameters.end(); it2++) { 
112                                 if (validParameter.isValidParameter(it2->first, myArray, it2->second) != true) {  abort = true;  }
113                         }
114                         
115                         //initialize outputTypes
116                         vector<string> tempOutNames;
117                         outputTypes["fasta"] = tempOutNames;
118                         outputTypes["name"] = tempOutNames;
119                         outputTypes["map"] = tempOutNames;
120                 
121                         //if the user changes the input directory command factory will send this info to us in the output parameter 
122                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
123                         if (inputDir == "not found"){   inputDir = "";          }
124                         else {
125                                 string path;
126                                 it = parameters.find("fasta");
127                                 //user has given a template file
128                                 if(it != parameters.end()){ 
129                                         path = m->hasPath(it->second);
130                                         //if the user has not given a path then, add inputdir. else leave path alone.
131                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
132                                 }
133                                 
134                                 it = parameters.find("name");
135                                 //user has given a template file
136                                 if(it != parameters.end()){ 
137                                         path = m->hasPath(it->second);
138                                         //if the user has not given a path then, add inputdir. else leave path alone.
139                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
140                                 }
141                                 
142                                 it = parameters.find("group");
143                                 //user has given a template file
144                                 if(it != parameters.end()){ 
145                                         path = m->hasPath(it->second);
146                                         //if the user has not given a path then, add inputdir. else leave path alone.
147                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
148                                 }
149                         }
150
151                         //check for required parameters
152                         fastafile = validParameter.validFile(parameters, "fasta", true);
153                         if (fastafile == "not found") {                                 
154                                 fastafile = m->getFastaFile(); 
155                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
156                                 else {  m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
157                         }
158                         else if (fastafile == "not open") { abort = true; }     
159                         else { m->setFastaFile(fastafile); }
160                         
161                         //if the user changes the output directory command factory will send this info to us in the output parameter 
162                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
163                                 outputDir = ""; 
164                                 outputDir += m->hasPath(fastafile); //if user entered a file with a path then preserve it       
165                         }
166
167                         //check for optional parameter and set defaults
168                         // ...at some point should added some additional type checking...
169                         namefile = validParameter.validFile(parameters, "name", true);
170                         if (namefile == "not found") { namefile =  "";  }
171                         else if (namefile == "not open") { namefile = ""; abort = true; }       
172                         else {  m->setNameFile(namefile); }
173                         
174                         groupfile = validParameter.validFile(parameters, "group", true);
175                         if (groupfile == "not found") { groupfile =  "";  bygroup = false; }
176                         else if (groupfile == "not open") { abort = true; groupfile =  ""; }    
177                         else {   m->setGroupFile(groupfile); bygroup = true;  }
178                         
179                         string temp     = validParameter.validFile(parameters, "diffs", false);         if(temp == "not found"){        temp = "1"; }
180                         m->mothurConvert(temp, diffs); 
181                         
182                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
183                         m->setProcessors(temp);
184                         m->mothurConvert(temp, processors);
185                         
186                         if (namefile == "") {
187                                 vector<string> files; files.push_back(fastafile);
188                                 parser.getNameFile(files);
189                         }
190                 }
191                                 
192         }
193         catch(exception& e) {
194                 m->errorOut(e, "PreClusterCommand", "PreClusterCommand");
195                 exit(1);
196         }
197 }
198 //**********************************************************************************************************************
199
200 int PreClusterCommand::execute(){
201         try {
202                 
203                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
204                 
205                 int start = time(NULL);
206                 
207                 string fileroot = outputDir + m->getRootName(m->getSimpleName(fastafile));
208                 string newFastaFile = fileroot + getOutputFileNameTag("fasta", fastafile);
209                 string newNamesFile = fileroot + getOutputFileNameTag("name");
210                 string newMapFile = fileroot + getOutputFileNameTag("map"); //add group name if by group
211                 outputNames.push_back(newFastaFile); outputTypes["fasta"].push_back(newFastaFile);
212                 outputNames.push_back(newNamesFile); outputTypes["name"].push_back(newNamesFile);
213                 
214                 
215                 if (bygroup) {
216                         //clear out old files
217                         ofstream outFasta; m->openOutputFile(newFastaFile, outFasta); outFasta.close();
218                         ofstream outNames; m->openOutputFile(newNamesFile, outNames);  outNames.close();
219                         newMapFile = fileroot + "precluster.";
220                         
221                         //parse fasta and name file by group
222                         SequenceParser* parser;
223                         if (namefile != "") { parser = new SequenceParser(groupfile, fastafile, namefile);      }
224                         else                            { parser = new SequenceParser(groupfile, fastafile);                    }
225                         
226                         vector<string> groups = parser->getNamesOfGroups();
227                         
228                         if(processors == 1)     {       driverGroups(parser, newFastaFile, newNamesFile, newMapFile, 0, groups.size(), groups); }
229                         else                            {       createProcessesGroups(parser, newFastaFile, newNamesFile, newMapFile, groups);                  }
230                         
231                         delete parser;
232                         
233                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        }        return 0; }
234                         
235                         //run unique.seqs for deconvolute results
236                         string inputString = "fasta=" + newFastaFile;
237                         if (namefile != "") { inputString += ", name=" + newNamesFile; }
238                         m->mothurOutEndLine(); 
239                         m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
240                         m->mothurOut("Running command: unique.seqs(" + inputString + ")"); m->mothurOutEndLine(); 
241                         m->mothurCalling = true;
242             
243                         Command* uniqueCommand = new DeconvoluteCommand(inputString);
244                         uniqueCommand->execute();
245                         
246                         map<string, vector<string> > filenames = uniqueCommand->getOutputFiles();
247                         
248                         delete uniqueCommand;
249                         m->mothurCalling = false;
250                         m->mothurOut("/******************************************/"); m->mothurOutEndLine(); 
251                         
252                         m->renameFile(filenames["fasta"][0], newFastaFile);
253             m->renameFile(filenames["name"][0], newNamesFile);
254                         
255                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run pre.cluster."); m->mothurOutEndLine(); 
256                                 
257                 }else {
258                         if (namefile != "") { readNameFile(); }
259                 
260                         //reads fasta file and return number of seqs
261                         int numSeqs = readFASTA(); //fills alignSeqs and makes all seqs active
262                 
263                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }
264         
265                         if (numSeqs == 0) { m->mothurOut("Error reading fasta file...please correct."); m->mothurOutEndLine(); return 0;  }
266                         if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); return 0;  }
267                         
268                         int count = process(newMapFile);
269                         outputNames.push_back(newMapFile); outputTypes["map"].push_back(newMapFile);
270                         
271                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }   
272                         
273                         m->mothurOut("Total number of sequences before precluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
274                         m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); 
275                         printData(newFastaFile, newNamesFile);
276                         
277                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); 
278                 }
279                                 
280                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);        } return 0; }
281                 
282                 m->mothurOutEndLine();
283                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
284                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }               
285                 m->mothurOutEndLine();
286                 
287                 //set fasta file as new current fastafile
288                 string current = "";
289                 itTypes = outputTypes.find("fasta");
290                 if (itTypes != outputTypes.end()) {
291                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
292                 }
293                 
294                 itTypes = outputTypes.find("name");
295                 if (itTypes != outputTypes.end()) {
296                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
297                 }
298                 
299                 return 0;
300                 
301         }
302         catch(exception& e) {
303                 m->errorOut(e, "PreClusterCommand", "execute");
304                 exit(1);
305         }
306 }
307 /**************************************************************************************************/
308 int PreClusterCommand::createProcessesGroups(SequenceParser* parser, string newFName, string newNName, string newMFile, vector<string> groups) {
309         try {
310                 
311                 vector<int> processIDS;
312                 int process = 1;
313                 int num = 0;
314                 
315                 //sanity check
316                 if (groups.size() < processors) { processors = groups.size(); }
317                 
318                 //divide the groups between the processors
319                 vector<linePair> lines;
320                 int numGroupsPerProcessor = groups.size() / processors;
321                 for (int i = 0; i < processors; i++) {
322                         int startIndex =  i * numGroupsPerProcessor;
323                         int endIndex = (i+1) * numGroupsPerProcessor;
324                         if(i == (processors - 1)){      endIndex = groups.size();       }
325                         lines.push_back(linePair(startIndex, endIndex));
326                 }
327                 
328 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)          
329                 
330                 //loop through and create all the processes you want
331                 while (process != processors) {
332                         int pid = fork();
333                         
334                         if (pid > 0) {
335                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
336                                 process++;
337                         }else if (pid == 0){
338                 outputNames.clear();
339                                 num = driverGroups(parser, newFName + toString(getpid()) + ".temp", newNName + toString(getpid()) + ".temp", newMFile, lines[process].start, lines[process].end, groups);
340                 
341                 string tempFile = toString(getpid()) + ".outputNames.temp";
342                 ofstream outTemp;
343                 m->openOutputFile(tempFile, outTemp);
344                 
345                 outTemp << outputNames.size();
346                 for (int i = 0; i < outputNames.size(); i++) { outTemp << outputNames[i] << endl; }
347                 outTemp.close();
348                 
349                                 exit(0);
350                         }else { 
351                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
352                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
353                                 exit(0);
354                         }
355                 }
356                 
357                 //do my part
358                 num = driverGroups(parser, newFName, newNName, newMFile, lines[0].start, lines[0].end, groups);
359                 
360                 //force parent to wait until all the processes are done
361                 for (int i=0;i<processIDS.size();i++) { 
362                         int temp = processIDS[i];
363                         wait(&temp);
364                 }
365         
366         for (int i = 0; i < processIDS.size(); i++) {
367             string tempFile = toString(processIDS[i]) +  ".outputNames.temp";
368             ifstream intemp;
369             m->openInputFile(tempFile, intemp);
370             
371             int num;
372             intemp >> num;
373             for (int k = 0; k < num; k++) {
374                 string name = "";
375                 intemp >> name; m->gobble(intemp);
376                 
377                 outputNames.push_back(name); outputTypes["map"].push_back(name);
378             }
379             intemp.close();
380             m->mothurRemove(tempFile);
381         }
382 #else
383                 
384                 //////////////////////////////////////////////////////////////////////////////////////////////////////
385                 //Windows version shared memory, so be careful when passing variables through the preClusterData struct. 
386                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
387                 //////////////////////////////////////////////////////////////////////////////////////////////////////
388                 
389                 vector<preClusterData*> pDataArray; 
390                 DWORD   dwThreadIdArray[processors-1];
391                 HANDLE  hThreadArray[processors-1]; 
392                 
393                 //Create processor worker threads.
394                 for( int i=1; i<processors; i++ ){
395                         // Allocate memory for thread data.
396                         string extension = toString(i) + ".temp";
397                         
398                         preClusterData* tempPreCluster = new preClusterData(fastafile, namefile, groupfile, (newFName+extension), (newNName+extension), newMFile, groups, m, lines[i].start, lines[i].end, diffs, i);
399                         pDataArray.push_back(tempPreCluster);
400                         processIDS.push_back(i);
401                         
402                         //MySeqSumThreadFunction is in header. It must be global or static to work with the threads.
403                         //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
404                         hThreadArray[i-1] = CreateThread(NULL, 0, MyPreclusterThreadFunction, pDataArray[i-1], 0, &dwThreadIdArray[i-1]);   
405                 }
406                 
407                                 
408                 //using the main process as a worker saves time and memory
409                 num = driverGroups(parser, newFName, newNName, newMFile, lines[0].start, lines[0].end, groups);
410                 
411                 //Wait until all threads have terminated.
412                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
413                 
414                 //Close all thread handles and free memory allocations.
415                 for(int i=0; i < pDataArray.size(); i++){
416                         for (int j = 0; j < pDataArray[i]->mapFileNames.size(); j++) {
417                                 outputNames.push_back(pDataArray[i]->mapFileNames[j]); outputTypes["map"].push_back(pDataArray[i]->mapFileNames[j]); 
418                         }
419                         CloseHandle(hThreadArray[i]);
420                         delete pDataArray[i];
421                 }
422                 
423 #endif          
424                 
425                 //append output files
426                 for(int i=0;i<processIDS.size();i++){
427                         //newFName = m->getFullPathName(".\\" + newFName);
428                         //newNName = m->getFullPathName(".\\" + newNName);
429                         
430                         m->appendFiles((newFName + toString(processIDS[i]) + ".temp"), newFName);
431                         m->mothurRemove((newFName + toString(processIDS[i]) + ".temp"));
432                         
433                         m->appendFiles((newNName + toString(processIDS[i]) + ".temp"), newNName);
434                         m->mothurRemove((newNName + toString(processIDS[i]) + ".temp"));
435                 }
436                 
437                 return num;     
438                 
439         }
440         catch(exception& e) {
441                 m->errorOut(e, "PreClusterCommand", "createProcessesGroups");
442                 exit(1);
443         }
444 }
445 /**************************************************************************************************/
446 int PreClusterCommand::driverGroups(SequenceParser* parser, string newFFile, string newNFile, string newMFile, int start, int end, vector<string> groups){
447         try {
448                 
449                 int numSeqs = 0;
450                 
451                 //precluster each group
452                 for (int i = start; i < end; i++) {
453                         
454                         start = time(NULL);
455                         
456                         if (m->control_pressed) {  return 0; }
457                         
458                         m->mothurOutEndLine(); m->mothurOut("Processing group " + groups[i] + ":"); m->mothurOutEndLine();
459                         
460                         map<string, string> thisNameMap;
461                         if (namefile != "") { thisNameMap = parser->getNameMap(groups[i]); }
462                         vector<Sequence> thisSeqs = parser->getSeqs(groups[i]);
463                         
464                         //fill alignSeqs with this groups info.
465                         numSeqs = loadSeqs(thisNameMap, thisSeqs);
466                         
467                         if (m->control_pressed) {   return 0; }
468                         
469                         if (diffs > length) { m->mothurOut("Error: diffs is greater than your sequence length."); m->mothurOutEndLine(); m->control_pressed = true; return 0;  }
470                         
471                         int count = process(newMFile+groups[i]+".map");
472                         outputNames.push_back(newMFile+groups[i]+".map"); outputTypes["map"].push_back(newMFile+groups[i]+".map");
473                         
474                         if (m->control_pressed) {  return 0; }
475                         
476                         m->mothurOut("Total number of sequences before pre.cluster was " + toString(alignSeqs.size()) + "."); m->mothurOutEndLine();
477                         m->mothurOut("pre.cluster removed " + toString(count) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); 
478                         printData(newFFile, newNFile);
479                         
480                         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to cluster " + toString(numSeqs) + " sequences."); m->mothurOutEndLine(); 
481                         
482                 }
483                 
484                 return numSeqs;
485         }
486         catch(exception& e) {
487                 m->errorOut(e, "PreClusterCommand", "driverGroups");
488                 exit(1);
489         }
490 }
491 /**************************************************************************************************/
492 int PreClusterCommand::process(string newMapFile){
493         try {
494                 ofstream out;
495                 m->openOutputFile(newMapFile, out);
496                 
497                 //sort seqs by number of identical seqs
498                 sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
499                 
500                 int count = 0;
501                 int numSeqs = alignSeqs.size();
502                 
503                 //think about running through twice...
504                 for (int i = 0; i < numSeqs; i++) {
505                         
506                         //are you active
507                         //                      itActive = active.find(alignSeqs[i].seq.getName());
508                         
509                         if (alignSeqs[i].active) {  //this sequence has not been merged yet
510                                 
511                                 string chunk = alignSeqs[i].seq.getName() + "\t" + toString(alignSeqs[i].numIdentical) + "\t" + toString(0) + "\t" + alignSeqs[i].seq.getAligned() + "\n";
512                                 
513                                 //try to merge it with all smaller seqs
514                                 for (int j = i+1; j < numSeqs; j++) {
515                                         
516                                         if (m->control_pressed) { out.close(); return 0; }
517                                         
518                                         if (alignSeqs[j].active) {  //this sequence has not been merged yet
519                                                 //are you within "diff" bases
520                                                 int mismatch = calcMisMatches(alignSeqs[i].seq.getAligned(), alignSeqs[j].seq.getAligned());
521                                                 
522                                                 if (mismatch <= diffs) {
523                                                         //merge
524                                                         alignSeqs[i].names += ',' + alignSeqs[j].names;
525                                                         alignSeqs[i].numIdentical += alignSeqs[j].numIdentical;
526                                                         
527                                                         chunk += alignSeqs[j].seq.getName() + "\t" + toString(alignSeqs[j].numIdentical) + "\t" + toString(mismatch) + "\t" + alignSeqs[j].seq.getAligned() + "\n";
528                                                         
529                                                         alignSeqs[j].active = 0;
530                                                         alignSeqs[j].numIdentical = 0;
531                                                         count++;
532                                                 }
533                                         }//end if j active
534                                 }//end for loop j
535                                 
536                                 //remove from active list 
537                                 alignSeqs[i].active = 0;
538                                 
539                                 out << "ideal_seq_" << (i+1) << '\t' << alignSeqs[i].numIdentical << endl << chunk << endl;;
540                                 
541                         }//end if active i
542                         if(i % 100 == 0)        { m->mothurOut(toString(i) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine(); }
543                 }
544                 out.close();
545                 
546                 if(numSeqs % 100 != 0)  { m->mothurOut(toString(numSeqs) + "\t" + toString(numSeqs - count) + "\t" + toString(count)); m->mothurOutEndLine();   }       
547                 
548                 return count;
549                 
550         }
551         catch(exception& e) {
552                 m->errorOut(e, "PreClusterCommand", "process");
553                 exit(1);
554         }
555 }
556 /**************************************************************************************************/
557 int PreClusterCommand::readFASTA(){
558         try {
559                 //ifstream inNames;
560                 ifstream inFasta;
561                 
562                 //m->openInputFile(namefile, inNames);
563                 m->openInputFile(fastafile, inFasta);
564                 
565                 //string firstCol, secondCol, nameString;
566                 set<int> lengths;
567                 
568                 while (!inFasta.eof()) {
569                         
570                         if (m->control_pressed) { inFasta.close(); return 0; }
571                         
572                         //inNames >> firstCol >> secondCol;
573                         //nameString = secondCol;
574                         
575                         //m->gobble(inNames);
576                         //int size = 1;
577                         //while (secondCol.find_first_of(',') != -1) { 
578                         //      size++;
579                         //      secondCol = secondCol.substr(secondCol.find_first_of(',')+1, secondCol.length());
580                         //}
581                         
582                         Sequence seq(inFasta);  m->gobble(inFasta);
583                         
584                         if (seq.getName() != "") {  //can get "" if commented line is at end of fasta file
585                                 if (namefile != "") {
586                                         itSize = sizes.find(seq.getName());
587                                         
588                                         if (itSize == sizes.end()) { m->mothurOut(seq.getName() + " is not in your names file, please correct."); m->mothurOutEndLine(); exit(1); }
589                                         else{
590                                                 seqPNode tempNode(itSize->second, seq, names[seq.getName()]);
591                                                 alignSeqs.push_back(tempNode);
592                                                 lengths.insert(seq.getAligned().length());
593                                         }       
594                                 }else { //no names file, you are identical to yourself 
595                                         seqPNode tempNode(1, seq, seq.getName());
596                                         alignSeqs.push_back(tempNode);
597                                         lengths.insert(seq.getAligned().length());
598                                 }
599                         }
600                 }
601                 inFasta.close();
602                 //inNames.close();
603         
604         if (lengths.size() > 1) { m->control_pressed = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); }
605         else if (lengths.size() == 1) { length = *(lengths.begin()); }
606         
607                 return alignSeqs.size();
608         }
609         
610         catch(exception& e) {
611                 m->errorOut(e, "PreClusterCommand", "readFASTA");
612                 exit(1);
613         }
614 }
615 /**************************************************************************************************/
616 int PreClusterCommand::loadSeqs(map<string, string>& thisName, vector<Sequence>& thisSeqs){
617         try {
618                 set<int> lengths;
619                 alignSeqs.clear();
620                 map<string, string>::iterator it;
621                 bool error = false;
622                         
623                 for (int i = 0; i < thisSeqs.size(); i++) {
624                         
625                         if (m->control_pressed) { return 0; }
626                                                 
627                         if (namefile != "") {
628                                 it = thisName.find(thisSeqs[i].getName());
629                                 
630                                 //should never be true since parser checks for this
631                                 if (it == thisName.end()) { m->mothurOut(thisSeqs[i].getName() + " is not in your names file, please correct."); m->mothurOutEndLine(); error = true; }
632                                 else{
633                                         //get number of reps
634                                         int numReps = 1;
635                                         for(int j=0;j<(it->second).length();j++){
636                                                 if((it->second)[j] == ','){     numReps++;      }
637                                         }
638                                         
639                                         seqPNode tempNode(numReps, thisSeqs[i], it->second);
640                                         alignSeqs.push_back(tempNode);
641                     lengths.insert(thisSeqs[i].getAligned().length());
642                                 }       
643                         }else { //no names file, you are identical to yourself 
644                                 seqPNode tempNode(1, thisSeqs[i], thisSeqs[i].getName());
645                                 alignSeqs.push_back(tempNode);
646                                 lengths.insert(thisSeqs[i].getAligned().length());
647                         }
648                 }
649                 
650         if (lengths.size() > 1) { error = true; m->mothurOut("[ERROR]: your sequences are not all the same length. pre.cluster requires sequences to be aligned."); m->mothurOutEndLine(); }
651         else if (lengths.size() == 1) { length = *(lengths.begin()); }
652         
653                 //sanity check
654                 if (error) { m->control_pressed = true; }
655                 
656                 thisSeqs.clear();
657                 
658                 return alignSeqs.size();
659         }
660         
661         catch(exception& e) {
662                 m->errorOut(e, "PreClusterCommand", "loadSeqs");
663                 exit(1);
664         }
665 }
666                                 
667 /**************************************************************************************************/
668
669 int PreClusterCommand::calcMisMatches(string seq1, string seq2){
670         try {
671                 int numBad = 0;
672                 
673                 for (int i = 0; i < seq1.length(); i++) {
674                         //do they match
675                         if (seq1[i] != seq2[i]) { numBad++; }
676                         if (numBad > diffs) { return length;  } //to far to cluster
677                 }
678                 
679                 return numBad;
680         }
681         catch(exception& e) {
682                 m->errorOut(e, "PreClusterCommand", "calcMisMatches");
683                 exit(1);
684         }
685 }
686
687 /**************************************************************************************************/
688
689 void PreClusterCommand::printData(string newfasta, string newname){
690         try {
691                 ofstream outFasta;
692                 ofstream outNames;
693                 
694                 if (bygroup) {
695                         m->openOutputFileAppend(newfasta, outFasta);
696                         m->openOutputFileAppend(newname, outNames);
697                 }else {
698                         m->openOutputFile(newfasta, outFasta);
699                         m->openOutputFile(newname, outNames);
700                 }
701                 
702                 for (int i = 0; i < alignSeqs.size(); i++) {
703                         if (alignSeqs[i].numIdentical != 0) {
704                                 alignSeqs[i].seq.printSequence(outFasta); 
705                                 outNames << alignSeqs[i].seq.getName() << '\t' << alignSeqs[i].names << endl;
706                         }
707                 }
708                 
709                 outFasta.close();
710                 outNames.close();
711                 
712         }
713         catch(exception& e) {
714                 m->errorOut(e, "PreClusterCommand", "printData");
715                 exit(1);
716         }
717 }
718 /**************************************************************************************************/
719
720 void PreClusterCommand::readNameFile(){
721         try {
722                 ifstream in;
723                 m->openInputFile(namefile, in);
724                 string firstCol, secondCol;
725                                 
726                 while (!in.eof()) {
727                         in >> firstCol >> secondCol; m->gobble(in);
728                         names[firstCol] = secondCol;
729                         int size = 1;
730
731                         for(int i=0;i<secondCol.size();i++){
732                                 if(secondCol[i] == ','){        size++; }
733                         }
734                         sizes[firstCol] = size;
735                 }
736                 in.close();
737         }
738         catch(exception& e) {
739                 m->errorOut(e, "PreClusterCommand", "readNameFile");
740                 exit(1);
741         }
742 }
743
744 /**************************************************************************************************/
745
746