]> git.donarmstrong.com Git - mothur.git/blob - countseqscommand.cpp
added shared file to make.table
[mothur.git] / countseqscommand.cpp
1 /*
2  *  countseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 6/1/11.
6  *  Copyright 2011 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "countseqscommand.h"
11 #include "sharedutilities.h"
12 #include "counttable.h"
13 #include "inputdata.h"
14
15 //**********************************************************************************************************************
16 vector<string> CountSeqsCommand::setParameters(){       
17         try {
18         CommandParameter pshared("shared", "InputTypes", "", "", "NameSHared-sharedGroup", "NameSHared", "none","count",false,false,true); parameters.push_back(pshared);
19                 CommandParameter pname("name", "InputTypes", "", "", "NameSHared", "NameSHared", "none","count",false,false,true); parameters.push_back(pname);
20                 CommandParameter pgroup("group", "InputTypes", "", "", "sharedGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
21         CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
22         CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
23                 CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
24                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
25                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
26                 
27                 vector<string> myArray;
28                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
29                 return myArray;
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "CountSeqsCommand", "setParameters");
33                 exit(1);
34         }
35 }
36 //**********************************************************************************************************************
37 string CountSeqsCommand::getHelpString(){       
38         try {
39                 string helpString = "";
40                 helpString += "The count.seqs aka. make.table command reads a name or shared file and outputs a .count_table file.  You may also provide a group with the names file to get the counts broken down by group.\n";
41                 helpString += "The groups parameter allows you to indicate which groups you want to include in the counts, by default all groups in your groupfile are used.\n";
42         helpString += "The large parameter indicates the name and group files are too large to fit in RAM.\n";
43                 helpString += "When you use the groups parameter and a sequence does not represent any sequences from the groups you specify it is not included in the .count.summary file.\n";
44         helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
45                 helpString += "The count.seqs command should be in the following format: count.seqs(name=yourNameFile).\n";
46                 helpString += "Example count.seqs(name=amazon.names) or make.table(name=amazon.names).\n";
47                 helpString += "Note: No spaces between parameter labels (i.e. name), '=' and parameters (i.e.yourNameFile).\n";
48                 return helpString;
49         }
50         catch(exception& e) {
51                 m->errorOut(e, "CountSeqsCommand", "getHelpString");
52                 exit(1);
53         }
54 }
55 //**********************************************************************************************************************
56 string CountSeqsCommand::getOutputPattern(string type) {
57     try {
58         string pattern = "";
59         if (type == "count") {  pattern = "[filename],count_table-[filename],[distance],count_table"; }
60         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
61         
62         return pattern;
63     }
64     catch(exception& e) {
65         m->errorOut(e, "CountSeqsCommand", "getOutputPattern");
66         exit(1);
67     }
68 }
69 //**********************************************************************************************************************
70 CountSeqsCommand::CountSeqsCommand(){   
71         try {
72                 abort = true; calledHelp = true; 
73                 setParameters();
74                 vector<string> tempOutNames;
75                 outputTypes["count"] = tempOutNames;
76         }
77         catch(exception& e) {
78                 m->errorOut(e, "CountSeqsCommand", "CountSeqsCommand");
79                 exit(1);
80         }
81 }
82 //**********************************************************************************************************************
83
84 CountSeqsCommand::CountSeqsCommand(string option)  {
85         try {
86                 abort = false; calledHelp = false;
87         allLines = 1;
88                 
89                 //allow user to run help
90                 if(option == "help") { help(); abort = true; calledHelp = true; }
91                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
92                 else {
93                         vector<string> myArray = setParameters();
94                         
95                         OptionParser parser(option);
96                         map<string,string> parameters = parser.getParameters();
97                         
98                         ValidParameters validParameter;
99                         map<string,string>::iterator it;
100                         
101                         //check to make sure all parameters are valid for command
102                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
103                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
104                         }
105                         
106                         //initialize outputTypes
107                         vector<string> tempOutNames;
108                         outputTypes["count"] = tempOutNames;
109                         
110                         
111                         //if the user changes the input directory command factory will send this info to us in the output parameter 
112                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
113                         if (inputDir == "not found"){   inputDir = "";          }
114                         else {
115                                 string path;
116                                 it = parameters.find("name");
117                                 //user has given a template file
118                                 if(it != parameters.end()){ 
119                                         path = m->hasPath(it->second);
120                                         //if the user has not given a path then, add inputdir. else leave path alone.
121                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
122                                 }
123                                 
124                                 it = parameters.find("group");
125                                 //user has given a template file
126                                 if(it != parameters.end()){ 
127                                         path = m->hasPath(it->second);
128                                         //if the user has not given a path then, add inputdir. else leave path alone.
129                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
130                                 }
131                 
132                 it = parameters.find("shared");
133                                 //user has given a template file
134                                 if(it != parameters.end()){
135                                         path = m->hasPath(it->second);
136                                         //if the user has not given a path then, add inputdir. else leave path alone.
137                                         if (path == "") {       parameters["shared"] = inputDir + it->second;           }
138                                 }
139                         }
140                         
141                         //check for required parameters
142                         namefile = validParameter.validFile(parameters, "name", true);
143                         if (namefile == "not open") { namefile = ""; abort = true; }
144                         else if (namefile == "not found"){      namefile = ""; }
145             else { m->setNameFile(namefile); }
146             
147             sharedfile = validParameter.validFile(parameters, "shared", true);
148                         if (sharedfile == "not open") { sharedfile = ""; abort = true; }
149                         else if (sharedfile == "not found"){    sharedfile = ""; }
150             else { m->setSharedFile(sharedfile); }
151             
152                         groupfile = validParameter.validFile(parameters, "group", true);
153                         if (groupfile == "not open") { abort = true; }
154                         else if (groupfile == "not found") {  groupfile = "";  }        
155                         else { m->setGroupFile(groupfile); }
156             
157             if ((namefile == "") && (sharedfile == "")) {
158                 namefile = m->getNameFile();
159                                 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
160                                 else {
161                     sharedfile = m->getSharedFile();
162                     if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
163                     else {
164                         m->mothurOut("You have no current namefile or sharedfile and the name or shared parameter is required."); m->mothurOutEndLine(); abort = true;
165                     }
166                 }
167                         }
168
169                         groups = validParameter.validFile(parameters, "groups", false);                 
170                         if (groups == "not found") { groups = "all"; }
171                         m->splitAtDash(groups, Groups);
172             m->setGroups(Groups);
173             
174             string temp = validParameter.validFile(parameters, "large", false);         if (temp == "not found") {      temp = "F";     }
175                         large = m->isTrue(temp);
176             
177             temp = validParameter.validFile(parameters, "processors", false);   if (temp == "not found"){       temp = m->getProcessors();      }
178                         m->setProcessors(temp);
179                         m->mothurConvert(temp, processors);
180                         
181                         //if the user changes the output directory command factory will send this info to us in the output parameter 
182                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
183
184                 }
185                 
186         }
187         catch(exception& e) {
188                 m->errorOut(e, "CountSeqsCommand", "CountSeqsCommand");
189                 exit(1);
190         }
191 }
192 //**********************************************************************************************************************
193
194 int CountSeqsCommand::execute(){
195         try {
196                 
197                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
198                 
199         map<string, string> variables;
200
201         if (namefile != "") {
202             int total = 0;
203             int start = time(NULL);
204             if (outputDir == "") { outputDir = m->hasPath(namefile); }
205             variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(namefile));
206             string outputFileName = getOutputFileName("count", variables);
207             
208             if (!large) { total = processSmall(outputFileName); }
209             else { total = processLarge(outputFileName);  }
210             
211             if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; }
212             
213             m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create a table for " + toString(total) + " sequences.");
214             m->mothurOutEndLine(); m->mothurOutEndLine();
215             
216             m->mothurOutEndLine();
217             m->mothurOut("Total number of sequences: " + toString(total)); m->mothurOutEndLine();
218  
219         }else {
220             if (outputDir == "") { outputDir = m->hasPath(sharedfile); }
221             variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile));
222             
223             InputData input(sharedfile, "sharedfile");
224             vector<SharedRAbundVector*> lookup = input.getSharedRAbundVectors();
225             string lastLabel = lookup[0]->getLabel();
226             
227             //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
228             set<string> processedLabels;
229             set<string> userLabels = labels;
230             
231             //as long as you are not at the end of the file or done wih the lines you want
232             while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
233                 
234                 if (m->control_pressed) { for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  } for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } return 0; }
235                 
236                 if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){
237                     
238                     m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
239                     
240                     processShared(lookup, variables);
241                     
242                     processedLabels.insert(lookup[0]->getLabel());
243                     userLabels.erase(lookup[0]->getLabel());
244                 }
245                 
246                 if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
247                     string saveLabel = lookup[0]->getLabel();
248                     
249                     for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
250                     lookup = input.getSharedRAbundVectors(lastLabel);
251                     m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
252                     
253                     processShared(lookup, variables);
254                     
255                     processedLabels.insert(lookup[0]->getLabel());
256                     userLabels.erase(lookup[0]->getLabel());
257                     
258                     //restore real lastlabel to save below
259                     lookup[0]->setLabel(saveLabel);
260                 }
261                 
262                 lastLabel = lookup[0]->getLabel();
263                 //prevent memory leak
264                 for (int i = 0; i < lookup.size(); i++) {  delete lookup[i]; lookup[i] = NULL; }
265                 
266                 if (m->control_pressed) { return 0; }
267                 
268                 //get next line to process
269                 lookup = input.getSharedRAbundVectors();
270             }
271             
272             if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {    m->mothurRemove(outputNames[i]); }  return 0; }
273             
274             //output error messages about any remaining user labels
275             set<string>::iterator it;
276             bool needToRun = false;
277             for (it = userLabels.begin(); it != userLabels.end(); it++) {
278                 m->mothurOut("Your file does not include the label " + *it);
279                 if (processedLabels.count(lastLabel) != 1) {
280                     m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
281                     needToRun = true;
282                 }else {
283                     m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
284                 }
285             }
286             
287             //run last label if you need to
288             if (needToRun == true)  {
289                 for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
290                 lookup = input.getSharedRAbundVectors(lastLabel);
291                 
292                 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
293                 
294                 processShared(lookup, variables);
295                 
296                 for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
297             }
298             
299         }
300         
301         //set rabund file as new current rabundfile
302                 itTypes = outputTypes.find("count");
303                 if (itTypes != outputTypes.end()) {
304                         if ((itTypes->second).size() != 0) { string current = (itTypes->second)[0]; m->setCountTableFile(current); }
305                 }
306         
307         m->mothurOutEndLine();
308                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
309                 for(int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();      }
310                 m->mothurOutEndLine();
311         
312                 return 0;               
313         }
314         
315         catch(exception& e) {
316                 m->errorOut(e, "CountSeqsCommand", "execute");
317                 exit(1);
318         }
319 }
320 //**********************************************************************************************************************
321
322 int CountSeqsCommand::processShared(vector<SharedRAbundVector*>& lookup, map<string, string> variables){
323     try {
324         variables["[distance]"] = lookup[0]->getLabel();
325         string outputFileName = getOutputFileName("count", variables);
326         outputNames.push_back(outputFileName); outputTypes["count"].push_back(outputFileName);
327         
328         ofstream out;
329         m->openOutputFile(outputFileName, out);
330         
331         out << "OTU_Label\ttotal\t";
332         for (int i = 0; i < lookup.size(); i++) { out << lookup[i]->getGroup() << '\t'; } out << endl;
333         
334         for (int j = 0; j < lookup[0]->getNumBins(); j++) {
335             if (m->control_pressed) { break; }
336             
337             int total = 0;
338             string output = "";
339             for (int i = 0; i < lookup.size(); i++) {
340                 total += lookup[i]->getAbundance(j);
341                 output += toString(lookup[i]->getAbundance(j)) + '\t';
342             }
343             out << m->currentSharedBinLabels[j] << '\t' << total << '\t' << output << endl;
344         }
345         out.close();
346         
347         return 0;
348     }
349     catch(exception& e) {
350         m->errorOut(e, "CountSeqsCommand", "processShared");
351         exit(1);
352     }
353 }
354 //**********************************************************************************************************************
355
356 int CountSeqsCommand::processSmall(string outputFileName){
357         try {
358         ofstream out;
359         m->openOutputFile(outputFileName, out); outputTypes["count"].push_back(outputFileName);
360         outputNames.push_back(outputFileName); outputTypes["count"].push_back(outputFileName);
361                 out << "Representative_Sequence\ttotal\t";
362         
363         GroupMap* groupMap;
364                 if (groupfile != "") { 
365                         groupMap = new GroupMap(groupfile); groupMap->readMap(); 
366                         
367                         //make sure groups are valid. takes care of user setting groupNames that are invalid or setting groups=all
368                         SharedUtil* util = new SharedUtil();
369                         vector<string> nameGroups = groupMap->getNamesOfGroups();
370                         util->setGroups(Groups, nameGroups);
371                         delete util;
372                         
373                         //sort groupNames so that the group title match the counts below, this is needed because the map object automatically sorts
374                         sort(Groups.begin(), Groups.end());
375                         
376                         //print groupNames
377                         for (int i = 0; i < Groups.size(); i++) {
378                                 out << Groups[i] << '\t';
379                         }
380                 }
381                 out << endl;
382         out.close();
383         
384         int total = createProcesses(groupMap, outputFileName);
385         
386         if (groupfile != "") { delete groupMap; }
387         
388         return total;
389     }
390         catch(exception& e) {
391                 m->errorOut(e, "CountSeqsCommand", "processSmall");
392                 exit(1);
393         }
394 }
395 /**************************************************************************************************/
396 int CountSeqsCommand::createProcesses(GroupMap*& groupMap, string outputFileName) {
397         try {
398                 
399                 vector<int> processIDS;
400                 int process = 0;
401         vector<unsigned long long> positions;
402         vector<linePair> lines;
403         int numSeqs = 0;
404         
405 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
406                 positions = m->divideFilePerLine(namefile, processors);
407                 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
408 #else
409                 if(processors == 1){ lines.push_back(linePair(0, 1000));  }
410         else {
411             int numSeqs = 0;
412             positions = m->setFilePosEachLine(namefile, numSeqs);
413             if (positions.size() < processors) { processors = positions.size(); }
414             
415             //figure out how many sequences you have to process
416             int numSeqsPerProcessor = numSeqs / processors;
417             for (int i = 0; i < processors; i++) {
418                 int startIndex =  i * numSeqsPerProcessor;
419                 if(i == (processors - 1)){      numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor;        }
420                 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
421             }
422         }
423 #endif
424
425                         
426 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
427                 
428                 //loop through and create all the processes you want
429                 while (process != processors-1) {
430                         int pid = fork();
431                         
432                         if (pid > 0) {
433                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
434                                 process++;
435                         }else if (pid == 0){
436                 string filename = toString(getpid()) + ".temp";
437                                 numSeqs = driver(lines[process].start, lines[process].end, filename, groupMap);
438                 
439                 string tempFile = toString(getpid()) + ".num.temp";
440                 ofstream outTemp;
441                 m->openOutputFile(tempFile, outTemp);
442                 
443                 outTemp << numSeqs << endl;
444                 outTemp.close();
445                 
446                                 exit(0);
447                         }else {
448                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
449                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
450                                 exit(0);
451                         }
452                 }
453                 
454                 string filename = toString(getpid()) + ".temp";
455         numSeqs = driver(lines[processors-1].start, lines[processors-1].end, filename, groupMap);
456         
457                 //force parent to wait until all the processes are done
458                 for (int i=0;i<processIDS.size();i++) {
459                         int temp = processIDS[i];
460                         wait(&temp);
461                 }
462         
463         for (int i = 0; i < processIDS.size(); i++) {
464             string tempFile = toString(processIDS[i]) +  ".num.temp";
465             ifstream intemp;
466             m->openInputFile(tempFile, intemp);
467             
468             int num;
469             intemp >> num; intemp.close();
470             numSeqs += num;
471             m->mothurRemove(tempFile);
472         }
473 #else           
474                 vector<countData*> pDataArray;
475                 DWORD   dwThreadIdArray[processors-1];
476                 HANDLE  hThreadArray[processors-1];
477         vector<GroupMap*> copies;
478                 
479                 //Create processor worker threads.
480                 for( int i=0; i<processors-1; i++ ){
481                         string filename = toString(i) + ".temp";
482             
483             GroupMap* copyGroup = new GroupMap();
484             copyGroup->getCopy(groupMap);
485             copies.push_back(copyGroup);
486             vector<string> cGroups = Groups;
487            
488                         countData* temp = new countData(filename, copyGroup, m, lines[i].start, lines[i].end, groupfile, namefile, cGroups);
489                         pDataArray.push_back(temp);
490                         processIDS.push_back(i);
491                         
492                         hThreadArray[i] = CreateThread(NULL, 0, MyCountThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
493                 }
494                 
495                 string filename = toString(processors-1) + ".temp";
496         numSeqs = driver(lines[processors-1].start, lines[processors-1].end, filename, groupMap);
497                         
498                 //Wait until all threads have terminated.
499                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
500                 
501                 //Close all thread handles and free memory allocations.
502                 for(int i=0; i < pDataArray.size(); i++){
503             numSeqs += pDataArray[i]->total;
504             delete copies[i];
505                         CloseHandle(hThreadArray[i]);
506                         delete pDataArray[i];
507                 }
508 #endif
509                 
510                 //append output files
511                 for(int i=0;i<processIDS.size();i++){
512                         m->appendFiles((toString(processIDS[i]) + ".temp"), outputFileName);
513                         m->mothurRemove((toString(processIDS[i]) + ".temp"));
514                 }
515         m->appendFiles(filename, outputFileName);
516         m->mothurRemove(filename);
517
518         
519         //sanity check
520         if (groupfile != "") {
521             if (numSeqs != groupMap->getNumSeqs()) {
522                 m->mothurOut("[ERROR]: processes reported processing " + toString(numSeqs) + " sequences, but group file indicates you have " + toString(groupMap->getNumSeqs()) + " sequences.");
523                 if (processors == 1) { m->mothurOut(" Could you have a file mismatch?\n"); }
524                 else { m->mothurOut(" Either you have a file mismatch or a process failed to complete the task assigned to it.\n"); m->control_pressed = true; }
525             }
526                 }
527                 return numSeqs;
528         }
529         catch(exception& e) {
530                 m->errorOut(e, "CountSeqsCommand", "createProcesses");
531                 exit(1);
532         }
533 }
534 /**************************************************************************************************/
535 int CountSeqsCommand::driver(unsigned long long start, unsigned long long end, string outputFileName, GroupMap*& groupMap) {
536         try {
537         
538         ofstream out;
539         m->openOutputFile(outputFileName, out);
540         
541         ifstream in;
542                 m->openInputFile(namefile, in);
543                 in.seekg(start);
544         
545                 bool done = false;
546         int total = 0;
547                 while (!done) {
548                         if (m->control_pressed) { break; }
549                         
550                         string firstCol, secondCol;
551                         in >> firstCol; m->gobble(in); in >> secondCol; m->gobble(in);
552             //cout << firstCol << '\t' << secondCol << endl;
553             m->checkName(firstCol);
554             m->checkName(secondCol);
555                         //cout << firstCol << '\t' << secondCol << endl;
556             
557                         vector<string> names;
558                         m->splitAtChar(secondCol, names, ',');
559                         
560                         if (groupfile != "") {
561                                 //set to 0
562                                 map<string, int> groupCounts;
563                                 int total = 0;
564                                 for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; }
565                                 
566                                 //get counts for each of the users groups
567                                 for (int i = 0; i < names.size(); i++) {
568                                         string group = groupMap->getGroup(names[i]);
569                                         
570                                         if (group == "not found") { m->mothurOut("[ERROR]: " + names[i] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); }
571                                         else {
572                                                 map<string, int>::iterator it = groupCounts.find(group);
573                                                 
574                                                 //if not found, then this sequence is not from a group we care about
575                                                 if (it != groupCounts.end()) {
576                                                         it->second++;
577                                                         total++;
578                                                 }
579                                         }
580                                 }
581                                 
582                                 if (total != 0) {
583                                         out << firstCol << '\t' << total << '\t';
584                                         for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
585                                                 out << it->second << '\t';
586                                         }
587                                         out << endl;
588                                 }
589                         }else {
590                                 out << firstCol << '\t' << names.size() << endl;
591                         }
592                         
593                         total += names.size();
594             
595 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
596             unsigned long long pos = in.tellg();
597             if ((pos == -1) || (pos >= end)) { break; }
598 #else
599             if (in.eof()) { break; }
600 #endif
601
602                 }
603                 in.close();
604         out.close();
605         
606         return total;
607
608     }
609         catch(exception& e) {
610                 m->errorOut(e, "CountSeqsCommand", "driver");
611                 exit(1);
612         }
613 }
614 //**********************************************************************************************************************
615
616 int CountSeqsCommand::processLarge(string outputFileName){
617         try {
618         set<string> namesOfGroups;
619         map<string, int> initial;
620         for (set<string>::iterator it = namesOfGroups.begin(); it != namesOfGroups.end(); it++) { initial[(*it)] = 0;  }
621         ofstream out;
622         m->openOutputFile(outputFileName, out); 
623         outputNames.push_back(outputFileName); outputTypes["count"].push_back(outputFileName);
624                 out << "Representative_Sequence\ttotal\t";
625         if (groupfile == "") { out << endl; }
626         
627         map<string, unsigned long long> namesToIndex;
628         string outfile = m->getRootName(groupfile) + "sorted.groups.temp";
629         string outName = m->getRootName(namefile) + "sorted.name.temp";
630         map<int, string> indexToName;
631         map<int, string> indexToGroup;
632         if (groupfile != "") { 
633             time_t estart = time(NULL);
634             //convert name file to redundant -> unique.  set unique name equal to index so we can use vectors, save name for later.
635             string newNameFile = m->getRootName(namefile) + ".name.temp";
636             string newGroupFile = m->getRootName(groupfile) + ".group.temp";
637             indexToName = processNameFile(newNameFile);
638             indexToGroup = getGroupNames(newGroupFile, namesOfGroups);
639             
640             //sort file by first column so the names of sequences will be easier to find
641             //use the unix sort 
642             #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
643                 string command = "sort -n " + newGroupFile + " -o " + outfile;
644                 system(command.c_str());
645                 command = "sort -n " + newNameFile + " -o " + outName;
646                 system(command.c_str());
647             #else //sort using windows sort
648                 string command = "sort " + newGroupFile + " /O " + outfile;
649                 system(command.c_str());
650                 command = "sort " + newNameFile + " /O " + outName;
651                 system(command.c_str());
652             #endif
653             m->mothurRemove(newNameFile);
654             m->mothurRemove(newGroupFile);
655             
656             m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to sort and index the group and name files. "); m->mothurOutEndLine();
657         }else { outName = namefile; }
658          
659         time_t estart = time(NULL);
660         //open input file
661                 ifstream in;
662                 m->openInputFile(outName, in);
663         
664         //open input file
665                 ifstream in2;
666                 
667                 int total = 0;
668         vector< vector<int> > nameMapCount;
669         if (groupfile != "") {
670             m->openInputFile(outfile, in2);
671             nameMapCount.resize(indexToName.size());
672             for (int i = 0; i < nameMapCount.size(); i++) {
673                 nameMapCount[i].resize(indexToGroup.size(), 0);
674             }
675         }
676         
677                 while (!in.eof()) {
678                         if (m->control_pressed) { break; }
679                         
680                         string firstCol;
681                         in >> firstCol;  m->gobble(in);
682                         
683                         if (groupfile != "") {
684                 int uniqueIndex;
685                 in >> uniqueIndex; m->gobble(in);
686                 
687                 string name; int groupIndex;
688                 in2 >> name >> groupIndex; m->gobble(in2);
689                 
690                 if (name != firstCol) { m->mothurOut("[ERROR]: found " + name + " in your groupfile, but " + firstCol + " was in your namefile, please correct.\n"); m->control_pressed = true; }
691                 
692                 nameMapCount[uniqueIndex][groupIndex]++;
693                 total++;
694             }else { 
695                 string secondCol;
696                 in >> secondCol; m->gobble(in);
697                 int num = m->getNumNames(secondCol);
698                 out << firstCol << '\t' << num << endl;
699                 total += num;
700             }
701                 }
702                 in.close();
703         
704         if (groupfile != "") {
705             m->mothurRemove(outfile);
706             m->mothurRemove(outName);
707             in2.close();
708             for (map<int, string>::iterator it = indexToGroup.begin(); it != indexToGroup.end(); it++) { out << it->second << '\t';  }
709             out << endl;
710             for (int i = 0; i < nameMapCount.size(); i++) {
711                 string totalsLine = "";
712                 int seqTotal = 0;
713                 for (int j = 0; j < nameMapCount[i].size(); j++) {
714                     seqTotal += nameMapCount[i][j];
715                     totalsLine += toString(nameMapCount[i][j]) + '\t';
716                 }
717                 out << indexToName[i] << '\t' << seqTotal << '\t' << totalsLine << endl;
718             }
719         }
720         
721         out.close();
722         
723         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to create the count table file. "); m->mothurOutEndLine();
724         
725         return total;
726     }
727         catch(exception& e) {
728                 m->errorOut(e, "CountSeqsCommand", "processLarge");
729                 exit(1);
730         }
731 }
732 /**************************************************************************************************/
733 map<int, string> CountSeqsCommand::processNameFile(string name) {
734         try {
735         map<int, string> indexToNames;
736         
737         ofstream out;
738         m->openOutputFile(name, out);
739         
740         //open input file
741                 ifstream in;
742                 m->openInputFile(namefile, in);
743         
744         string rest = "";
745         char buffer[4096];
746         bool pairDone = false;
747         bool columnOne = true;
748         string firstCol, secondCol;
749         int count = 0;
750         
751                 while (!in.eof()) {
752                         if (m->control_pressed) { break; }
753                         
754             in.read(buffer, 4096);
755             vector<string> pieces = m->splitWhiteSpace(rest, buffer, in.gcount());
756             
757             for (int i = 0; i < pieces.size(); i++) {
758                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
759                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
760                 
761                 if (pairDone) { 
762                     m->checkName(firstCol);
763                     m->checkName(secondCol);
764                     //parse names into vector
765                     vector<string> theseNames;
766                     m->splitAtComma(secondCol, theseNames);
767                     for (int i = 0; i < theseNames.size(); i++) {  out << theseNames[i] << '\t' << count << endl;  }
768                     indexToNames[count] = firstCol;
769                     pairDone = false; 
770                     count++;
771                 }
772             }
773                 }
774                 in.close();
775        
776                 
777         if (rest != "") {
778             vector<string> pieces = m->splitWhiteSpace(rest);
779             
780             for (int i = 0; i < pieces.size(); i++) {
781                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
782                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
783                 
784                 if (pairDone) { 
785                     m->checkName(firstCol);
786                     m->checkName(secondCol);
787                     //parse names into vector
788                     vector<string> theseNames;
789                     m->splitAtComma(secondCol, theseNames);
790                     for (int i = 0; i < theseNames.size(); i++) {  out << theseNames[i] << '\t' << count << endl;  }
791                     indexToNames[count] = firstCol;
792                     pairDone = false; 
793                     count++;
794                 }
795             }
796
797         }
798         out.close();
799         
800         return indexToNames;
801     }
802         catch(exception& e) {
803                 m->errorOut(e, "CountSeqsCommand", "processNameFile");
804                 exit(1);
805         }
806 }
807 /**************************************************************************************************/
808 map<int, string> CountSeqsCommand::getGroupNames(string filename, set<string>& namesOfGroups) {
809         try {
810         map<int, string> indexToGroups;
811         map<string, int> groupIndex;
812         map<string, int>::iterator it;
813         
814         ofstream out;
815         m->openOutputFile(filename, out);
816         
817         //open input file
818                 ifstream in;
819                 m->openInputFile(groupfile, in);
820         
821         string rest = "";
822         char buffer[4096];
823         bool pairDone = false;
824         bool columnOne = true;
825         string firstCol, secondCol;
826         int count = 0;
827         
828                 while (!in.eof()) {
829                         if (m->control_pressed) { break; }
830                         
831             in.read(buffer, 4096);
832             vector<string> pieces = m->splitWhiteSpace(rest, buffer, in.gcount());
833             
834             for (int i = 0; i < pieces.size(); i++) {
835                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
836                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
837                 
838                 if (pairDone) { 
839                     m->checkName(firstCol);
840                     it = groupIndex.find(secondCol);
841                     if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above
842                         groupIndex[secondCol] = count;
843                         count++;
844                     }
845                     out << firstCol << '\t' << groupIndex[secondCol] << endl; 
846                     namesOfGroups.insert(secondCol);
847                     pairDone = false; 
848                 }
849             }
850                 }
851                 in.close();
852         
853         
854         if (rest != "") {
855             vector<string> pieces = m->splitWhiteSpace(rest);
856             
857             for (int i = 0; i < pieces.size(); i++) {
858                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
859                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
860                 
861                 if (pairDone) { 
862                     m->checkName(firstCol);
863                     it = groupIndex.find(secondCol);
864                     if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above
865                         groupIndex[secondCol] = count;
866                         count++;
867                     }
868                     out << firstCol << '\t' << groupIndex[secondCol] << endl; 
869                     namesOfGroups.insert(secondCol);
870                     pairDone = false; 
871                 }
872             }
873         }
874         out.close();
875                 
876         for (it = groupIndex.begin(); it != groupIndex.end(); it++) {  indexToGroups[it->second] = it->first;  }
877         
878         return indexToGroups;
879         }
880         catch(exception& e) {
881                 m->errorOut(e, "CountSeqsCommand", "getGroupNames");
882                 exit(1);
883         }
884 }
885 //**********************************************************************************************************************
886
887
888