]> git.donarmstrong.com Git - mothur.git/blob - countseqscommand.cpp
added adjustDots function to pcr.seqs, keeps sequences aligned in case where primers...
[mothur.git] / countseqscommand.cpp
1 /*
2  *  countseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 6/1/11.
6  *  Copyright 2011 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "countseqscommand.h"
11 #include "sharedutilities.h"
12 #include "counttable.h"
13
14 //**********************************************************************************************************************
15 vector<string> CountSeqsCommand::setParameters(){       
16         try {
17                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none","count",false,true,true); parameters.push_back(pname);
18                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(pgroup);
19         CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
20         CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
21                 CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
22                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
23                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
24                 
25                 vector<string> myArray;
26                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
27                 return myArray;
28         }
29         catch(exception& e) {
30                 m->errorOut(e, "CountSeqsCommand", "setParameters");
31                 exit(1);
32         }
33 }
34 //**********************************************************************************************************************
35 string CountSeqsCommand::getHelpString(){       
36         try {
37                 string helpString = "";
38                 helpString += "The count.seqs aka. make.table command reads a name file and outputs a .count_table file.  You may also provide a group file to get the counts broken down by group.\n";
39                 helpString += "The groups parameter allows you to indicate which groups you want to include in the counts, by default all groups in your groupfile are used.\n";
40         helpString += "The large parameter indicates the name and group files are too large to fit in RAM.\n";
41                 helpString += "When you use the groups parameter and a sequence does not represent any sequences from the groups you specify it is not included in the .count.summary file.\n";
42         helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
43                 helpString += "The count.seqs command should be in the following format: count.seqs(name=yourNameFile).\n";
44                 helpString += "Example count.seqs(name=amazon.names) or make.table(name=amazon.names).\n";
45                 helpString += "Note: No spaces between parameter labels (i.e. name), '=' and parameters (i.e.yourNameFile).\n";
46                 return helpString;
47         }
48         catch(exception& e) {
49                 m->errorOut(e, "CountSeqsCommand", "getHelpString");
50                 exit(1);
51         }
52 }
53 //**********************************************************************************************************************
54 string CountSeqsCommand::getOutputPattern(string type) {
55     try {
56         string pattern = "";
57         
58         if (type == "count") {  pattern = "[filename],count_table"; }
59         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
60         
61         return pattern;
62     }
63     catch(exception& e) {
64         m->errorOut(e, "CountSeqsCommand", "getOutputPattern");
65         exit(1);
66     }
67 }
68 //**********************************************************************************************************************
69 CountSeqsCommand::CountSeqsCommand(){   
70         try {
71                 abort = true; calledHelp = true; 
72                 setParameters();
73                 vector<string> tempOutNames;
74                 outputTypes["count"] = tempOutNames;
75         }
76         catch(exception& e) {
77                 m->errorOut(e, "CountSeqsCommand", "CountSeqsCommand");
78                 exit(1);
79         }
80 }
81 //**********************************************************************************************************************
82
83 CountSeqsCommand::CountSeqsCommand(string option)  {
84         try {
85                 abort = false; calledHelp = false;   
86                 
87                 //allow user to run help
88                 if(option == "help") { help(); abort = true; calledHelp = true; }
89                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
90                 else {
91                         vector<string> myArray = setParameters();
92                         
93                         OptionParser parser(option);
94                         map<string,string> parameters = parser.getParameters();
95                         
96                         ValidParameters validParameter;
97                         map<string,string>::iterator it;
98                         
99                         //check to make sure all parameters are valid for command
100                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
101                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
102                         }
103                         
104                         //initialize outputTypes
105                         vector<string> tempOutNames;
106                         outputTypes["count"] = tempOutNames;
107                         
108                         
109                         //if the user changes the input directory command factory will send this info to us in the output parameter 
110                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
111                         if (inputDir == "not found"){   inputDir = "";          }
112                         else {
113                                 string path;
114                                 it = parameters.find("name");
115                                 //user has given a template file
116                                 if(it != parameters.end()){ 
117                                         path = m->hasPath(it->second);
118                                         //if the user has not given a path then, add inputdir. else leave path alone.
119                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
120                                 }
121                                 
122                                 it = parameters.find("group");
123                                 //user has given a template file
124                                 if(it != parameters.end()){ 
125                                         path = m->hasPath(it->second);
126                                         //if the user has not given a path then, add inputdir. else leave path alone.
127                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
128                                 }
129                         }
130                         
131                         //check for required parameters
132                         namefile = validParameter.validFile(parameters, "name", true);
133                         if (namefile == "not open") { namefile = ""; abort = true; }
134                         else if (namefile == "not found"){                                      
135                                 namefile = m->getNameFile(); 
136                                 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
137                                 else {  m->mothurOut("You have no current namefile and the name parameter is required."); m->mothurOutEndLine(); abort = true; }
138                         }else { m->setNameFile(namefile); }
139                         
140                         groupfile = validParameter.validFile(parameters, "group", true);
141                         if (groupfile == "not open") { abort = true; }
142                         else if (groupfile == "not found") {  groupfile = "";  }        
143                         else { m->setGroupFile(groupfile); }
144                         
145                         groups = validParameter.validFile(parameters, "groups", false);                 
146                         if (groups == "not found") { groups = "all"; }
147                         m->splitAtDash(groups, Groups);
148             
149             string temp = validParameter.validFile(parameters, "large", false);         if (temp == "not found") {      temp = "F";     }
150                         large = m->isTrue(temp);
151             
152             temp = validParameter.validFile(parameters, "processors", false);   if (temp == "not found"){       temp = m->getProcessors();      }
153                         m->setProcessors(temp);
154                         m->mothurConvert(temp, processors);
155                         
156                         //if the user changes the output directory command factory will send this info to us in the output parameter 
157                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(namefile);               }
158
159                 }
160                 
161         }
162         catch(exception& e) {
163                 m->errorOut(e, "CountSeqsCommand", "CountSeqsCommand");
164                 exit(1);
165         }
166 }
167 //**********************************************************************************************************************
168
169 int CountSeqsCommand::execute(){
170         try {
171                 
172                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
173                 
174         map<string, string> variables; 
175         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(namefile));
176                 string outputFileName = getOutputFileName("count", variables);
177                 
178         int total = 0;
179         int start = time(NULL);
180         if (!large) { total = processSmall(outputFileName); }
181         else { total = processLarge(outputFileName);  }
182         
183                 if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; }
184                 
185         m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create a table for " + toString(total) + " sequences.");
186         m->mothurOutEndLine();
187         m->mothurOutEndLine();
188         
189         //set rabund file as new current rabundfile
190                 itTypes = outputTypes.find("count");
191                 if (itTypes != outputTypes.end()) {
192                         if ((itTypes->second).size() != 0) { string current = (itTypes->second)[0]; m->setCountTableFile(current); }
193                 }
194         
195         m->mothurOutEndLine();
196                 m->mothurOut("Total number of sequences: " + toString(total)); m->mothurOutEndLine();
197                 m->mothurOutEndLine();
198                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
199                 m->mothurOut(outputFileName); m->mothurOutEndLine();    
200                 m->mothurOutEndLine();
201                 
202                 return 0;               
203         }
204         
205         catch(exception& e) {
206                 m->errorOut(e, "CountSeqsCommand", "execute");
207                 exit(1);
208         }
209 }
210 //**********************************************************************************************************************
211
212 int CountSeqsCommand::processSmall(string outputFileName){
213         try {
214         ofstream out;
215         m->openOutputFile(outputFileName, out); outputTypes["count"].push_back(outputFileName);
216         outputNames.push_back(outputFileName); outputTypes["count"].push_back(outputFileName);
217                 out << "Representative_Sequence\ttotal\t";
218         
219         GroupMap* groupMap;
220                 if (groupfile != "") { 
221                         groupMap = new GroupMap(groupfile); groupMap->readMap(); 
222                         
223                         //make sure groups are valid. takes care of user setting groupNames that are invalid or setting groups=all
224                         SharedUtil* util = new SharedUtil();
225                         vector<string> nameGroups = groupMap->getNamesOfGroups();
226                         util->setGroups(Groups, nameGroups);
227                         delete util;
228                         
229                         //sort groupNames so that the group title match the counts below, this is needed because the map object automatically sorts
230                         sort(Groups.begin(), Groups.end());
231                         
232                         //print groupNames
233                         for (int i = 0; i < Groups.size(); i++) {
234                                 out << Groups[i] << '\t';
235                         }
236                 }
237                 out << endl;
238         out.close();
239         
240         int total = createProcesses(groupMap, outputFileName);
241         
242         if (groupfile != "") { delete groupMap; }
243         
244         return total;
245     }
246         catch(exception& e) {
247                 m->errorOut(e, "CountSeqsCommand", "processSmall");
248                 exit(1);
249         }
250 }
251 /**************************************************************************************************/
252 int CountSeqsCommand::createProcesses(GroupMap*& groupMap, string outputFileName) {
253         try {
254                 
255                 vector<int> processIDS;
256                 int process = 0;
257         vector<unsigned long long> positions;
258         vector<linePair> lines;
259         int numSeqs = 0;
260         
261 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
262                 positions = m->divideFilePerLine(namefile, processors);
263                 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
264 #else
265                 if(processors == 1){ lines.push_back(linePair(0, 1000));  }
266         else {
267             int numSeqs = 0;
268             positions = m->setFilePosEachLine(namefile, numSeqs);
269             if (positions.size() < processors) { processors = positions.size(); }
270             
271             //figure out how many sequences you have to process
272             int numSeqsPerProcessor = numSeqs / processors;
273             for (int i = 0; i < processors; i++) {
274                 int startIndex =  i * numSeqsPerProcessor;
275                 if(i == (processors - 1)){      numSeqsPerProcessor = numSeqs - i * numSeqsPerProcessor;        }
276                 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
277             }
278         }
279 #endif
280
281                         
282 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
283                 
284                 //loop through and create all the processes you want
285                 while (process != processors-1) {
286                         int pid = fork();
287                         
288                         if (pid > 0) {
289                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
290                                 process++;
291                         }else if (pid == 0){
292                 string filename = toString(getpid()) + ".temp";
293                                 numSeqs = driver(lines[process].start, lines[process].end, filename, groupMap);
294                 
295                 string tempFile = toString(getpid()) + ".num.temp";
296                 ofstream outTemp;
297                 m->openOutputFile(tempFile, outTemp);
298                 
299                 outTemp << numSeqs << endl;
300                 outTemp.close();
301                 
302                                 exit(0);
303                         }else {
304                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
305                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
306                                 exit(0);
307                         }
308                 }
309                 
310                 string filename = toString(getpid()) + ".temp";
311         numSeqs = driver(lines[processors-1].start, lines[processors-1].end, filename, groupMap);
312         
313                 //force parent to wait until all the processes are done
314                 for (int i=0;i<processIDS.size();i++) {
315                         int temp = processIDS[i];
316                         wait(&temp);
317                 }
318         
319         for (int i = 0; i < processIDS.size(); i++) {
320             string tempFile = toString(processIDS[i]) +  ".num.temp";
321             ifstream intemp;
322             m->openInputFile(tempFile, intemp);
323             
324             int num;
325             intemp >> num; intemp.close();
326             numSeqs += num;
327             m->mothurRemove(tempFile);
328         }
329 #else           
330                 vector<countData*> pDataArray;
331                 DWORD   dwThreadIdArray[processors-1];
332                 HANDLE  hThreadArray[processors-1];
333         vector<GroupMap*> copies;
334                 
335                 //Create processor worker threads.
336                 for( int i=0; i<processors-1; i++ ){
337                         string filename = toString(i) + ".temp";
338             
339             GroupMap* copyGroup = new GroupMap();
340             copyGroup->getCopy(groupMap);
341             copies.push_back(copyGroup);
342             vector<string> cGroups = Groups;
343            
344                         countData* temp = new countData(filename, copyGroup, m, lines[i].start, lines[i].end, groupfile, namefile, cGroups);
345                         pDataArray.push_back(temp);
346                         processIDS.push_back(i);
347                         
348                         hThreadArray[i] = CreateThread(NULL, 0, MyCountThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
349                 }
350                 
351                 string filename = toString(processors-1) + ".temp";
352         numSeqs = driver(lines[processors-1].start, lines[processors-1].end, filename, groupMap);
353                         
354                 //Wait until all threads have terminated.
355                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
356                 
357                 //Close all thread handles and free memory allocations.
358                 for(int i=0; i < pDataArray.size(); i++){
359             numSeqs += pDataArray[i]->total;
360             delete copies[i];
361                         CloseHandle(hThreadArray[i]);
362                         delete pDataArray[i];
363                 }
364 #endif
365                 
366                 //append output files
367                 for(int i=0;i<processIDS.size();i++){
368                         m->appendFiles((toString(processIDS[i]) + ".temp"), outputFileName);
369                         m->mothurRemove((toString(processIDS[i]) + ".temp"));
370                 }
371         m->appendFiles(filename, outputFileName);
372         m->mothurRemove(filename);
373
374         
375         //sanity check
376         if (numSeqs != groupMap->getNumSeqs()) {
377             m->mothurOut("[ERROR]: processes reported processing " + toString(numSeqs) + " sequences, but group file indicates you have " + toString(groupMap->getNumSeqs()) + " sequences.");
378             if (processors == 1) { m->mothurOut(" Could you have a file mismatch?\n"); }
379             else { m->mothurOut(" Either you have a file mismatch or a process failed to complete the task assigned to it.\n"); m->control_pressed = true; }
380         }
381                 
382                 return numSeqs;
383         }
384         catch(exception& e) {
385                 m->errorOut(e, "CountSeqsCommand", "createProcesses");
386                 exit(1);
387         }
388 }
389 /**************************************************************************************************/
390 int CountSeqsCommand::driver(unsigned long long start, unsigned long long end, string outputFileName, GroupMap*& groupMap) {
391         try {
392         
393         ofstream out;
394         m->openOutputFile(outputFileName, out);
395         
396         ifstream in;
397                 m->openInputFile(namefile, in);
398                 in.seekg(start);
399         
400                 bool done = false;
401         int total = 0;
402                 while (!done) {
403                         if (m->control_pressed) { break; }
404                         
405                         string firstCol, secondCol;
406                         in >> firstCol; m->gobble(in); in >> secondCol; m->gobble(in);
407             //cout << firstCol << '\t' << secondCol << endl;
408             m->checkName(firstCol);
409             m->checkName(secondCol);
410                         //cout << firstCol << '\t' << secondCol << endl;
411             
412                         vector<string> names;
413                         m->splitAtChar(secondCol, names, ',');
414                         
415                         if (groupfile != "") {
416                                 //set to 0
417                                 map<string, int> groupCounts;
418                                 int total = 0;
419                                 for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; }
420                                 
421                                 //get counts for each of the users groups
422                                 for (int i = 0; i < names.size(); i++) {
423                                         string group = groupMap->getGroup(names[i]);
424                                         
425                                         if (group == "not found") { m->mothurOut("[ERROR]: " + names[i] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); }
426                                         else {
427                                                 map<string, int>::iterator it = groupCounts.find(group);
428                                                 
429                                                 //if not found, then this sequence is not from a group we care about
430                                                 if (it != groupCounts.end()) {
431                                                         it->second++;
432                                                         total++;
433                                                 }
434                                         }
435                                 }
436                                 
437                                 if (total != 0) {
438                                         out << firstCol << '\t' << total << '\t';
439                                         for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
440                                                 out << it->second << '\t';
441                                         }
442                                         out << endl;
443                                 }
444                         }else {
445                                 out << firstCol << '\t' << names.size() << endl;
446                         }
447                         
448                         total += names.size();
449             
450 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
451             unsigned long long pos = in.tellg();
452             if ((pos == -1) || (pos >= end)) { break; }
453 #else
454             if (in.eof()) { break; }
455 #endif
456
457                 }
458                 in.close();
459         out.close();
460         
461         return total;
462
463     }
464         catch(exception& e) {
465                 m->errorOut(e, "CountSeqsCommand", "driver");
466                 exit(1);
467         }
468 }
469 //**********************************************************************************************************************
470
471 int CountSeqsCommand::processLarge(string outputFileName){
472         try {
473         set<string> namesOfGroups;
474         map<string, int> initial;
475         for (set<string>::iterator it = namesOfGroups.begin(); it != namesOfGroups.end(); it++) { initial[(*it)] = 0;  }
476         ofstream out;
477         m->openOutputFile(outputFileName, out); 
478         outputNames.push_back(outputFileName); outputTypes["count"].push_back(outputFileName);
479                 out << "Representative_Sequence\ttotal\t";
480         if (groupfile == "") { out << endl; }
481         
482         map<string, unsigned long long> namesToIndex;
483         string outfile = m->getRootName(groupfile) + "sorted.groups.temp";
484         string outName = m->getRootName(namefile) + "sorted.name.temp";
485         map<int, string> indexToName;
486         map<int, string> indexToGroup;
487         if (groupfile != "") { 
488             time_t estart = time(NULL);
489             //convert name file to redundant -> unique.  set unique name equal to index so we can use vectors, save name for later.
490             string newNameFile = m->getRootName(namefile) + ".name.temp";
491             string newGroupFile = m->getRootName(groupfile) + ".group.temp";
492             indexToName = processNameFile(newNameFile);
493             indexToGroup = getGroupNames(newGroupFile, namesOfGroups);
494             
495             //sort file by first column so the names of sequences will be easier to find
496             //use the unix sort 
497             #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
498                 string command = "sort -n " + newGroupFile + " -o " + outfile;
499                 system(command.c_str());
500                 command = "sort -n " + newNameFile + " -o " + outName;
501                 system(command.c_str());
502             #else //sort using windows sort
503                 string command = "sort " + newGroupFile + " /O " + outfile;
504                 system(command.c_str());
505                 command = "sort " + newNameFile + " /O " + outName;
506                 system(command.c_str());
507             #endif
508             m->mothurRemove(newNameFile);
509             m->mothurRemove(newGroupFile);
510             
511             m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to sort and index the group and name files. "); m->mothurOutEndLine();
512         }else { outName = namefile; }
513          
514         time_t estart = time(NULL);
515         //open input file
516                 ifstream in;
517                 m->openInputFile(outName, in);
518         
519         //open input file
520                 ifstream in2;
521                 
522                 int total = 0;
523         vector< vector<int> > nameMapCount;
524         if (groupfile != "") {
525             m->openInputFile(outfile, in2);
526             nameMapCount.resize(indexToName.size());
527             for (int i = 0; i < nameMapCount.size(); i++) {
528                 nameMapCount[i].resize(indexToGroup.size(), 0);
529             }
530         }
531         
532                 while (!in.eof()) {
533                         if (m->control_pressed) { break; }
534                         
535                         string firstCol;
536                         in >> firstCol;  m->gobble(in);
537                         
538                         if (groupfile != "") {
539                 int uniqueIndex;
540                 in >> uniqueIndex; m->gobble(in);
541                 
542                 string name; int groupIndex;
543                 in2 >> name >> groupIndex; m->gobble(in2);
544                 
545                 if (name != firstCol) { m->mothurOut("[ERROR]: found " + name + " in your groupfile, but " + firstCol + " was in your namefile, please correct.\n"); m->control_pressed = true; }
546                 
547                 nameMapCount[uniqueIndex][groupIndex]++;
548                 total++;
549             }else { 
550                 string secondCol;
551                 in >> secondCol; m->gobble(in);
552                 int num = m->getNumNames(secondCol);
553                 out << firstCol << '\t' << num << endl;
554                 total += num;
555             }
556                 }
557                 in.close();
558         
559         if (groupfile != "") {
560             m->mothurRemove(outfile);
561             m->mothurRemove(outName);
562             in2.close();
563             for (map<int, string>::iterator it = indexToGroup.begin(); it != indexToGroup.end(); it++) { out << it->second << '\t';  }
564             out << endl;
565             for (int i = 0; i < nameMapCount.size(); i++) {
566                 string totalsLine = "";
567                 int seqTotal = 0;
568                 for (int j = 0; j < nameMapCount[i].size(); j++) {
569                     seqTotal += nameMapCount[i][j];
570                     totalsLine += toString(nameMapCount[i][j]) + '\t';
571                 }
572                 out << indexToName[i] << '\t' << seqTotal << '\t' << totalsLine << endl;
573             }
574         }
575         
576         out.close();
577         
578         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to create the count table file. "); m->mothurOutEndLine();
579         
580         return total;
581     }
582         catch(exception& e) {
583                 m->errorOut(e, "CountSeqsCommand", "processLarge");
584                 exit(1);
585         }
586 }
587 /**************************************************************************************************/
588 map<int, string> CountSeqsCommand::processNameFile(string name) {
589         try {
590         map<int, string> indexToNames;
591         
592         ofstream out;
593         m->openOutputFile(name, out);
594         
595         //open input file
596                 ifstream in;
597                 m->openInputFile(namefile, in);
598         
599         string rest = "";
600         char buffer[4096];
601         bool pairDone = false;
602         bool columnOne = true;
603         string firstCol, secondCol;
604         int count = 0;
605         
606                 while (!in.eof()) {
607                         if (m->control_pressed) { break; }
608                         
609             in.read(buffer, 4096);
610             vector<string> pieces = m->splitWhiteSpace(rest, buffer, in.gcount());
611             
612             for (int i = 0; i < pieces.size(); i++) {
613                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
614                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
615                 
616                 if (pairDone) { 
617                     m->checkName(firstCol);
618                     m->checkName(secondCol);
619                     //parse names into vector
620                     vector<string> theseNames;
621                     m->splitAtComma(secondCol, theseNames);
622                     for (int i = 0; i < theseNames.size(); i++) {  out << theseNames[i] << '\t' << count << endl;  }
623                     indexToNames[count] = firstCol;
624                     pairDone = false; 
625                     count++;
626                 }
627             }
628                 }
629                 in.close();
630         out.close();
631                 
632         if (rest != "") {
633             vector<string> pieces = m->splitWhiteSpace(rest);
634             
635             for (int i = 0; i < pieces.size(); i++) {
636                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
637                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
638                 
639                 if (pairDone) { 
640                     m->checkName(firstCol);
641                     m->checkName(secondCol);
642                     //parse names into vector
643                     vector<string> theseNames;
644                     m->splitAtComma(secondCol, theseNames);
645                     for (int i = 0; i < theseNames.size(); i++) {  out << theseNames[i] << '\t' << count << endl;  }
646                     indexToNames[count] = firstCol;
647                     pairDone = false; 
648                     count++;
649                 }
650             }
651
652         }
653         
654         return indexToNames;
655     }
656         catch(exception& e) {
657                 m->errorOut(e, "CountSeqsCommand", "processNameFile");
658                 exit(1);
659         }
660 }
661 /**************************************************************************************************/
662 map<int, string> CountSeqsCommand::getGroupNames(string filename, set<string>& namesOfGroups) {
663         try {
664         map<int, string> indexToGroups;
665         map<string, int> groupIndex;
666         map<string, int>::iterator it;
667         
668         ofstream out;
669         m->openOutputFile(filename, out);
670         
671         //open input file
672                 ifstream in;
673                 m->openInputFile(groupfile, in);
674         
675         string rest = "";
676         char buffer[4096];
677         bool pairDone = false;
678         bool columnOne = true;
679         string firstCol, secondCol;
680         int count = 0;
681         
682                 while (!in.eof()) {
683                         if (m->control_pressed) { break; }
684                         
685             in.read(buffer, 4096);
686             vector<string> pieces = m->splitWhiteSpace(rest, buffer, in.gcount());
687             
688             for (int i = 0; i < pieces.size(); i++) {
689                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
690                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
691                 
692                 if (pairDone) { 
693                     m->checkName(firstCol);
694                     it = groupIndex.find(secondCol);
695                     if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above
696                         groupIndex[secondCol] = count;
697                         count++;
698                     }
699                     out << firstCol << '\t' << groupIndex[secondCol] << endl; 
700                     namesOfGroups.insert(secondCol);
701                     pairDone = false; 
702                 }
703             }
704                 }
705                 in.close();
706         out.close();
707         
708         if (rest != "") {
709             vector<string> pieces = m->splitWhiteSpace(rest);
710             
711             for (int i = 0; i < pieces.size(); i++) {
712                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
713                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
714                 
715                 if (pairDone) { 
716                     m->checkName(firstCol);
717                     it = groupIndex.find(secondCol);
718                     if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above
719                         groupIndex[secondCol] = count;
720                         count++;
721                     }
722                     out << firstCol << '\t' << groupIndex[secondCol] << endl; 
723                     namesOfGroups.insert(secondCol);
724                     pairDone = false; 
725                 }
726             }
727         }
728                 
729         for (it = groupIndex.begin(); it != groupIndex.end(); it++) {  indexToGroups[it->second] = it->first;  }
730         
731         return indexToGroups;
732         }
733         catch(exception& e) {
734                 m->errorOut(e, "CountSeqsCommand", "getGroupNames");
735                 exit(1);
736         }
737 }
738 //**********************************************************************************************************************
739
740
741