]> git.donarmstrong.com Git - mothur.git/blob - countseqscommand.cpp
added flag to count table read so that commands that don't require the group info...
[mothur.git] / countseqscommand.cpp
1 /*
2  *  countseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 6/1/11.
6  *  Copyright 2011 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "countseqscommand.h"
11 #include "groupmap.h"
12 #include "sharedutilities.h"
13 #include "counttable.h"
14
15 //**********************************************************************************************************************
16 vector<string> CountSeqsCommand::setParameters(){       
17         try {
18                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none","count",false,true,true); parameters.push_back(pname);
19                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(pgroup);
20         CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
21         CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
22                 CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
23                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
24                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
25                 
26                 vector<string> myArray;
27                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
28                 return myArray;
29         }
30         catch(exception& e) {
31                 m->errorOut(e, "CountSeqsCommand", "setParameters");
32                 exit(1);
33         }
34 }
35 //**********************************************************************************************************************
36 string CountSeqsCommand::getHelpString(){       
37         try {
38                 string helpString = "";
39                 helpString += "The count.seqs aka. make.table command reads a name file and outputs a .count_table file.  You may also provide a group file to get the counts broken down by group.\n";
40                 helpString += "The groups parameter allows you to indicate which groups you want to include in the counts, by default all groups in your groupfile are used.\n";
41         helpString += "The large parameter indicates the name and group files are too large to fit in RAM.\n";
42                 helpString += "When you use the groups parameter and a sequence does not represent any sequences from the groups you specify it is not included in the .count.summary file.\n";
43         helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
44                 helpString += "The count.seqs command should be in the following format: count.seqs(name=yourNameFile).\n";
45                 helpString += "Example count.seqs(name=amazon.names) or make.table(name=amazon.names).\n";
46                 helpString += "Note: No spaces between parameter labels (i.e. name), '=' and parameters (i.e.yourNameFile).\n";
47                 return helpString;
48         }
49         catch(exception& e) {
50                 m->errorOut(e, "CountSeqsCommand", "getHelpString");
51                 exit(1);
52         }
53 }
54 //**********************************************************************************************************************
55 string CountSeqsCommand::getOutputPattern(string type) {
56     try {
57         string pattern = "";
58         
59         if (type == "count") {  pattern = "[filename],count_table"; }
60         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
61         
62         return pattern;
63     }
64     catch(exception& e) {
65         m->errorOut(e, "CountSeqsCommand", "getOutputPattern");
66         exit(1);
67     }
68 }
69 //**********************************************************************************************************************
70 CountSeqsCommand::CountSeqsCommand(){   
71         try {
72                 abort = true; calledHelp = true; 
73                 setParameters();
74                 vector<string> tempOutNames;
75                 outputTypes["count"] = tempOutNames;
76         }
77         catch(exception& e) {
78                 m->errorOut(e, "CountSeqsCommand", "CountSeqsCommand");
79                 exit(1);
80         }
81 }
82 //**********************************************************************************************************************
83
84 CountSeqsCommand::CountSeqsCommand(string option)  {
85         try {
86                 abort = false; calledHelp = false;   
87                 
88                 //allow user to run help
89                 if(option == "help") { help(); abort = true; calledHelp = true; }
90                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
91                 else {
92                         vector<string> myArray = setParameters();
93                         
94                         OptionParser parser(option);
95                         map<string,string> parameters = parser.getParameters();
96                         
97                         ValidParameters validParameter;
98                         map<string,string>::iterator it;
99                         
100                         //check to make sure all parameters are valid for command
101                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
102                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
103                         }
104                         
105                         //initialize outputTypes
106                         vector<string> tempOutNames;
107                         outputTypes["count"] = tempOutNames;
108                         
109                         
110                         //if the user changes the input directory command factory will send this info to us in the output parameter 
111                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
112                         if (inputDir == "not found"){   inputDir = "";          }
113                         else {
114                                 string path;
115                                 it = parameters.find("name");
116                                 //user has given a template file
117                                 if(it != parameters.end()){ 
118                                         path = m->hasPath(it->second);
119                                         //if the user has not given a path then, add inputdir. else leave path alone.
120                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
121                                 }
122                                 
123                                 it = parameters.find("group");
124                                 //user has given a template file
125                                 if(it != parameters.end()){ 
126                                         path = m->hasPath(it->second);
127                                         //if the user has not given a path then, add inputdir. else leave path alone.
128                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
129                                 }
130                         }
131                         
132                         //check for required parameters
133                         namefile = validParameter.validFile(parameters, "name", true);
134                         if (namefile == "not open") { namefile = ""; abort = true; }
135                         else if (namefile == "not found"){                                      
136                                 namefile = m->getNameFile(); 
137                                 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
138                                 else {  m->mothurOut("You have no current namefile and the name parameter is required."); m->mothurOutEndLine(); abort = true; }
139                         }else { m->setNameFile(namefile); }
140                         
141                         groupfile = validParameter.validFile(parameters, "group", true);
142                         if (groupfile == "not open") { abort = true; }
143                         else if (groupfile == "not found") {  groupfile = "";  }        
144                         else { m->setGroupFile(groupfile); }
145                         
146                         groups = validParameter.validFile(parameters, "groups", false);                 
147                         if (groups == "not found") { groups = "all"; }
148                         m->splitAtDash(groups, Groups);
149             
150             string temp = validParameter.validFile(parameters, "large", false);         if (temp == "not found") {      temp = "F";     }
151                         large = m->isTrue(temp);
152                         
153                         //if the user changes the output directory command factory will send this info to us in the output parameter 
154                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(namefile);               }
155
156                 }
157                 
158         }
159         catch(exception& e) {
160                 m->errorOut(e, "CountSeqsCommand", "CountSeqsCommand");
161                 exit(1);
162         }
163 }
164 //**********************************************************************************************************************
165
166 int CountSeqsCommand::execute(){
167         try {
168                 
169                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
170                 
171         map<string, string> variables; 
172         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(namefile));
173                 string outputFileName = getOutputFileName("count", variables);
174                 
175         int total = 0;
176         if (!large) { total = processSmall(outputFileName); }
177         else { total = processLarge(outputFileName);  }
178         
179                 if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; }
180                 
181         //set rabund file as new current rabundfile
182                 itTypes = outputTypes.find("count");
183                 if (itTypes != outputTypes.end()) {
184                         if ((itTypes->second).size() != 0) { string current = (itTypes->second)[0]; m->setCountTableFile(current); }
185                 }
186         
187         m->mothurOutEndLine();
188                 m->mothurOut("Total number of sequences: " + toString(total)); m->mothurOutEndLine();
189                 m->mothurOutEndLine();
190                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
191                 m->mothurOut(outputFileName); m->mothurOutEndLine();    
192                 m->mothurOutEndLine();
193                 
194                 return 0;               
195         }
196         
197         catch(exception& e) {
198                 m->errorOut(e, "CountSeqsCommand", "execute");
199                 exit(1);
200         }
201 }
202 //**********************************************************************************************************************
203
204 int CountSeqsCommand::processSmall(string outputFileName){
205         try {
206         ofstream out;
207         m->openOutputFile(outputFileName, out); outputTypes["count"].push_back(outputFileName);
208         outputNames.push_back(outputFileName); outputTypes["count"].push_back(outputFileName);
209                 out << "Representative_Sequence\ttotal\t";
210         
211         GroupMap* groupMap;
212                 if (groupfile != "") { 
213                         groupMap = new GroupMap(groupfile); groupMap->readMap(); 
214                         
215                         //make sure groups are valid. takes care of user setting groupNames that are invalid or setting groups=all
216                         SharedUtil* util = new SharedUtil();
217                         vector<string> nameGroups = groupMap->getNamesOfGroups();
218                         util->setGroups(Groups, nameGroups);
219                         delete util;
220                         
221                         //sort groupNames so that the group title match the counts below, this is needed because the map object automatically sorts
222                         sort(Groups.begin(), Groups.end());
223                         
224                         //print groupNames
225                         for (int i = 0; i < Groups.size(); i++) {
226                                 out << Groups[i] << '\t';
227                         }
228                 }
229                 out << endl;
230                 
231                 //open input file
232                 ifstream in;
233                 m->openInputFile(namefile, in);
234         
235                 int total = 0;
236                 while (!in.eof()) {
237                         if (m->control_pressed) { break; }
238                         
239                         string firstCol, secondCol;
240                         in >> firstCol; m->gobble(in); in >> secondCol; m->gobble(in);
241             //cout << firstCol << '\t' << secondCol << endl;
242             m->checkName(firstCol);
243             m->checkName(secondCol);
244                         //cout << firstCol << '\t' << secondCol << endl;
245            
246                         vector<string> names;
247                         m->splitAtChar(secondCol, names, ',');
248                         
249                         if (groupfile != "") {
250                                 //set to 0
251                                 map<string, int> groupCounts;
252                                 int total = 0;
253                                 for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; }
254                                 
255                                 //get counts for each of the users groups
256                                 for (int i = 0; i < names.size(); i++) {
257                                         string group = groupMap->getGroup(names[i]);
258                                         
259                                         if (group == "not found") { m->mothurOut("[ERROR]: " + names[i] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); }
260                                         else {
261                                                 map<string, int>::iterator it = groupCounts.find(group);
262                                                 
263                                                 //if not found, then this sequence is not from a group we care about
264                                                 if (it != groupCounts.end()) {
265                                                         it->second++;
266                                                         total++;
267                                                 }
268                                         }
269                                 }
270                                 
271                                 if (total != 0) {
272                                         out << firstCol << '\t' << total << '\t';
273                                         for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
274                                                 out << it->second << '\t';
275                                         }
276                                         out << endl;
277                                 }
278                         }else {
279                                 out << firstCol << '\t' << names.size() << endl;
280                         }
281                         
282                         total += names.size();
283                 }
284                 in.close();
285         out.close();
286                 
287                 if (groupfile != "") { delete groupMap; }
288
289         return total;
290     }
291         catch(exception& e) {
292                 m->errorOut(e, "CountSeqsCommand", "processSmall");
293                 exit(1);
294         }
295 }
296 //**********************************************************************************************************************
297
298 int CountSeqsCommand::processLarge(string outputFileName){
299         try {
300         set<string> namesOfGroups;
301         map<string, int> initial;
302         for (set<string>::iterator it = namesOfGroups.begin(); it != namesOfGroups.end(); it++) { initial[(*it)] = 0;  }
303         ofstream out;
304         m->openOutputFile(outputFileName, out); 
305         outputNames.push_back(outputFileName); outputTypes["count"].push_back(outputFileName);
306                 out << "Representative_Sequence\ttotal\t";
307         if (groupfile == "") { out << endl; }
308         
309         map<string, unsigned long long> namesToIndex;
310         string outfile = m->getRootName(groupfile) + "sorted.groups.temp";
311         string outName = m->getRootName(namefile) + "sorted.name.temp";
312         map<int, string> indexToName;
313         map<int, string> indexToGroup;
314         if (groupfile != "") { 
315             time_t estart = time(NULL);
316             //convert name file to redundant -> unique.  set unique name equal to index so we can use vectors, save name for later.
317             string newNameFile = m->getRootName(namefile) + ".name.temp";
318             string newGroupFile = m->getRootName(groupfile) + ".group.temp";
319             indexToName = processNameFile(newNameFile);
320             indexToGroup = getGroupNames(newGroupFile, namesOfGroups);
321             
322             //sort file by first column so the names of sequences will be easier to find
323             //use the unix sort 
324             #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
325                 string command = "sort -n " + newGroupFile + " -o " + outfile;
326                 system(command.c_str());
327                 command = "sort -n " + newNameFile + " -o " + outName;
328                 system(command.c_str());
329             #else //sort using windows sort
330                 string command = "sort " + newGroupFile + " /O " + outfile;
331                 system(command.c_str());
332                 command = "sort " + newNameFile + " /O " + outName;
333                 system(command.c_str());
334             #endif
335             m->mothurRemove(newNameFile);
336             m->mothurRemove(newGroupFile);
337             
338             m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to sort and index the group and name files. "); m->mothurOutEndLine();
339         }else { outName = namefile; }
340          
341         time_t estart = time(NULL);
342         //open input file
343                 ifstream in;
344                 m->openInputFile(outName, in);
345         
346         //open input file
347                 ifstream in2;
348                 
349                 int total = 0;
350         vector< vector<int> > nameMapCount;
351         if (groupfile != "") {
352             m->openInputFile(outfile, in2);
353             nameMapCount.resize(indexToName.size());
354             for (int i = 0; i < nameMapCount.size(); i++) {
355                 nameMapCount[i].resize(indexToGroup.size(), 0);
356             }
357         }
358         
359                 while (!in.eof()) {
360                         if (m->control_pressed) { break; }
361                         
362                         string firstCol;
363                         in >> firstCol;  m->gobble(in);
364                         
365                         if (groupfile != "") {
366                 int uniqueIndex;
367                 in >> uniqueIndex; m->gobble(in);
368                 
369                 string name; int groupIndex;
370                 in2 >> name >> groupIndex; m->gobble(in2);
371                 
372                 if (name != firstCol) { m->mothurOut("[ERROR]: found " + name + " in your groupfile, but " + firstCol + " was in your namefile, please correct.\n"); m->control_pressed = true; }
373                 
374                 nameMapCount[uniqueIndex][groupIndex]++;
375                 total++;
376             }else { 
377                 string secondCol;
378                 in >> secondCol; m->gobble(in);
379                 int num = m->getNumNames(secondCol);
380                 out << firstCol << '\t' << num << endl;
381                 total += num;
382             }
383                 }
384                 in.close();
385         
386         if (groupfile != "") {
387             m->mothurRemove(outfile);
388             m->mothurRemove(outName);
389             in2.close();
390             for (map<int, string>::iterator it = indexToGroup.begin(); it != indexToGroup.end(); it++) { out << it->second << '\t';  }
391             out << endl;
392             for (int i = 0; i < nameMapCount.size(); i++) {
393                 string totalsLine = "";
394                 int seqTotal = 0;
395                 for (int j = 0; j < nameMapCount[i].size(); j++) {
396                     seqTotal += nameMapCount[i][j];
397                     totalsLine += toString(nameMapCount[i][j]) + '\t';
398                 }
399                 out << indexToName[i] << '\t' << seqTotal << '\t' << totalsLine << endl;
400             }
401         }
402         
403         out.close();
404         
405         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to create the count table file. "); m->mothurOutEndLine();
406         
407         return total;
408     }
409         catch(exception& e) {
410                 m->errorOut(e, "CountSeqsCommand", "processLarge");
411                 exit(1);
412         }
413 }
414 /**************************************************************************************************/
415 map<int, string> CountSeqsCommand::processNameFile(string name) {
416         try {
417         map<int, string> indexToNames;
418         
419         ofstream out;
420         m->openOutputFile(name, out);
421         
422         //open input file
423                 ifstream in;
424                 m->openInputFile(namefile, in);
425         
426         string rest = "";
427         char buffer[4096];
428         bool pairDone = false;
429         bool columnOne = true;
430         string firstCol, secondCol;
431         int count = 0;
432         
433                 while (!in.eof()) {
434                         if (m->control_pressed) { break; }
435                         
436             in.read(buffer, 4096);
437             vector<string> pieces = m->splitWhiteSpace(rest, buffer, in.gcount());
438             
439             for (int i = 0; i < pieces.size(); i++) {
440                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
441                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
442                 
443                 if (pairDone) { 
444                     m->checkName(firstCol);
445                     m->checkName(secondCol);
446                     //parse names into vector
447                     vector<string> theseNames;
448                     m->splitAtComma(secondCol, theseNames);
449                     for (int i = 0; i < theseNames.size(); i++) {  out << theseNames[i] << '\t' << count << endl;  }
450                     indexToNames[count] = firstCol;
451                     pairDone = false; 
452                     count++;
453                 }
454             }
455                 }
456                 in.close();
457         out.close();
458                 
459         if (rest != "") {
460             vector<string> pieces = m->splitWhiteSpace(rest);
461             
462             for (int i = 0; i < pieces.size(); i++) {
463                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
464                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
465                 
466                 if (pairDone) { 
467                     m->checkName(firstCol);
468                     m->checkName(secondCol);
469                     //parse names into vector
470                     vector<string> theseNames;
471                     m->splitAtComma(secondCol, theseNames);
472                     for (int i = 0; i < theseNames.size(); i++) {  out << theseNames[i] << '\t' << count << endl;  }
473                     indexToNames[count] = firstCol;
474                     pairDone = false; 
475                     count++;
476                 }
477             }
478
479         }
480         
481         return indexToNames;
482     }
483         catch(exception& e) {
484                 m->errorOut(e, "CountSeqsCommand", "processNameFile");
485                 exit(1);
486         }
487 }
488 /**************************************************************************************************/
489 map<int, string> CountSeqsCommand::getGroupNames(string filename, set<string>& namesOfGroups) {
490         try {
491         map<int, string> indexToGroups;
492         map<string, int> groupIndex;
493         map<string, int>::iterator it;
494         
495         ofstream out;
496         m->openOutputFile(filename, out);
497         
498         //open input file
499                 ifstream in;
500                 m->openInputFile(groupfile, in);
501         
502         string rest = "";
503         char buffer[4096];
504         bool pairDone = false;
505         bool columnOne = true;
506         string firstCol, secondCol;
507         int count = 0;
508         
509                 while (!in.eof()) {
510                         if (m->control_pressed) { break; }
511                         
512             in.read(buffer, 4096);
513             vector<string> pieces = m->splitWhiteSpace(rest, buffer, in.gcount());
514             
515             for (int i = 0; i < pieces.size(); i++) {
516                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
517                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
518                 
519                 if (pairDone) { 
520                     m->checkName(firstCol);
521                     it = groupIndex.find(secondCol);
522                     if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above
523                         groupIndex[secondCol] = count;
524                         count++;
525                     }
526                     out << firstCol << '\t' << groupIndex[secondCol] << endl; 
527                     namesOfGroups.insert(secondCol);
528                     pairDone = false; 
529                 }
530             }
531                 }
532                 in.close();
533         out.close();
534         
535         if (rest != "") {
536             vector<string> pieces = m->splitWhiteSpace(rest);
537             
538             for (int i = 0; i < pieces.size(); i++) {
539                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
540                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
541                 
542                 if (pairDone) { 
543                     m->checkName(firstCol);
544                     it = groupIndex.find(secondCol);
545                     if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above
546                         groupIndex[secondCol] = count;
547                         count++;
548                     }
549                     out << firstCol << '\t' << groupIndex[secondCol] << endl; 
550                     namesOfGroups.insert(secondCol);
551                     pairDone = false; 
552                 }
553             }
554         }
555                 
556         for (it = groupIndex.begin(); it != groupIndex.end(); it++) {  indexToGroups[it->second] = it->first;  }
557         
558         return indexToGroups;
559         }
560         catch(exception& e) {
561                 m->errorOut(e, "CountSeqsCommand", "getGroupNames");
562                 exit(1);
563         }
564 }
565 //**********************************************************************************************************************
566
567
568