]> git.donarmstrong.com Git - mothur.git/blob - countseqscommand.cpp
made make.table alias to count.seqs command. added large parameter to count.seqs...
[mothur.git] / countseqscommand.cpp
1 /*
2  *  countseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 6/1/11.
6  *  Copyright 2011 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "countseqscommand.h"
11 #include "groupmap.h"
12 #include "sharedutilities.h"
13
14 //**********************************************************************************************************************
15 vector<string> CountSeqsCommand::setParameters(){       
16         try {
17                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pname);
18                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
19         CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
20                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
21                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
22                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
23                 
24                 vector<string> myArray;
25                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
26                 return myArray;
27         }
28         catch(exception& e) {
29                 m->errorOut(e, "CountSeqsCommand", "setParameters");
30                 exit(1);
31         }
32 }
33 //**********************************************************************************************************************
34 string CountSeqsCommand::getHelpString(){       
35         try {
36                 string helpString = "";
37                 helpString += "The count.seqs aka. make.table command reads a name file and outputs a .count.table file.  You may also provide a group file to get the counts broken down by group.\n";
38                 helpString += "The groups parameter allows you to indicate which groups you want to include in the counts, by default all groups in your groupfile are used.\n";
39         helpString += "The large parameter indicates the name and group files are too large to fit in RAM.\n";
40                 helpString += "When you use the groups parameter and a sequence does not represent any sequences from the groups you specify it is not included in the .count.summary file.\n";
41                 helpString += "The count.seqs command should be in the following format: count.seqs(name=yourNameFile).\n";
42                 helpString += "Example count.seqs(name=amazon.names) or make.table(name=amazon.names).\n";
43                 helpString += "Note: No spaces between parameter labels (i.e. name), '=' and parameters (i.e.yourNameFile).\n";
44                 return helpString;
45         }
46         catch(exception& e) {
47                 m->errorOut(e, "CountSeqsCommand", "getHelpString");
48                 exit(1);
49         }
50 }
51 //**********************************************************************************************************************
52 string CountSeqsCommand::getOutputFileNameTag(string type, string inputName=""){        
53         try {
54         string outputFileName = "";
55                 map<string, vector<string> >::iterator it;
56         
57         //is this a type this command creates
58         it = outputTypes.find(type);
59         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
60         else {
61             if (type == "counttable") {  outputFileName =  "count.table"; }
62             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
63         }
64         return outputFileName;
65         }
66         catch(exception& e) {
67                 m->errorOut(e, "CountSeqsCommand", "getOutputFileNameTag");
68                 exit(1);
69         }
70 }
71 //**********************************************************************************************************************
72 CountSeqsCommand::CountSeqsCommand(){   
73         try {
74                 abort = true; calledHelp = true; 
75                 setParameters();
76                 vector<string> tempOutNames;
77                 outputTypes["counttable"] = tempOutNames;
78         }
79         catch(exception& e) {
80                 m->errorOut(e, "CountSeqsCommand", "CountSeqsCommand");
81                 exit(1);
82         }
83 }
84 //**********************************************************************************************************************
85
86 CountSeqsCommand::CountSeqsCommand(string option)  {
87         try {
88                 abort = false; calledHelp = false;   
89                 
90                 //allow user to run help
91                 if(option == "help") { help(); abort = true; calledHelp = true; }
92                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
93                 else {
94                         vector<string> myArray = setParameters();
95                         
96                         OptionParser parser(option);
97                         map<string,string> parameters = parser.getParameters();
98                         
99                         ValidParameters validParameter;
100                         map<string,string>::iterator it;
101                         
102                         //check to make sure all parameters are valid for command
103                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
104                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
105                         }
106                         
107                         //initialize outputTypes
108                         vector<string> tempOutNames;
109                         outputTypes["counttable"] = tempOutNames;
110                         
111                         
112                         //if the user changes the input directory command factory will send this info to us in the output parameter 
113                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
114                         if (inputDir == "not found"){   inputDir = "";          }
115                         else {
116                                 string path;
117                                 it = parameters.find("name");
118                                 //user has given a template file
119                                 if(it != parameters.end()){ 
120                                         path = m->hasPath(it->second);
121                                         //if the user has not given a path then, add inputdir. else leave path alone.
122                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
123                                 }
124                                 
125                                 it = parameters.find("group");
126                                 //user has given a template file
127                                 if(it != parameters.end()){ 
128                                         path = m->hasPath(it->second);
129                                         //if the user has not given a path then, add inputdir. else leave path alone.
130                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
131                                 }
132                         }
133                         
134                         //check for required parameters
135                         namefile = validParameter.validFile(parameters, "name", true);
136                         if (namefile == "not open") { namefile = ""; abort = true; }
137                         else if (namefile == "not found"){                                      
138                                 namefile = m->getNameFile(); 
139                                 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
140                                 else {  m->mothurOut("You have no current namefile and the name parameter is required."); m->mothurOutEndLine(); abort = true; }
141                         }else { m->setNameFile(namefile); }
142                         
143                         groupfile = validParameter.validFile(parameters, "group", true);
144                         if (groupfile == "not open") { abort = true; }
145                         else if (groupfile == "not found") {  groupfile = "";  }        
146                         else { m->setGroupFile(groupfile); }
147                         
148                         groups = validParameter.validFile(parameters, "groups", false);                 
149                         if (groups == "not found") { groups = "all"; }
150                         m->splitAtDash(groups, Groups);
151             
152             string temp = validParameter.validFile(parameters, "large", false);         if (temp == "not found") {      temp = "F";     }
153                         large = m->isTrue(temp);
154                         
155                         //if the user changes the output directory command factory will send this info to us in the output parameter 
156                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(namefile);               }
157
158                 }
159                 
160         }
161         catch(exception& e) {
162                 m->errorOut(e, "CountSeqsCommand", "CountSeqsCommand");
163                 exit(1);
164         }
165 }
166 //**********************************************************************************************************************
167
168 int CountSeqsCommand::execute(){
169         try {
170                 
171                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
172                 
173                 string outputFileName = outputDir + m->getRootName(m->getSimpleName(namefile)) + getOutputFileNameTag("counttable");
174                 
175         int total = 0;
176         if (!large) { total = processSmall(outputFileName); }
177         else { total = processLarge(outputFileName);  }
178                                 
179                 if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; }
180                 
181         //set rabund file as new current rabundfile
182                 itTypes = outputTypes.find("counttable");
183                 if (itTypes != outputTypes.end()) {
184                         if ((itTypes->second).size() != 0) { string current = (itTypes->second)[0]; m->setCountTableFile(current); }
185                 }
186         
187         m->mothurOutEndLine();
188                 m->mothurOut("Total number of sequences: " + toString(total)); m->mothurOutEndLine();
189                 m->mothurOutEndLine();
190                 m->mothurOut("Output File Name: "); m->mothurOutEndLine();
191                 m->mothurOut(outputFileName); m->mothurOutEndLine();    
192                 m->mothurOutEndLine();
193                 
194                 return 0;               
195         }
196         
197         catch(exception& e) {
198                 m->errorOut(e, "CountSeqsCommand", "execute");
199                 exit(1);
200         }
201 }
202 //**********************************************************************************************************************
203
204 int CountSeqsCommand::processSmall(string outputFileName){
205         try {
206         ofstream out;
207         m->openOutputFile(outputFileName, out); outputTypes["counttable"].push_back(outputFileName);
208         outputNames.push_back(outputFileName); outputTypes["counttable"].push_back(outputFileName);
209                 out << "Representative_Sequence\ttotal\t";
210         
211         GroupMap* groupMap;
212                 if (groupfile != "") { 
213                         groupMap = new GroupMap(groupfile); groupMap->readMap(); 
214                         
215                         //make sure groups are valid. takes care of user setting groupNames that are invalid or setting groups=all
216                         SharedUtil* util = new SharedUtil();
217                         vector<string> nameGroups = groupMap->getNamesOfGroups();
218                         util->setGroups(Groups, nameGroups);
219                         delete util;
220                         
221                         //sort groupNames so that the group title match the counts below, this is needed because the map object automatically sorts
222                         sort(Groups.begin(), Groups.end());
223                         
224                         //print groupNames
225                         for (int i = 0; i < Groups.size(); i++) {
226                                 out << Groups[i] << '\t';
227                         }
228                 }
229                 out << endl;
230                 
231                 //open input file
232                 ifstream in;
233                 m->openInputFile(namefile, in);
234         
235                 int total = 0;
236                 while (!in.eof()) {
237                         if (m->control_pressed) { break; }
238                         
239                         string firstCol, secondCol;
240                         in >> firstCol; m->gobble(in); in >> secondCol; m->gobble(in);
241                         
242                         vector<string> names;
243                         m->splitAtChar(secondCol, names, ',');
244                         
245                         if (groupfile != "") {
246                                 //set to 0
247                                 map<string, int> groupCounts;
248                                 int total = 0;
249                                 for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; }
250                                 
251                                 //get counts for each of the users groups
252                                 for (int i = 0; i < names.size(); i++) {
253                                         string group = groupMap->getGroup(names[i]);
254                                         
255                                         if (group == "not found") { m->mothurOut("[ERROR]: " + names[i] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); }
256                                         else {
257                                                 map<string, int>::iterator it = groupCounts.find(group);
258                                                 
259                                                 //if not found, then this sequence is not from a group we care about
260                                                 if (it != groupCounts.end()) {
261                                                         it->second++;
262                                                         total++;
263                                                 }
264                                         }
265                                 }
266                                 
267                                 if (total != 0) {
268                                         out << firstCol << '\t' << total << '\t';
269                                         for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
270                                                 out << it->second << '\t';
271                                         }
272                                         out << endl;
273                                 }
274                         }else {
275                                 out << firstCol << '\t' << names.size() << endl;
276                         }
277                         
278                         total += names.size();
279                 }
280                 in.close();
281         out.close();
282                 
283                 if (groupfile != "") { delete groupMap; }
284
285         return total;
286     }
287         catch(exception& e) {
288                 m->errorOut(e, "CountSeqsCommand", "processSmall");
289                 exit(1);
290         }
291 }
292 //**********************************************************************************************************************
293
294 int CountSeqsCommand::processLarge(string outputFileName){
295         try {
296         set<string> namesOfGroups;
297         map<string, int> initial;
298         for (set<string>::iterator it = namesOfGroups.begin(); it != namesOfGroups.end(); it++) { initial[(*it)] = 0;  }
299         ofstream out;
300         m->openOutputFile(outputFileName, out); 
301         outputNames.push_back(outputFileName); outputTypes["counttable"].push_back(outputFileName);
302                 out << "Representative_Sequence\ttotal\t";
303         if (groupfile == "") { out << endl; }
304         
305         map<string, unsigned long long> namesToIndex;
306         string outfile = m->getRootName(groupfile) + "sorted.groups.temp";
307         string outName = m->getRootName(namefile) + "sorted.name.temp";
308         map<int, string> indexToName;
309         map<int, string> indexToGroup;
310         if (groupfile != "") { 
311             time_t estart = time(NULL);
312             //convert name file to redundant -> unique.  set unique name equal to index so we can use vectors, save name for later.
313             string newNameFile = m->getRootName(namefile) + ".name.temp";
314             string newGroupFile = m->getRootName(groupfile) + ".group.temp";
315             indexToName = processNameFile(newNameFile);
316             indexToGroup = getGroupNames(newGroupFile, namesOfGroups);
317             
318             //sort file by first column so the names of sequences will be easier to find
319             //use the unix sort 
320             #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
321                 string command = "sort -n " + newGroupFile + " -o " + outfile;
322                 system(command.c_str());
323                 command = "sort -n " + newNameFile + " -o " + outName;
324                 system(command.c_str());
325             #else //sort using windows sort
326                 string command = "sort " + newGroupFile + " /O " + outfile;
327                 system(command.c_str());
328                 command = "sort " + newNameFile + " /O " + outName;
329                 system(command.c_str());
330             #endif
331             m->mothurRemove(newNameFile);
332             m->mothurRemove(newGroupFile);
333             
334             m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to sort and index the group and name files. "); m->mothurOutEndLine();
335         }else { outName = namefile; }
336          
337         time_t estart = time(NULL);
338         //open input file
339                 ifstream in;
340                 m->openInputFile(outName, in);
341         
342         //open input file
343                 ifstream in2;
344                 
345                 int total = 0;
346         vector< vector<int> > nameMapCount;
347         if (groupfile != "") {
348             m->openInputFile(outfile, in2);
349             nameMapCount.resize(indexToName.size());
350             for (int i = 0; i < nameMapCount.size(); i++) {
351                 nameMapCount[i].resize(indexToGroup.size(), 0);
352             }
353         }
354         
355                 while (!in.eof()) {
356                         if (m->control_pressed) { break; }
357                         
358                         string firstCol;
359                         in >> firstCol;  m->gobble(in);
360                         
361                         if (groupfile != "") {
362                 int uniqueIndex;
363                 in >> uniqueIndex; m->gobble(in);
364                 
365                 string name; int groupIndex;
366                 in2 >> name >> groupIndex; m->gobble(in2);
367                 
368                 if (name != firstCol) { m->mothurOut("[ERROR]: found " + name + " in your groupfile, but " + firstCol + " was in your namefile, please correct.\n"); m->control_pressed = true; }
369                 
370                 nameMapCount[uniqueIndex][groupIndex]++;
371                 total++;
372             }else { 
373                 string secondCol;
374                 in >> secondCol; m->gobble(in);
375                 int num = m->getNumNames(secondCol);
376                 out << firstCol << '\t' << num << endl;
377                 total += num;
378             }
379                 }
380                 in.close();
381         
382         if (groupfile != "") {
383             m->mothurRemove(outfile);
384             m->mothurRemove(outName);
385             in2.close();
386             for (map<int, string>::iterator it = indexToGroup.begin(); it != indexToGroup.end(); it++) { out << it->second << '\t';  }
387             out << endl;
388             for (int i = 0; i < nameMapCount.size(); i++) {
389                 string totalsLine = "";
390                 int seqTotal = 0;
391                 for (int j = 0; j < nameMapCount[i].size(); j++) {
392                     seqTotal += nameMapCount[i][j];
393                     totalsLine += toString(nameMapCount[i][j]) + '\t';
394                 }
395                 out << indexToName[i] << '\t' << seqTotal << '\t' << totalsLine << endl;
396             }
397         }
398         
399         out.close();
400         
401         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to create the count table file. "); m->mothurOutEndLine();
402         
403         return total;
404     }
405         catch(exception& e) {
406                 m->errorOut(e, "CountSeqsCommand", "processLarge");
407                 exit(1);
408         }
409 }
410 /**************************************************************************************************/
411 map<int, string> CountSeqsCommand::processNameFile(string name) {
412         try {
413         map<int, string> indexToNames;
414         
415         ofstream out;
416         m->openOutputFile(name, out);
417         
418         //open input file
419                 ifstream in;
420                 m->openInputFile(namefile, in);
421         
422         string rest = "";
423         char buffer[4096];
424         bool pairDone = false;
425         bool columnOne = true;
426         string firstCol, secondCol;
427         int count = 0;
428         
429                 while (!in.eof()) {
430                         if (m->control_pressed) { break; }
431                         
432             in.read(buffer, 4096);
433             vector<string> pieces = m->splitWhiteSpace(rest, buffer, in.gcount());
434             
435             for (int i = 0; i < pieces.size(); i++) {
436                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
437                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
438                 
439                 if (pairDone) { 
440                     //parse names into vector
441                     vector<string> theseNames;
442                     m->splitAtComma(secondCol, theseNames);
443                     for (int i = 0; i < theseNames.size(); i++) {  out << theseNames[i] << '\t' << count << endl;  }
444                     indexToNames[count] = firstCol;
445                     pairDone = false; 
446                     count++;
447                 }
448             }
449                 }
450                 in.close();
451         out.close();
452                 
453         return indexToNames;
454     }
455         catch(exception& e) {
456                 m->errorOut(e, "CountSeqsCommand", "processNameFile");
457                 exit(1);
458         }
459 }
460 /**************************************************************************************************/
461 map<int, string> CountSeqsCommand::getGroupNames(string filename, set<string>& namesOfGroups) {
462         try {
463         map<int, string> indexToGroups;
464         map<string, int> groupIndex;
465         map<string, int>::iterator it;
466         
467         ofstream out;
468         m->openOutputFile(filename, out);
469         
470         //open input file
471                 ifstream in;
472                 m->openInputFile(groupfile, in);
473         
474         string rest = "";
475         char buffer[4096];
476         bool pairDone = false;
477         bool columnOne = true;
478         string firstCol, secondCol;
479         int count = 0;
480         
481                 while (!in.eof()) {
482                         if (m->control_pressed) { break; }
483                         
484             in.read(buffer, 4096);
485             vector<string> pieces = m->splitWhiteSpace(rest, buffer, in.gcount());
486             
487             for (int i = 0; i < pieces.size(); i++) {
488                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
489                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
490                 
491                 if (pairDone) { 
492                     it = groupIndex.find(secondCol);
493                     if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above
494                         groupIndex[secondCol] = count;
495                         count++;
496                     }
497                     out << firstCol << '\t' << groupIndex[secondCol] << endl; 
498                     namesOfGroups.insert(secondCol);
499                     pairDone = false; 
500                 }
501             }
502                 }
503                 in.close();
504         out.close();
505                 
506         for (it = groupIndex.begin(); it != groupIndex.end(); it++) {  indexToGroups[it->second] = it->first;  }
507         
508         return indexToGroups;
509         }
510         catch(exception& e) {
511                 m->errorOut(e, "CountSeqsCommand", "getGroupNames");
512                 exit(1);
513         }
514 }
515 //**********************************************************************************************************************
516
517
518