]> git.donarmstrong.com Git - mothur.git/blob - countseqscommand.cpp
changed added group output to indicator command. a few changes to work with the guy
[mothur.git] / countseqscommand.cpp
1 /*
2  *  countseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 6/1/11.
6  *  Copyright 2011 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "countseqscommand.h"
11 #include "groupmap.h"
12 #include "sharedutilities.h"
13 #include "counttable.h"
14
15 //**********************************************************************************************************************
16 vector<string> CountSeqsCommand::setParameters(){       
17         try {
18                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pname);
19                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
20         CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
21                 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
22                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
23                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
24                 
25                 vector<string> myArray;
26                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
27                 return myArray;
28         }
29         catch(exception& e) {
30                 m->errorOut(e, "CountSeqsCommand", "setParameters");
31                 exit(1);
32         }
33 }
34 //**********************************************************************************************************************
35 string CountSeqsCommand::getHelpString(){       
36         try {
37                 string helpString = "";
38                 helpString += "The count.seqs aka. make.table command reads a name file and outputs a .count_table file.  You may also provide a group file to get the counts broken down by group.\n";
39                 helpString += "The groups parameter allows you to indicate which groups you want to include in the counts, by default all groups in your groupfile are used.\n";
40         helpString += "The large parameter indicates the name and group files are too large to fit in RAM.\n";
41                 helpString += "When you use the groups parameter and a sequence does not represent any sequences from the groups you specify it is not included in the .count.summary file.\n";
42                 helpString += "The count.seqs command should be in the following format: count.seqs(name=yourNameFile).\n";
43                 helpString += "Example count.seqs(name=amazon.names) or make.table(name=amazon.names).\n";
44                 helpString += "Note: No spaces between parameter labels (i.e. name), '=' and parameters (i.e.yourNameFile).\n";
45                 return helpString;
46         }
47         catch(exception& e) {
48                 m->errorOut(e, "CountSeqsCommand", "getHelpString");
49                 exit(1);
50         }
51 }
52 //**********************************************************************************************************************
53 string CountSeqsCommand::getOutputFileNameTag(string type, string inputName=""){        
54         try {
55         string outputFileName = "";
56                 map<string, vector<string> >::iterator it;
57         
58         //is this a type this command creates
59         it = outputTypes.find(type);
60         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
61         else {
62             if (type == "counttable") {  outputFileName =  "count_table"; }
63             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
64         }
65         return outputFileName;
66         }
67         catch(exception& e) {
68                 m->errorOut(e, "CountSeqsCommand", "getOutputFileNameTag");
69                 exit(1);
70         }
71 }
72 //**********************************************************************************************************************
73 CountSeqsCommand::CountSeqsCommand(){   
74         try {
75                 abort = true; calledHelp = true; 
76                 setParameters();
77                 vector<string> tempOutNames;
78                 outputTypes["counttable"] = tempOutNames;
79         }
80         catch(exception& e) {
81                 m->errorOut(e, "CountSeqsCommand", "CountSeqsCommand");
82                 exit(1);
83         }
84 }
85 //**********************************************************************************************************************
86
87 CountSeqsCommand::CountSeqsCommand(string option)  {
88         try {
89                 abort = false; calledHelp = false;   
90                 
91                 //allow user to run help
92                 if(option == "help") { help(); abort = true; calledHelp = true; }
93                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
94                 else {
95                         vector<string> myArray = setParameters();
96                         
97                         OptionParser parser(option);
98                         map<string,string> parameters = parser.getParameters();
99                         
100                         ValidParameters validParameter;
101                         map<string,string>::iterator it;
102                         
103                         //check to make sure all parameters are valid for command
104                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
105                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
106                         }
107                         
108                         //initialize outputTypes
109                         vector<string> tempOutNames;
110                         outputTypes["counttable"] = tempOutNames;
111                         
112                         
113                         //if the user changes the input directory command factory will send this info to us in the output parameter 
114                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
115                         if (inputDir == "not found"){   inputDir = "";          }
116                         else {
117                                 string path;
118                                 it = parameters.find("name");
119                                 //user has given a template file
120                                 if(it != parameters.end()){ 
121                                         path = m->hasPath(it->second);
122                                         //if the user has not given a path then, add inputdir. else leave path alone.
123                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
124                                 }
125                                 
126                                 it = parameters.find("group");
127                                 //user has given a template file
128                                 if(it != parameters.end()){ 
129                                         path = m->hasPath(it->second);
130                                         //if the user has not given a path then, add inputdir. else leave path alone.
131                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
132                                 }
133                         }
134                         
135                         //check for required parameters
136                         namefile = validParameter.validFile(parameters, "name", true);
137                         if (namefile == "not open") { namefile = ""; abort = true; }
138                         else if (namefile == "not found"){                                      
139                                 namefile = m->getNameFile(); 
140                                 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
141                                 else {  m->mothurOut("You have no current namefile and the name parameter is required."); m->mothurOutEndLine(); abort = true; }
142                         }else { m->setNameFile(namefile); }
143                         
144                         groupfile = validParameter.validFile(parameters, "group", true);
145                         if (groupfile == "not open") { abort = true; }
146                         else if (groupfile == "not found") {  groupfile = "";  }        
147                         else { m->setGroupFile(groupfile); }
148                         
149                         groups = validParameter.validFile(parameters, "groups", false);                 
150                         if (groups == "not found") { groups = "all"; }
151                         m->splitAtDash(groups, Groups);
152             
153             string temp = validParameter.validFile(parameters, "large", false);         if (temp == "not found") {      temp = "F";     }
154                         large = m->isTrue(temp);
155                         
156                         //if the user changes the output directory command factory will send this info to us in the output parameter 
157                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(namefile);               }
158
159                 }
160                 
161         }
162         catch(exception& e) {
163                 m->errorOut(e, "CountSeqsCommand", "CountSeqsCommand");
164                 exit(1);
165         }
166 }
167 //**********************************************************************************************************************
168
169 int CountSeqsCommand::execute(){
170         try {
171                 
172                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
173                 
174                 string outputFileName = outputDir + m->getRootName(m->getSimpleName(namefile)) + getOutputFileNameTag("counttable");
175                 
176         int total = 0;
177         if (!large) { total = processSmall(outputFileName); }
178         else { total = processLarge(outputFileName);  }
179         
180                 if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; }
181                 
182         //set rabund file as new current rabundfile
183                 itTypes = outputTypes.find("counttable");
184                 if (itTypes != outputTypes.end()) {
185                         if ((itTypes->second).size() != 0) { string current = (itTypes->second)[0]; m->setCountTableFile(current); }
186                 }
187         
188         m->mothurOutEndLine();
189                 m->mothurOut("Total number of sequences: " + toString(total)); m->mothurOutEndLine();
190                 m->mothurOutEndLine();
191                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
192                 m->mothurOut(outputFileName); m->mothurOutEndLine();    
193                 m->mothurOutEndLine();
194                 
195                 return 0;               
196         }
197         
198         catch(exception& e) {
199                 m->errorOut(e, "CountSeqsCommand", "execute");
200                 exit(1);
201         }
202 }
203 //**********************************************************************************************************************
204
205 int CountSeqsCommand::processSmall(string outputFileName){
206         try {
207         ofstream out;
208         m->openOutputFile(outputFileName, out); outputTypes["counttable"].push_back(outputFileName);
209         outputNames.push_back(outputFileName); outputTypes["counttable"].push_back(outputFileName);
210                 out << "Representative_Sequence\ttotal\t";
211         
212         GroupMap* groupMap;
213                 if (groupfile != "") { 
214                         groupMap = new GroupMap(groupfile); groupMap->readMap(); 
215                         
216                         //make sure groups are valid. takes care of user setting groupNames that are invalid or setting groups=all
217                         SharedUtil* util = new SharedUtil();
218                         vector<string> nameGroups = groupMap->getNamesOfGroups();
219                         util->setGroups(Groups, nameGroups);
220                         delete util;
221                         
222                         //sort groupNames so that the group title match the counts below, this is needed because the map object automatically sorts
223                         sort(Groups.begin(), Groups.end());
224                         
225                         //print groupNames
226                         for (int i = 0; i < Groups.size(); i++) {
227                                 out << Groups[i] << '\t';
228                         }
229                 }
230                 out << endl;
231                 
232                 //open input file
233                 ifstream in;
234                 m->openInputFile(namefile, in);
235         
236                 int total = 0;
237                 while (!in.eof()) {
238                         if (m->control_pressed) { break; }
239                         
240                         string firstCol, secondCol;
241                         in >> firstCol; m->gobble(in); in >> secondCol; m->gobble(in);
242                         
243                         vector<string> names;
244                         m->splitAtChar(secondCol, names, ',');
245                         
246                         if (groupfile != "") {
247                                 //set to 0
248                                 map<string, int> groupCounts;
249                                 int total = 0;
250                                 for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; }
251                                 
252                                 //get counts for each of the users groups
253                                 for (int i = 0; i < names.size(); i++) {
254                                         string group = groupMap->getGroup(names[i]);
255                                         
256                                         if (group == "not found") { m->mothurOut("[ERROR]: " + names[i] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); }
257                                         else {
258                                                 map<string, int>::iterator it = groupCounts.find(group);
259                                                 
260                                                 //if not found, then this sequence is not from a group we care about
261                                                 if (it != groupCounts.end()) {
262                                                         it->second++;
263                                                         total++;
264                                                 }
265                                         }
266                                 }
267                                 
268                                 if (total != 0) {
269                                         out << firstCol << '\t' << total << '\t';
270                                         for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
271                                                 out << it->second << '\t';
272                                         }
273                                         out << endl;
274                                 }
275                         }else {
276                                 out << firstCol << '\t' << names.size() << endl;
277                         }
278                         
279                         total += names.size();
280                 }
281                 in.close();
282         out.close();
283                 
284                 if (groupfile != "") { delete groupMap; }
285
286         return total;
287     }
288         catch(exception& e) {
289                 m->errorOut(e, "CountSeqsCommand", "processSmall");
290                 exit(1);
291         }
292 }
293 //**********************************************************************************************************************
294
295 int CountSeqsCommand::processLarge(string outputFileName){
296         try {
297         set<string> namesOfGroups;
298         map<string, int> initial;
299         for (set<string>::iterator it = namesOfGroups.begin(); it != namesOfGroups.end(); it++) { initial[(*it)] = 0;  }
300         ofstream out;
301         m->openOutputFile(outputFileName, out); 
302         outputNames.push_back(outputFileName); outputTypes["counttable"].push_back(outputFileName);
303                 out << "Representative_Sequence\ttotal\t";
304         if (groupfile == "") { out << endl; }
305         
306         map<string, unsigned long long> namesToIndex;
307         string outfile = m->getRootName(groupfile) + "sorted.groups.temp";
308         string outName = m->getRootName(namefile) + "sorted.name.temp";
309         map<int, string> indexToName;
310         map<int, string> indexToGroup;
311         if (groupfile != "") { 
312             time_t estart = time(NULL);
313             //convert name file to redundant -> unique.  set unique name equal to index so we can use vectors, save name for later.
314             string newNameFile = m->getRootName(namefile) + ".name.temp";
315             string newGroupFile = m->getRootName(groupfile) + ".group.temp";
316             indexToName = processNameFile(newNameFile);
317             indexToGroup = getGroupNames(newGroupFile, namesOfGroups);
318             
319             //sort file by first column so the names of sequences will be easier to find
320             //use the unix sort 
321             #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
322                 string command = "sort -n " + newGroupFile + " -o " + outfile;
323                 system(command.c_str());
324                 command = "sort -n " + newNameFile + " -o " + outName;
325                 system(command.c_str());
326             #else //sort using windows sort
327                 string command = "sort " + newGroupFile + " /O " + outfile;
328                 system(command.c_str());
329                 command = "sort " + newNameFile + " /O " + outName;
330                 system(command.c_str());
331             #endif
332             m->mothurRemove(newNameFile);
333             m->mothurRemove(newGroupFile);
334             
335             m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to sort and index the group and name files. "); m->mothurOutEndLine();
336         }else { outName = namefile; }
337          
338         time_t estart = time(NULL);
339         //open input file
340                 ifstream in;
341                 m->openInputFile(outName, in);
342         
343         //open input file
344                 ifstream in2;
345                 
346                 int total = 0;
347         vector< vector<int> > nameMapCount;
348         if (groupfile != "") {
349             m->openInputFile(outfile, in2);
350             nameMapCount.resize(indexToName.size());
351             for (int i = 0; i < nameMapCount.size(); i++) {
352                 nameMapCount[i].resize(indexToGroup.size(), 0);
353             }
354         }
355         
356                 while (!in.eof()) {
357                         if (m->control_pressed) { break; }
358                         
359                         string firstCol;
360                         in >> firstCol;  m->gobble(in);
361                         
362                         if (groupfile != "") {
363                 int uniqueIndex;
364                 in >> uniqueIndex; m->gobble(in);
365                 
366                 string name; int groupIndex;
367                 in2 >> name >> groupIndex; m->gobble(in2);
368                 
369                 if (name != firstCol) { m->mothurOut("[ERROR]: found " + name + " in your groupfile, but " + firstCol + " was in your namefile, please correct.\n"); m->control_pressed = true; }
370                 
371                 nameMapCount[uniqueIndex][groupIndex]++;
372                 total++;
373             }else { 
374                 string secondCol;
375                 in >> secondCol; m->gobble(in);
376                 int num = m->getNumNames(secondCol);
377                 out << firstCol << '\t' << num << endl;
378                 total += num;
379             }
380                 }
381                 in.close();
382         
383         if (groupfile != "") {
384             m->mothurRemove(outfile);
385             m->mothurRemove(outName);
386             in2.close();
387             for (map<int, string>::iterator it = indexToGroup.begin(); it != indexToGroup.end(); it++) { out << it->second << '\t';  }
388             out << endl;
389             for (int i = 0; i < nameMapCount.size(); i++) {
390                 string totalsLine = "";
391                 int seqTotal = 0;
392                 for (int j = 0; j < nameMapCount[i].size(); j++) {
393                     seqTotal += nameMapCount[i][j];
394                     totalsLine += toString(nameMapCount[i][j]) + '\t';
395                 }
396                 out << indexToName[i] << '\t' << seqTotal << '\t' << totalsLine << endl;
397             }
398         }
399         
400         out.close();
401         
402         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to create the count table file. "); m->mothurOutEndLine();
403         
404         return total;
405     }
406         catch(exception& e) {
407                 m->errorOut(e, "CountSeqsCommand", "processLarge");
408                 exit(1);
409         }
410 }
411 /**************************************************************************************************/
412 map<int, string> CountSeqsCommand::processNameFile(string name) {
413         try {
414         map<int, string> indexToNames;
415         
416         ofstream out;
417         m->openOutputFile(name, out);
418         
419         //open input file
420                 ifstream in;
421                 m->openInputFile(namefile, in);
422         
423         string rest = "";
424         char buffer[4096];
425         bool pairDone = false;
426         bool columnOne = true;
427         string firstCol, secondCol;
428         int count = 0;
429         
430                 while (!in.eof()) {
431                         if (m->control_pressed) { break; }
432                         
433             in.read(buffer, 4096);
434             vector<string> pieces = m->splitWhiteSpace(rest, buffer, in.gcount());
435             
436             for (int i = 0; i < pieces.size(); i++) {
437                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
438                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
439                 
440                 if (pairDone) { 
441                     //parse names into vector
442                     vector<string> theseNames;
443                     m->splitAtComma(secondCol, theseNames);
444                     for (int i = 0; i < theseNames.size(); i++) {  out << theseNames[i] << '\t' << count << endl;  }
445                     indexToNames[count] = firstCol;
446                     pairDone = false; 
447                     count++;
448                 }
449             }
450                 }
451                 in.close();
452         out.close();
453                 
454         if (rest != "") {
455             vector<string> pieces = m->splitWhiteSpace(rest);
456             
457             for (int i = 0; i < pieces.size(); i++) {
458                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
459                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
460                 
461                 if (pairDone) { 
462                     //parse names into vector
463                     vector<string> theseNames;
464                     m->splitAtComma(secondCol, theseNames);
465                     for (int i = 0; i < theseNames.size(); i++) {  out << theseNames[i] << '\t' << count << endl;  }
466                     indexToNames[count] = firstCol;
467                     pairDone = false; 
468                     count++;
469                 }
470             }
471
472         }
473         
474         return indexToNames;
475     }
476         catch(exception& e) {
477                 m->errorOut(e, "CountSeqsCommand", "processNameFile");
478                 exit(1);
479         }
480 }
481 /**************************************************************************************************/
482 map<int, string> CountSeqsCommand::getGroupNames(string filename, set<string>& namesOfGroups) {
483         try {
484         map<int, string> indexToGroups;
485         map<string, int> groupIndex;
486         map<string, int>::iterator it;
487         
488         ofstream out;
489         m->openOutputFile(filename, out);
490         
491         //open input file
492                 ifstream in;
493                 m->openInputFile(groupfile, in);
494         
495         string rest = "";
496         char buffer[4096];
497         bool pairDone = false;
498         bool columnOne = true;
499         string firstCol, secondCol;
500         int count = 0;
501         
502                 while (!in.eof()) {
503                         if (m->control_pressed) { break; }
504                         
505             in.read(buffer, 4096);
506             vector<string> pieces = m->splitWhiteSpace(rest, buffer, in.gcount());
507             
508             for (int i = 0; i < pieces.size(); i++) {
509                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
510                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
511                 
512                 if (pairDone) { 
513                     it = groupIndex.find(secondCol);
514                     if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above
515                         groupIndex[secondCol] = count;
516                         count++;
517                     }
518                     out << firstCol << '\t' << groupIndex[secondCol] << endl; 
519                     namesOfGroups.insert(secondCol);
520                     pairDone = false; 
521                 }
522             }
523                 }
524                 in.close();
525         out.close();
526         
527         if (rest != "") {
528             vector<string> pieces = m->splitWhiteSpace(rest);
529             
530             for (int i = 0; i < pieces.size(); i++) {
531                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
532                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
533                 
534                 if (pairDone) { 
535                     it = groupIndex.find(secondCol);
536                     if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above
537                         groupIndex[secondCol] = count;
538                         count++;
539                     }
540                     out << firstCol << '\t' << groupIndex[secondCol] << endl; 
541                     namesOfGroups.insert(secondCol);
542                     pairDone = false; 
543                 }
544             }
545         }
546                 
547         for (it = groupIndex.begin(); it != groupIndex.end(); it++) {  indexToGroups[it->second] = it->first;  }
548         
549         return indexToGroups;
550         }
551         catch(exception& e) {
552                 m->errorOut(e, "CountSeqsCommand", "getGroupNames");
553                 exit(1);
554         }
555 }
556 //**********************************************************************************************************************
557
558
559