]> git.donarmstrong.com Git - mothur.git/blob - countseqscommand.cpp
Merge remote-tracking branch 'origin/master'
[mothur.git] / countseqscommand.cpp
1 /*
2  *  countseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 6/1/11.
6  *  Copyright 2011 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "countseqscommand.h"
11 #include "groupmap.h"
12 #include "sharedutilities.h"
13 #include "counttable.h"
14
15 //**********************************************************************************************************************
16 vector<string> CountSeqsCommand::setParameters(){       
17         try {
18                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none","count",false,true,true); parameters.push_back(pname);
19                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(pgroup);
20         CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
21                 CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
22                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
23                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
24                 
25                 vector<string> myArray;
26                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
27                 return myArray;
28         }
29         catch(exception& e) {
30                 m->errorOut(e, "CountSeqsCommand", "setParameters");
31                 exit(1);
32         }
33 }
34 //**********************************************************************************************************************
35 string CountSeqsCommand::getHelpString(){       
36         try {
37                 string helpString = "";
38                 helpString += "The count.seqs aka. make.table command reads a name file and outputs a .count_table file.  You may also provide a group file to get the counts broken down by group.\n";
39                 helpString += "The groups parameter allows you to indicate which groups you want to include in the counts, by default all groups in your groupfile are used.\n";
40         helpString += "The large parameter indicates the name and group files are too large to fit in RAM.\n";
41                 helpString += "When you use the groups parameter and a sequence does not represent any sequences from the groups you specify it is not included in the .count.summary file.\n";
42                 helpString += "The count.seqs command should be in the following format: count.seqs(name=yourNameFile).\n";
43                 helpString += "Example count.seqs(name=amazon.names) or make.table(name=amazon.names).\n";
44                 helpString += "Note: No spaces between parameter labels (i.e. name), '=' and parameters (i.e.yourNameFile).\n";
45                 return helpString;
46         }
47         catch(exception& e) {
48                 m->errorOut(e, "CountSeqsCommand", "getHelpString");
49                 exit(1);
50         }
51 }
52 //**********************************************************************************************************************
53 string CountSeqsCommand::getOutputPattern(string type) {
54     try {
55         string pattern = "";
56         
57         if (type == "count") {  pattern = "[filename],count_table"; }
58         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
59         
60         return pattern;
61     }
62     catch(exception& e) {
63         m->errorOut(e, "CountSeqsCommand", "getOutputPattern");
64         exit(1);
65     }
66 }
67 //**********************************************************************************************************************
68 CountSeqsCommand::CountSeqsCommand(){   
69         try {
70                 abort = true; calledHelp = true; 
71                 setParameters();
72                 vector<string> tempOutNames;
73                 outputTypes["count"] = tempOutNames;
74         }
75         catch(exception& e) {
76                 m->errorOut(e, "CountSeqsCommand", "CountSeqsCommand");
77                 exit(1);
78         }
79 }
80 //**********************************************************************************************************************
81
82 CountSeqsCommand::CountSeqsCommand(string option)  {
83         try {
84                 abort = false; calledHelp = false;   
85                 
86                 //allow user to run help
87                 if(option == "help") { help(); abort = true; calledHelp = true; }
88                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
89                 else {
90                         vector<string> myArray = setParameters();
91                         
92                         OptionParser parser(option);
93                         map<string,string> parameters = parser.getParameters();
94                         
95                         ValidParameters validParameter;
96                         map<string,string>::iterator it;
97                         
98                         //check to make sure all parameters are valid for command
99                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
100                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
101                         }
102                         
103                         //initialize outputTypes
104                         vector<string> tempOutNames;
105                         outputTypes["count"] = tempOutNames;
106                         
107                         
108                         //if the user changes the input directory command factory will send this info to us in the output parameter 
109                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
110                         if (inputDir == "not found"){   inputDir = "";          }
111                         else {
112                                 string path;
113                                 it = parameters.find("name");
114                                 //user has given a template file
115                                 if(it != parameters.end()){ 
116                                         path = m->hasPath(it->second);
117                                         //if the user has not given a path then, add inputdir. else leave path alone.
118                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
119                                 }
120                                 
121                                 it = parameters.find("group");
122                                 //user has given a template file
123                                 if(it != parameters.end()){ 
124                                         path = m->hasPath(it->second);
125                                         //if the user has not given a path then, add inputdir. else leave path alone.
126                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
127                                 }
128                         }
129                         
130                         //check for required parameters
131                         namefile = validParameter.validFile(parameters, "name", true);
132                         if (namefile == "not open") { namefile = ""; abort = true; }
133                         else if (namefile == "not found"){                                      
134                                 namefile = m->getNameFile(); 
135                                 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
136                                 else {  m->mothurOut("You have no current namefile and the name parameter is required."); m->mothurOutEndLine(); abort = true; }
137                         }else { m->setNameFile(namefile); }
138                         
139                         groupfile = validParameter.validFile(parameters, "group", true);
140                         if (groupfile == "not open") { abort = true; }
141                         else if (groupfile == "not found") {  groupfile = "";  }        
142                         else { m->setGroupFile(groupfile); }
143                         
144                         groups = validParameter.validFile(parameters, "groups", false);                 
145                         if (groups == "not found") { groups = "all"; }
146                         m->splitAtDash(groups, Groups);
147             
148             string temp = validParameter.validFile(parameters, "large", false);         if (temp == "not found") {      temp = "F";     }
149                         large = m->isTrue(temp);
150                         
151                         //if the user changes the output directory command factory will send this info to us in the output parameter 
152                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(namefile);               }
153
154                 }
155                 
156         }
157         catch(exception& e) {
158                 m->errorOut(e, "CountSeqsCommand", "CountSeqsCommand");
159                 exit(1);
160         }
161 }
162 //**********************************************************************************************************************
163
164 int CountSeqsCommand::execute(){
165         try {
166                 
167                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
168                 
169         map<string, string> variables; 
170         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(namefile));
171                 string outputFileName = getOutputFileName("count", variables);
172                 
173         int total = 0;
174         if (!large) { total = processSmall(outputFileName); }
175         else { total = processLarge(outputFileName);  }
176         
177                 if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; }
178                 
179         //set rabund file as new current rabundfile
180                 itTypes = outputTypes.find("count");
181                 if (itTypes != outputTypes.end()) {
182                         if ((itTypes->second).size() != 0) { string current = (itTypes->second)[0]; m->setCountTableFile(current); }
183                 }
184         
185         m->mothurOutEndLine();
186                 m->mothurOut("Total number of sequences: " + toString(total)); m->mothurOutEndLine();
187                 m->mothurOutEndLine();
188                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
189                 m->mothurOut(outputFileName); m->mothurOutEndLine();    
190                 m->mothurOutEndLine();
191                 
192                 return 0;               
193         }
194         
195         catch(exception& e) {
196                 m->errorOut(e, "CountSeqsCommand", "execute");
197                 exit(1);
198         }
199 }
200 //**********************************************************************************************************************
201
202 int CountSeqsCommand::processSmall(string outputFileName){
203         try {
204         ofstream out;
205         m->openOutputFile(outputFileName, out); outputTypes["count"].push_back(outputFileName);
206         outputNames.push_back(outputFileName); outputTypes["count"].push_back(outputFileName);
207                 out << "Representative_Sequence\ttotal\t";
208         
209         GroupMap* groupMap;
210                 if (groupfile != "") { 
211                         groupMap = new GroupMap(groupfile); groupMap->readMap(); 
212                         
213                         //make sure groups are valid. takes care of user setting groupNames that are invalid or setting groups=all
214                         SharedUtil* util = new SharedUtil();
215                         vector<string> nameGroups = groupMap->getNamesOfGroups();
216                         util->setGroups(Groups, nameGroups);
217                         delete util;
218                         
219                         //sort groupNames so that the group title match the counts below, this is needed because the map object automatically sorts
220                         sort(Groups.begin(), Groups.end());
221                         
222                         //print groupNames
223                         for (int i = 0; i < Groups.size(); i++) {
224                                 out << Groups[i] << '\t';
225                         }
226                 }
227                 out << endl;
228                 
229                 //open input file
230                 ifstream in;
231                 m->openInputFile(namefile, in);
232         
233                 int total = 0;
234                 while (!in.eof()) {
235                         if (m->control_pressed) { break; }
236                         
237                         string firstCol, secondCol;
238                         in >> firstCol; m->gobble(in); in >> secondCol; m->gobble(in);
239                         
240                         vector<string> names;
241                         m->splitAtChar(secondCol, names, ',');
242                         
243                         if (groupfile != "") {
244                                 //set to 0
245                                 map<string, int> groupCounts;
246                                 int total = 0;
247                                 for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; }
248                                 
249                                 //get counts for each of the users groups
250                                 for (int i = 0; i < names.size(); i++) {
251                                         string group = groupMap->getGroup(names[i]);
252                                         
253                                         if (group == "not found") { m->mothurOut("[ERROR]: " + names[i] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); }
254                                         else {
255                                                 map<string, int>::iterator it = groupCounts.find(group);
256                                                 
257                                                 //if not found, then this sequence is not from a group we care about
258                                                 if (it != groupCounts.end()) {
259                                                         it->second++;
260                                                         total++;
261                                                 }
262                                         }
263                                 }
264                                 
265                                 if (total != 0) {
266                                         out << firstCol << '\t' << total << '\t';
267                                         for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
268                                                 out << it->second << '\t';
269                                         }
270                                         out << endl;
271                                 }
272                         }else {
273                                 out << firstCol << '\t' << names.size() << endl;
274                         }
275                         
276                         total += names.size();
277                 }
278                 in.close();
279         out.close();
280                 
281                 if (groupfile != "") { delete groupMap; }
282
283         return total;
284     }
285         catch(exception& e) {
286                 m->errorOut(e, "CountSeqsCommand", "processSmall");
287                 exit(1);
288         }
289 }
290 //**********************************************************************************************************************
291
292 int CountSeqsCommand::processLarge(string outputFileName){
293         try {
294         set<string> namesOfGroups;
295         map<string, int> initial;
296         for (set<string>::iterator it = namesOfGroups.begin(); it != namesOfGroups.end(); it++) { initial[(*it)] = 0;  }
297         ofstream out;
298         m->openOutputFile(outputFileName, out); 
299         outputNames.push_back(outputFileName); outputTypes["count"].push_back(outputFileName);
300                 out << "Representative_Sequence\ttotal\t";
301         if (groupfile == "") { out << endl; }
302         
303         map<string, unsigned long long> namesToIndex;
304         string outfile = m->getRootName(groupfile) + "sorted.groups.temp";
305         string outName = m->getRootName(namefile) + "sorted.name.temp";
306         map<int, string> indexToName;
307         map<int, string> indexToGroup;
308         if (groupfile != "") { 
309             time_t estart = time(NULL);
310             //convert name file to redundant -> unique.  set unique name equal to index so we can use vectors, save name for later.
311             string newNameFile = m->getRootName(namefile) + ".name.temp";
312             string newGroupFile = m->getRootName(groupfile) + ".group.temp";
313             indexToName = processNameFile(newNameFile);
314             indexToGroup = getGroupNames(newGroupFile, namesOfGroups);
315             
316             //sort file by first column so the names of sequences will be easier to find
317             //use the unix sort 
318             #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
319                 string command = "sort -n " + newGroupFile + " -o " + outfile;
320                 system(command.c_str());
321                 command = "sort -n " + newNameFile + " -o " + outName;
322                 system(command.c_str());
323             #else //sort using windows sort
324                 string command = "sort " + newGroupFile + " /O " + outfile;
325                 system(command.c_str());
326                 command = "sort " + newNameFile + " /O " + outName;
327                 system(command.c_str());
328             #endif
329             m->mothurRemove(newNameFile);
330             m->mothurRemove(newGroupFile);
331             
332             m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to sort and index the group and name files. "); m->mothurOutEndLine();
333         }else { outName = namefile; }
334          
335         time_t estart = time(NULL);
336         //open input file
337                 ifstream in;
338                 m->openInputFile(outName, in);
339         
340         //open input file
341                 ifstream in2;
342                 
343                 int total = 0;
344         vector< vector<int> > nameMapCount;
345         if (groupfile != "") {
346             m->openInputFile(outfile, in2);
347             nameMapCount.resize(indexToName.size());
348             for (int i = 0; i < nameMapCount.size(); i++) {
349                 nameMapCount[i].resize(indexToGroup.size(), 0);
350             }
351         }
352         
353                 while (!in.eof()) {
354                         if (m->control_pressed) { break; }
355                         
356                         string firstCol;
357                         in >> firstCol;  m->gobble(in);
358                         
359                         if (groupfile != "") {
360                 int uniqueIndex;
361                 in >> uniqueIndex; m->gobble(in);
362                 
363                 string name; int groupIndex;
364                 in2 >> name >> groupIndex; m->gobble(in2);
365                 
366                 if (name != firstCol) { m->mothurOut("[ERROR]: found " + name + " in your groupfile, but " + firstCol + " was in your namefile, please correct.\n"); m->control_pressed = true; }
367                 
368                 nameMapCount[uniqueIndex][groupIndex]++;
369                 total++;
370             }else { 
371                 string secondCol;
372                 in >> secondCol; m->gobble(in);
373                 int num = m->getNumNames(secondCol);
374                 out << firstCol << '\t' << num << endl;
375                 total += num;
376             }
377                 }
378                 in.close();
379         
380         if (groupfile != "") {
381             m->mothurRemove(outfile);
382             m->mothurRemove(outName);
383             in2.close();
384             for (map<int, string>::iterator it = indexToGroup.begin(); it != indexToGroup.end(); it++) { out << it->second << '\t';  }
385             out << endl;
386             for (int i = 0; i < nameMapCount.size(); i++) {
387                 string totalsLine = "";
388                 int seqTotal = 0;
389                 for (int j = 0; j < nameMapCount[i].size(); j++) {
390                     seqTotal += nameMapCount[i][j];
391                     totalsLine += toString(nameMapCount[i][j]) + '\t';
392                 }
393                 out << indexToName[i] << '\t' << seqTotal << '\t' << totalsLine << endl;
394             }
395         }
396         
397         out.close();
398         
399         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to create the count table file. "); m->mothurOutEndLine();
400         
401         return total;
402     }
403         catch(exception& e) {
404                 m->errorOut(e, "CountSeqsCommand", "processLarge");
405                 exit(1);
406         }
407 }
408 /**************************************************************************************************/
409 map<int, string> CountSeqsCommand::processNameFile(string name) {
410         try {
411         map<int, string> indexToNames;
412         
413         ofstream out;
414         m->openOutputFile(name, out);
415         
416         //open input file
417                 ifstream in;
418                 m->openInputFile(namefile, in);
419         
420         string rest = "";
421         char buffer[4096];
422         bool pairDone = false;
423         bool columnOne = true;
424         string firstCol, secondCol;
425         int count = 0;
426         
427                 while (!in.eof()) {
428                         if (m->control_pressed) { break; }
429                         
430             in.read(buffer, 4096);
431             vector<string> pieces = m->splitWhiteSpace(rest, buffer, in.gcount());
432             
433             for (int i = 0; i < pieces.size(); i++) {
434                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
435                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
436                 
437                 if (pairDone) { 
438                     //parse names into vector
439                     vector<string> theseNames;
440                     m->splitAtComma(secondCol, theseNames);
441                     for (int i = 0; i < theseNames.size(); i++) {  out << theseNames[i] << '\t' << count << endl;  }
442                     indexToNames[count] = firstCol;
443                     pairDone = false; 
444                     count++;
445                 }
446             }
447                 }
448                 in.close();
449         out.close();
450                 
451         if (rest != "") {
452             vector<string> pieces = m->splitWhiteSpace(rest);
453             
454             for (int i = 0; i < pieces.size(); i++) {
455                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
456                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
457                 
458                 if (pairDone) { 
459                     //parse names into vector
460                     vector<string> theseNames;
461                     m->splitAtComma(secondCol, theseNames);
462                     for (int i = 0; i < theseNames.size(); i++) {  out << theseNames[i] << '\t' << count << endl;  }
463                     indexToNames[count] = firstCol;
464                     pairDone = false; 
465                     count++;
466                 }
467             }
468
469         }
470         
471         return indexToNames;
472     }
473         catch(exception& e) {
474                 m->errorOut(e, "CountSeqsCommand", "processNameFile");
475                 exit(1);
476         }
477 }
478 /**************************************************************************************************/
479 map<int, string> CountSeqsCommand::getGroupNames(string filename, set<string>& namesOfGroups) {
480         try {
481         map<int, string> indexToGroups;
482         map<string, int> groupIndex;
483         map<string, int>::iterator it;
484         
485         ofstream out;
486         m->openOutputFile(filename, out);
487         
488         //open input file
489                 ifstream in;
490                 m->openInputFile(groupfile, in);
491         
492         string rest = "";
493         char buffer[4096];
494         bool pairDone = false;
495         bool columnOne = true;
496         string firstCol, secondCol;
497         int count = 0;
498         
499                 while (!in.eof()) {
500                         if (m->control_pressed) { break; }
501                         
502             in.read(buffer, 4096);
503             vector<string> pieces = m->splitWhiteSpace(rest, buffer, in.gcount());
504             
505             for (int i = 0; i < pieces.size(); i++) {
506                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
507                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
508                 
509                 if (pairDone) { 
510                     it = groupIndex.find(secondCol);
511                     if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above
512                         groupIndex[secondCol] = count;
513                         count++;
514                     }
515                     out << firstCol << '\t' << groupIndex[secondCol] << endl; 
516                     namesOfGroups.insert(secondCol);
517                     pairDone = false; 
518                 }
519             }
520                 }
521                 in.close();
522         out.close();
523         
524         if (rest != "") {
525             vector<string> pieces = m->splitWhiteSpace(rest);
526             
527             for (int i = 0; i < pieces.size(); i++) {
528                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
529                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
530                 
531                 if (pairDone) { 
532                     it = groupIndex.find(secondCol);
533                     if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above
534                         groupIndex[secondCol] = count;
535                         count++;
536                     }
537                     out << firstCol << '\t' << groupIndex[secondCol] << endl; 
538                     namesOfGroups.insert(secondCol);
539                     pairDone = false; 
540                 }
541             }
542         }
543                 
544         for (it = groupIndex.begin(); it != groupIndex.end(); it++) {  indexToGroups[it->second] = it->first;  }
545         
546         return indexToGroups;
547         }
548         catch(exception& e) {
549                 m->errorOut(e, "CountSeqsCommand", "getGroupNames");
550                 exit(1);
551         }
552 }
553 //**********************************************************************************************************************
554
555
556