]> git.donarmstrong.com Git - mothur.git/blob - countseqscommand.cpp
sffinfo bug with flow grams right index when clipQualRight=0
[mothur.git] / countseqscommand.cpp
1 /*
2  *  countseqscommand.cpp
3  *  Mothur
4  *
5  *  Created by westcott on 6/1/11.
6  *  Copyright 2011 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "countseqscommand.h"
11 #include "groupmap.h"
12 #include "sharedutilities.h"
13 #include "counttable.h"
14
15 //**********************************************************************************************************************
16 vector<string> CountSeqsCommand::setParameters(){       
17         try {
18                 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none","count",false,true,true); parameters.push_back(pname);
19                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(pgroup);
20         CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
21                 CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
22                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
23                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
24                 
25                 vector<string> myArray;
26                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
27                 return myArray;
28         }
29         catch(exception& e) {
30                 m->errorOut(e, "CountSeqsCommand", "setParameters");
31                 exit(1);
32         }
33 }
34 //**********************************************************************************************************************
35 string CountSeqsCommand::getHelpString(){       
36         try {
37                 string helpString = "";
38                 helpString += "The count.seqs aka. make.table command reads a name file and outputs a .count_table file.  You may also provide a group file to get the counts broken down by group.\n";
39                 helpString += "The groups parameter allows you to indicate which groups you want to include in the counts, by default all groups in your groupfile are used.\n";
40         helpString += "The large parameter indicates the name and group files are too large to fit in RAM.\n";
41                 helpString += "When you use the groups parameter and a sequence does not represent any sequences from the groups you specify it is not included in the .count.summary file.\n";
42                 helpString += "The count.seqs command should be in the following format: count.seqs(name=yourNameFile).\n";
43                 helpString += "Example count.seqs(name=amazon.names) or make.table(name=amazon.names).\n";
44                 helpString += "Note: No spaces between parameter labels (i.e. name), '=' and parameters (i.e.yourNameFile).\n";
45                 return helpString;
46         }
47         catch(exception& e) {
48                 m->errorOut(e, "CountSeqsCommand", "getHelpString");
49                 exit(1);
50         }
51 }
52 //**********************************************************************************************************************
53 string CountSeqsCommand::getOutputPattern(string type) {
54     try {
55         string pattern = "";
56         
57         if (type == "count") {  pattern = "[filename],count_table"; }
58         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
59         
60         return pattern;
61     }
62     catch(exception& e) {
63         m->errorOut(e, "CountSeqsCommand", "getOutputPattern");
64         exit(1);
65     }
66 }
67 //**********************************************************************************************************************
68 CountSeqsCommand::CountSeqsCommand(){   
69         try {
70                 abort = true; calledHelp = true; 
71                 setParameters();
72                 vector<string> tempOutNames;
73                 outputTypes["count"] = tempOutNames;
74         }
75         catch(exception& e) {
76                 m->errorOut(e, "CountSeqsCommand", "CountSeqsCommand");
77                 exit(1);
78         }
79 }
80 //**********************************************************************************************************************
81
82 CountSeqsCommand::CountSeqsCommand(string option)  {
83         try {
84                 abort = false; calledHelp = false;   
85                 
86                 //allow user to run help
87                 if(option == "help") { help(); abort = true; calledHelp = true; }
88                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
89                 else {
90                         vector<string> myArray = setParameters();
91                         
92                         OptionParser parser(option);
93                         map<string,string> parameters = parser.getParameters();
94                         
95                         ValidParameters validParameter;
96                         map<string,string>::iterator it;
97                         
98                         //check to make sure all parameters are valid for command
99                         for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
100                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
101                         }
102                         
103                         //initialize outputTypes
104                         vector<string> tempOutNames;
105                         outputTypes["count"] = tempOutNames;
106                         
107                         
108                         //if the user changes the input directory command factory will send this info to us in the output parameter 
109                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
110                         if (inputDir == "not found"){   inputDir = "";          }
111                         else {
112                                 string path;
113                                 it = parameters.find("name");
114                                 //user has given a template file
115                                 if(it != parameters.end()){ 
116                                         path = m->hasPath(it->second);
117                                         //if the user has not given a path then, add inputdir. else leave path alone.
118                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
119                                 }
120                                 
121                                 it = parameters.find("group");
122                                 //user has given a template file
123                                 if(it != parameters.end()){ 
124                                         path = m->hasPath(it->second);
125                                         //if the user has not given a path then, add inputdir. else leave path alone.
126                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
127                                 }
128                         }
129                         
130                         //check for required parameters
131                         namefile = validParameter.validFile(parameters, "name", true);
132                         if (namefile == "not open") { namefile = ""; abort = true; }
133                         else if (namefile == "not found"){                                      
134                                 namefile = m->getNameFile(); 
135                                 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
136                                 else {  m->mothurOut("You have no current namefile and the name parameter is required."); m->mothurOutEndLine(); abort = true; }
137                         }else { m->setNameFile(namefile); }
138                         
139                         groupfile = validParameter.validFile(parameters, "group", true);
140                         if (groupfile == "not open") { abort = true; }
141                         else if (groupfile == "not found") {  groupfile = "";  }        
142                         else { m->setGroupFile(groupfile); }
143                         
144                         groups = validParameter.validFile(parameters, "groups", false);                 
145                         if (groups == "not found") { groups = "all"; }
146                         m->splitAtDash(groups, Groups);
147             
148             string temp = validParameter.validFile(parameters, "large", false);         if (temp == "not found") {      temp = "F";     }
149                         large = m->isTrue(temp);
150                         
151                         //if the user changes the output directory command factory will send this info to us in the output parameter 
152                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(namefile);               }
153
154                 }
155                 
156         }
157         catch(exception& e) {
158                 m->errorOut(e, "CountSeqsCommand", "CountSeqsCommand");
159                 exit(1);
160         }
161 }
162 //**********************************************************************************************************************
163
164 int CountSeqsCommand::execute(){
165         try {
166                 
167                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
168                 
169         map<string, string> variables; 
170         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(namefile));
171                 string outputFileName = getOutputFileName("count", variables);
172                 
173         int total = 0;
174         if (!large) { total = processSmall(outputFileName); }
175         else { total = processLarge(outputFileName);  }
176         
177                 if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; }
178                 
179         //set rabund file as new current rabundfile
180                 itTypes = outputTypes.find("count");
181                 if (itTypes != outputTypes.end()) {
182                         if ((itTypes->second).size() != 0) { string current = (itTypes->second)[0]; m->setCountTableFile(current); }
183                 }
184         
185         m->mothurOutEndLine();
186                 m->mothurOut("Total number of sequences: " + toString(total)); m->mothurOutEndLine();
187                 m->mothurOutEndLine();
188                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
189                 m->mothurOut(outputFileName); m->mothurOutEndLine();    
190                 m->mothurOutEndLine();
191                 
192                 return 0;               
193         }
194         
195         catch(exception& e) {
196                 m->errorOut(e, "CountSeqsCommand", "execute");
197                 exit(1);
198         }
199 }
200 //**********************************************************************************************************************
201
202 int CountSeqsCommand::processSmall(string outputFileName){
203         try {
204         ofstream out;
205         m->openOutputFile(outputFileName, out); outputTypes["count"].push_back(outputFileName);
206         outputNames.push_back(outputFileName); outputTypes["count"].push_back(outputFileName);
207                 out << "Representative_Sequence\ttotal\t";
208         
209         GroupMap* groupMap;
210                 if (groupfile != "") { 
211                         groupMap = new GroupMap(groupfile); groupMap->readMap(); 
212                         
213                         //make sure groups are valid. takes care of user setting groupNames that are invalid or setting groups=all
214                         SharedUtil* util = new SharedUtil();
215                         vector<string> nameGroups = groupMap->getNamesOfGroups();
216                         util->setGroups(Groups, nameGroups);
217                         delete util;
218                         
219                         //sort groupNames so that the group title match the counts below, this is needed because the map object automatically sorts
220                         sort(Groups.begin(), Groups.end());
221                         
222                         //print groupNames
223                         for (int i = 0; i < Groups.size(); i++) {
224                                 out << Groups[i] << '\t';
225                         }
226                 }
227                 out << endl;
228                 
229                 //open input file
230                 ifstream in;
231                 m->openInputFile(namefile, in);
232         
233                 int total = 0;
234                 while (!in.eof()) {
235                         if (m->control_pressed) { break; }
236                         
237                         string firstCol, secondCol;
238                         in >> firstCol; m->gobble(in); in >> secondCol; m->gobble(in);
239             //cout << firstCol << '\t' << secondCol << endl;
240             m->checkName(firstCol);
241             m->checkName(secondCol);
242                         //cout << firstCol << '\t' << secondCol << endl;
243            
244                         vector<string> names;
245                         m->splitAtChar(secondCol, names, ',');
246                         
247                         if (groupfile != "") {
248                                 //set to 0
249                                 map<string, int> groupCounts;
250                                 int total = 0;
251                                 for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; }
252                                 
253                                 //get counts for each of the users groups
254                                 for (int i = 0; i < names.size(); i++) {
255                                         string group = groupMap->getGroup(names[i]);
256                                         
257                                         if (group == "not found") { m->mothurOut("[ERROR]: " + names[i] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); }
258                                         else {
259                                                 map<string, int>::iterator it = groupCounts.find(group);
260                                                 
261                                                 //if not found, then this sequence is not from a group we care about
262                                                 if (it != groupCounts.end()) {
263                                                         it->second++;
264                                                         total++;
265                                                 }
266                                         }
267                                 }
268                                 
269                                 if (total != 0) {
270                                         out << firstCol << '\t' << total << '\t';
271                                         for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
272                                                 out << it->second << '\t';
273                                         }
274                                         out << endl;
275                                 }
276                         }else {
277                                 out << firstCol << '\t' << names.size() << endl;
278                         }
279                         
280                         total += names.size();
281                 }
282                 in.close();
283         out.close();
284                 
285                 if (groupfile != "") { delete groupMap; }
286
287         return total;
288     }
289         catch(exception& e) {
290                 m->errorOut(e, "CountSeqsCommand", "processSmall");
291                 exit(1);
292         }
293 }
294 //**********************************************************************************************************************
295
296 int CountSeqsCommand::processLarge(string outputFileName){
297         try {
298         set<string> namesOfGroups;
299         map<string, int> initial;
300         for (set<string>::iterator it = namesOfGroups.begin(); it != namesOfGroups.end(); it++) { initial[(*it)] = 0;  }
301         ofstream out;
302         m->openOutputFile(outputFileName, out); 
303         outputNames.push_back(outputFileName); outputTypes["count"].push_back(outputFileName);
304                 out << "Representative_Sequence\ttotal\t";
305         if (groupfile == "") { out << endl; }
306         
307         map<string, unsigned long long> namesToIndex;
308         string outfile = m->getRootName(groupfile) + "sorted.groups.temp";
309         string outName = m->getRootName(namefile) + "sorted.name.temp";
310         map<int, string> indexToName;
311         map<int, string> indexToGroup;
312         if (groupfile != "") { 
313             time_t estart = time(NULL);
314             //convert name file to redundant -> unique.  set unique name equal to index so we can use vectors, save name for later.
315             string newNameFile = m->getRootName(namefile) + ".name.temp";
316             string newGroupFile = m->getRootName(groupfile) + ".group.temp";
317             indexToName = processNameFile(newNameFile);
318             indexToGroup = getGroupNames(newGroupFile, namesOfGroups);
319             
320             //sort file by first column so the names of sequences will be easier to find
321             //use the unix sort 
322             #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
323                 string command = "sort -n " + newGroupFile + " -o " + outfile;
324                 system(command.c_str());
325                 command = "sort -n " + newNameFile + " -o " + outName;
326                 system(command.c_str());
327             #else //sort using windows sort
328                 string command = "sort " + newGroupFile + " /O " + outfile;
329                 system(command.c_str());
330                 command = "sort " + newNameFile + " /O " + outName;
331                 system(command.c_str());
332             #endif
333             m->mothurRemove(newNameFile);
334             m->mothurRemove(newGroupFile);
335             
336             m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to sort and index the group and name files. "); m->mothurOutEndLine();
337         }else { outName = namefile; }
338          
339         time_t estart = time(NULL);
340         //open input file
341                 ifstream in;
342                 m->openInputFile(outName, in);
343         
344         //open input file
345                 ifstream in2;
346                 
347                 int total = 0;
348         vector< vector<int> > nameMapCount;
349         if (groupfile != "") {
350             m->openInputFile(outfile, in2);
351             nameMapCount.resize(indexToName.size());
352             for (int i = 0; i < nameMapCount.size(); i++) {
353                 nameMapCount[i].resize(indexToGroup.size(), 0);
354             }
355         }
356         
357                 while (!in.eof()) {
358                         if (m->control_pressed) { break; }
359                         
360                         string firstCol;
361                         in >> firstCol;  m->gobble(in);
362                         
363                         if (groupfile != "") {
364                 int uniqueIndex;
365                 in >> uniqueIndex; m->gobble(in);
366                 
367                 string name; int groupIndex;
368                 in2 >> name >> groupIndex; m->gobble(in2);
369                 
370                 if (name != firstCol) { m->mothurOut("[ERROR]: found " + name + " in your groupfile, but " + firstCol + " was in your namefile, please correct.\n"); m->control_pressed = true; }
371                 
372                 nameMapCount[uniqueIndex][groupIndex]++;
373                 total++;
374             }else { 
375                 string secondCol;
376                 in >> secondCol; m->gobble(in);
377                 int num = m->getNumNames(secondCol);
378                 out << firstCol << '\t' << num << endl;
379                 total += num;
380             }
381                 }
382                 in.close();
383         
384         if (groupfile != "") {
385             m->mothurRemove(outfile);
386             m->mothurRemove(outName);
387             in2.close();
388             for (map<int, string>::iterator it = indexToGroup.begin(); it != indexToGroup.end(); it++) { out << it->second << '\t';  }
389             out << endl;
390             for (int i = 0; i < nameMapCount.size(); i++) {
391                 string totalsLine = "";
392                 int seqTotal = 0;
393                 for (int j = 0; j < nameMapCount[i].size(); j++) {
394                     seqTotal += nameMapCount[i][j];
395                     totalsLine += toString(nameMapCount[i][j]) + '\t';
396                 }
397                 out << indexToName[i] << '\t' << seqTotal << '\t' << totalsLine << endl;
398             }
399         }
400         
401         out.close();
402         
403         m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to create the count table file. "); m->mothurOutEndLine();
404         
405         return total;
406     }
407         catch(exception& e) {
408                 m->errorOut(e, "CountSeqsCommand", "processLarge");
409                 exit(1);
410         }
411 }
412 /**************************************************************************************************/
413 map<int, string> CountSeqsCommand::processNameFile(string name) {
414         try {
415         map<int, string> indexToNames;
416         
417         ofstream out;
418         m->openOutputFile(name, out);
419         
420         //open input file
421                 ifstream in;
422                 m->openInputFile(namefile, in);
423         
424         string rest = "";
425         char buffer[4096];
426         bool pairDone = false;
427         bool columnOne = true;
428         string firstCol, secondCol;
429         int count = 0;
430         
431                 while (!in.eof()) {
432                         if (m->control_pressed) { break; }
433                         
434             in.read(buffer, 4096);
435             vector<string> pieces = m->splitWhiteSpace(rest, buffer, in.gcount());
436             
437             for (int i = 0; i < pieces.size(); i++) {
438                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
439                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
440                 
441                 if (pairDone) { 
442                     m->checkName(firstCol);
443                     m->checkName(secondCol);
444                     //parse names into vector
445                     vector<string> theseNames;
446                     m->splitAtComma(secondCol, theseNames);
447                     for (int i = 0; i < theseNames.size(); i++) {  out << theseNames[i] << '\t' << count << endl;  }
448                     indexToNames[count] = firstCol;
449                     pairDone = false; 
450                     count++;
451                 }
452             }
453                 }
454                 in.close();
455         out.close();
456                 
457         if (rest != "") {
458             vector<string> pieces = m->splitWhiteSpace(rest);
459             
460             for (int i = 0; i < pieces.size(); i++) {
461                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
462                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
463                 
464                 if (pairDone) { 
465                     m->checkName(firstCol);
466                     m->checkName(secondCol);
467                     //parse names into vector
468                     vector<string> theseNames;
469                     m->splitAtComma(secondCol, theseNames);
470                     for (int i = 0; i < theseNames.size(); i++) {  out << theseNames[i] << '\t' << count << endl;  }
471                     indexToNames[count] = firstCol;
472                     pairDone = false; 
473                     count++;
474                 }
475             }
476
477         }
478         
479         return indexToNames;
480     }
481         catch(exception& e) {
482                 m->errorOut(e, "CountSeqsCommand", "processNameFile");
483                 exit(1);
484         }
485 }
486 /**************************************************************************************************/
487 map<int, string> CountSeqsCommand::getGroupNames(string filename, set<string>& namesOfGroups) {
488         try {
489         map<int, string> indexToGroups;
490         map<string, int> groupIndex;
491         map<string, int>::iterator it;
492         
493         ofstream out;
494         m->openOutputFile(filename, out);
495         
496         //open input file
497                 ifstream in;
498                 m->openInputFile(groupfile, in);
499         
500         string rest = "";
501         char buffer[4096];
502         bool pairDone = false;
503         bool columnOne = true;
504         string firstCol, secondCol;
505         int count = 0;
506         
507                 while (!in.eof()) {
508                         if (m->control_pressed) { break; }
509                         
510             in.read(buffer, 4096);
511             vector<string> pieces = m->splitWhiteSpace(rest, buffer, in.gcount());
512             
513             for (int i = 0; i < pieces.size(); i++) {
514                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
515                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
516                 
517                 if (pairDone) { 
518                     m->checkName(firstCol);
519                     it = groupIndex.find(secondCol);
520                     if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above
521                         groupIndex[secondCol] = count;
522                         count++;
523                     }
524                     out << firstCol << '\t' << groupIndex[secondCol] << endl; 
525                     namesOfGroups.insert(secondCol);
526                     pairDone = false; 
527                 }
528             }
529                 }
530                 in.close();
531         out.close();
532         
533         if (rest != "") {
534             vector<string> pieces = m->splitWhiteSpace(rest);
535             
536             for (int i = 0; i < pieces.size(); i++) {
537                 if (columnOne) {  firstCol = pieces[i]; columnOne=false; }
538                 else  { secondCol = pieces[i]; pairDone = true; columnOne=true; }
539                 
540                 if (pairDone) { 
541                     m->checkName(firstCol);
542                     it = groupIndex.find(secondCol);
543                     if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above
544                         groupIndex[secondCol] = count;
545                         count++;
546                     }
547                     out << firstCol << '\t' << groupIndex[secondCol] << endl; 
548                     namesOfGroups.insert(secondCol);
549                     pairDone = false; 
550                 }
551             }
552         }
553                 
554         for (it = groupIndex.begin(); it != groupIndex.end(); it++) {  indexToGroups[it->second] = it->first;  }
555         
556         return indexToGroups;
557         }
558         catch(exception& e) {
559                 m->errorOut(e, "CountSeqsCommand", "getGroupNames");
560                 exit(1);
561         }
562 }
563 //**********************************************************************************************************************
564
565
566