5 * Created by westcott on 6/1/11.
6 * Copyright 2011 Schloss Lab. All rights reserved.
10 #include "countseqscommand.h"
12 #include "sharedutilities.h"
13 #include "counttable.h"
15 //**********************************************************************************************************************
16 vector<string> CountSeqsCommand::setParameters(){
18 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none","count",false,true,true); parameters.push_back(pname);
19 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(pgroup);
20 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
21 CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
22 CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
23 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
24 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
26 vector<string> myArray;
27 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
31 m->errorOut(e, "CountSeqsCommand", "setParameters");
35 //**********************************************************************************************************************
36 string CountSeqsCommand::getHelpString(){
38 string helpString = "";
39 helpString += "The count.seqs aka. make.table command reads a name file and outputs a .count_table file. You may also provide a group file to get the counts broken down by group.\n";
40 helpString += "The groups parameter allows you to indicate which groups you want to include in the counts, by default all groups in your groupfile are used.\n";
41 helpString += "The large parameter indicates the name and group files are too large to fit in RAM.\n";
42 helpString += "When you use the groups parameter and a sequence does not represent any sequences from the groups you specify it is not included in the .count.summary file.\n";
43 helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n";
44 helpString += "The count.seqs command should be in the following format: count.seqs(name=yourNameFile).\n";
45 helpString += "Example count.seqs(name=amazon.names) or make.table(name=amazon.names).\n";
46 helpString += "Note: No spaces between parameter labels (i.e. name), '=' and parameters (i.e.yourNameFile).\n";
50 m->errorOut(e, "CountSeqsCommand", "getHelpString");
54 //**********************************************************************************************************************
55 string CountSeqsCommand::getOutputPattern(string type) {
59 if (type == "count") { pattern = "[filename],count_table"; }
60 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
65 m->errorOut(e, "CountSeqsCommand", "getOutputPattern");
69 //**********************************************************************************************************************
70 CountSeqsCommand::CountSeqsCommand(){
72 abort = true; calledHelp = true;
74 vector<string> tempOutNames;
75 outputTypes["count"] = tempOutNames;
78 m->errorOut(e, "CountSeqsCommand", "CountSeqsCommand");
82 //**********************************************************************************************************************
84 CountSeqsCommand::CountSeqsCommand(string option) {
86 abort = false; calledHelp = false;
88 //allow user to run help
89 if(option == "help") { help(); abort = true; calledHelp = true; }
90 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
92 vector<string> myArray = setParameters();
94 OptionParser parser(option);
95 map<string,string> parameters = parser.getParameters();
97 ValidParameters validParameter;
98 map<string,string>::iterator it;
100 //check to make sure all parameters are valid for command
101 for (map<string,string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
102 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
105 //initialize outputTypes
106 vector<string> tempOutNames;
107 outputTypes["count"] = tempOutNames;
110 //if the user changes the input directory command factory will send this info to us in the output parameter
111 string inputDir = validParameter.validFile(parameters, "inputdir", false);
112 if (inputDir == "not found"){ inputDir = ""; }
115 it = parameters.find("name");
116 //user has given a template file
117 if(it != parameters.end()){
118 path = m->hasPath(it->second);
119 //if the user has not given a path then, add inputdir. else leave path alone.
120 if (path == "") { parameters["name"] = inputDir + it->second; }
123 it = parameters.find("group");
124 //user has given a template file
125 if(it != parameters.end()){
126 path = m->hasPath(it->second);
127 //if the user has not given a path then, add inputdir. else leave path alone.
128 if (path == "") { parameters["group"] = inputDir + it->second; }
132 //check for required parameters
133 namefile = validParameter.validFile(parameters, "name", true);
134 if (namefile == "not open") { namefile = ""; abort = true; }
135 else if (namefile == "not found"){
136 namefile = m->getNameFile();
137 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
138 else { m->mothurOut("You have no current namefile and the name parameter is required."); m->mothurOutEndLine(); abort = true; }
139 }else { m->setNameFile(namefile); }
141 groupfile = validParameter.validFile(parameters, "group", true);
142 if (groupfile == "not open") { abort = true; }
143 else if (groupfile == "not found") { groupfile = ""; }
144 else { m->setGroupFile(groupfile); }
146 groups = validParameter.validFile(parameters, "groups", false);
147 if (groups == "not found") { groups = "all"; }
148 m->splitAtDash(groups, Groups);
150 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
151 large = m->isTrue(temp);
153 //if the user changes the output directory command factory will send this info to us in the output parameter
154 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(namefile); }
159 catch(exception& e) {
160 m->errorOut(e, "CountSeqsCommand", "CountSeqsCommand");
164 //**********************************************************************************************************************
166 int CountSeqsCommand::execute(){
169 if (abort == true) { if (calledHelp) { return 0; } return 2; }
171 map<string, string> variables;
172 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(namefile));
173 string outputFileName = getOutputFileName("count", variables);
176 if (!large) { total = processSmall(outputFileName); }
177 else { total = processLarge(outputFileName); }
179 if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; }
181 //set rabund file as new current rabundfile
182 itTypes = outputTypes.find("count");
183 if (itTypes != outputTypes.end()) {
184 if ((itTypes->second).size() != 0) { string current = (itTypes->second)[0]; m->setCountTableFile(current); }
187 m->mothurOutEndLine();
188 m->mothurOut("Total number of sequences: " + toString(total)); m->mothurOutEndLine();
189 m->mothurOutEndLine();
190 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
191 m->mothurOut(outputFileName); m->mothurOutEndLine();
192 m->mothurOutEndLine();
197 catch(exception& e) {
198 m->errorOut(e, "CountSeqsCommand", "execute");
202 //**********************************************************************************************************************
204 int CountSeqsCommand::processSmall(string outputFileName){
207 m->openOutputFile(outputFileName, out); outputTypes["count"].push_back(outputFileName);
208 outputNames.push_back(outputFileName); outputTypes["count"].push_back(outputFileName);
209 out << "Representative_Sequence\ttotal\t";
212 if (groupfile != "") {
213 groupMap = new GroupMap(groupfile); groupMap->readMap();
215 //make sure groups are valid. takes care of user setting groupNames that are invalid or setting groups=all
216 SharedUtil* util = new SharedUtil();
217 vector<string> nameGroups = groupMap->getNamesOfGroups();
218 util->setGroups(Groups, nameGroups);
221 //sort groupNames so that the group title match the counts below, this is needed because the map object automatically sorts
222 sort(Groups.begin(), Groups.end());
225 for (int i = 0; i < Groups.size(); i++) {
226 out << Groups[i] << '\t';
233 m->openInputFile(namefile, in);
237 if (m->control_pressed) { break; }
239 string firstCol, secondCol;
240 in >> firstCol; m->gobble(in); in >> secondCol; m->gobble(in);
241 //cout << firstCol << '\t' << secondCol << endl;
242 m->checkName(firstCol);
243 m->checkName(secondCol);
244 //cout << firstCol << '\t' << secondCol << endl;
246 vector<string> names;
247 m->splitAtChar(secondCol, names, ',');
249 if (groupfile != "") {
251 map<string, int> groupCounts;
253 for (int i = 0; i < Groups.size(); i++) { groupCounts[Groups[i]] = 0; }
255 //get counts for each of the users groups
256 for (int i = 0; i < names.size(); i++) {
257 string group = groupMap->getGroup(names[i]);
259 if (group == "not found") { m->mothurOut("[ERROR]: " + names[i] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); }
261 map<string, int>::iterator it = groupCounts.find(group);
263 //if not found, then this sequence is not from a group we care about
264 if (it != groupCounts.end()) {
272 out << firstCol << '\t' << total << '\t';
273 for (map<string, int>::iterator it = groupCounts.begin(); it != groupCounts.end(); it++) {
274 out << it->second << '\t';
279 out << firstCol << '\t' << names.size() << endl;
282 total += names.size();
287 if (groupfile != "") { delete groupMap; }
291 catch(exception& e) {
292 m->errorOut(e, "CountSeqsCommand", "processSmall");
296 //**********************************************************************************************************************
298 int CountSeqsCommand::processLarge(string outputFileName){
300 set<string> namesOfGroups;
301 map<string, int> initial;
302 for (set<string>::iterator it = namesOfGroups.begin(); it != namesOfGroups.end(); it++) { initial[(*it)] = 0; }
304 m->openOutputFile(outputFileName, out);
305 outputNames.push_back(outputFileName); outputTypes["count"].push_back(outputFileName);
306 out << "Representative_Sequence\ttotal\t";
307 if (groupfile == "") { out << endl; }
309 map<string, unsigned long long> namesToIndex;
310 string outfile = m->getRootName(groupfile) + "sorted.groups.temp";
311 string outName = m->getRootName(namefile) + "sorted.name.temp";
312 map<int, string> indexToName;
313 map<int, string> indexToGroup;
314 if (groupfile != "") {
315 time_t estart = time(NULL);
316 //convert name file to redundant -> unique. set unique name equal to index so we can use vectors, save name for later.
317 string newNameFile = m->getRootName(namefile) + ".name.temp";
318 string newGroupFile = m->getRootName(groupfile) + ".group.temp";
319 indexToName = processNameFile(newNameFile);
320 indexToGroup = getGroupNames(newGroupFile, namesOfGroups);
322 //sort file by first column so the names of sequences will be easier to find
324 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
325 string command = "sort -n " + newGroupFile + " -o " + outfile;
326 system(command.c_str());
327 command = "sort -n " + newNameFile + " -o " + outName;
328 system(command.c_str());
329 #else //sort using windows sort
330 string command = "sort " + newGroupFile + " /O " + outfile;
331 system(command.c_str());
332 command = "sort " + newNameFile + " /O " + outName;
333 system(command.c_str());
335 m->mothurRemove(newNameFile);
336 m->mothurRemove(newGroupFile);
338 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to sort and index the group and name files. "); m->mothurOutEndLine();
339 }else { outName = namefile; }
341 time_t estart = time(NULL);
344 m->openInputFile(outName, in);
350 vector< vector<int> > nameMapCount;
351 if (groupfile != "") {
352 m->openInputFile(outfile, in2);
353 nameMapCount.resize(indexToName.size());
354 for (int i = 0; i < nameMapCount.size(); i++) {
355 nameMapCount[i].resize(indexToGroup.size(), 0);
360 if (m->control_pressed) { break; }
363 in >> firstCol; m->gobble(in);
365 if (groupfile != "") {
367 in >> uniqueIndex; m->gobble(in);
369 string name; int groupIndex;
370 in2 >> name >> groupIndex; m->gobble(in2);
372 if (name != firstCol) { m->mothurOut("[ERROR]: found " + name + " in your groupfile, but " + firstCol + " was in your namefile, please correct.\n"); m->control_pressed = true; }
374 nameMapCount[uniqueIndex][groupIndex]++;
378 in >> secondCol; m->gobble(in);
379 int num = m->getNumNames(secondCol);
380 out << firstCol << '\t' << num << endl;
386 if (groupfile != "") {
387 m->mothurRemove(outfile);
388 m->mothurRemove(outName);
390 for (map<int, string>::iterator it = indexToGroup.begin(); it != indexToGroup.end(); it++) { out << it->second << '\t'; }
392 for (int i = 0; i < nameMapCount.size(); i++) {
393 string totalsLine = "";
395 for (int j = 0; j < nameMapCount[i].size(); j++) {
396 seqTotal += nameMapCount[i][j];
397 totalsLine += toString(nameMapCount[i][j]) + '\t';
399 out << indexToName[i] << '\t' << seqTotal << '\t' << totalsLine << endl;
405 m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to create the count table file. "); m->mothurOutEndLine();
409 catch(exception& e) {
410 m->errorOut(e, "CountSeqsCommand", "processLarge");
414 /**************************************************************************************************/
415 map<int, string> CountSeqsCommand::processNameFile(string name) {
417 map<int, string> indexToNames;
420 m->openOutputFile(name, out);
424 m->openInputFile(namefile, in);
428 bool pairDone = false;
429 bool columnOne = true;
430 string firstCol, secondCol;
434 if (m->control_pressed) { break; }
436 in.read(buffer, 4096);
437 vector<string> pieces = m->splitWhiteSpace(rest, buffer, in.gcount());
439 for (int i = 0; i < pieces.size(); i++) {
440 if (columnOne) { firstCol = pieces[i]; columnOne=false; }
441 else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
444 m->checkName(firstCol);
445 m->checkName(secondCol);
446 //parse names into vector
447 vector<string> theseNames;
448 m->splitAtComma(secondCol, theseNames);
449 for (int i = 0; i < theseNames.size(); i++) { out << theseNames[i] << '\t' << count << endl; }
450 indexToNames[count] = firstCol;
460 vector<string> pieces = m->splitWhiteSpace(rest);
462 for (int i = 0; i < pieces.size(); i++) {
463 if (columnOne) { firstCol = pieces[i]; columnOne=false; }
464 else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
467 m->checkName(firstCol);
468 m->checkName(secondCol);
469 //parse names into vector
470 vector<string> theseNames;
471 m->splitAtComma(secondCol, theseNames);
472 for (int i = 0; i < theseNames.size(); i++) { out << theseNames[i] << '\t' << count << endl; }
473 indexToNames[count] = firstCol;
483 catch(exception& e) {
484 m->errorOut(e, "CountSeqsCommand", "processNameFile");
488 /**************************************************************************************************/
489 map<int, string> CountSeqsCommand::getGroupNames(string filename, set<string>& namesOfGroups) {
491 map<int, string> indexToGroups;
492 map<string, int> groupIndex;
493 map<string, int>::iterator it;
496 m->openOutputFile(filename, out);
500 m->openInputFile(groupfile, in);
504 bool pairDone = false;
505 bool columnOne = true;
506 string firstCol, secondCol;
510 if (m->control_pressed) { break; }
512 in.read(buffer, 4096);
513 vector<string> pieces = m->splitWhiteSpace(rest, buffer, in.gcount());
515 for (int i = 0; i < pieces.size(); i++) {
516 if (columnOne) { firstCol = pieces[i]; columnOne=false; }
517 else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
520 m->checkName(firstCol);
521 it = groupIndex.find(secondCol);
522 if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above
523 groupIndex[secondCol] = count;
526 out << firstCol << '\t' << groupIndex[secondCol] << endl;
527 namesOfGroups.insert(secondCol);
536 vector<string> pieces = m->splitWhiteSpace(rest);
538 for (int i = 0; i < pieces.size(); i++) {
539 if (columnOne) { firstCol = pieces[i]; columnOne=false; }
540 else { secondCol = pieces[i]; pairDone = true; columnOne=true; }
543 m->checkName(firstCol);
544 it = groupIndex.find(secondCol);
545 if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above
546 groupIndex[secondCol] = count;
549 out << firstCol << '\t' << groupIndex[secondCol] << endl;
550 namesOfGroups.insert(secondCol);
556 for (it = groupIndex.begin(); it != groupIndex.end(); it++) { indexToGroups[it->second] = it->first; }
558 return indexToGroups;
560 catch(exception& e) {
561 m->errorOut(e, "CountSeqsCommand", "getGroupNames");
565 //**********************************************************************************************************************