From: Sarah Westcott Date: Tue, 26 Jun 2012 15:27:14 +0000 (-0400) Subject: Merge remote-tracking branch 'origin/master' X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=2ecee16fec29d4c525f740ec19b27962ca09c050;hp=2aa152a03b9efe77ecfbbd4e0e8e128f74724cc1 Merge remote-tracking branch 'origin/master' --- diff --git a/commandfactory.cpp b/commandfactory.cpp index 54ff4a1..02af676 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -289,6 +289,7 @@ CommandFactory::CommandFactory(){ commands["remove.otulabels"] = "remove.otulabels"; commands["make.contigs"] = "make.contigs"; commands["load.logfile"] = "load.logfile"; + commands["make.table"] = "make.table"; commands["quit"] = "MPIEnabled"; } @@ -482,7 +483,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "make.shared") { command = new SharedCommand(optionString); } else if(commandName == "get.commandinfo") { command = new GetCommandInfoCommand(optionString); } else if(commandName == "deunique.tree") { command = new DeuniqueTreeCommand(optionString); } - else if(commandName == "count.seqs") { command = new CountSeqsCommand(optionString); } + else if((commandName == "count.seqs") || (commandName == "make.table")) { command = new CountSeqsCommand(optionString); } else if(commandName == "count.groups") { command = new CountGroupsCommand(optionString); } else if(commandName == "clear.memory") { command = new ClearMemoryCommand(optionString); } else if(commandName == "summary.tax") { command = new SummaryTaxCommand(optionString); } @@ -636,7 +637,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str else if(commandName == "make.shared") { pipecommand = new SharedCommand(optionString); } else if(commandName == "get.commandinfo") { pipecommand = new GetCommandInfoCommand(optionString); } else if(commandName == "deunique.tree") { pipecommand = new DeuniqueTreeCommand(optionString); } - else if(commandName == "count.seqs") { pipecommand = new CountSeqsCommand(optionString); } + else if((commandName == "count.seqs") || (commandName == "make.table")) { pipecommand = new CountSeqsCommand(optionString); } else if(commandName == "count.groups") { pipecommand = new CountGroupsCommand(optionString); } else if(commandName == "clear.memory") { pipecommand = new ClearMemoryCommand(optionString); } else if(commandName == "summary.tax") { pipecommand = new SummaryTaxCommand(optionString); } @@ -776,7 +777,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "make.shared") { shellcommand = new SharedCommand(); } else if(commandName == "get.commandinfo") { shellcommand = new GetCommandInfoCommand(); } else if(commandName == "deunique.tree") { shellcommand = new DeuniqueTreeCommand(); } - else if(commandName == "count.seqs") { shellcommand = new CountSeqsCommand(); } + else if((commandName == "count.seqs") || (commandName == "make.table")) { shellcommand = new CountSeqsCommand(); } else if(commandName == "count.groups") { shellcommand = new CountGroupsCommand(); } else if(commandName == "clear.memory") { shellcommand = new ClearMemoryCommand(); } else if(commandName == "summary.tax") { shellcommand = new SummaryTaxCommand(); } diff --git a/countseqscommand.cpp b/countseqscommand.cpp index 8a18fc7..210dd96 100644 --- a/countseqscommand.cpp +++ b/countseqscommand.cpp @@ -16,6 +16,7 @@ vector CountSeqsCommand::setParameters(){ try { CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pname); CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); + CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -33,11 +34,12 @@ vector CountSeqsCommand::setParameters(){ string CountSeqsCommand::getHelpString(){ try { string helpString = ""; - helpString += "The count.seqs command reads a name file and outputs a .seq.count file. You may also provide a group file to get the counts broken down by group.\n"; + helpString += "The count.seqs aka. make.table command reads a name file and outputs a .count.table file. You may also provide a group file to get the counts broken down by group.\n"; helpString += "The groups parameter allows you to indicate which groups you want to include in the counts, by default all groups in your groupfile are used.\n"; + helpString += "The large parameter indicates the name and group files are too large to fit in RAM.\n"; helpString += "When you use the groups parameter and a sequence does not represent any sequences from the groups you specify it is not included in the .count.summary file.\n"; helpString += "The count.seqs command should be in the following format: count.seqs(name=yourNameFile).\n"; - helpString += "Example count.seqs(name=amazon.names).\n"; + helpString += "Example count.seqs(name=amazon.names) or make.table(name=amazon.names).\n"; helpString += "Note: No spaces between parameter labels (i.e. name), '=' and parameters (i.e.yourNameFile).\n"; return helpString; } @@ -56,7 +58,7 @@ string CountSeqsCommand::getOutputFileNameTag(string type, string inputName=""){ it = outputTypes.find(type); if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); } else { - if (type == "summary") { outputFileName = "seq.count"; } + if (type == "counttable") { outputFileName = "count.table"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; } } return outputFileName; @@ -72,7 +74,7 @@ CountSeqsCommand::CountSeqsCommand(){ abort = true; calledHelp = true; setParameters(); vector tempOutNames; - outputTypes["summary"] = tempOutNames; + outputTypes["counttable"] = tempOutNames; } catch(exception& e) { m->errorOut(e, "CountSeqsCommand", "CountSeqsCommand"); @@ -104,7 +106,7 @@ CountSeqsCommand::CountSeqsCommand(string option) { //initialize outputTypes vector tempOutNames; - outputTypes["summary"] = tempOutNames; + outputTypes["counttable"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter @@ -146,6 +148,9 @@ CountSeqsCommand::CountSeqsCommand(string option) { groups = validParameter.validFile(parameters, "groups", false); if (groups == "not found") { groups = "all"; } m->splitAtDash(groups, Groups); + + string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; } + large = m->isTrue(temp); //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(namefile); } @@ -165,12 +170,45 @@ int CountSeqsCommand::execute(){ if (abort == true) { if (calledHelp) { return 0; } return 2; } - ofstream out; - string outputFileName = outputDir + m->getRootName(m->getSimpleName(namefile)) + getOutputFileNameTag("summary"); - m->openOutputFile(outputFileName, out); outputTypes["summary"].push_back(outputFileName); - out << "Representative_Sequence\ttotal\t"; + string outputFileName = outputDir + m->getRootName(m->getSimpleName(namefile)) + getOutputFileNameTag("counttable"); + + int total = 0; + if (!large) { total = processSmall(outputFileName); } + else { total = processLarge(outputFileName); } + + if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; } + + //set rabund file as new current rabundfile + itTypes = outputTypes.find("counttable"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { string current = (itTypes->second)[0]; m->setCountTableFile(current); } + } + + m->mothurOutEndLine(); + m->mothurOut("Total number of sequences: " + toString(total)); m->mothurOutEndLine(); + m->mothurOutEndLine(); + m->mothurOut("Output File Name: "); m->mothurOutEndLine(); + m->mothurOut(outputFileName); m->mothurOutEndLine(); + m->mothurOutEndLine(); - GroupMap* groupMap; + return 0; + } + + catch(exception& e) { + m->errorOut(e, "CountSeqsCommand", "execute"); + exit(1); + } +} +//********************************************************************************************************************** + +int CountSeqsCommand::processSmall(string outputFileName){ + try { + ofstream out; + m->openOutputFile(outputFileName, out); outputTypes["counttable"].push_back(outputFileName); + outputNames.push_back(outputFileName); outputTypes["counttable"].push_back(outputFileName); + out << "Representative_Sequence\ttotal\t"; + + GroupMap* groupMap; if (groupfile != "") { groupMap = new GroupMap(groupfile); groupMap->readMap(); @@ -199,7 +237,7 @@ int CountSeqsCommand::execute(){ if (m->control_pressed) { break; } string firstCol, secondCol; - in >> firstCol >> secondCol; m->gobble(in); + in >> firstCol; m->gobble(in); in >> secondCol; m->gobble(in); vector names; m->splitAtChar(secondCol, names, ','); @@ -240,24 +278,241 @@ int CountSeqsCommand::execute(){ total += names.size(); } in.close(); + out.close(); if (groupfile != "") { delete groupMap; } + + return total; + } + catch(exception& e) { + m->errorOut(e, "CountSeqsCommand", "processSmall"); + exit(1); + } +} +//********************************************************************************************************************** + +int CountSeqsCommand::processLarge(string outputFileName){ + try { + set namesOfGroups; + map initial; + for (set::iterator it = namesOfGroups.begin(); it != namesOfGroups.end(); it++) { initial[(*it)] = 0; } + ofstream out; + m->openOutputFile(outputFileName, out); + outputNames.push_back(outputFileName); outputTypes["counttable"].push_back(outputFileName); + out << "Representative_Sequence\ttotal\t"; + if (groupfile == "") { out << endl; } + + map namesToIndex; + string outfile = m->getRootName(groupfile) + "sorted.groups.temp"; + string outName = m->getRootName(namefile) + "sorted.name.temp"; + map indexToName; + map indexToGroup; + if (groupfile != "") { + time_t estart = time(NULL); + //convert name file to redundant -> unique. set unique name equal to index so we can use vectors, save name for later. + string newNameFile = m->getRootName(namefile) + ".name.temp"; + string newGroupFile = m->getRootName(groupfile) + ".group.temp"; + indexToName = processNameFile(newNameFile); + indexToGroup = getGroupNames(newGroupFile, namesOfGroups); + + //sort file by first column so the names of sequences will be easier to find + //use the unix sort + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + string command = "sort -n " + newGroupFile + " -o " + outfile; + system(command.c_str()); + command = "sort -n " + newNameFile + " -o " + outName; + system(command.c_str()); + #else //sort using windows sort + string command = "sort " + newGroupFile + " /O " + outfile; + system(command.c_str()); + command = "sort " + newNameFile + " /O " + outName; + system(command.c_str()); + #endif + m->mothurRemove(newNameFile); + m->mothurRemove(newGroupFile); + + m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to sort and index the group and name files. "); m->mothurOutEndLine(); + }else { outName = namefile; } + + time_t estart = time(NULL); + //open input file + ifstream in; + m->openInputFile(outName, in); + + //open input file + ifstream in2; - if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; } + int total = 0; + vector< vector > nameMapCount; + if (groupfile != "") { + m->openInputFile(outfile, in2); + nameMapCount.resize(indexToName.size()); + for (int i = 0; i < nameMapCount.size(); i++) { + nameMapCount[i].resize(indexToGroup.size(), 0); + } + } + + while (!in.eof()) { + if (m->control_pressed) { break; } + + string firstCol; + in >> firstCol; m->gobble(in); + + if (groupfile != "") { + int uniqueIndex; + in >> uniqueIndex; m->gobble(in); + + string name; int groupIndex; + in2 >> name >> groupIndex; m->gobble(in2); + + if (name != firstCol) { m->mothurOut("[ERROR]: found " + name + " in your groupfile, but " + firstCol + " was in your namefile, please correct.\n"); m->control_pressed = true; } + + nameMapCount[uniqueIndex][groupIndex]++; + total++; + }else { + string secondCol; + in >> secondCol; m->gobble(in); + int num = m->getNumNames(secondCol); + out << firstCol << '\t' << num << endl; + total += num; + } + } + in.close(); + + if (groupfile != "") { + m->mothurRemove(outfile); + m->mothurRemove(outName); + in2.close(); + for (map::iterator it = indexToGroup.begin(); it != indexToGroup.end(); it++) { out << it->second << '\t'; } + out << endl; + for (int i = 0; i < nameMapCount.size(); i++) { + string totalsLine = ""; + int seqTotal = 0; + for (int j = 0; j < nameMapCount[i].size(); j++) { + seqTotal += nameMapCount[i][j]; + totalsLine += toString(nameMapCount[i][j]) + '\t'; + } + out << indexToName[i] << '\t' << seqTotal << '\t' << totalsLine << endl; + } + } + + out.close(); + + m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to create the count table file. "); m->mothurOutEndLine(); + + return total; + } + catch(exception& e) { + m->errorOut(e, "CountSeqsCommand", "processLarge"); + exit(1); + } +} +/**************************************************************************************************/ +map CountSeqsCommand::processNameFile(string name) { + try { + map indexToNames; + + ofstream out; + m->openOutputFile(name, out); + + //open input file + ifstream in; + m->openInputFile(namefile, in); + + string rest = ""; + char buffer[4096]; + bool pairDone = false; + bool columnOne = true; + string firstCol, secondCol; + int count = 0; + + while (!in.eof()) { + if (m->control_pressed) { break; } + + in.read(buffer, 4096); + vector pieces = m->splitWhiteSpace(rest, buffer, in.gcount()); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { firstCol = pieces[i]; columnOne=false; } + else { secondCol = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + //parse names into vector + vector theseNames; + m->splitAtComma(secondCol, theseNames); + for (int i = 0; i < theseNames.size(); i++) { out << theseNames[i] << '\t' << count << endl; } + indexToNames[count] = firstCol; + pairDone = false; + count++; + } + } + } + in.close(); + out.close(); - m->mothurOutEndLine(); - m->mothurOut("Total number of sequences: " + toString(total)); m->mothurOutEndLine(); - m->mothurOutEndLine(); - m->mothurOut("Output File Name: "); m->mothurOutEndLine(); - m->mothurOut(outputFileName); m->mothurOutEndLine(); - m->mothurOutEndLine(); + return indexToNames; + } + catch(exception& e) { + m->errorOut(e, "CountSeqsCommand", "processNameFile"); + exit(1); + } +} +/**************************************************************************************************/ +map CountSeqsCommand::getGroupNames(string filename, set& namesOfGroups) { + try { + map indexToGroups; + map groupIndex; + map::iterator it; + + ofstream out; + m->openOutputFile(filename, out); + + //open input file + ifstream in; + m->openInputFile(groupfile, in); + + string rest = ""; + char buffer[4096]; + bool pairDone = false; + bool columnOne = true; + string firstCol, secondCol; + int count = 0; + + while (!in.eof()) { + if (m->control_pressed) { break; } + + in.read(buffer, 4096); + vector pieces = m->splitWhiteSpace(rest, buffer, in.gcount()); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { firstCol = pieces[i]; columnOne=false; } + else { secondCol = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + it = groupIndex.find(secondCol); + if (it == groupIndex.end()) { //add group, assigning the group and number so we can use vectors above + groupIndex[secondCol] = count; + count++; + } + out << firstCol << '\t' << groupIndex[secondCol] << endl; + namesOfGroups.insert(secondCol); + pairDone = false; + } + } + } + in.close(); + out.close(); - return 0; + for (it = groupIndex.begin(); it != groupIndex.end(); it++) { indexToGroups[it->second] = it->first; } + + return indexToGroups; } - catch(exception& e) { - m->errorOut(e, "CountSeqsCommand", "execute"); + m->errorOut(e, "CountSeqsCommand", "getGroupNames"); exit(1); } } //********************************************************************************************************************** + + + diff --git a/countseqscommand.h b/countseqscommand.h index 54982c1..555d6a7 100644 --- a/countseqscommand.h +++ b/countseqscommand.h @@ -34,8 +34,14 @@ public: private: string namefile, groupfile, outputDir, groups; - bool abort; - vector Groups; + bool abort, large; + vector Groups, outputNames; + + int processSmall(string); + int processLarge(string); + map processNameFile(string); + map getGroupNames(string, set&); + }; #endif diff --git a/createdatabasecommand.cpp b/createdatabasecommand.cpp index febd762..5919f0c 100644 --- a/createdatabasecommand.cpp +++ b/createdatabasecommand.cpp @@ -15,7 +15,8 @@ vector CreateDatabaseCommand::setParameters(){ CommandParameter pfasta("repfasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); CommandParameter pname("repname", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pname); CommandParameter pcontaxonomy("contaxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pcontaxonomy); - CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist); + CommandParameter plist("list", "InputTypes", "", "", "ListShared", "ListShared", "none",false,false); parameters.push_back(plist); + CommandParameter pshared("shared", "InputTypes", "", "", "ListShared", "ListShared", "none",false,false); parameters.push_back(pshared); CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); @@ -34,8 +35,8 @@ vector CreateDatabaseCommand::setParameters(){ string CreateDatabaseCommand::getHelpString(){ try { string helpString = ""; - helpString += "The create.database command reads a listfile, *.cons.taxonomy, *.rep.fasta, *.rep.names and optional groupfile, and creates a database file.\n"; - helpString += "The create.database command parameters are repfasta, list, repname, contaxonomy, group and label. List, repfasta, repnames, and contaxonomy are required.\n"; + helpString += "The create.database command reads a list file or a shared file, *.cons.taxonomy, *.rep.fasta, *.rep.names and optional groupfile, and creates a database file.\n"; + helpString += "The create.database command parameters are repfasta, list, shared, repname, contaxonomy, group and label. List, repfasta, repnames, and contaxonomy are required.\n"; helpString += "The repfasta file is fasta file outputted by get.oturep(fasta=yourFastaFile, list=yourListfile, column=yourDistFile, name=yourNameFile).\n"; helpString += "The repname file is the name file outputted by get.oturep(fasta=yourFastaFile, list=yourListfile, column=yourDistFile, name=yourNameFile).\n"; helpString += "The contaxonomy file is the taxonomy file outputted by classify.otu(list=yourListfile, taxonomy=yourTaxonomyFile).\n"; @@ -159,6 +160,14 @@ CreateDatabaseCommand::CreateDatabaseCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["group"] = inputDir + it->second; } } + + it = parameters.find("shared"); + //user has given a template file + if(it != parameters.end()){ + path = m->hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["shared"] = inputDir + it->second; } + } } @@ -167,14 +176,33 @@ CreateDatabaseCommand::CreateDatabaseCommand(string option) { //check for required parameters listfile = validParameter.validFile(parameters, "list", true); - if (listfile == "not found") { - //if there is a current list file, use it + if (listfile == "not found") { listfile = ""; } + else if (listfile == "not open") { listfile = ""; abort = true; } + else { m->setListFile(listfile); } + + sharedfile = validParameter.validFile(parameters, "shared", true); + if (sharedfile == "not found") { sharedfile = ""; } + else if (sharedfile == "not open") { sharedfile = ""; abort = true; } + else { m->setSharedFile(sharedfile); } + + if ((sharedfile == "") && (listfile == "")) { + //is there are current file available for either of these? + //give priority to list, then shared listfile = m->getListFile(); if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); } - else { m->mothurOut("You have no current listfile and the list parameter is required."); m->mothurOutEndLine(); abort = true; } + else { + sharedfile = m->getSharedFile(); + if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); } + else { + m->mothurOut("No valid current files. You must provide a shared or list file before you can use the create.database command."); m->mothurOutEndLine(); + abort = true; + } + } } - else if (listfile == "not open") { abort = true; } - else { m->setListFile(listfile); } + else if ((sharedfile != "") && (listfile != "")) { m->mothurOut("When executing a create.database command you must enter ONLY ONE of the following: shared or list."); m->mothurOutEndLine(); abort = true; } + + if (sharedfile != "") { if (outputDir == "") { outputDir = m->hasPath(sharedfile); } } + else { if (outputDir == "") { outputDir = m->hasPath(listfile); } } contaxonomyfile = validParameter.validFile(parameters, "contaxonomy", true); if (contaxonomyfile == "not found") { //if there is a current list file, use it @@ -219,7 +247,8 @@ int CreateDatabaseCommand::execute(){ //taxonomies holds the taxonomy info for each Otu //classifyOtuSizes holds the size info of each Otu to help with error checking vector taxonomies; - vector classifyOtuSizes = readTax(taxonomies); + vector otuLabels; + vector classifyOtuSizes = readTax(taxonomies, otuLabels); if (m->control_pressed) { return 0; } @@ -230,7 +259,7 @@ int CreateDatabaseCommand::execute(){ //names redundants to uniques. backwards to how we normally do it, but each bin is the list file will be a key entry in the map. map repNames; - int numUniqueNamesFile = m->readNames(repnamesfile, repNames); + int numUniqueNamesFile = m->readNames(repnamesfile, repNames, 1); //are there the same number of otus in the fasta and name files if (repOtusSizes.size() != numUniqueNamesFile) { m->mothurOut("[ERROR]: you have " + toString(numUniqueNamesFile) + " unique seqs in your repname file, but " + toString(repOtusSizes.size()) + " seqs in your repfasta file. These should match.\n"); m->control_pressed = true; } @@ -251,86 +280,130 @@ int CreateDatabaseCommand::execute(){ if (m->control_pressed) { return 0; } - //at this point we are fairly sure the repfasta, repnames and contaxonomy files match so lets proceed with the listfile - ListVector* list = getList(); - - if (m->control_pressed) { delete list; return 0; } - GroupMap* groupmap = NULL; - if (groupfile != "") { - groupmap = new GroupMap(groupfile); - groupmap->readMap(); - } - - if (m->control_pressed) { delete list; if (groupfile != "") { delete groupmap; } return 0; } - - if (outputDir == "") { outputDir += m->hasPath(listfile); } - string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + getOutputFileNameTag("database"); + string outputFileName = ""; + if (listfile != "") { outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + getOutputFileNameTag("database"); } + else { outputFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + getOutputFileNameTag("database"); } outputNames.push_back(outputFileName); outputTypes["database"].push_back(outputFileName); ofstream out; m->openOutputFile(outputFileName, out); string header = "OTUNumber\tAbundance\t"; - if (groupfile != "") { - header = "OTUNumber\t"; - for (int i = 0; i < groupmap->getNamesOfGroups().size(); i++) { header += (groupmap->getNamesOfGroups())[i] + '\t'; } - } - header += "repSeqName\trepSeq\tOTUConTaxonomy"; - out << header << endl; + - for (int i = 0; i < list->getNumBins(); i++) { + if (listfile != "") { + //at this point we are fairly sure the repfasta, repnames and contaxonomy files match so lets proceed with the listfile + ListVector* list = getList(); - if (m->control_pressed) { break; } + if (otuLabels.size() != list->getNumBins()) { + m->mothurOut("[ERROR]: you have " + toString(otuLabels.size()) + " otus in your contaxonomy file, but your list file has " + toString(list->getNumBins()) + " otus. These should match. Make sure you are using files for the same distance.\n"); m->control_pressed = true; } - out << (i+1) << '\t'; + if (m->control_pressed) { delete list; return 0; } - vector binNames; - string bin = list->get(i); - - map::iterator it = repNames.find(bin); - if (it == repNames.end()) { - m->mothurOut("[ERROR: OTU " + toString(i+1) + " is not in the repnames file. Make sure you are using files for the same distance.\n"); m->control_pressed = true; break; + GroupMap* groupmap = NULL; + if (groupfile != "") { + groupmap = new GroupMap(groupfile); + groupmap->readMap(); } - m->splitAtComma(bin, binNames); + if (m->control_pressed) { delete list; if (groupfile != "") { delete groupmap; } return 0; } - //sanity check - if (binNames.size() != classifyOtuSizes[i]) { - m->mothurOut("[ERROR: OTU " + toString(i+1) + " contains " + toString(binNames.size()) + " sequence, but the rep and taxonomy files indicated this OTU should have " + toString(classifyOtuSizes[i]) + ". Make sure you are using files for the same distance.\n"); m->control_pressed = true; break; + if (groupfile != "") { + header = "OTUNumber\t"; + for (int i = 0; i < groupmap->getNamesOfGroups().size(); i++) { header += (groupmap->getNamesOfGroups())[i] + '\t'; } } + header += "repSeqName\trepSeq\tOTUConTaxonomy"; + out << header << endl; - //output abundances - if (groupfile != "") { - string groupAbunds = ""; - map counts; - //initialize counts to 0 - for (int j = 0; j < groupmap->getNamesOfGroups().size(); j++) { counts[(groupmap->getNamesOfGroups())[j]] = 0; } + for (int i = 0; i < list->getNumBins(); i++) { + + if (m->control_pressed) { break; } + + out << otuLabels[i] << '\t'; + + vector binNames; + string bin = list->get(i); - //find abundances by group - bool error = false; - for (int j = 0; j < binNames.size(); j++) { - string group = groupmap->getGroup(binNames[j]); - if (group == "not found") { - m->mothurOut("[ERROR]: " + binNames[j] + " is not in your groupfile, please correct.\n"); - error = true; - }else { counts[group]++; } + map::iterator it = repNames.find(bin); + if (it == repNames.end()) { + m->mothurOut("[ERROR: OTU " + otuLabels[i] + " is not in the repnames file. Make sure you are using files for the same distance.\n"); m->control_pressed = true; break; } - //output counts - for (int j = 0; j < groupmap->getNamesOfGroups().size(); j++) { out << counts[(groupmap->getNamesOfGroups())[j]] << '\t'; } + m->splitAtComma(bin, binNames); - if (error) { m->control_pressed = true; } - }else { out << binNames.size() << '\t'; } + //sanity check + if (binNames.size() != classifyOtuSizes[i]) { + m->mothurOut("[ERROR: OTU " + otuLabels[i] + " contains " + toString(binNames.size()) + " sequence, but the rep and taxonomy files indicated this OTU should have " + toString(classifyOtuSizes[i]) + ". Make sure you are using files for the same distance.\n"); m->control_pressed = true; break; + } + + //output abundances + if (groupfile != "") { + string groupAbunds = ""; + map counts; + //initialize counts to 0 + for (int j = 0; j < groupmap->getNamesOfGroups().size(); j++) { counts[(groupmap->getNamesOfGroups())[j]] = 0; } + + //find abundances by group + bool error = false; + for (int j = 0; j < binNames.size(); j++) { + string group = groupmap->getGroup(binNames[j]); + if (group == "not found") { + m->mothurOut("[ERROR]: " + binNames[j] + " is not in your groupfile, please correct.\n"); + error = true; + }else { counts[group]++; } + } + + //output counts + for (int j = 0; j < groupmap->getNamesOfGroups().size(); j++) { out << counts[(groupmap->getNamesOfGroups())[j]] << '\t'; } + + if (error) { m->control_pressed = true; } + }else { out << binNames.size() << '\t'; } + + //output repSeq + out << it->second << '\t' << seqs[i].getAligned() << '\t' << taxonomies[i] << endl; + } + + + delete list; + if (groupfile != "") { delete groupmap; } + + }else { + vector lookup = getShared(); + + header = "OTUNumber\t"; + for (int i = 0; i < lookup.size(); i++) { header += lookup[i]->getGroup() + '\t'; } + header += "repSeqName\trepSeq\tOTUConTaxonomy"; + out << header << endl; - //output repSeq - out << it->second << '\t' << seqs[i].getAligned() << '\t' << taxonomies[i] << endl; + for (int h = 0; h < lookup[0]->getNumBins(); h++) { + + if (m->control_pressed) { break; } + + int index = findIndex(otuLabels, m->currentBinLabels[h]); + if (index == -1) { m->mothurOut("[ERROR]: " + m->currentBinLabels[h] + " is not in your constaxonomy file, aborting.\n"); m->control_pressed = true; } + + if (m->control_pressed) { break; } + + out << otuLabels[index] << '\t'; + + int totalAbund = 0; + for (int i = 0; i < lookup.size(); i++) { + int abund = lookup[i]->getAbundance(h); + totalAbund += abund; + out << abund << '\t'; + } + + //sanity check + if (totalAbund != classifyOtuSizes[index]) { + m->mothurOut("[ERROR: OTU " + m->currentBinLabels[h] + " contains " + toString(totalAbund) + " sequence, but the rep and taxonomy files indicated this OTU should have " + toString(classifyOtuSizes[index]) + ". Make sure you are using files for the same distance.\n"); m->control_pressed = true; break; + } + + //output repSeq + out << seqs[index].getName() << '\t' << seqs[index].getAligned() << '\t' << taxonomies[index] << endl; + } } out.close(); - - delete list; - if (groupfile != "") { delete groupmap; } - if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; } m->mothurOutEndLine(); @@ -347,7 +420,21 @@ int CreateDatabaseCommand::execute(){ } } //********************************************************************************************************************** -vector CreateDatabaseCommand::readTax(vector& taxonomies){ +int CreateDatabaseCommand::findIndex(vector& otuLabels, string label){ + try { + int index = -1; + for (int i = 0; i < otuLabels.size(); i++) { + if (otuLabels[i] == label) { index = i; break; } + } + return index; + } + catch(exception& e) { + m->errorOut(e, "CreateDatabaseCommand", "findIndex"); + exit(1); + } +} +//********************************************************************************************************************** +vector CreateDatabaseCommand::readTax(vector& taxonomies, vector& otuLabels){ try { vector sizes; @@ -369,6 +456,7 @@ vector CreateDatabaseCommand::readTax(vector& taxonomies){ sizes.push_back(size); taxonomies.push_back(tax); + otuLabels.push_back(otu); } in.close(); @@ -493,5 +581,81 @@ ListVector* CreateDatabaseCommand::getList(){ } } //********************************************************************************************************************** +vector CreateDatabaseCommand::getShared(){ + try { + InputData input(sharedfile, "sharedfile"); + vector lookup = input.getSharedRAbundVectors(); + string lastLabel = lookup[0]->getLabel(); + + if (label == "") { label = lastLabel; return lookup; } + + //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label. + set labels; labels.insert(label); + set processedLabels; + set userLabels = labels; + + //as long as you are not at the end of the file or done wih the lines you want + while((lookup[0] != NULL) && (userLabels.size() != 0)) { + if (m->control_pressed) { return lookup; } + + if(labels.count(lookup[0]->getLabel()) == 1){ + processedLabels.insert(lookup[0]->getLabel()); + userLabels.erase(lookup[0]->getLabel()); + break; + } + + if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) { + string saveLabel = lookup[0]->getLabel(); + + for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } + lookup = input.getSharedRAbundVectors(lastLabel); + + processedLabels.insert(lookup[0]->getLabel()); + userLabels.erase(lookup[0]->getLabel()); + + //restore real lastlabel to save below + lookup[0]->setLabel(saveLabel); + break; + } + + lastLabel = lookup[0]->getLabel(); + + //get next line to process + //prevent memory leak + for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } + lookup = input.getSharedRAbundVectors(); + } + + + if (m->control_pressed) { return lookup; } + + //output error messages about any remaining user labels + set::iterator it; + bool needToRun = false; + for (it = userLabels.begin(); it != userLabels.end(); it++) { + m->mothurOut("Your file does not include the label " + *it); + if (processedLabels.count(lastLabel) != 1) { + m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine(); + needToRun = true; + }else { + m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine(); + } + } + + //run last label if you need to + if (needToRun == true) { + for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } + lookup = input.getSharedRAbundVectors(lastLabel); + } + + return lookup; + } + catch(exception& e) { + m->errorOut(e, "CreateDatabaseCommand", "getList"); + exit(1); + } +} + +//********************************************************************************************************************** diff --git a/createdatabasecommand.h b/createdatabasecommand.h index 85e2367..741feba 100644 --- a/createdatabasecommand.h +++ b/createdatabasecommand.h @@ -34,13 +34,15 @@ public: private: bool abort; - string listfile, groupfile, repfastafile, repnamesfile, contaxonomyfile, label, outputDir; + string sharedfile, listfile, groupfile, repfastafile, repnamesfile, contaxonomyfile, label, outputDir; vector outputNames; vector readFasta(vector&); - vector readTax(vector&); + vector readTax(vector&, vector&); ListVector* getList(); + vector getShared(); + int findIndex(vector&, string); }; diff --git a/distancecommand.cpp b/distancecommand.cpp index 99e89ba..16407d5 100644 --- a/distancecommand.cpp +++ b/distancecommand.cpp @@ -15,7 +15,7 @@ vector DistanceCommand::setParameters(){ CommandParameter pcolumn("column", "InputTypes", "", "", "none", "none", "OldFastaColumn",false,false); parameters.push_back(pcolumn); CommandParameter poldfasta("oldfasta", "InputTypes", "", "", "none", "none", "OldFastaColumn",false,false); parameters.push_back(poldfasta); CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter poutput("output", "Multiple", "column-lt-square", "column", "", "", "",false,false); parameters.push_back(poutput); + CommandParameter poutput("output", "Multiple", "column-lt-square-phylip", "column", "", "", "",false,false); parameters.push_back(poutput); CommandParameter pcalc("calc", "Multiple", "nogaps-eachgap-onegap", "onegap", "", "", "",false,false); parameters.push_back(pcalc); CommandParameter pcountends("countends", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pcountends); CommandParameter pcompress("compress", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pcompress); @@ -210,6 +210,7 @@ DistanceCommand::DistanceCommand(string option) { convert(temp, compress); output = validParameter.validFile(parameters, "output", false); if(output == "not found"){ output = "column"; } + if (output == "phylip") { output = "lt"; } if (((column != "") && (oldfastafile == "")) || ((column == "") && (oldfastafile != ""))) { m->mothurOut("If you provide column or oldfasta, you must provide both."); m->mothurOutEndLine(); abort=true; } diff --git a/getcurrentcommand.cpp b/getcurrentcommand.cpp index ca83231..c3b5f0a 100644 --- a/getcurrentcommand.cpp +++ b/getcurrentcommand.cpp @@ -140,6 +140,8 @@ int GetCurrentCommand::execute(){ m->setFlowFile(""); }else if (types[i] == "biom") { m->setBiomFile(""); + }else if (types[i] == "counttable") { + m->setCountTableFile(""); }else if (types[i] == "processors") { m->setProcessors("1"); }else if (types[i] == "all") { diff --git a/mothurout.cpp b/mothurout.cpp index eacad37..14840c1 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -43,6 +43,7 @@ set MothurOut::getCurrentTypes() { types.insert("tree"); types.insert("flow"); types.insert("biom"); + types.insert("counttable"); types.insert("processors"); return types; @@ -78,6 +79,7 @@ void MothurOut::printCurrentFiles() { if (treefile != "") { mothurOut("tree=" + treefile); mothurOutEndLine(); } if (flowfile != "") { mothurOut("flow=" + flowfile); mothurOutEndLine(); } if (biomfile != "") { mothurOut("biom=" + biomfile); mothurOutEndLine(); } + if (counttablefile != "") { mothurOut("counttable=" + counttablefile); mothurOutEndLine(); } if (processors != "1") { mothurOut("processors=" + processors); mothurOutEndLine(); } } @@ -112,6 +114,7 @@ bool MothurOut::hasCurrentFiles() { if (treefile != "") { return true; } if (flowfile != "") { return true; } if (biomfile != "") { return true; } + if (counttablefile != "") { return true; } if (processors != "1") { return true; } return hasCurrent; @@ -147,6 +150,7 @@ void MothurOut::clearCurrentFiles() { taxonomyfile = ""; flowfile = ""; biomfile = ""; + counttablefile = ""; processors = "1"; } catch(exception& e) { @@ -1291,16 +1295,6 @@ vector MothurOut::setFilePosEachLine(string filename, int& n positions.push_back(0); while(!in.eof()){ - //unsigned long long lastpos = in.tellg(); - //input = getline(in); - //if (input.length() != 0) { - //unsigned long long pos = in.tellg(); - //if (pos != -1) { positions.push_back(pos - input.length() - 1); } - //else { positions.push_back(lastpos); } - //} - //gobble(in); //has to be here since windows line endings are 2 characters and mess up the positions - - //getline counting reads char d = in.get(); count++; while ((d != '\n') && (d != '\r') && (d != '\f') && (d != in.eof())) { @@ -1503,7 +1497,7 @@ vector MothurOut::splitWhiteSpace(string& rest, char buffer[], int size) for (int i = 0; i < size; i++) { if (!isspace(buffer[i])) { rest += buffer[i]; } else { - pieces.push_back(rest); rest = ""; + if (rest != "") { pieces.push_back(rest); rest = ""; } while (i < size) { //gobble white space if (isspace(buffer[i])) { i++; } else { rest = buffer[i]; break; } //cout << "next piece buffer = " << nextPiece << endl; @@ -1527,7 +1521,7 @@ vector MothurOut::splitWhiteSpace(string input){ for (int i = 0; i < input.length(); i++) { if (!isspace(input[i])) { rest += input[i]; } else { - pieces.push_back(rest); rest = ""; + if (rest != "") { pieces.push_back(rest); rest = ""; } while (i < input.length()) { //gobble white space if (isspace(input[i])) { i++; } else { rest = input[i]; break; } //cout << "next piece buffer = " << nextPiece << endl; @@ -1631,6 +1625,46 @@ int MothurOut::readNames(string namefile, map& nameMap, bool red } } /**********************************************************************************************************************/ +int MothurOut::readNames(string namefile, map& nameMap, int flip) { + try { + + //open input file + ifstream in; + openInputFile(namefile, in); + + string rest = ""; + char buffer[4096]; + bool pairDone = false; + bool columnOne = true; + string firstCol, secondCol; + + while (!in.eof()) { + if (control_pressed) { break; } + + in.read(buffer, 4096); + vector pieces = splitWhiteSpace(rest, buffer, in.gcount()); + + for (int i = 0; i < pieces.size(); i++) { + if (columnOne) { firstCol = pieces[i]; columnOne=false; } + else { secondCol = pieces[i]; pairDone = true; columnOne=true; } + + if (pairDone) { + nameMap[secondCol] = firstCol; + pairDone = false; + } + } + } + in.close(); + + return nameMap.size(); + + } + catch(exception& e) { + errorOut(e, "MothurOut", "readNames"); + exit(1); + } +} +/**********************************************************************************************************************/ int MothurOut::readNames(string namefile, map& nameMap, map& nameCount) { try { nameMap.clear(); nameCount.clear(); diff --git a/mothurout.h b/mothurout.h index b2c638a..98565dc 100644 --- a/mothurout.h +++ b/mothurout.h @@ -106,6 +106,7 @@ class MothurOut { int readNames(string, map&, map&); int readNames(string, map&); int readNames(string, map&, bool); + int readNames(string, map&, int); int readNames(string, map >&); int readNames(string, vector&, map&); int mothurRemove(string); @@ -173,6 +174,7 @@ class MothurOut { string getTaxonomyFile() { return taxonomyfile; } string getFlowFile() { return flowfile; } string getBiomFile() { return biomfile; } + string getCountTableFile() { return counttablefile; } string getProcessors() { return processors; } void setListFile(string f) { listfile = getFullPathName(f); } @@ -196,6 +198,7 @@ class MothurOut { void setTaxonomyFile(string f) { taxonomyfile = getFullPathName(f); } void setFlowFile(string f) { flowfile = getFullPathName(f); } void setBiomFile(string f) { biomfile = getFullPathName(f); } + void setCountTableFile(string f) { counttablefile = getFullPathName(f); } void setProcessors(string p) { processors = p; mothurOut("\nUsing " + toString(p) + " processors.\n"); } void printCurrentFiles(); @@ -231,6 +234,7 @@ class MothurOut { processors = "1"; flowfile = ""; biomfile = ""; + counttablefile = ""; gui = false; printedHeaders = false; commandInputsConvertError = false; @@ -245,7 +249,7 @@ class MothurOut { string releaseDate, version; string accnosfile, phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, designfile, taxonomyfile, biomfile; - string orderfile, treefile, sharedfile, ordergroupfile, relabundfile, fastafile, qualfile, sfffile, oligosfile, processors, flowfile; + string orderfile, treefile, sharedfile, ordergroupfile, relabundfile, fastafile, qualfile, sfffile, oligosfile, processors, flowfile, counttablefile; vector Groups; vector namesOfGroups; diff --git a/nameassignment.cpp b/nameassignment.cpp index 3d99551..126c4e0 100644 --- a/nameassignment.cpp +++ b/nameassignment.cpp @@ -22,7 +22,7 @@ void NameAssignment::readMap(){ int rowIndex = 0; while(fileHandle){ - fileHandle >> firstCol; //read from first column + fileHandle >> firstCol; m->gobble(fileHandle); //read from first column fileHandle >> secondCol; //read from second column itData = (*this).find(firstCol); diff --git a/optionparser.cpp b/optionparser.cpp index 2d13cd0..e3850e5 100644 --- a/optionparser.cpp +++ b/optionparser.cpp @@ -124,6 +124,8 @@ map OptionParser::getParameters() { it->second = m->getTaxonomyFile(); }else if (it->first == "biom") { it->second = m->getBiomFile(); + }else if (it->first == "counttable") { + it->second = m->getCountTableFile(); }else { m->mothurOut("[ERROR]: mothur does not save a current file for " + it->first); m->mothurOutEndLine(); } diff --git a/pairwiseseqscommand.cpp b/pairwiseseqscommand.cpp index 1fa96e3..f69faff 100644 --- a/pairwiseseqscommand.cpp +++ b/pairwiseseqscommand.cpp @@ -19,7 +19,7 @@ vector PairwiseSeqsCommand::setParameters(){ CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "",false,false); parameters.push_back(pgapopen); CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pgapextend); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); - CommandParameter poutput("output", "Multiple", "column-lt-square", "column", "", "", "",false,false); parameters.push_back(poutput); + CommandParameter poutput("output", "Multiple", "column-lt-square-phylip", "column", "", "", "",false,false); parameters.push_back(poutput); CommandParameter pcalc("calc", "Multiple", "nogaps-eachgap-onegap", "onegap", "", "", "",false,false); parameters.push_back(pcalc); CommandParameter pcountends("countends", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pcountends); CommandParameter pcompress("compress", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pcompress); @@ -249,6 +249,7 @@ PairwiseSeqsCommand::PairwiseSeqsCommand(string option) { align = validParameter.validFile(parameters, "align", false); if (align == "not found"){ align = "needleman"; } output = validParameter.validFile(parameters, "output", false); if(output == "not found"){ output = "column"; } + if (output=="phylip") { output = "lt"; } if ((output != "column") && (output != "lt") && (output != "square")) { m->mothurOut(output + " is not a valid output form. Options are column, lt and square. I will use column."); m->mothurOutEndLine(); output = "column"; } calc = validParameter.validFile(parameters, "calc", false); diff --git a/rarefactsharedcommand.cpp b/rarefactsharedcommand.cpp index a312df0..6451041 100644 --- a/rarefactsharedcommand.cpp +++ b/rarefactsharedcommand.cpp @@ -11,6 +11,7 @@ #include "sharedsobs.h" #include "sharednseqs.h" #include "sharedutilities.h" +#include "subsample.h" //********************************************************************************************************************** vector RareFactSharedCommand::setParameters(){ @@ -21,6 +22,8 @@ vector RareFactSharedCommand::setParameters(){ CommandParameter pfreq("freq", "Number", "", "100", "", "", "",false,false); parameters.push_back(pfreq); CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters); CommandParameter pcalc("calc", "Multiple", "sharednseqs-sharedobserved", "sharedobserved", "", "", "",true,false); parameters.push_back(pcalc); + CommandParameter psubsampleiters("subsampleiters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(psubsampleiters); + CommandParameter psubsample("subsample", "String", "", "", "", "", "",false,false); parameters.push_back(psubsample); CommandParameter pjumble("jumble", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pjumble); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter psets("sets", "String", "", "", "", "", "",false,false); parameters.push_back(psets); @@ -50,6 +53,8 @@ string RareFactSharedCommand::getHelpString(){ helpString += "The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n"; helpString += "Example rarefaction.shared(label=unique-0.01-0.03, iters=10000, groups=B-C, jumble=T, calc=sharedobserved).\n"; helpString += "The default values for iters is 1000, freq is 100, and calc is sharedobserved which calculates the shared rarefaction curve for the observed richness.\n"; + helpString += "The subsampleiters parameter allows you to choose the number of times you would like to run the subsample.\n"; + helpString += "The subsample parameter allows you to enter the size pergroup of the sample or you can set subsample=T and mothur will use the size of your smallest group.\n"; helpString += "The default value for groups is all the groups in your groupfile, and jumble is true.\n"; helpString += validCalculator.printCalc("sharedrarefaction"); helpString += "The label parameter is used to analyze specific labels in your input.\n"; @@ -219,6 +224,18 @@ RareFactSharedCommand::RareFactSharedCommand(string option) { temp = validParameter.validFile(parameters, "groupmode", false); if (temp == "not found") { temp = "T"; } groupMode = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "subsampleiters", false); if (temp == "not found") { temp = "1000"; } + m->mothurConvert(temp, iters); + + temp = validParameter.validFile(parameters, "subsample", false); if (temp == "not found") { temp = "F"; } + if (m->isNumeric1(temp)) { m->mothurConvert(temp, subsampleSize); subsample = true; } + else { + if (m->isTrue(temp)) { subsample = true; subsampleSize = -1; } //we will set it to smallest group later + else { subsample = false; } + } + + if (subsample == false) { iters = 1; } } @@ -301,7 +318,35 @@ int RareFactSharedCommand::process(GroupMap& designMap, string thisSet){ } } } - + + /******************************************************/ + if (subsample) { + if (subsampleSize == -1) { //user has not set size, set size = smallest samples size + subsampleSize = subset[0]->getNumSeqs(); + for (int i = 1; i < subset.size(); i++) { + int thisSize = subset[i]->getNumSeqs(); + + if (thisSize < subsampleSize) { subsampleSize = thisSize; } + } + }else { + newGroups.clear(); + vector temp; + for (int i = 0; i < subset.size(); i++) { + if (subset[i]->getNumSeqs() < subsampleSize) { + m->mothurOut(subset[i]->getGroup() + " contains " + toString(subset[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine(); + delete subset[i]; + }else { + newGroups.push_back(subset[i]->getGroup()); + temp.push_back(subset[i]); + } + } + subset = temp; + } + + if (subset.size() < 2) { m->mothurOut("You have not provided enough valid groups. I cannot run the command."); m->mothurOutEndLine(); m->control_pressed = true; return 0; } + } + /******************************************************/ + ValidCalculators validCalculator; for (int i=0; igetSharedCurve(freq, nIters); delete rCurve; + if (subsample) { subsampleLookup(subset, fileNameRoot); } + processedLabels.insert(subset[0]->getLabel()); userLabels.erase(subset[0]->getLabel()); } @@ -372,6 +419,8 @@ int RareFactSharedCommand::process(GroupMap& designMap, string thisSet){ rCurve->getSharedCurve(freq, nIters); delete rCurve; + if (subsample) { subsampleLookup(subset, fileNameRoot); } + processedLabels.insert(subset[0]->getLabel()); userLabels.erase(subset[0]->getLabel()); @@ -444,6 +493,9 @@ int RareFactSharedCommand::process(GroupMap& designMap, string thisSet){ rCurve = new Rarefact(subset, rDisplays); rCurve->getSharedCurve(freq, nIters); delete rCurve; + + if (subsample) { subsampleLookup(subset, fileNameRoot); } + for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } } @@ -458,6 +510,158 @@ int RareFactSharedCommand::process(GroupMap& designMap, string thisSet){ } } //********************************************************************************************************************** +int RareFactSharedCommand::subsampleLookup(vector& thisLookup, string fileNameRoot) { + try { + + map > filenames; + for (int thisIter = 0; thisIter < iters; thisIter++) { + + vector thisItersLookup = thisLookup; + + //we want the summary results for the whole dataset, then the subsampling + SubSample sample; + vector tempLabels; //dont need since we arent printing the sampled sharedRabunds + + //make copy of lookup so we don't get access violations + vector newLookup; + for (int k = 0; k < thisItersLookup.size(); k++) { + SharedRAbundVector* temp = new SharedRAbundVector(); + temp->setLabel(thisItersLookup[k]->getLabel()); + temp->setGroup(thisItersLookup[k]->getGroup()); + newLookup.push_back(temp); + } + + //for each bin + for (int k = 0; k < thisItersLookup[0]->getNumBins(); k++) { + if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; } + for (int j = 0; j < thisItersLookup.size(); j++) { newLookup[j]->push_back(thisItersLookup[j]->getAbundance(k), thisItersLookup[j]->getGroup()); } + } + + tempLabels = sample.getSample(newLookup, subsampleSize); + thisItersLookup = newLookup; + + + Rarefact* rCurve; + vector rDisplays; + + string thisfileNameRoot = fileNameRoot + toString(thisIter); + + ValidCalculators validCalculator; + for (int i=0; igetSharedCurve(freq, nIters); + delete rCurve; + + //clean up memory + for (int i = 0; i < thisItersLookup.size(); i++) { delete thisItersLookup[i]; } + thisItersLookup.clear(); + for(int i=0;i > > results; //iter -> numSampled -> data + for (map >::iterator it = filenames.begin(); it != filenames.end(); it++) { + vector thisTypesFiles = it->second; + vector columnHeaders; + for (int i = 0; i < thisTypesFiles.size(); i++) { + ifstream in; + m->openInputFile(thisTypesFiles[i], in); + + string headers = m->getline(in); m->gobble(in); + columnHeaders = m->splitWhiteSpace(headers); + int numCols = columnHeaders.size(); + + vector > thisFilesLines; + while (!in.eof()) { + if (m->control_pressed) { break; } + vector data; data.resize(numCols, 0); + //read numSampled line + for (int j = 0; j < numCols; j++) { in >> data[j]; m->gobble(in); } + thisFilesLines.push_back(data); + } + in.close(); + results.push_back(thisFilesLines); + m->mothurRemove(thisTypesFiles[i]); + } + + if (!m->control_pressed) { + //process results + string outputFile = fileNameRoot + "ave-std." + thisLookup[0]->getLabel() + "." + getOutputFileNameTag(it->first); + ofstream out; + m->openOutputFile(outputFile, out); + outputNames.push_back(outputFile); outputTypes[it->first].push_back(outputFile); + + out << columnHeaders[0] << '\t' << "method\t"; + for (int i = 1; i < columnHeaders.size(); i++) { out << columnHeaders[i] << '\t'; } + out << endl; + + vector< vector > aveResults; aveResults.resize(results[0].size()); + for (int i = 0; i < aveResults.size(); i++) { aveResults[i].resize(results[0][i].size(), 0.0); } + + for (int thisIter = 0; thisIter < iters; thisIter++) { //sum all groups dists for each calculator + for (int i = 0; i < aveResults.size(); i++) { //initialize sums to zero. + aveResults[i][0] = results[thisIter][i][0]; + for (int j = 1; j < aveResults[i].size(); j++) { + aveResults[i][j] += results[thisIter][i][j]; + } + } + } + + for (int i = 0; i < aveResults.size(); i++) { //finds average. + for (int j = 1; j < aveResults[i].size(); j++) { + aveResults[i][j] /= (float) iters; + } + } + + //standard deviation + vector< vector > stdResults; stdResults.resize(results[0].size()); + for (int i = 0; i < stdResults.size(); i++) { stdResults[i].resize(results[0][i].size(), 0.0); } + + for (int thisIter = 0; thisIter < iters; thisIter++) { //compute the difference of each dist from the mean, and square the result of each + for (int i = 0; i < stdResults.size(); i++) { + stdResults[i][0] = aveResults[i][0]; + for (int j = 1; j < stdResults[i].size(); j++) { + stdResults[i][j] += ((results[thisIter][i][j] - aveResults[i][j]) * (results[thisIter][i][j] - aveResults[i][j])); + } + } + } + + for (int i = 0; i < stdResults.size(); i++) { //finds average. + out << aveResults[i][0] << '\t' << "ave\t"; + for (int j = 1; j < aveResults[i].size(); j++) { out << aveResults[i][j] << '\t'; } + out << endl; + out << stdResults[i][0] << '\t' << "std\t"; + for (int j = 1; j < stdResults[i].size(); j++) { + stdResults[i][j] /= (float) iters; + stdResults[i][j] = sqrt(stdResults[i][j]); + out << stdResults[i][j] << '\t'; + } + out << endl; + } + out.close(); + } + } + + + return 0; + } + catch(exception& e) { + m->errorOut(e, "RareFactSharedCommand", "subsample"); + exit(1); + } +} +//********************************************************************************************************************** vector RareFactSharedCommand::createGroupFile(vector& outputNames) { try { diff --git a/rarefactsharedcommand.h b/rarefactsharedcommand.h index a5b4546..af73e13 100644 --- a/rarefactsharedcommand.h +++ b/rarefactsharedcommand.h @@ -37,18 +37,19 @@ public: private: vector lookup; - int nIters; + int nIters, subsampleSize, iters; string format; float freq; map file2Group; //index in outputNames[i] -> group - bool abort, allLines, jumble, groupMode; + bool abort, allLines, jumble, groupMode, subsample; set labels; //holds labels to be used string label, calc, groups, outputDir, sharedfile, designfile; vector Estimators, Groups, outputNames, Sets; int process(GroupMap&, string); vector createGroupFile(vector&); + int subsampleLookup(vector&, string); }; diff --git a/unifracunweightedcommand.cpp b/unifracunweightedcommand.cpp index 0650ea0..0749cb7 100644 --- a/unifracunweightedcommand.cpp +++ b/unifracunweightedcommand.cpp @@ -22,7 +22,7 @@ vector UnifracUnweightedCommand::setParameters(){ CommandParameter piters("iters", "Number", "", "1000", "", "", "",false,false); parameters.push_back(piters); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter prandom("random", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prandom); - CommandParameter pdistance("distance", "Multiple", "column-lt-square", "column", "", "", "",false,false); parameters.push_back(pdistance); + CommandParameter pdistance("distance", "Multiple", "column-lt-square-phylip", "column", "", "", "",false,false); parameters.push_back(pdistance); CommandParameter psubsample("subsample", "String", "", "", "", "", "",false,false); parameters.push_back(psubsample); CommandParameter pconsensus("consensus", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pconsensus); CommandParameter proot("root", "Boolean", "F", "", "", "", "",false,false); parameters.push_back(proot); @@ -204,6 +204,7 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option) { string temp = validParameter.validFile(parameters, "distance", false); if (temp == "not found") { phylip = false; outputForm = ""; } else{ + if (temp=="phylip") { temp = "lt"; } if ((temp == "lt") || (temp == "column") || (temp == "square")) { phylip = true; outputForm = temp; } else { m->mothurOut("Options for distance are: lt, square, or column. Using lt."); m->mothurOutEndLine(); phylip = true; outputForm = "lt"; } } diff --git a/unifracweightedcommand.cpp b/unifracweightedcommand.cpp index 13d10fd..d1e8833 100644 --- a/unifracweightedcommand.cpp +++ b/unifracweightedcommand.cpp @@ -24,7 +24,7 @@ vector UnifracWeightedCommand::setParameters(){ CommandParameter psubsample("subsample", "String", "", "", "", "", "",false,false); parameters.push_back(psubsample); CommandParameter pconsensus("consensus", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pconsensus); CommandParameter prandom("random", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(prandom); - CommandParameter pdistance("distance", "Multiple", "column-lt-square", "column", "", "", "",false,false); parameters.push_back(pdistance); + CommandParameter pdistance("distance", "Multiple", "column-lt-square-phylip", "column", "", "", "",false,false); parameters.push_back(pdistance); CommandParameter proot("root", "Boolean", "F", "", "", "", "",false,false); parameters.push_back(proot); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -204,6 +204,7 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) { string temp = validParameter.validFile(parameters, "distance", false); if (temp == "not found") { phylip = false; outputForm = ""; } else{ + if (temp=="phylip") { temp = "lt"; } if ((temp == "lt") || (temp == "column") || (temp == "square")) { phylip = true; outputForm = temp; } else { m->mothurOut("Options for distance are: lt, square, or column. Using lt."); m->mothurOutEndLine(); phylip = true; outputForm = "lt"; } } diff --git a/venncommand.cpp b/venncommand.cpp index 18bb635..c918500 100644 --- a/venncommand.cpp +++ b/venncommand.cpp @@ -250,8 +250,7 @@ int VennCommand::execute(){ }else if (Estimators[i] == "chao") { vennCalculators.push_back(new Chao1()); }else if (Estimators[i] == "ace") { - if(abund < 5) - abund = 10; + if(abund < 5) { abund = 10; } vennCalculators.push_back(new Ace(abund)); } }