//bootstrap - to set confidenceScore
int numToSelect = queryKmers.size() / 8;
+ if (m->debug) { m->mothurOut(seq->getName() + "\t"); }
+
tax = bootstrapResults(queryKmers, index, numToSelect);
+
+ if (m->debug) { m->mothurOut("\n"); }
return tax;
}
int seqTaxIndex = tax;
TaxNode seqTax = phyloTree->get(tax);
+
while (seqTax.level != 0) { //while you are not at the root
itBoot2 = confidenceScores.find(seqTaxIndex); //is this a classification we already have a count on
confidence = itBoot2->second;
}
+ if (m->debug) { m->mothurOut(seqTax.name + "(" + toString(((confidence/(float)iters) * 100)) + ");"); }
+
if (((confidence/(float)iters) * 100) >= confidenceThreshold) {
confidenceTax = seqTax.name + "(" + toString(((confidence/(float)iters) * 100)) + ");" + confidenceTax;
simpleTax = seqTax.name + ";" + simpleTax;
}
-
+
seqTaxIndex = seqTax.parent;
seqTax = phyloTree->get(seqTax.parent);
}
CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","",false,false,true); parameters.push_back(pcount);
CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
+ CommandParameter pstrand("strand", "String", "", "", "", "", "","",false,false); parameters.push_back(pstrand);
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
CommandParameter pabskew("abskew", "Number", "", "1.9", "", "", "","",false,false); parameters.push_back(pabskew);
string helpString = "";
helpString += "The chimera.uchime command reads a fastafile and referencefile and outputs potentially chimeric sequences.\n";
helpString += "This command is a wrapper for uchime written by Robert C. Edgar.\n";
- helpString += "The chimera.uchime command parameters are fasta, name, count, reference, processors, dereplicate, abskew, chimealns, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, skipgaps, skipgaps2, minlen, maxlen, ucl and queryfact.\n";
+ helpString += "The chimera.uchime command parameters are fasta, name, count, reference, processors, dereplicate, abskew, chimealns, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, skipgaps, skipgaps2, minlen, maxlen, ucl, strand and queryfact.\n";
helpString += "The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required, unless you have a valid current fasta file. \n";
helpString += "The name parameter allows you to provide a name file, if you are using template=self. \n";
helpString += "The count parameter allows you to provide a count file, if you are using template=self. \n";
maxp = validParameter.validFile(parameters, "maxp", false); if (maxp == "not found") { useMaxp = false; maxp = "2"; } else{ useMaxp = true; }
minlen = validParameter.validFile(parameters, "minlen", false); if (minlen == "not found") { useMinlen = false; minlen = "10"; } else{ useMinlen = true; }
maxlen = validParameter.validFile(parameters, "maxlen", false); if (maxlen == "not found") { useMaxlen = false; maxlen = "10000"; } else{ useMaxlen = true; }
+
+ strand = validParameter.validFile(parameters, "strand", false); if (strand == "not found") { strand = ""; }
temp = validParameter.validFile(parameters, "ucl", false); if (temp == "not found") { temp = "f"; }
ucl = m->isTrue(temp);
*tempa = '\0'; strncat(tempa, alns.c_str(), alns.length());
cPara.push_back(tempa);
}
+
+ if (strand != "") {
+ char* tempA = new char[9];
+ *tempA = '\0'; strncat(tempA, "--strand", 8);
+ cPara.push_back(tempA);
+ char* tempa = new char[strand.length()+1];
+ *tempa = '\0'; strncat(tempa, strand.c_str(), strand.length());
+ cPara.push_back(tempa);
+ }
if (useAbskew) {
char* tempskew = new char[9];
uchimeData* tempUchime = new uchimeData(outputFileName+extension, uchimeLocation, templatefile, files[i], "", "", "", accnos+extension, alns+extension, dummy, m, 0, 0, i);
tempUchime->setBooleans(useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount);
- tempUchime->setVariables(abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract);
+ tempUchime->setVariables(abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract, strand);
pDataArray.push_back(tempUchime);
processIDS.push_back(i);
uchimeData* tempUchime = new uchimeData(outputFName+extension, uchimeLocation, templatefile, filename+extension, fastaFile, nameFile, groupFile, accnos+extension, alns+extension, groups, m, lines[i].start, lines[i].end, i);
tempUchime->setBooleans(useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount);
- tempUchime->setVariables(abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract);
+ tempUchime->setVariables(abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract, strand);
pDataArray.push_back(tempUchime);
processIDS.push_back(i);
int createProcesses(string, string, string, string, int&);
bool abort, useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount, hasName, dups;
- string fastafile, groupfile, templatefile, outputDir, namefile, countfile, abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract, uchimeLocation;
+ string fastafile, groupfile, templatefile, outputDir, namefile, countfile, abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract, uchimeLocation, strand;
int processors;
SequenceParser* sparser;
int threadID, count, numChimeras;
vector<string> groups;
bool useAbskew, chimealns, useMinH, useMindiv, useXn, useDn, useXa, useChunks, useMinchunk, useIdsmoothwindow, useMinsmoothid, useMaxp, skipgaps, skipgaps2, useMinlen, useMaxlen, ucl, useQueryfract, hasCount;
- string abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract;
+ string abskew, minh, mindiv, xn, dn, xa, chunks, minchunk, idsmoothwindow, minsmoothid, maxp, minlen, maxlen, queryfract, strand;
uchimeData(){}
uchimeData(string o, string uloc, string t, string file, string f, string n, string g, string ac, string al, vector<string> gr, MothurOut* mout, int st, int en, int tid) {
hasCount = hc;
}
- void setVariables(string abske, string min, string mindi, string x, string d, string xa2, string chunk, string minchun, string idsmoothwindo, string minsmoothi, string max, string minle, string maxle, string queryfrac) {
+ void setVariables(string abske, string min, string mindi, string x, string d, string xa2, string chunk, string minchun, string idsmoothwindo, string minsmoothi, string max, string minle, string maxle, string queryfrac, string stra) {
abskew = abske;
minh = min;
mindiv = mindi;
+ strand = stra;
xn = x;
dn = d;
xa = xa2;
cPara.push_back(tempa);
}
+ if (pDataArray->strand != "") {
+ char* tempA = new char[9];
+ *tempA = '\0'; strncat(tempA, "--strand", 8);
+ cPara.push_back(tempA);
+ char* tempa = new char[pDataArray->strand.length()+1];
+ *tempa = '\0'; strncat(tempa, pDataArray->strand.c_str(), pDataArray->strand.length());
+ cPara.push_back(tempa);
+ }
+
if (pDataArray->useAbskew) {
char* tempskew = new char[9];
*tempskew = '\0'; strncat(tempskew, "--abskew", 8);
cPara.push_back(tempa);
}
+ if (pDataArray->strand != "") {
+ char* tempA = new char[9];
+ *tempA = '\0'; strncat(tempA, "--strand", 8);
+ cPara.push_back(tempA);
+ char* tempa = new char[pDataArray->strand.length()+1];
+ *tempa = '\0'; strncat(tempa, pDataArray->strand.c_str(), pDataArray->strand.length());
+ cPara.push_back(tempa);
+ }
+
if (pDataArray->useAbskew) {
char* tempskew = new char[9];
*tempskew = '\0'; strncat(tempskew, "--abskew", 8);
try {
if (abort == true) { if (calledHelp) { return 0; } return 2; }
- string outputMethodTag = method + ".";
+ string outputMethodTag = method;
if(method == "wang"){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip, writeShortcuts); }
else if(method == "knn"){ classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted, rand()); }
else if(method == "zap"){
m->mothurOut("It took " + toString(time(NULL) - estart) + " seconds to split the distance file."); m->mothurOutEndLine();
estart = time(NULL);
-
+
if (!runCluster) {
-#ifdef USE_MPI
- }
-#endif
+
m->mothurOutEndLine();
m->mothurOut("Output File Names: "); m->mothurOutEndLine();
for (int i = 0; i < distName.size(); i++) { m->mothurOut(distName[i].begin()->first); m->mothurOutEndLine(); m->mothurOut(distName[i].begin()->second); m->mothurOutEndLine(); }
return 0;
}
-
+
//****************** break up files between processes and cluster each file set ******************************//
#ifdef USE_MPI
////you are process 0 from above////
}
}
/***********************************************************************/
+void CommandFactory::printCommandsCategories(ostream& out) {
+ try {
+ map<string, string> commands = getListCommands();
+ map<string, string>::iterator it;
+
+ map<string, string> categories;
+ map<string, string>::iterator itCat;
+ //loop through each command outputting info
+ for (it = commands.begin(); it != commands.end(); it++) {
+
+ Command* thisCommand = getCommand(it->first);
+
+ //don't add hidden commands
+ if (thisCommand->getCommandCategory() != "Hidden") {
+ itCat = categories.find(thisCommand->getCommandCategory());
+ if (itCat == categories.end()) {
+ categories[thisCommand->getCommandCategory()] = thisCommand->getCommandName();
+ }else {
+ categories[thisCommand->getCommandCategory()] += ", " + thisCommand->getCommandName();
+ }
+ }
+ }
+
+ for (itCat = categories.begin(); itCat != categories.end(); itCat++) {
+ out << itCat->first << " commmands include: " << itCat->second << endl;
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "CommandFactory", "printCommandsCategories");
+ exit(1);
+ }
+}
+
+/***********************************************************************/
bool isValidCommand(string);\r
bool isValidCommand(string, string);\r
void printCommands(ostream&);\r
+ void printCommandsCategories(ostream&);\r
void setOutputDirectory(string o) { outputDir = o; m->setOutputDir(o); }\r
void setInputDirectory(string i) { inputDir = i; }\r
void setLogfileName(string n, bool a) { logFileName = n; append = a; }\r
for (int j = 0; j < rareCounts.size(); j++) { //add "rare" OTU to the filtered lookup
filteredLookup[j]->push_back(rareCounts[j], thislookup[j]->getGroup());
}
-
- //create new label
- string oldLastLabel = saveBinLabels[saveBinLabels.size()-1];
- string tag = "";
- string otuNumber = "";
- for (int i = 0;i < oldLastLabel.length(); i++){
- //add numbers
- if( oldLastLabel[i]>47 && oldLastLabel[i]<58){ otuNumber += oldLastLabel[i]; }
- else { tag += oldLastLabel[i]; }
- }
-
- int oldLastBin;
- m->mothurConvert(otuNumber, oldLastBin);
- oldLastBin++;
- string newLabel = tag + toString(oldLastBin);
- filteredLabels.push_back(newLabel);
+ //create label for rare OTUs
+ filteredLabels.push_back("rareOTUs");
}
}
~ParseFastaQCommand() {}
vector<string> setParameters();
- string getCommandName() { return "parse.fastq"; }
+ string getCommandName() { return "fastq.info"; }
string getCommandCategory() { return "Sequence Processing"; }
string getHelpString();
CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
CommandParameter pdiffs("diffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(pdiffs);
CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
+ CommandParameter ptopdown("topdown", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(ptopdown);
+
CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
string helpString = "";
helpString += "The pre.cluster command groups sequences that are within a given number of base mismatches.\n";
helpString += "The pre.cluster command outputs a new fasta and name file.\n";
- helpString += "The pre.cluster command parameters are fasta, name, group, count, processors and diffs. The fasta parameter is required. \n";
+ helpString += "The pre.cluster command parameters are fasta, name, group, count, topdown, processors and diffs. The fasta parameter is required. \n";
helpString += "The name parameter allows you to give a list of seqs that are identical. This file is 2 columns, first column is name or representative sequence, second column is a list of its identical sequences separated by commas.\n";
helpString += "The group parameter allows you to provide a group file so you can cluster by group. \n";
helpString += "The count parameter allows you to provide a count file so you can cluster by group. \n";
helpString += "The diffs parameter allows you to specify maximum number of mismatched bases allowed between sequences in a grouping. The default is 1.\n";
+ helpString += "The topdown parameter allows you to specify whether to cluster from largest abundance to smallest or smallest to largest. Default=T, meanging largest to smallest.\n";
helpString += "The pre.cluster command should be in the following format: \n";
helpString += "pre.cluster(fasta=yourFastaFile, names=yourNamesFile, diffs=yourMaxDiffs) \n";
helpString += "Example pre.cluster(fasta=amazon.fasta, diffs=2).\n";
m->setProcessors(temp);
m->mothurConvert(temp, processors);
+ temp = validParameter.validFile(parameters, "topdown", false); if(temp == "not found"){ temp = "T"; }
+ topdown = m->isTrue(temp);
+
if (countfile == "") {
if (namefile == "") {
vector<string> files; files.push_back(fastafile);
// Allocate memory for thread data.
string extension = toString(i) + ".temp";
- preClusterData* tempPreCluster = new preClusterData(fastafile, namefile, groupfile, countfile, (newFName+extension), (newNName+extension), newMFile, groups, m, lines[i].start, lines[i].end, diffs, i);
+ preClusterData* tempPreCluster = new preClusterData(fastafile, namefile, groupfile, countfile, (newFName+extension), (newNName+extension), newMFile, groups, m, lines[i].start, lines[i].end, diffs, topdown, i);
pDataArray.push_back(tempPreCluster);
processIDS.push_back(i);
m->openOutputFile(newMapFile, out);
//sort seqs by number of identical seqs
- sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
+ if (topdown) { sort(alignSeqs.begin(), alignSeqs.end(), comparePriorityTopDown); }
+ else { sort(alignSeqs.begin(), alignSeqs.end(), comparePriorityDownTop); }
int count = 0;
int numSeqs = alignSeqs.size();
~seqPNode() {}
};
/************************************************************/
-inline bool comparePriority(seqPNode first, seqPNode second) {
+inline bool comparePriorityTopDown(seqPNode first, seqPNode second) {
if (first.numIdentical > second.numIdentical) { return true; }
else if (first.numIdentical == second.numIdentical) {
if (first.seq.getName() > second.seq.getName()) { return true; }
}
return false;
}
+/************************************************************/
+inline bool comparePriorityDownTop(seqPNode first, seqPNode second) {
+ if (first.numIdentical < second.numIdentical) { return true; }
+ else if (first.numIdentical == second.numIdentical) {
+ if (first.seq.getName() > second.seq.getName()) { return true; }
+ }
+ return false;
+}
//************************************************************/
class PreClusterCommand : public Command {
CountTable ct;
int diffs, length, processors;
- bool abort, bygroup;
+ bool abort, bygroup, topdown;
string fastafile, namefile, outputDir, groupfile, countfile;
vector<seqPNode> alignSeqs; //maps the number of identical seqs to a sequence
map<string, string> names; //represents the names file first column maps to second column
int diffs, threadID;
vector<string> groups;
vector<string> mapFileNames;
+ bool topdown;
preClusterData(){}
- preClusterData(string f, string n, string g, string c, string nff, string nnf, string nmf, vector<string> gr, MothurOut* mout, int st, int en, int d, int tid) {
+ preClusterData(string f, string n, string g, string c, string nff, string nnf, string nmf, vector<string> gr, MothurOut* mout, int st, int en, int d, bool td, int tid) {
fastafile = f;
namefile = n;
groupfile = g;
threadID = tid;
groups = gr;
countfile = c;
+ topdown = td;
}
};
pDataArray->m->openOutputFile(pDataArray->newMName+pDataArray->groups[k]+".map", out);
pDataArray->mapFileNames.push_back(pDataArray->newMName+pDataArray->groups[k]+".map");
- //sort seqs by number of identical seqs
- sort(alignSeqs.begin(), alignSeqs.end(), comparePriority);
-
+ //sort seqs by number of identical seqs
+ if (pDataArray->topdown) { sort(alignSeqs.begin(), alignSeqs.end(), comparePriorityTopDown); }
+ else { sort(alignSeqs.begin(), alignSeqs.end(), comparePriorityDownTop); }
+
int count = 0;
//think about running through twice...
for(int i=0;i<numRefSeqs;i++){
double length = 0;
- int diffs = alignQueryToReferences(querySeq, referenceSeqs[i], queryAlign[i], refAlign[i], length);
+ double diffs = alignQueryToReferences(querySeq, referenceSeqs[i], queryAlign[i], refAlign[i], length);
if(diffs < bestRefDiffs){
bestRefDiffs = diffs;
bestRefLength = length;
int end = refLength - 1;
int maxRow = 0;
- double maxRowValue = -100000000000;
+ double maxRowValue = -2147483647;
for(int i=0;i<queryLength;i++){
if(alignMatrix[i][end] > maxRowValue){
maxRow = i;
end = queryLength - 1;
int maxColumn = 0;
- double maxColumnValue = -100000000000;
+ double maxColumnValue = -2147483647;
for(int j=0;j<refLength;j++){
if(alignMatrix[end][j] > maxColumnValue){
~SplitGroupCommand() {}
vector<string> setParameters();
- string getCommandName() { return "split.group"; }
+ string getCommandName() { return "split.groups"; }
string getCommandCategory() { return "Sequence Processing"; }
string getHelpString();
//**********************************************************************************************************************
vector<string> TrimFlowsCommand::setParameters(){
try {
- CommandParameter pflow("flow", "InputTypes", "", "", "none", "none", "none","flow",false,true,true); parameters.push_back(pflow);
+ CommandParameter pflow("flow", "InputTypes", "", "", "none", "none", "none","flow-file",false,true,true); parameters.push_back(pflow);
CommandParameter poligos("oligos", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(poligos);
CommandParameter pmaxhomop("maxhomop", "Number", "", "9", "", "", "","",false,false); parameters.push_back(pmaxhomop);
CommandParameter pmaxflows("maxflows", "Number", "", "450", "", "", "","",false,false); parameters.push_back(pmaxflows);
if (type == "flow") { pattern = "[filename],[tag],flow"; }
else if (type == "fasta") { pattern = "[filename],flow.fasta"; }
- else if (type == "file") { pattern = "[filename],[tag],flow.files"; }
+ else if (type == "file") { pattern = "[filename],flow.files"; }
else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
return pattern;
if(allFiles){
set<string> namesAlreadyProcessed;
- variables["[tag]"] = "";
flowFilesFileName = getOutputFileName("file",variables);
m->openOutputFile(flowFilesFileName, output);
output.close();
}
else{
- variables["[tag]"] = "";
flowFilesFileName = getOutputFileName("file",variables);
m->openOutputFile(flowFilesFileName, output);