X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=blobdiff_plain;f=classifyseqscommand.cpp;h=1b56433073d404e2c7f33513f7362f8306afcb42;hp=4291132c74ea9537e9abfdea7779ac404d636c2a;hb=cf9987b67aa49777a4c91c2d21f96e58bf17aa82;hpb=55386dddad84cc1140d736cabaf4dd0ae16f2e01 diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 4291132..1b56433 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -8,37 +8,35 @@ */ #include "classifyseqscommand.h" -#include "sequence.hpp" -#include "bayesian.h" -#include "phylotree.h" -#include "phylosummary.h" -#include "knn.h" //********************************************************************************************************************** vector ClassifySeqsCommand::setParameters(){ try { - CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptaxonomy); - CommandParameter ptemplate("reference", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(ptemplate); - CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); - CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); - CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup); - CommandParameter psearch("search", "Multiple", "kmer-blast-suffix-distance", "kmer", "", "", "",false,false); parameters.push_back(psearch); - CommandParameter pksize("ksize", "Number", "", "8", "", "", "",false,false); parameters.push_back(pksize); - CommandParameter pmethod("method", "Multiple", "bayesian-knn", "bayesian", "", "", "",false,false); parameters.push_back(pmethod); - CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); - CommandParameter pmatch("match", "Number", "", "1.0", "", "", "",false,false); parameters.push_back(pmatch); - CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pmismatch); - CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "",false,false); parameters.push_back(pgapopen); - CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "",false,false); parameters.push_back(pgapextend); - CommandParameter pcutoff("cutoff", "Number", "", "0", "", "", "",false,true); parameters.push_back(pcutoff); - CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pprobs); - CommandParameter piters("iters", "Number", "", "100", "", "", "",false,true); parameters.push_back(piters); - CommandParameter psave("save", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psave); - CommandParameter pnumwanted("numwanted", "Number", "", "10", "", "", "",false,true); parameters.push_back(pnumwanted); - CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); - CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); + CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(ptaxonomy); + CommandParameter ptemplate("reference", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(ptemplate); + CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","taxonomy",false,true,true); parameters.push_back(pfasta); + CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","",false,false,true); parameters.push_back(pname); + CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","",false,false,true); parameters.push_back(pcount); + CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup); + + CommandParameter psearch("search", "Multiple", "kmer-blast-suffix-distance-align", "kmer", "", "", "","",false,false); parameters.push_back(psearch); + CommandParameter pksize("ksize", "Number", "", "8", "", "", "","",false,false); parameters.push_back(pksize); + CommandParameter pmethod("method", "Multiple", "wang-knn-zap", "wang", "", "", "","",false,false); parameters.push_back(pmethod); + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); + CommandParameter pmatch("match", "Number", "", "1.0", "", "", "","",false,false); parameters.push_back(pmatch); + CommandParameter pmismatch("mismatch", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pmismatch); + CommandParameter pgapopen("gapopen", "Number", "", "-2.0", "", "", "","",false,false); parameters.push_back(pgapopen); + CommandParameter pgapextend("gapextend", "Number", "", "-1.0", "", "", "","",false,false); parameters.push_back(pgapextend); + CommandParameter pcutoff("cutoff", "Number", "", "0", "", "", "","",false,true); parameters.push_back(pcutoff); + CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pprobs); + CommandParameter piters("iters", "Number", "", "100", "", "", "","",false,true); parameters.push_back(piters); + CommandParameter psave("save", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(psave); + CommandParameter pshortcuts("shortcuts", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pshortcuts); + CommandParameter pnumwanted("numwanted", "Number", "", "10", "", "", "","",false,true); parameters.push_back(pnumwanted); + CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); + CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } @@ -54,12 +52,13 @@ string ClassifySeqsCommand::getHelpString(){ try { string helpString = ""; helpString += "The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n"; - helpString += "The classify.seqs command parameters are reference, fasta, name, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n"; + helpString += "The classify.seqs command parameters are reference, fasta, name, group, count, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend, numwanted and probs.\n"; helpString += "The reference, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n"; - helpString += "The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer, blast and distance. The default is kmer.\n"; + helpString += "The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer, blast, align and distance. The default is kmer.\n"; helpString += "The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n"; helpString += "The group parameter allows you add a group file so you can have the summary totals broken up by group.\n"; - helpString += "The method parameter allows you to specify classification method to use. Your options are: bayesian and knn. The default is bayesian.\n"; + helpString += "The count parameter allows you add a count file so you can have the summary totals broken up by group.\n"; + helpString += "The method parameter allows you to specify classification method to use. Your options are: wang, knn and zap. The default is wang.\n"; helpString += "The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n"; helpString += "The processors parameter allows you to specify the number of processors to use. The default is 1.\n"; #ifdef USE_MPI @@ -72,8 +71,9 @@ string ClassifySeqsCommand::getHelpString(){ helpString += "The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n"; helpString += "The numwanted parameter allows you to specify the number of sequence matches you want with the knn method. The default is 10.\n"; helpString += "The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy. The default is 0.\n"; - helpString += "The probs parameter shuts off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be shown.\n"; - helpString += "The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the bayesian method. The default is 100.\n"; + helpString += "The probs parameter shuts off the bootstrapping results for the wang and zap method. The default is true, meaning you want the bootstrapping to be shown.\n"; + helpString += "The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the wang method. The default is 100.\n"; + //helpString += "The flip parameter allows you shut off mothur's The default is T.\n"; helpString += "The classify.seqs command should be in the following format: \n"; helpString += "classify.seqs(reference=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n"; helpString += "Example classify.seqs(fasta=amazon.fasta, reference=core.filtered, method=knn, search=gotoh, ksize=8, processors=2)\n"; @@ -88,12 +88,31 @@ string ClassifySeqsCommand::getHelpString(){ } } //********************************************************************************************************************** +string ClassifySeqsCommand::getOutputPattern(string type) { + try { + string pattern = ""; + + if (type == "taxonomy") { pattern = "[filename],[tag],[tag2],taxonomy"; } + else if (type == "taxsummary") { pattern = "[filename],[tag],[tag2],tax.summary"; } + else if (type == "accnos") { pattern = "[filename],[tag],[tag2],flip.accnos"; } + else if (type == "matchdist") { pattern = "[filename],[tag],[tag2],match.dist"; } + else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } + + return pattern; + } + catch(exception& e) { + m->errorOut(e, "ClassifySeqsCommand", "getOutputPattern"); + exit(1); + } +} +//********************************************************************************************************************** ClassifySeqsCommand::ClassifySeqsCommand(){ try { abort = true; calledHelp = true; setParameters(); vector tempOutNames; outputTypes["taxonomy"] = tempOutNames; + outputTypes["accnos"] = tempOutNames; outputTypes["taxsummary"] = tempOutNames; outputTypes["matchdist"] = tempOutNames; } @@ -106,7 +125,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(){ ClassifySeqsCommand::ClassifySeqsCommand(string option) { try { abort = false; calledHelp = false; - rdb = ReferenceDB::getInstance(); + rdb = ReferenceDB::getInstance(); hasName = false; hasCount=false; //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } @@ -131,6 +150,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { outputTypes["taxonomy"] = tempOutNames; outputTypes["taxsummary"] = tempOutNames; outputTypes["matchdist"] = tempOutNames; + outputTypes["accnos"] = tempOutNames; //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -155,15 +175,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["taxonomy"] = inputDir + it->second; } } - - it = parameters.find("group"); - //user has given a template file - if(it != parameters.end()){ - path = m->hasPath(it->second); - //if the user has not given a path then, add inputdir. else leave path alone. - if (path == "") { parameters["group"] = inputDir + it->second; } - } - } + } fastaFileName = validParameter.validFile(parameters, "fasta", false); if (fastaFileName == "not found") { @@ -246,7 +258,6 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { namefile = validParameter.validFile(parameters, "name", false); if (namefile == "not found") { namefile = ""; } - else { m->splitAtDash(namefile, namefileNames); @@ -311,11 +322,90 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { } } } - + + if (namefileNames.size() != 0) { hasName = true; } + if (namefile != "") { if (namefileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a name file, you must have one for each fasta file."); m->mothurOutEndLine(); } } + //check for required parameters + countfile = validParameter.validFile(parameters, "count", false); + if (countfile == "not found") { + countfile = ""; + }else { + m->splitAtDash(countfile, countfileNames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < countfileNames.size(); i++) { + + bool ignore = false; + if (countfileNames[i] == "current") { + countfileNames[i] = m->getCountTableFile(); + if (countfileNames[i] != "") { m->mothurOut("Using " + countfileNames[i] + " as input file for the count parameter where you had given current."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current count file, ignoring current."); m->mothurOutEndLine(); ignore=true; + //erase from file list + countfileNames.erase(countfileNames.begin()+i); + i--; + } + } + + if (!ignore) { + + if (inputDir != "") { + string path = m->hasPath(countfileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { countfileNames[i] = inputDir + countfileNames[i]; } + } + + int ableToOpen; + ifstream in; + + ableToOpen = m->openInputFile(countfileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(countfileNames[i]); + m->mothurOut("Unable to open " + countfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + countfileNames[i] = tryPath; + } + } + + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(countfileNames[i]); + m->mothurOut("Unable to open " + countfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + countfileNames[i] = tryPath; + } + } + + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + countfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + countfileNames.erase(countfileNames.begin()+i); + i--; + }else { + m->setCountTableFile(countfileNames[i]); + } + } + } + } + + if (countfileNames.size() != 0) { hasCount = true; if (countfileNames.size() != fastaFileNames.size()) {m->mothurOut("If you provide a count file, you must have one for each fasta file."); m->mothurOutEndLine(); } } + + //make sure there is at least one valid file left + if (hasName && hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or name."); m->mothurOutEndLine(); abort = true; } + groupfile = validParameter.validFile(parameters, "group", false); if (groupfile == "not found") { groupfile = ""; } else { @@ -323,54 +413,74 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { //go through files and make sure they are good, if not, then disregard them for (int i = 0; i < groupfileNames.size(); i++) { - if (inputDir != "") { - string path = m->hasPath(groupfileNames[i]); - //if the user has not given a path then, add inputdir. else leave path alone. - if (path == "") { groupfileNames[i] = inputDir + groupfileNames[i]; } - } - int ableToOpen; - ifstream in; - ableToOpen = m->openInputFile(groupfileNames[i], in, "noerror"); - - //if you can't open it, try default location - if (ableToOpen == 1) { - if (m->getDefaultPath() != "") { //default path is set - string tryPath = m->getDefaultPath() + m->getSimpleName(groupfileNames[i]); - m->mothurOut("Unable to open " + groupfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); - ifstream in2; - ableToOpen = m->openInputFile(tryPath, in2, "noerror"); - in2.close(); - groupfileNames[i] = tryPath; + bool ignore = false; + if (groupfileNames[i] == "current") { + groupfileNames[i] = m->getGroupFile(); + if (groupfileNames[i] != "") { m->mothurOut("Using " + groupfileNames[i] + " as input file for the group parameter where you had given current."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current group file, ignoring current."); m->mothurOutEndLine(); ignore=true; + //erase from file list + groupfileNames.erase(groupfileNames.begin()+i); + i--; } } - if (ableToOpen == 1) { - if (m->getOutputDir() != "") { //default path is set - string tryPath = m->getOutputDir() + m->getSimpleName(groupfileNames[i]); - m->mothurOut("Unable to open " + groupfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); - ifstream in2; - ableToOpen = m->openInputFile(tryPath, in2, "noerror"); - in2.close(); - groupfileNames[i] = tryPath; + if (!ignore) { + + if (inputDir != "") { + string path = m->hasPath(groupfileNames[i]); + cout << path << '\t' << inputDir << endl; + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { groupfileNames[i] = inputDir + groupfileNames[i]; } + } + + int ableToOpen; + + ifstream in; + ableToOpen = m->openInputFile(groupfileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(groupfileNames[i]); + m->mothurOut("Unable to open " + groupfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + groupfileNames[i] = tryPath; + } + } + + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(groupfileNames[i]); + m->mothurOut("Unable to open " + groupfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + groupfileNames[i] = tryPath; + } + } + + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + groupfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + groupfileNames.erase(groupfileNames.begin()+i); + i--; + }else { + m->setGroupFile(groupfileNames[i]); } } - in.close(); - - if (ableToOpen == 1) { - m->mothurOut("Unable to open " + groupfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); groupfileNames[i] = ""; - //erase from file list - groupfileNames.erase(groupfileNames.begin()+i); - i--; - }else { - m->setGroupFile(groupfileNames[i]); - } } } if (groupfile != "") { if (groupfileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a group file, you must have one for each fasta file."); m->mothurOutEndLine(); } + if (hasCount) { m->mothurOut("[ERROR]: You must enter ONLY ONE of the following: count or group."); m->mothurOutEndLine(); abort = true; } }else { for (int i = 0; i < fastaFileNames.size(); i++) { groupfileNames.push_back(""); } } @@ -378,8 +488,9 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { //check for optional parameter and set defaults // ...at some point should added some additional type checking... string temp; - temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found"){ temp = "8"; } - convert(temp, kmerSize); + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } + m->setProcessors(temp); + m->mothurConvert(temp, processors); temp = validParameter.validFile(parameters, "save", false); if (temp == "not found"){ temp = "f"; } save = m->isTrue(temp); @@ -416,46 +527,69 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { }else if (taxonomyFileName == "not open") { abort = true; } else { if (save) { rdb->setSavedTaxonomy(taxonomyFileName); } } - temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } - m->setProcessors(temp); - convert(temp, processors); - search = validParameter.validFile(parameters, "search", false); if (search == "not found"){ search = "kmer"; } - method = validParameter.validFile(parameters, "method", false); if (method == "not found"){ method = "bayesian"; } + method = validParameter.validFile(parameters, "method", false); if (method == "not found"){ method = "wang"; } + + temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found"){ + temp = "8"; + if (method == "zap") { temp = "7"; } + } + m->mothurConvert(temp, kmerSize); temp = validParameter.validFile(parameters, "match", false); if (temp == "not found"){ temp = "1.0"; } - convert(temp, match); + m->mothurConvert(temp, match); temp = validParameter.validFile(parameters, "mismatch", false); if (temp == "not found"){ temp = "-1.0"; } - convert(temp, misMatch); + m->mothurConvert(temp, misMatch); temp = validParameter.validFile(parameters, "gapopen", false); if (temp == "not found"){ temp = "-2.0"; } - convert(temp, gapOpen); + m->mothurConvert(temp, gapOpen); temp = validParameter.validFile(parameters, "gapextend", false); if (temp == "not found"){ temp = "-1.0"; } - convert(temp, gapExtend); + m->mothurConvert(temp, gapExtend); temp = validParameter.validFile(parameters, "numwanted", false); if (temp == "not found"){ temp = "10"; } - convert(temp, numWanted); + m->mothurConvert(temp, numWanted); temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found"){ temp = "0"; } - convert(temp, cutoff); + m->mothurConvert(temp, cutoff); temp = validParameter.validFile(parameters, "probs", false); if (temp == "not found"){ temp = "true"; } probs = m->isTrue(temp); + + temp = validParameter.validFile(parameters, "shortcuts", false); if (temp == "not found"){ temp = "true"; } + writeShortcuts = m->isTrue(temp); + + //temp = validParameter.validFile(parameters, "flip", false); if (temp == "not found"){ temp = "T"; } + //flip = m->isTrue(temp); + flip = true; temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "100"; } - convert(temp, iters); - + m->mothurConvert(temp, iters); - if ((method == "bayesian") && (search != "kmer")) { - m->mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); m->mothurOutEndLine(); + if ((method == "wang") && (search != "kmer")) { + m->mothurOut("The wang method requires the kmer search. " + search + " will be disregarded, and kmer will be used." ); m->mothurOutEndLine(); search = "kmer"; } - } - + + if ((method == "zap") && ((search != "kmer") && (search != "align"))) { + m->mothurOut("The zap method requires the kmer or align search. " + search + " will be disregarded, and kmer will be used." ); m->mothurOutEndLine(); + search = "kmer"; + } + + if (!abort) { + if (!hasCount) { + if (namefileNames.size() == 0){ + if (fastaFileNames.size() != 0) { + vector files; files.push_back(fastaFileNames[fastaFileNames.size()-1]); + parser.getNameFile(files); + } + } + } + } + } } catch(exception& e) { m->errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand"); @@ -474,37 +608,52 @@ ClassifySeqsCommand::~ClassifySeqsCommand(){ int ClassifySeqsCommand::execute(){ try { if (abort == true) { if (calledHelp) { return 0; } return 2; } - - if(method == "bayesian"){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand()); } + + string outputMethodTag = method; + if(method == "wang"){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip, writeShortcuts); } else if(method == "knn"){ classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted, rand()); } + else if(method == "zap"){ + outputMethodTag = search + "_" + outputMethodTag; + if (search == "kmer") { classify = new KmerTree(templateFileName, taxonomyFileName, kmerSize, cutoff); } + else { classify = new AlignTree(templateFileName, taxonomyFileName, cutoff); } + } else { - m->mothurOut(search + " is not a valid method option. I will run the command using bayesian."); + m->mothurOut(search + " is not a valid method option. I will run the command using wang."); m->mothurOutEndLine(); - classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand()); + classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand(), flip, writeShortcuts); } if (m->control_pressed) { delete classify; return 0; } - for (int s = 0; s < fastaFileNames.size(); s++) { m->mothurOut("Classifying sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine(); - string baseTName = taxonomyFileName; - if (taxonomyFileName == "saved") {baseTName = rdb->getSavedTaxonomy(); } + string baseTName = m->getSimpleName(taxonomyFileName); + if (taxonomyFileName == "saved") { baseTName = rdb->getSavedTaxonomy(); } - string RippedTaxName = m->getRootName(m->getSimpleName(baseTName)); - RippedTaxName = m->getExtension(RippedTaxName.substr(0, RippedTaxName.length()-1)); - if (RippedTaxName[0] == '.') { RippedTaxName = RippedTaxName.substr(1, RippedTaxName.length()); } - RippedTaxName += "."; - + //set rippedTaxName to + string RippedTaxName = ""; + bool foundDot = false; + for (int i = baseTName.length()-1; i >= 0; i--) { + if (foundDot && (baseTName[i] != '.')) { RippedTaxName = baseTName[i] + RippedTaxName; } + else if (foundDot && (baseTName[i] == '.')) { break; } + else if (!foundDot && (baseTName[i] == '.')) { foundDot = true; } + } + //if (RippedTaxName != "") { RippedTaxName += "."; } + if (outputDir == "") { outputDir += m->hasPath(fastaFileNames[s]); } - string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "taxonomy"; + map variables; + variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])); + variables["[tag]"] = RippedTaxName; + variables["[tag2]"] = outputMethodTag; + string newTaxonomyFile = getOutputFileName("taxonomy", variables); + string newaccnosFile = getOutputFileName("accnos", variables); string tempTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "taxonomy.temp"; - string taxSummary = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "tax.summary"; + string taxSummary = getOutputFileName("taxsummary", variables); if ((method == "knn") && (search == "distance")) { - string DistName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "match.dist"; + string DistName = getOutputFileName("matchdist", variables); classify->setDistName(DistName); outputNames.push_back(DistName); outputTypes["matchdist"].push_back(DistName); } @@ -518,7 +667,7 @@ int ClassifySeqsCommand::execute(){ #ifdef USE_MPI int pid, numSeqsPerProcessor; int tag = 2001; - vector MPIPos; + vector MPIPos; MPI_Status status; MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are @@ -527,6 +676,7 @@ int ClassifySeqsCommand::execute(){ MPI_File inMPI; MPI_File outMPINewTax; MPI_File outMPITempTax; + MPI_File outMPIAcc; int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; int inMode=MPI_MODE_RDONLY; @@ -536,6 +686,9 @@ int ClassifySeqsCommand::execute(){ char outTempTax[1024]; strcpy(outTempTax, tempTaxonomyFile.c_str()); + + char outAcc[1024]; + strcpy(outAcc, newaccnosFile.c_str()); char inFileName[1024]; strcpy(inFileName, fastaFileNames[s].c_str()); @@ -543,8 +696,9 @@ int ClassifySeqsCommand::execute(){ MPI_File_open(MPI_COMM_WORLD, inFileName, inMode, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer MPI_File_open(MPI_COMM_WORLD, outNewTax, outMode, MPI_INFO_NULL, &outMPINewTax); MPI_File_open(MPI_COMM_WORLD, outTempTax, outMode, MPI_INFO_NULL, &outMPITempTax); + MPI_File_open(MPI_COMM_WORLD, outAcc, outMode, MPI_INFO_NULL, &outMPIAcc); - if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPIAcc); MPI_File_close(&outMPITempTax); delete classify; return 0; } if (pid == 0) { //you are the root process @@ -563,9 +717,9 @@ int ClassifySeqsCommand::execute(){ //align your part - driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos); + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, outMPIAcc, MPIPos); - if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPIAcc); MPI_File_close(&outMPITempTax); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } for (int i = 1; i < processors; i++) { int done; @@ -583,9 +737,9 @@ int ClassifySeqsCommand::execute(){ //align your part - driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos); + driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, outMPIAcc, MPIPos); - if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPIAcc); MPI_File_close(&outMPITempTax); delete classify; return 0; } int done = 0; MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD); @@ -595,35 +749,47 @@ int ClassifySeqsCommand::execute(){ MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); + MPI_File_close(&outMPIAcc); MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else - vector positions = m->divideFile(fastaFileNames[s], processors); + vector positions; +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + positions = m->divideFile(fastaFileNames[s], processors); + for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(new linePair(positions[i], positions[(i+1)])); } +#else + if (processors == 1) { + lines.push_back(new linePair(0, 1000)); + }else { + positions = m->setFilePosFasta(fastaFileNames[s], numFastaSeqs); + if (positions.size() < processors) { processors = positions.size(); } - for (int i = 0; i < (positions.size()-1); i++) { - lines.push_back(new linePair(positions[i], positions[(i+1)])); - } - - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - if(processors == 1){ - numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); + //figure out how many sequences you have to process + int numSeqsPerProcessor = numFastaSeqs / processors; + for (int i = 0; i < processors; i++) { + int startIndex = i * numSeqsPerProcessor; + if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; } + lines.push_back(new linePair(positions[startIndex], numSeqsPerProcessor)); + } } - else{ - processIDS.resize(0); - - numFastaSeqs = createProcesses(newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); - +#endif + if(processors == 1){ + numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, newaccnosFile, fastaFileNames[s]); + }else{ + numFastaSeqs = createProcesses(newTaxonomyFile, tempTaxonomyFile, newaccnosFile, fastaFileNames[s]); } - #else - numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); - #endif #endif + + if (!m->isBlank(newaccnosFile)) { m->mothurOutEndLine(); m->mothurOut("[WARNING]: mothur reversed some your sequences for a better classification. If you would like to take a closer look, please check " + newaccnosFile + " for the list of the sequences."); m->mothurOutEndLine(); + outputNames.push_back(newaccnosFile); outputTypes["accnos"].push_back(newaccnosFile); + }else { m->mothurRemove(newaccnosFile); } m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); start = time(NULL); - + + #ifdef USE_MPI if (pid == 0) { //this part does not need to be paralellized @@ -634,67 +800,64 @@ int ClassifySeqsCommand::execute(){ if(namefile != "") { m->mothurOut("Reading " + namefileNames[s] + "..."); cout.flush(); - nameMap.clear(); //remove old names - - ifstream inNames; - m->openInputFile(namefileNames[s], inNames); - - string firstCol, secondCol; - while(!inNames.eof()) { - inNames >> firstCol >> secondCol; m->gobble(inNames); - - vector temp; - m->splitAtComma(secondCol, temp); - - nameMap[firstCol] = temp; - } - inNames.close(); - + m->readNames(namefileNames[s], nameMap); m->mothurOut(" Done."); m->mothurOutEndLine(); } #endif - string group = ""; - if (groupfile != "") { group = groupfileNames[s]; } - - PhyloSummary taxaSum(baseTName, group); - - if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } - - if (namefile == "") { taxaSum.summarize(tempTaxonomyFile); } - else { - ifstream in; - m->openInputFile(tempTaxonomyFile, in); - - //read in users taxonomy file and add sequences to tree - string name, taxon; - - while(!in.eof()){ - in >> name >> taxon; m->gobble(in); - - itNames = nameMap.find(name); - - if (itNames == nameMap.end()) { - m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1); - }else{ - for (int i = 0; i < itNames->second.size(); i++) { - taxaSum.addSeqToTree(itNames->second[i], taxon); //add it as many times as there are identical seqs - } - itNames->second.clear(); - nameMap.erase(itNames->first); - } - } - in.close(); - } + string group = ""; + GroupMap* groupMap = NULL; + CountTable* ct = NULL; + PhyloSummary* taxaSum; + if (hasCount) { + ct = new CountTable(); + ct->readTable(countfileNames[s], true, false); + taxaSum = new PhyloSummary(taxonomyFileName, ct); + taxaSum->summarize(tempTaxonomyFile); + }else { + if (groupfile != "") { group = groupfileNames[s]; groupMap = new GroupMap(group); groupMap->readMap(); } + + taxaSum = new PhyloSummary(taxonomyFileName, groupMap); + + if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete taxaSum; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } + + if (namefile == "") { taxaSum->summarize(tempTaxonomyFile); } + else { + ifstream in; + m->openInputFile(tempTaxonomyFile, in); + + //read in users taxonomy file and add sequences to tree + string name, taxon; + + while(!in.eof()){ + if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete taxaSum; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } + + in >> name >> taxon; m->gobble(in); + + itNames = nameMap.find(name); + + if (itNames == nameMap.end()) { + m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1); + }else{ + for (int i = 0; i < itNames->second.size(); i++) { + taxaSum->addSeqToTree(itNames->second[i], taxon); //add it as many times as there are identical seqs + } + itNames->second.clear(); + nameMap.erase(itNames->first); + } + } + in.close(); + } + } m->mothurRemove(tempTaxonomyFile); - if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } //print summary file ofstream outTaxTree; m->openOutputFile(taxSummary, outTaxTree); - taxaSum.print(outTaxTree); + taxaSum->print(outTaxTree); outTaxTree.close(); //output taxonomy with the unclassified bins added @@ -706,12 +869,12 @@ int ClassifySeqsCommand::execute(){ m->openOutputFile(unclass, outTax); //get maxLevel from phylotree so you know how many 'unclassified's to add - int maxLevel = taxaSum.getMaxLevel(); + int maxLevel = taxaSum->getMaxLevel(); //read taxfile - this reading and rewriting is done to preserve the confidence scores. string name, taxon; while (!inTax.eof()) { - if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } m->mothurRemove(unclass); delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); if (ct != NULL) { delete ct; } if (groupMap != NULL) { delete groupMap; } delete taxaSum; for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } m->mothurRemove(unclass); delete classify; return 0; } inTax >> name >> taxon; m->gobble(inTax); @@ -722,6 +885,8 @@ int ClassifySeqsCommand::execute(){ inTax.close(); outTax.close(); + if (ct != NULL) { delete ct; } + if (groupMap != NULL) { delete groupMap; } delete taxaSum; m->mothurRemove(newTaxonomyFile); rename(unclass.c_str(), newTaxonomyFile.c_str()); @@ -731,12 +896,13 @@ int ClassifySeqsCommand::execute(){ #ifdef USE_MPI } #endif - - m->mothurOutEndLine(); - m->mothurOut("Output File Names: "); m->mothurOutEndLine(); - for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } - m->mothurOutEndLine(); } + delete classify; + + m->mothurOutEndLine(); + m->mothurOut("Output File Names: "); m->mothurOutEndLine(); + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } + m->mothurOutEndLine(); //set taxonomy file as new current taxonomyfile string current = ""; @@ -745,7 +911,14 @@ int ClassifySeqsCommand::execute(){ if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); } } - delete classify; + current = ""; + itTypes = outputTypes.find("accnos"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); } + } + + + return 0; } catch(exception& e) { @@ -785,11 +958,14 @@ string ClassifySeqsCommand::addUnclassifieds(string tax, int maxlevel) { /**************************************************************************************************/ -int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string filename) { +int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string accnos, string filename) { try { -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 1; + int num = 0; + processIDS.clear(); + +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + int process = 1; //loop through and create all the processes you want while (process != processors) { @@ -799,7 +975,7 @@ int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - num = driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp", filename); + num = driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp", accnos + toString(getpid()) + ".temp", filename); //pass numSeqs to parent ofstream out; @@ -817,7 +993,7 @@ int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, } //parent does its part - num = driver(lines[0], taxFileName, tempTaxFile, filename); + num = driver(lines[0], taxFileName, tempTaxFile, accnos, filename); //force parent to wait until all the processes are done for (int i=0;i> tempNum; num += tempNum; } in.close(); m->mothurRemove(m->getFullPathName(tempFile)); } +#else + ////////////////////////////////////////////////////////////////////////////////////////////////////// + //Windows version shared memory, so be careful when passing variables through the alignData struct. + //Above fork() will clone, so memory is separate, but that's not the case with windows, + ////////////////////////////////////////////////////////////////////////////////////////////////////// + + vector pDataArray; + DWORD dwThreadIdArray[processors-1]; + HANDLE hThreadArray[processors-1]; + + //Create processor worker threads. + for( int i=0; istart, lines[i]->end, match, misMatch, gapOpen, gapExtend, cutoff, i, flip, writeShortcuts); + pDataArray.push_back(tempclass); + + //MySeqSumThreadFunction is in header. It must be global or static to work with the threads. + //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier + hThreadArray[i] = CreateThread(NULL, 0, MyClassThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]); + + } + //parent does its part + num = driver(lines[processors-1], taxFileName + toString(processors-1) + ".temp", tempTaxFile + toString(processors-1) + ".temp", accnos + toString(processors-1) + ".temp", filename); + processIDS.push_back((processors-1)); + + //Wait until all threads have terminated. + WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); + + //Close all thread handles and free memory allocations. + for(int i=0; i < pDataArray.size(); i++){ + num += pDataArray[i]->count; + if (pDataArray[i]->count != pDataArray[i]->end) { + m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->end) + " sequences assigned to it, quitting. \n"); m->control_pressed = true; + } + CloseHandle(hThreadArray[i]); + delete pDataArray[i]; + } + + #endif + vector nonBlankAccnosFiles; + if (!(m->isBlank(accnos))) { nonBlankAccnosFiles.push_back(accnos); } + else { m->mothurRemove(accnos); } //remove so other files can be renamed to it + for(int i=0;iappendFiles((taxFileName + toString(processIDS[i]) + ".temp"), taxFileName); + m->appendFiles((tempTaxFile + toString(processIDS[i]) + ".temp"), tempTaxFile); + if (!(m->isBlank(accnos + toString(processIDS[i]) + ".temp"))) { + nonBlankAccnosFiles.push_back(accnos + toString(processIDS[i]) + ".temp"); + }else { m->mothurRemove((accnos + toString(processIDS[i]) + ".temp")); } + m->mothurRemove((m->getFullPathName(taxFileName) + toString(processIDS[i]) + ".temp")); m->mothurRemove((m->getFullPathName(tempTaxFile) + toString(processIDS[i]) + ".temp")); } - return num; -#endif - } - catch(exception& e) { - m->errorOut(e, "ClassifySeqsCommand", "createProcesses"); - exit(1); - } -} -/**************************************************************************************************/ - -void ClassifySeqsCommand::appendTaxFiles(string temp, string filename) { - try{ - - ofstream output; - ifstream input; - m->openOutputFileAppend(filename, output); - m->openInputFile(temp, input); - - while(char c = input.get()){ - if(input.eof()) { break; } - else { output << c; } + //append accnos files + if (nonBlankAccnosFiles.size() != 0) { + rename(nonBlankAccnosFiles[0].c_str(), accnos.c_str()); + + for (int h=1; h < nonBlankAccnosFiles.size(); h++) { + m->appendFiles(nonBlankAccnosFiles[h], accnos); + m->mothurRemove(nonBlankAccnosFiles[h]); + } + }else { //recreate the accnosfile if needed + ofstream out; + m->openOutputFile(accnos, out); + out.close(); } + + return num; - input.close(); - output.close(); } catch(exception& e) { - m->errorOut(e, "ClassifySeqsCommand", "appendTaxFiles"); + m->errorOut(e, "ClassifySeqsCommand", "createProcesses"); exit(1); } } - //********************************************************************************************************************** -int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempTFName, string filename){ +int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempTFName, string accnos, string filename){ try { ofstream outTax; m->openOutputFile(taxFName, outTax); ofstream outTaxSimple; m->openOutputFile(tempTFName, outTaxSimple); + + ofstream outAcc; + m->openOutputFile(accnos, outAcc); ifstream inFASTA; m->openInputFile(filename, inFASTA); @@ -893,7 +1112,11 @@ int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempT int count = 0; while (!done) { - if (m->control_pressed) { return 0; } + if (m->control_pressed) { + inFASTA.close(); + outTax.close(); + outTaxSimple.close(); + outAcc.close(); return 0; } Sequence* candidateSeq = new Sequence(inFASTA); m->gobble(inFASTA); @@ -902,38 +1125,42 @@ int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempT taxonomy = classify->getTaxonomy(candidateSeq); if (m->control_pressed) { delete candidateSeq; return 0; } - - if (taxonomy != "bad seq") { - //output confidence scores or not - if (probs) { - outTax << candidateSeq->getName() << '\t' << taxonomy << endl; - }else{ - outTax << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl; - } - - outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl; + + if (taxonomy == "unknown;") { m->mothurOut("[WARNING]: " + candidateSeq->getName() + " could not be classified. You can use the remove.lineage command with taxon=unknown; to remove such sequences."); m->mothurOutEndLine(); } + + //output confidence scores or not + if (probs) { + outTax << candidateSeq->getName() << '\t' << taxonomy << endl; + }else{ + outTax << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl; } + + if (classify->getFlipped()) { outAcc << candidateSeq->getName() << endl; } + + outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl; + count++; } delete candidateSeq; - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - unsigned long int pos = inFASTA.tellg(); + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) + unsigned long long pos = inFASTA.tellg(); if ((pos == -1) || (pos >= filePos->end)) { break; } #else if (inFASTA.eof()) { break; } #endif //report progress - if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } + if((count) % 100 == 0){ m->mothurOutJustToScreen("Processing sequence: " + toString(count) +"\n"); } } //report progress - if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } + if((count) % 100 != 0){ m->mothurOutJustToScreen("Processing sequence: " + toString(count)+"\n"); } inFASTA.close(); outTax.close(); outTaxSimple.close(); + outAcc.close(); return count; } @@ -944,10 +1171,11 @@ int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempT } //********************************************************************************************************************** #ifdef USE_MPI -int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& newFile, MPI_File& tempFile, vector& MPIPos){ +int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& newFile, MPI_File& tempFile, MPI_File& accFile, vector& MPIPos){ try { MPI_Status statusNew; MPI_Status statusTemp; + MPI_Status statusAcc; MPI_Status status; int pid; @@ -975,29 +1203,40 @@ int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File if (candidateSeq->getName() != "") { taxonomy = classify->getTaxonomy(candidateSeq); - if (taxonomy != "bad seq") { - //output confidence scores or not - if (probs) { - outputString = candidateSeq->getName() + "\t" + taxonomy + "\n"; - }else{ - outputString = candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n"; - } - - int length = outputString.length(); - char* buf2 = new char[length]; - memcpy(buf2, outputString.c_str(), length); + if (taxonomy == "unknown;") { m->mothurOut("[WARNING]: " + candidateSeq->getName() + " could not be classified. You can use the remove.lineage command with taxon=unknown; to remove such sequences."); m->mothurOutEndLine(); } - MPI_File_write_shared(newFile, buf2, length, MPI_CHAR, &statusNew); - delete buf2; - + //output confidence scores or not + if (probs) { + outputString = candidateSeq->getName() + "\t" + taxonomy + "\n"; + }else{ outputString = candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n"; - length = outputString.length(); - char* buf = new char[length]; - memcpy(buf, outputString.c_str(), length); + } + + int length = outputString.length(); + char* buf2 = new char[length]; + memcpy(buf2, outputString.c_str(), length); - MPI_File_write_shared(tempFile, buf, length, MPI_CHAR, &statusTemp); - delete buf; + MPI_File_write_shared(newFile, buf2, length, MPI_CHAR, &statusNew); + delete buf2; + + outputString = candidateSeq->getName() + "\t" + classify->getSimpleTax() + "\n"; + length = outputString.length(); + char* buf = new char[length]; + memcpy(buf, outputString.c_str(), length); + + MPI_File_write_shared(tempFile, buf, length, MPI_CHAR, &statusTemp); + delete buf; + + if (classify->getFlipped()) { + outputString = candidateSeq->getName() + "\n"; + length = outputString.length(); + char* buf3 = new char[length]; + memcpy(buf3, outputString.c_str(), length); + + MPI_File_write_shared(accFile, buf3, length, MPI_CHAR, &statusAcc); + delete buf3; } + } delete candidateSeq;