X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=classifyseqscommand.cpp;h=4291132c74ea9537e9abfdea7779ac404d636c2a;hb=55386dddad84cc1140d736cabaf4dd0ae16f2e01;hp=1619656e792cfa14ea1bec54d558cf560a61861d;hpb=1d898dc6edaf9e9f287fab53bf1f21fb29757a17;p=mothur.git diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 1619656..4291132 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -15,6 +15,7 @@ #include "knn.h" + //********************************************************************************************************************** vector ClassifySeqsCommand::setParameters(){ try { @@ -34,6 +35,7 @@ vector ClassifySeqsCommand::setParameters(){ CommandParameter pcutoff("cutoff", "Number", "", "0", "", "", "",false,true); parameters.push_back(pcutoff); CommandParameter pprobs("probs", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pprobs); CommandParameter piters("iters", "Number", "", "100", "", "", "",false,true); parameters.push_back(piters); + CommandParameter psave("save", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(psave); CommandParameter pnumwanted("numwanted", "Number", "", "10", "", "", "",false,true); parameters.push_back(pnumwanted); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); @@ -63,6 +65,7 @@ string ClassifySeqsCommand::getHelpString(){ #ifdef USE_MPI helpString += "When using MPI, the processors parameter is set to the number of MPI processes running. \n"; #endif + helpString += "If the save parameter is set to true the reference sequences will be saved in memory, to clear them later you can use the clear.memory command. Default=f."; helpString += "The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n"; helpString += "The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n"; helpString += "The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -2.0.\n"; @@ -103,9 +106,11 @@ ClassifySeqsCommand::ClassifySeqsCommand(){ ClassifySeqsCommand::ClassifySeqsCommand(string option) { try { abort = false; calledHelp = false; + rdb = ReferenceDB::getInstance(); //allow user to run help if(option == "help") { help(); abort = true; calledHelp = true; } + else if(option == "citation") { citation(); abort = true; calledHelp = true;} else { vector myArray = setParameters(); @@ -160,16 +165,6 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { } } - //check for required parameters - templateFileName = validParameter.validFile(parameters, "reference", true); - if (templateFileName == "not found") { - m->mothurOut("reference is a required parameter for the classify.seqs command."); - m->mothurOutEndLine(); - abort = true; - } - else if (templateFileName == "not open") { abort = true; } - - fastaFileName = validParameter.validFile(parameters, "fasta", false); if (fastaFileName == "not found") { //if there is a current fasta file, use it @@ -182,47 +177,65 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { //go through files and make sure they are good, if not, then disregard them for (int i = 0; i < fastaFileNames.size(); i++) { - if (inputDir != "") { - string path = m->hasPath(fastaFileNames[i]); - //if the user has not given a path then, add inputdir. else leave path alone. - if (path == "") { fastaFileNames[i] = inputDir + fastaFileNames[i]; } - } - int ableToOpen; - - ifstream in; - ableToOpen = m->openInputFile(fastaFileNames[i], in, "noerror"); - - //if you can't open it, try default location - if (ableToOpen == 1) { - if (m->getDefaultPath() != "") { //default path is set - string tryPath = m->getDefaultPath() + m->getSimpleName(fastaFileNames[i]); - m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); - ifstream in2; - ableToOpen = m->openInputFile(tryPath, in2, "noerror"); - in2.close(); - fastaFileNames[i] = tryPath; + bool ignore = false; + if (fastaFileNames[i] == "current") { + fastaFileNames[i] = m->getFastaFile(); + if (fastaFileNames[i] != "") { m->mothurOut("Using " + fastaFileNames[i] + " as input file for the fasta parameter where you had given current."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current fastafile, ignoring current."); m->mothurOutEndLine(); ignore=true; + //erase from file list + fastaFileNames.erase(fastaFileNames.begin()+i); + i--; } } - if (ableToOpen == 1) { - if (m->getOutputDir() != "") { //default path is set - string tryPath = m->getOutputDir() + m->getSimpleName(fastaFileNames[i]); - m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); - ifstream in2; - ableToOpen = m->openInputFile(tryPath, in2, "noerror"); - in2.close(); - fastaFileNames[i] = tryPath; + if (!ignore) { + + if (inputDir != "") { + string path = m->hasPath(fastaFileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { fastaFileNames[i] = inputDir + fastaFileNames[i]; } } - } - - in.close(); + + int ableToOpen; + + ifstream in; + ableToOpen = m->openInputFile(fastaFileNames[i], in, "noerror"); - if (ableToOpen == 1) { - m->mothurOut("Unable to open " + fastaFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); - //erase from file list - fastaFileNames.erase(fastaFileNames.begin()+i); - i--; + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(fastaFileNames[i]); + m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + fastaFileNames[i] = tryPath; + } + } + + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(fastaFileNames[i]); + m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + fastaFileNames[i] = tryPath; + } + } + + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + fastaFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); + //erase from file list + fastaFileNames.erase(fastaFileNames.begin()+i); + i--; + }else { + m->setFastaFile(fastaFileNames[i]); + } } } @@ -231,16 +244,6 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { if (fastaFileNames.size() == 0) { m->mothurOut("no valid files."); m->mothurOutEndLine(); abort = true; } } - - taxonomyFileName = validParameter.validFile(parameters, "taxonomy", true); - if (taxonomyFileName == "not found") { - m->mothurOut("taxonomy is a required parameter for the classify.seqs command."); - m->mothurOutEndLine(); - abort = true; - } - else if (taxonomyFileName == "not open") { abort = true; } - - namefile = validParameter.validFile(parameters, "name", false); if (namefile == "not found") { namefile = ""; } @@ -249,47 +252,63 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { //go through files and make sure they are good, if not, then disregard them for (int i = 0; i < namefileNames.size(); i++) { - if (inputDir != "") { - string path = m->hasPath(namefileNames[i]); - //if the user has not given a path then, add inputdir. else leave path alone. - if (path == "") { namefileNames[i] = inputDir + namefileNames[i]; } - } - int ableToOpen; - - ifstream in; - ableToOpen = m->openInputFile(namefileNames[i], in, "noerror"); - - //if you can't open it, try default location - if (ableToOpen == 1) { - if (m->getDefaultPath() != "") { //default path is set - string tryPath = m->getDefaultPath() + m->getSimpleName(namefileNames[i]); - m->mothurOut("Unable to open " + namefileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); - ifstream in2; - ableToOpen = m->openInputFile(tryPath, in2, "noerror"); - in2.close(); - namefileNames[i] = tryPath; + bool ignore = false; + if (namefileNames[i] == "current") { + namefileNames[i] = m->getNameFile(); + if (namefileNames[i] != "") { m->mothurOut("Using " + namefileNames[i] + " as input file for the name parameter where you had given current."); m->mothurOutEndLine(); } + else { + m->mothurOut("You have no current namefile, ignoring current."); m->mothurOutEndLine(); ignore=true; + //erase from file list + namefileNames.erase(namefileNames.begin()+i); + i--; } } - if (ableToOpen == 1) { - if (m->getOutputDir() != "") { //default path is set - string tryPath = m->getOutputDir() + m->getSimpleName(namefileNames[i]); - m->mothurOut("Unable to open " + namefileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); - ifstream in2; - ableToOpen = m->openInputFile(tryPath, in2, "noerror"); - in2.close(); - namefileNames[i] = tryPath; + if (!ignore) { + + if (inputDir != "") { + string path = m->hasPath(namefileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { namefileNames[i] = inputDir + namefileNames[i]; } } - } - in.close(); + int ableToOpen; + + ifstream in; + ableToOpen = m->openInputFile(namefileNames[i], in, "noerror"); - if (ableToOpen == 1) { - m->mothurOut("Unable to open " + namefileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); abort = true; - //erase from file list - namefileNames.erase(namefileNames.begin()+i); - i--; + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(namefileNames[i]); + m->mothurOut("Unable to open " + namefileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + namefileNames[i] = tryPath; + } + } + + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(namefileNames[i]); + m->mothurOut("Unable to open " + namefileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + namefileNames[i] = tryPath; + } + } + in.close(); + + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + namefileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); abort = true; + //erase from file list + namefileNames.erase(namefileNames.begin()+i); + i--; + }else { + m->setNameFile(namefileNames[i]); + } } - } } @@ -344,6 +363,8 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { //erase from file list groupfileNames.erase(groupfileNames.begin()+i); i--; + }else { + m->setGroupFile(groupfileNames[i]); } } } @@ -360,6 +381,41 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found"){ temp = "8"; } convert(temp, kmerSize); + temp = validParameter.validFile(parameters, "save", false); if (temp == "not found"){ temp = "f"; } + save = m->isTrue(temp); + rdb->save = save; + if (save) { //clear out old references + rdb->clearMemory(); + } + + //this has to go after save so that if the user sets save=t and provides no reference we abort + templateFileName = validParameter.validFile(parameters, "reference", true); + if (templateFileName == "not found") { + //check for saved reference sequences + if (rdb->referenceSeqs.size() != 0) { + templateFileName = "saved"; + }else { + m->mothurOut("[ERROR]: You don't have any saved reference sequences and the reference parameter is a required for the classify.seqs command."); + m->mothurOutEndLine(); + abort = true; + } + }else if (templateFileName == "not open") { abort = true; } + else { if (save) { rdb->setSavedReference(templateFileName); } } + + //this has to go after save so that if the user sets save=t and provides no reference we abort + taxonomyFileName = validParameter.validFile(parameters, "taxonomy", true); + if (taxonomyFileName == "not found") { + //check for saved reference sequences + if (rdb->wordGenusProb.size() != 0) { + taxonomyFileName = "saved"; + }else { + m->mothurOut("[ERROR]: You don't have any saved taxonomy information and the taxonomy parameter is a required for the classify.seqs command."); + m->mothurOutEndLine(); + abort = true; + } + }else if (taxonomyFileName == "not open") { abort = true; } + else { if (save) { rdb->setSavedTaxonomy(taxonomyFileName); } } + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); } m->setProcessors(temp); convert(temp, processors); @@ -419,12 +475,12 @@ int ClassifySeqsCommand::execute(){ try { if (abort == true) { if (calledHelp) { return 0; } return 2; } - if(method == "bayesian"){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters); } - else if(method == "knn"){ classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted); } + if(method == "bayesian"){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand()); } + else if(method == "knn"){ classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted, rand()); } else { m->mothurOut(search + " is not a valid method option. I will run the command using bayesian."); m->mothurOutEndLine(); - classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters); + classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters, rand()); } if (m->control_pressed) { delete classify; return 0; } @@ -434,7 +490,10 @@ int ClassifySeqsCommand::execute(){ m->mothurOut("Classifying sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine(); - string RippedTaxName = m->getRootName(m->getSimpleName(taxonomyFileName)); + string baseTName = taxonomyFileName; + if (taxonomyFileName == "saved") {baseTName = rdb->getSavedTaxonomy(); } + + string RippedTaxName = m->getRootName(m->getSimpleName(baseTName)); RippedTaxName = m->getExtension(RippedTaxName.substr(0, RippedTaxName.length()-1)); if (RippedTaxName[0] == '.') { RippedTaxName = RippedTaxName.substr(1, RippedTaxName.length()); } RippedTaxName += "."; @@ -506,7 +565,7 @@ int ClassifySeqsCommand::execute(){ //align your part driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos); - if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } for (int i = 1; i < processors; i++) { int done; @@ -599,9 +658,9 @@ int ClassifySeqsCommand::execute(){ string group = ""; if (groupfile != "") { group = groupfileNames[s]; } - PhyloSummary taxaSum(taxonomyFileName, group); + PhyloSummary taxaSum(baseTName, group); - if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } if (namefile == "") { taxaSum.summarize(tempTaxonomyFile); } else { @@ -628,9 +687,9 @@ int ClassifySeqsCommand::execute(){ } in.close(); } - remove(tempTaxonomyFile.c_str()); + m->mothurRemove(tempTaxonomyFile); - if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } delete classify; return 0; } //print summary file ofstream outTaxTree; @@ -652,7 +711,7 @@ int ClassifySeqsCommand::execute(){ //read taxfile - this reading and rewriting is done to preserve the confidence scores. string name, taxon; while (!inTax.eof()) { - if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } remove(unclass.c_str()); delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } m->mothurRemove(unclass); delete classify; return 0; } inTax >> name >> taxon; m->gobble(inTax); @@ -663,7 +722,7 @@ int ClassifySeqsCommand::execute(){ inTax.close(); outTax.close(); - remove(newTaxonomyFile.c_str()); + m->mothurRemove(newTaxonomyFile); rename(unclass.c_str(), newTaxonomyFile.c_str()); m->mothurOutEndLine(); @@ -741,7 +800,7 @@ int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, process++; }else if (pid == 0){ num = driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp", filename); - + //pass numSeqs to parent ofstream out; string tempFile = filename + toString(getpid()) + ".num.temp"; @@ -771,14 +830,14 @@ int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string tempFile = filename + toString(processIDS[i]) + ".num.temp"; m->openInputFile(tempFile, in); if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; } - in.close(); remove(tempFile.c_str()); + in.close(); m->mothurRemove(m->getFullPathName(tempFile)); } for(int i=0;imothurRemove((m->getFullPathName(taxFileName) + toString(processIDS[i]) + ".temp")); + m->mothurRemove((m->getFullPathName(tempTaxFile) + toString(processIDS[i]) + ".temp")); } return num;