X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=classifyseqscommand.cpp;h=6cfa3321c1cfd9b14cc3bf4b86d400c46feecb3c;hb=348de0f8b17d84ede77081dcf67bd6ef43496677;hp=33f49e5fc4b322301c4620cf4c9263fe751ab259;hpb=8c616f2509abd2fb9485a38607c1b439d243b85c;p=mothur.git diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 33f49e5..6cfa332 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -14,14 +14,63 @@ #include "phylosummary.h" #include "knn.h" -//********************************************************************************************************************** +//********************************************************************************************************************** +vector ClassifySeqsCommand::getValidParameters(){ + try { + string AlignArray[] = {"template","fasta","name","group","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters", "outputdir","inputdir"}; + vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); + return myArray; + } + catch(exception& e) { + m->errorOut(e, "ClassifySeqsCommand", "getValidParameters"); + exit(1); + } +} +//********************************************************************************************************************** +ClassifySeqsCommand::ClassifySeqsCommand(){ + try { + abort = true; calledHelp = true; + vector tempOutNames; + outputTypes["taxonomy"] = tempOutNames; + outputTypes["taxsummary"] = tempOutNames; + outputTypes["matchdist"] = tempOutNames; + } + catch(exception& e) { + m->errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand"); + exit(1); + } +} +//********************************************************************************************************************** +vector ClassifySeqsCommand::getRequiredParameters(){ + try { + string Array[] = {"fasta","template","taxonomy"}; + vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); + return myArray; + } + catch(exception& e) { + m->errorOut(e, "ClassifySeqsCommand", "getRequiredParameters"); + exit(1); + } +} +//********************************************************************************************************************** +vector ClassifySeqsCommand::getRequiredFiles(){ + try { + vector myArray; + return myArray; + } + catch(exception& e) { + m->errorOut(e, "ClassifySeqsCommand", "getRequiredFiles"); + exit(1); + } +} +//********************************************************************************************************************** ClassifySeqsCommand::ClassifySeqsCommand(string option) { try { - abort = false; + abort = false; calledHelp = false; //allow user to run help - if(option == "help") { help(); abort = true; } + if(option == "help") { help(); abort = true; calledHelp = true; } else { @@ -40,6 +89,12 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } } + //initialize outputTypes + vector tempOutNames; + outputTypes["taxonomy"] = tempOutNames; + outputTypes["taxsummary"] = tempOutNames; + outputTypes["matchdist"] = tempOutNames; + //if the user changes the output directory command factory will send this info to us in the output parameter outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; } @@ -106,10 +161,24 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { if (m->getDefaultPath() != "") { //default path is set string tryPath = m->getDefaultPath() + m->getSimpleName(fastaFileNames[i]); m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); - ableToOpen = m->openInputFile(tryPath, in, "noerror"); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + fastaFileNames[i] = tryPath; + } + } + + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(fastaFileNames[i]); + m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); fastaFileNames[i] = tryPath; } } + in.close(); if (ableToOpen == 1) { @@ -158,7 +227,20 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { if (m->getDefaultPath() != "") { //default path is set string tryPath = m->getDefaultPath() + m->getSimpleName(namefileNames[i]); m->mothurOut("Unable to open " + namefileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); - ableToOpen = m->openInputFile(tryPath, in, "noerror"); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + namefileNames[i] = tryPath; + } + } + + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(namefileNames[i]); + m->mothurOut("Unable to open " + namefileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); namefileNames[i] = tryPath; } } @@ -200,10 +282,24 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { if (m->getDefaultPath() != "") { //default path is set string tryPath = m->getDefaultPath() + m->getSimpleName(groupfileNames[i]); m->mothurOut("Unable to open " + groupfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); - ableToOpen = m->openInputFile(tryPath, in, "noerror"); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); + groupfileNames[i] = tryPath; + } + } + + if (ableToOpen == 1) { + if (m->getOutputDir() != "") { //default path is set + string tryPath = m->getOutputDir() + m->getSimpleName(groupfileNames[i]); + m->mothurOut("Unable to open " + groupfileNames[i] + ". Trying output directory " + tryPath); m->mothurOutEndLine(); + ifstream in2; + ableToOpen = m->openInputFile(tryPath, in2, "noerror"); + in2.close(); groupfileNames[i] = tryPath; } } + in.close(); if (ableToOpen == 1) { @@ -324,7 +420,7 @@ void ClassifySeqsCommand::help(){ int ClassifySeqsCommand::execute(){ try { - if (abort == true) { return 0; } + if (abort == true) { if (calledHelp) { return 0; } return 2; } if(method == "bayesian"){ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff, iters); } else if(method == "knn"){ classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted); } @@ -336,7 +432,6 @@ int ClassifySeqsCommand::execute(){ if (m->control_pressed) { delete classify; return 0; } - vector outputNames; for (int s = 0; s < fastaFileNames.size(); s++) { @@ -354,18 +449,18 @@ int ClassifySeqsCommand::execute(){ if ((method == "knn") && (search == "distance")) { string DistName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "match.dist"; - classify->setDistName(DistName); outputNames.push_back(DistName); + classify->setDistName(DistName); outputNames.push_back(DistName); outputTypes["matchdist"].push_back(DistName); } - outputNames.push_back(newTaxonomyFile); - outputNames.push_back(taxSummary); + outputNames.push_back(newTaxonomyFile); outputTypes["taxonomy"].push_back(newTaxonomyFile); + outputNames.push_back(taxSummary); outputTypes["taxsummary"].push_back(taxSummary); int start = time(NULL); int numFastaSeqs = 0; for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); #ifdef USE_MPI - int pid, end, numSeqsPerProcessor; + int pid, numSeqsPerProcessor; int tag = 2001; vector MPIPos; @@ -380,20 +475,11 @@ int ClassifySeqsCommand::execute(){ int outMode=MPI_MODE_CREATE|MPI_MODE_WRONLY; int inMode=MPI_MODE_RDONLY; - //char* outNewTax = new char[newTaxonomyFile.length()]; - //memcpy(outNewTax, newTaxonomyFile.c_str(), newTaxonomyFile.length()); - char outNewTax[1024]; strcpy(outNewTax, newTaxonomyFile.c_str()); - - //char* outTempTax = new char[tempTaxonomyFile.length()]; - //memcpy(outTempTax, tempTaxonomyFile.c_str(), tempTaxonomyFile.length()); char outTempTax[1024]; strcpy(outTempTax, tempTaxonomyFile.c_str()); - - //char* inFileName = new char[fastaFileNames[s].length()]; - //memcpy(inFileName, fastaFileNames[s].c_str(), fastaFileNames[s].length()); char inFileName[1024]; strcpy(inFileName, fastaFileNames[s].c_str()); @@ -402,11 +488,7 @@ int ClassifySeqsCommand::execute(){ MPI_File_open(MPI_COMM_WORLD, outNewTax, outMode, MPI_INFO_NULL, &outMPINewTax); MPI_File_open(MPI_COMM_WORLD, outTempTax, outMode, MPI_INFO_NULL, &outMPITempTax); - //delete outNewTax; - //delete outTempTax; - //delete inFileName; - - if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); delete classify; return 0; } if (pid == 0) { //you are the root process @@ -427,7 +509,7 @@ int ClassifySeqsCommand::execute(){ //align your part driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos); - if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; } for (int i = 1; i < processors; i++) { int done; @@ -447,7 +529,7 @@ int ClassifySeqsCommand::execute(){ //align your part driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos); - if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); delete classify; return 0; } int done = 0; MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD); @@ -476,16 +558,6 @@ int ClassifySeqsCommand::execute(){ numFastaSeqs = createProcesses(newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); - rename((newTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), newTaxonomyFile.c_str()); - rename((tempTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), tempTaxonomyFile.c_str()); - - for(int i=1;icontrol_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; } if (namefile == "") { taxaSum.summarize(tempTaxonomyFile); } else { @@ -561,7 +633,7 @@ int ClassifySeqsCommand::execute(){ } remove(tempTaxonomyFile.c_str()); - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; } //print summary file ofstream outTaxTree; @@ -579,11 +651,11 @@ int ClassifySeqsCommand::execute(){ //get maxLevel from phylotree so you know how many 'unclassified's to add int maxLevel = taxaSum.getMaxLevel(); - + //read taxfile - this reading and rewriting is done to preserve the confidence scores. string name, taxon; while (!inTax.eof()) { - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } remove(unclass.c_str()); delete classify; return 0; } + if (m->control_pressed) { outputTypes.clear(); for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } remove(unclass.c_str()); delete classify; return 0; } inTax >> name >> taxon; m->gobble(inTax); @@ -610,6 +682,13 @@ int ClassifySeqsCommand::execute(){ m->mothurOutEndLine(); } + //set taxonomy file as new current taxonomyfile + string current = ""; + itTypes = outputTypes.find("taxonomy"); + if (itTypes != outputTypes.end()) { + if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); } + } + delete classify; return 0; } @@ -653,7 +732,7 @@ string ClassifySeqsCommand::addUnclassifieds(string tax, int maxlevel) { int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string filename) { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) - int process = 0; + int process = 1; int num = 0; //loop through and create all the processes you want @@ -674,11 +753,18 @@ int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, out.close(); exit(0); - }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } + }else { + m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); + for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); } + exit(0); + } } + //parent does its part + num = driver(lines[0], taxFileName, tempTaxFile, filename); + //force parent to wait until all the processes are done - for (int i=0;icontrol_pressed) { return 0; } Sequence* candidateSeq = new Sequence(inFASTA); m->gobble(inFASTA); - + if (candidateSeq->getName() != "") { + taxonomy = classify->getTaxonomy(candidateSeq); if (m->control_pressed) { delete candidateSeq; return 0; } @@ -776,10 +870,11 @@ int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempT //report progress if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } + } //report progress if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } - + inFASTA.close(); outTax.close(); outTaxSimple.close();