X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=classifyseqscommand.cpp;h=e934d9d6d114a18759ec317185cd01085ac72052;hb=a98eb683e17d8e49583bf2d215ab7562a4cdca75;hp=2a210b01ad3a8fb0812c1877595028a851bdbee7;hpb=956cdff34f2d609a7736838b1631cd7957580b8b;p=mothur.git diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 2a210b0..e934d9d 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -32,7 +32,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { OptionParser parser(option); map parameters = parser.getParameters(); - ValidParameters validParameter; + ValidParameters validParameter("classify.seqs"); map::iterator it; //check to make sure all parameters are valid for command @@ -51,7 +51,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { it = parameters.find("template"); //user has given a template file if(it != parameters.end()){ - path = hasPath(it->second); + path = m->hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["template"] = inputDir + it->second; } } @@ -59,7 +59,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { it = parameters.find("taxonomy"); //user has given a template file if(it != parameters.end()){ - path = hasPath(it->second); + path = m->hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["taxonomy"] = inputDir + it->second; } } @@ -67,7 +67,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { it = parameters.find("group"); //user has given a template file if(it != parameters.end()){ - path = hasPath(it->second); + path = m->hasPath(it->second); //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["group"] = inputDir + it->second; } } @@ -86,43 +86,34 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { fastaFileName = validParameter.validFile(parameters, "fasta", false); if (fastaFileName == "not found") { m->mothurOut("fasta is a required parameter for the classify.seqs command."); m->mothurOutEndLine(); abort = true; } else { - splitAtDash(fastaFileName, fastaFileNames); + m->splitAtDash(fastaFileName, fastaFileNames); //go through files and make sure they are good, if not, then disregard them for (int i = 0; i < fastaFileNames.size(); i++) { if (inputDir != "") { - string path = hasPath(fastaFileNames[i]); + string path = m->hasPath(fastaFileNames[i]); //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { fastaFileNames[i] = inputDir + fastaFileNames[i]; } } int ableToOpen; - #ifdef USE_MPI - int pid; - MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running - MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are - - if (pid == 0) { - #endif - ifstream in; - ableToOpen = openInputFile(fastaFileNames[i], in); - in.close(); - - #ifdef USE_MPI - for (int j = 1; j < processors; j++) { - MPI_Send(&ableToOpen, 1, MPI_INT, j, 2001, MPI_COMM_WORLD); - } - }else{ - MPI_Status status; - MPI_Recv(&ableToOpen, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + ableToOpen = m->openInputFile(fastaFileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(fastaFileNames[i]); + m->mothurOut("Unable to open " + fastaFileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ableToOpen = m->openInputFile(tryPath, in, "noerror"); + fastaFileNames[i] = tryPath; } - - #endif + } + in.close(); if (ableToOpen == 1) { - m->mothurOut(fastaFileNames[i] + " will be disregarded."); m->mothurOutEndLine(); + m->mothurOut("Unable to open " + fastaFileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); //erase from file list fastaFileNames.erase(fastaFileNames.begin()+i); i--; @@ -148,41 +139,38 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { if (namefile == "not found") { namefile = ""; } else { - splitAtDash(namefile, namefileNames); + m->splitAtDash(namefile, namefileNames); //go through files and make sure they are good, if not, then disregard them for (int i = 0; i < namefileNames.size(); i++) { if (inputDir != "") { - string path = hasPath(namefileNames[i]); + string path = m->hasPath(namefileNames[i]); //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { namefileNames[i] = inputDir + namefileNames[i]; } } int ableToOpen; - #ifdef USE_MPI - int pid; - MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running - MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are - - if (pid == 0) { - #endif - ifstream in; - ableToOpen = openInputFile(namefileNames[i], in); - in.close(); - - #ifdef USE_MPI - for (int j = 1; j < processors; j++) { - MPI_Send(&ableToOpen, 1, MPI_INT, j, 2001, MPI_COMM_WORLD); - } - }else{ - MPI_Status status; - MPI_Recv(&ableToOpen, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + ableToOpen = m->openInputFile(namefileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(namefileNames[i]); + m->mothurOut("Unable to open " + namefileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ableToOpen = m->openInputFile(tryPath, in, "noerror"); + namefileNames[i] = tryPath; } - - #endif - if (ableToOpen == 1) { m->mothurOut("Unable to match name file with fasta file."); m->mothurOutEndLine(); abort = true; } + } + in.close(); + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + namefileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); abort = true; + //erase from file list + namefileNames.erase(namefileNames.begin()+i); + i--; + } + } } @@ -193,41 +181,37 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { groupfile = validParameter.validFile(parameters, "group", false); if (groupfile == "not found") { groupfile = ""; } else { - splitAtDash(groupfile, groupfileNames); + m->splitAtDash(groupfile, groupfileNames); //go through files and make sure they are good, if not, then disregard them for (int i = 0; i < groupfileNames.size(); i++) { if (inputDir != "") { - string path = hasPath(groupfileNames[i]); + string path = m->hasPath(groupfileNames[i]); //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { groupfileNames[i] = inputDir + groupfileNames[i]; } } int ableToOpen; - #ifdef USE_MPI - int pid; - MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running - MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are - - if (pid == 0) { - #endif - ifstream in; - ableToOpen = openInputFile(groupfileNames[i], in); - in.close(); - - #ifdef USE_MPI - for (int j = 1; j < processors; j++) { - MPI_Send(&ableToOpen, 1, MPI_INT, j, 2001, MPI_COMM_WORLD); - } - }else{ - MPI_Status status; - MPI_Recv(&ableToOpen, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + ableToOpen = m->openInputFile(groupfileNames[i], in, "noerror"); + + //if you can't open it, try default location + if (ableToOpen == 1) { + if (m->getDefaultPath() != "") { //default path is set + string tryPath = m->getDefaultPath() + m->getSimpleName(groupfileNames[i]); + m->mothurOut("Unable to open " + groupfileNames[i] + ". Trying default " + tryPath); m->mothurOutEndLine(); + ableToOpen = m->openInputFile(tryPath, in, "noerror"); + groupfileNames[i] = tryPath; } - - #endif - if (ableToOpen == 1) { m->mothurOut("Unable to match group file with fasta file, not using " + groupfileNames[i] + "."); m->mothurOutEndLine(); groupfileNames[i] = ""; } + } + in.close(); + if (ableToOpen == 1) { + m->mothurOut("Unable to open " + groupfileNames[i] + ". It will be disregarded."); m->mothurOutEndLine(); groupfileNames[i] = ""; + //erase from file list + groupfileNames.erase(groupfileNames.begin()+i); + i--; + } } } @@ -269,7 +253,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { convert(temp, cutoff); temp = validParameter.validFile(parameters, "probs", false); if (temp == "not found"){ temp = "true"; } - probs = isTrue(temp); + probs = m->isTrue(temp); temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "100"; } convert(temp, iters); @@ -358,10 +342,20 @@ int ClassifySeqsCommand::execute(){ m->mothurOut("Classifying sequences from " + fastaFileNames[s] + " ..." ); m->mothurOutEndLine(); - if (outputDir == "") { outputDir += hasPath(fastaFileNames[s]); } - string newTaxonomyFile = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + getRootName(getSimpleName(taxonomyFileName)) + "taxonomy"; - string tempTaxonomyFile = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + "taxonomy.temp"; - string taxSummary = outputDir + getRootName(getSimpleName(fastaFileNames[s])) + getRootName(getSimpleName(taxonomyFileName)) + "tax.summary"; + string RippedTaxName = m->getRootName(m->getSimpleName(taxonomyFileName)); + RippedTaxName = m->getExtension(RippedTaxName.substr(0, RippedTaxName.length()-1)); + if (RippedTaxName[0] == '.') { RippedTaxName = RippedTaxName.substr(1, RippedTaxName.length()); } + RippedTaxName += "."; + + if (outputDir == "") { outputDir += m->hasPath(fastaFileNames[s]); } + string newTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "taxonomy"; + string tempTaxonomyFile = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "taxonomy.temp"; + string taxSummary = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + RippedTaxName + "tax.summary"; + + if ((method == "knn") && (search == "distance")) { + string DistName = outputDir + m->getRootName(m->getSimpleName(fastaFileNames[s])) + "match.dist"; + classify->setDistName(DistName); outputNames.push_back(DistName); + } outputNames.push_back(newTaxonomyFile); outputNames.push_back(taxSummary); @@ -373,7 +367,7 @@ int ClassifySeqsCommand::execute(){ #ifdef USE_MPI int pid, end, numSeqsPerProcessor; int tag = 2001; - vector MPIPos; + vector MPIPos; MPI_Status status; MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are @@ -416,7 +410,7 @@ int ClassifySeqsCommand::execute(){ if (pid == 0) { //you are the root process - MPIPos = setFilePosFasta(fastaFileNames[s], numFastaSeqs); //fills MPIPos, returns numSeqs + MPIPos = m->setFilePosFasta(fastaFileNames[s], numFastaSeqs); //fills MPIPos, returns numSeqs //send file positions to all processes for(int i = 1; i < processors; i++) { @@ -466,45 +460,21 @@ int ClassifySeqsCommand::execute(){ MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else + + vector positions = m->divideFile(fastaFileNames[s], processors); + + for (int i = 0; i < (positions.size()-1); i++) { + lines.push_back(new linePair(positions[i], positions[(i+1)])); + } + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ - ifstream inFASTA; - openInputFile(fastaFileNames[s], inFASTA); - getNumSeqs(inFASTA, numFastaSeqs); - inFASTA.close(); - - lines.push_back(new linePair(0, numFastaSeqs)); - - driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); + numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); } else{ - vector positions; processIDS.resize(0); - ifstream inFASTA; - openInputFile(fastaFileNames[s], inFASTA); - - string input; - while(!inFASTA.eof()){ - input = getline(inFASTA); - if (input.length() != 0) { - if(input[0] == '>'){ unsigned long int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } - } - } - inFASTA.close(); - - numFastaSeqs = positions.size(); - - int numSeqsPerProcessor = numFastaSeqs / processors; - - for (int i = 0; i < processors; i++) { - unsigned long int startPos = positions[ i * numSeqsPerProcessor ]; - if(i == processors - 1){ - numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; - } - lines.push_back(new linePair(startPos, numSeqsPerProcessor)); - } - createProcesses(newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); + numFastaSeqs = createProcesses(newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); rename((newTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), newTaxonomyFile.c_str()); rename((tempTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), tempTaxonomyFile.c_str()); @@ -518,14 +488,7 @@ int ClassifySeqsCommand::execute(){ } #else - ifstream inFASTA; - openInputFile(fastaFileNames[s], inFASTA); - getNumSeqs(inFASTA, numFastaSeqs); - inFASTA.close(); - - lines.push_back(new linePair(0, numFastaSeqs)); - - driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); + numFastaSeqs = driver(lines[0], newTaxonomyFile, tempTaxonomyFile, fastaFileNames[s]); #endif #endif @@ -547,14 +510,14 @@ int ClassifySeqsCommand::execute(){ nameMap.clear(); //remove old names ifstream inNames; - openInputFile(namefileNames[s], inNames); + m->openInputFile(namefileNames[s], inNames); string firstCol, secondCol; while(!inNames.eof()) { - inNames >> firstCol >> secondCol; gobble(inNames); + inNames >> firstCol >> secondCol; m->gobble(inNames); vector temp; - splitAtComma(secondCol, temp); + m->splitAtComma(secondCol, temp); nameMap[firstCol] = temp; } @@ -574,13 +537,13 @@ int ClassifySeqsCommand::execute(){ if (namefile == "") { taxaSum.summarize(tempTaxonomyFile); } else { ifstream in; - openInputFile(tempTaxonomyFile, in); + m->openInputFile(tempTaxonomyFile, in); //read in users taxonomy file and add sequences to tree string name, taxon; while(!in.eof()){ - in >> name >> taxon; gobble(in); + in >> name >> taxon; m->gobble(in); itNames = nameMap.find(name); @@ -602,17 +565,17 @@ int ClassifySeqsCommand::execute(){ //print summary file ofstream outTaxTree; - openOutputFile(taxSummary, outTaxTree); + m->openOutputFile(taxSummary, outTaxTree); taxaSum.print(outTaxTree); outTaxTree.close(); //output taxonomy with the unclassified bins added ifstream inTax; - openInputFile(newTaxonomyFile, inTax); + m->openInputFile(newTaxonomyFile, inTax); ofstream outTax; string unclass = newTaxonomyFile + ".unclass.temp"; - openOutputFile(unclass, outTax); + m->openOutputFile(unclass, outTax); //get maxLevel from phylotree so you know how many 'unclassified's to add int maxLevel = taxaSum.getMaxLevel(); @@ -622,7 +585,7 @@ int ClassifySeqsCommand::execute(){ while (!inTax.eof()) { if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } remove(unclass.c_str()); delete classify; return 0; } - inTax >> name >> taxon; gobble(inTax); + inTax >> name >> taxon; m->gobble(inTax); string newTax = addUnclassifieds(taxon, maxLevel); @@ -687,11 +650,11 @@ string ClassifySeqsCommand::addUnclassifieds(string tax, int maxlevel) { /**************************************************************************************************/ -void ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string filename) { +int ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile, string filename) { try { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) int process = 0; - // processIDS.resize(0); + int num = 0; //loop through and create all the processes you want while (process != processors) { @@ -701,7 +664,15 @@ void ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later process++; }else if (pid == 0){ - driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp", filename); + num = driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp", filename); + + //pass numSeqs to parent + ofstream out; + string tempFile = filename + toString(getpid()) + ".num.temp"; + m->openOutputFile(tempFile, out); + out << num << endl; + out.close(); + exit(0); }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } } @@ -711,6 +682,16 @@ void ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile int temp = processIDS[i]; wait(&temp); } + + for (int i = 0; i < processIDS.size(); i++) { + ifstream in; + string tempFile = filename + toString(processIDS[i]) + ".num.temp"; + m->openInputFile(tempFile, in); + if (!in.eof()) { int tempNum = 0; in >> tempNum; num += tempNum; } + in.close(); remove(tempFile.c_str()); + } + + return num; #endif } catch(exception& e) { @@ -725,8 +706,8 @@ void ClassifySeqsCommand::appendTaxFiles(string temp, string filename) { ofstream output; ifstream input; - openOutputFileAppend(filename, output); - openInputFile(temp, input); + m->openOutputFileAppend(filename, output); + m->openInputFile(temp, input); while(char c = input.get()){ if(input.eof()) { break; } @@ -744,25 +725,28 @@ void ClassifySeqsCommand::appendTaxFiles(string temp, string filename) { //********************************************************************************************************************** -int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFName, string filename){ +int ClassifySeqsCommand::driver(linePair* filePos, string taxFName, string tempTFName, string filename){ try { ofstream outTax; - openOutputFile(taxFName, outTax); + m->openOutputFile(taxFName, outTax); ofstream outTaxSimple; - openOutputFile(tempTFName, outTaxSimple); + m->openOutputFile(tempTFName, outTaxSimple); ifstream inFASTA; - openInputFile(filename, inFASTA); - - inFASTA.seekg(line->start); + m->openInputFile(filename, inFASTA); string taxonomy; - for(int i=0;inumSeqs;i++){ + inFASTA.seekg(filePos->start); + + bool done = false; + int count = 0; + + while (!done) { if (m->control_pressed) { return 0; } - Sequence* candidateSeq = new Sequence(inFASTA); gobble(inFASTA); + Sequence* candidateSeq = new Sequence(inFASTA); m->gobble(inFASTA); if (candidateSeq->getName() != "") { taxonomy = classify->getTaxonomy(candidateSeq); @@ -779,19 +763,24 @@ int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFNa outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl; } - } + count++; + } delete candidateSeq; - if((i+1) % 100 == 0){ - m->mothurOut("Classifying sequence " + toString(i+1)); m->mothurOutEndLine(); - } + unsigned long int pos = inFASTA.tellg(); + if ((pos == -1) || (pos >= filePos->end)) { break; } + + //report progress + if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } } - + //report progress + if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); } + inFASTA.close(); outTax.close(); outTaxSimple.close(); - return 1; + return count; } catch(exception& e) { m->errorOut(e, "ClassifySeqsCommand", "driver"); @@ -800,7 +789,7 @@ int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFNa } //********************************************************************************************************************** #ifdef USE_MPI -int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& newFile, MPI_File& tempFile, vector& MPIPos){ +int ClassifySeqsCommand::driverMPI(int start, int num, MPI_File& inMPI, MPI_File& newFile, MPI_File& tempFile, vector& MPIPos){ try { MPI_Status statusNew; MPI_Status statusTemp; @@ -901,10 +890,10 @@ int ClassifySeqsCommand::MPIReadNamesFile(string nameFilename){ string firstCol, secondCol; while(!iss.eof()) { - iss >> firstCol >> secondCol; gobble(iss); + iss >> firstCol >> secondCol; m->gobble(iss); vector temp; - splitAtComma(secondCol, temp); + m->splitAtComma(secondCol, temp); nameMap[firstCol] = temp; }