X-Git-Url: https://git.donarmstrong.com/?a=blobdiff_plain;f=classifyseqscommand.cpp;h=6c5d827d6553eac7f8ca8696091dd3ab7c87e4f3;hb=cd985cf388dcc4c7de8251339206aec5f7e12f1e;hp=bed65806ca72aecc1c26536e1dcbbe6d74b8b67b;hpb=a61980e36d61937acccadc41be260165a1e389f4;p=mothur.git diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index bed6580..6c5d827 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -11,6 +11,7 @@ #include "sequence.hpp" #include "bayesian.h" #include "phylotree.h" +#include "phylosummary.h" #include "knn.h" //********************************************************************************************************************** @@ -25,7 +26,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { else { //valid paramters for this command - string AlignArray[] = {"template","fasta","name","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters", "outputdir","inputdir"}; + string AlignArray[] = {"template","fasta","name","group","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters", "outputdir","inputdir"}; vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); OptionParser parser(option); @@ -62,6 +63,14 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["taxonomy"] = inputDir + it->second; } } + + it = parameters.find("group"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["group"] = inputDir + it->second; } + } } //check for required parameters @@ -73,6 +82,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { } else if (templateFileName == "not open") { abort = true; } + fastaFileName = validParameter.validFile(parameters, "fasta", false); if (fastaFileName == "not found") { m->mothurOut("fasta is a required parameter for the classify.seqs command."); m->mothurOutEndLine(); abort = true; } else { @@ -180,6 +190,53 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { if (namefileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a name file, you must have one for each fasta file."); m->mothurOutEndLine(); } } + groupfile = validParameter.validFile(parameters, "group", false); + if (groupfile == "not found") { groupfile = ""; } + else { + splitAtDash(groupfile, groupfileNames); + + //go through files and make sure they are good, if not, then disregard them + for (int i = 0; i < groupfileNames.size(); i++) { + if (inputDir != "") { + string path = hasPath(groupfileNames[i]); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { groupfileNames[i] = inputDir + groupfileNames[i]; } + } + int ableToOpen; + + #ifdef USE_MPI + int pid; + MPI_Comm_size(MPI_COMM_WORLD, &processors); //set processors to the number of mpi processes running + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + if (pid == 0) { + #endif + + ifstream in; + ableToOpen = openInputFile(groupfileNames[i], in); + in.close(); + + #ifdef USE_MPI + for (int j = 1; j < processors; j++) { + MPI_Send(&ableToOpen, 1, MPI_INT, j, 2001, MPI_COMM_WORLD); + } + }else{ + MPI_Status status; + MPI_Recv(&ableToOpen, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD, &status); + } + + #endif + if (ableToOpen == 1) { m->mothurOut("Unable to match group file with fasta file, not using " + groupfileNames[i] + "."); m->mothurOutEndLine(); groupfileNames[i] = ""; } + + } + } + + if (groupfile != "") { + if (groupfileNames.size() != fastaFileNames.size()) { abort = true; m->mothurOut("If you provide a group file, you must have one for each fasta file."); m->mothurOutEndLine(); } + }else { + for (int i = 0; i < fastaFileNames.size(); i++) { groupfileNames.push_back(""); } + } + //check for optional parameter and set defaults // ...at some point should added some additional type checking... string temp; @@ -250,6 +307,7 @@ void ClassifySeqsCommand::help(){ m->mothurOut("The template, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n"); m->mothurOut("The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer, blast and distance. The default is kmer.\n"); m->mothurOut("The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n"); + m->mothurOut("The group parameter allows you add a group file so you can have the summary totals broken up by group.\n"); m->mothurOut("The method parameter allows you to specify classification method to use. Your options are: bayesian and knn. The default is bayesian.\n"); m->mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n"); m->mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n"); @@ -262,7 +320,7 @@ void ClassifySeqsCommand::help(){ m->mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -1.0.\n"); m->mothurOut("The numwanted parameter allows you to specify the number of sequence matches you want with the knn method. The default is 10.\n"); m->mothurOut("The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy. The default is 0.\n"); - m->mothurOut("The probs parameter shut off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be run.\n"); + m->mothurOut("The probs parameter shuts off the bootstrapping results for the bayesian method. The default is true, meaning you want the bootstrapping to be shown.\n"); m->mothurOut("The iters parameter allows you to specify how many iterations to do when calculating the bootstrap confidence score for your taxonomy with the bayesian method. The default is 100.\n"); m->mothurOut("The classify.seqs command should be in the following format: \n"); m->mothurOut("classify.seqs(template=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n"); @@ -355,21 +413,22 @@ int ClassifySeqsCommand::execute(){ //delete inFileName; if (m->control_pressed) { MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); delete classify; return 0; } - - if(namefile != "") { MPIReadNamesFile(namefileNames[s]); } if (pid == 0) { //you are the root process MPIPos = setFilePosFasta(fastaFileNames[s], numFastaSeqs); //fills MPIPos, returns numSeqs //send file positions to all processes - MPI_Bcast(&numFastaSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs - MPI_Bcast(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos + for(int i = 1; i < processors; i++) { + MPI_Send(&numFastaSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD); + MPI_Send(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, i, tag, MPI_COMM_WORLD); + } //figure out how many sequences you have to align numSeqsPerProcessor = numFastaSeqs / processors; - if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; } int startIndex = pid * numSeqsPerProcessor; + if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; } + //align your part driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos); @@ -381,14 +440,15 @@ int ClassifySeqsCommand::execute(){ MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status); } }else{ //you are a child process - MPI_Bcast(&numFastaSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs + MPI_Recv(&numFastaSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status); MPIPos.resize(numFastaSeqs+1); - MPI_Bcast(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions + MPI_Recv(&MPIPos[0], (numFastaSeqs+1), MPI_LONG, 0, tag, MPI_COMM_WORLD, &status); //figure out how many sequences you have to align numSeqsPerProcessor = numFastaSeqs / processors; - if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; } int startIndex = pid * numSeqsPerProcessor; + if(pid == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - pid * numSeqsPerProcessor; } + //align your part driverMPI(startIndex, numSeqsPerProcessor, inMPI, outMPINewTax, outMPITempTax, MPIPos); @@ -403,28 +463,14 @@ int ClassifySeqsCommand::execute(){ MPI_File_close(&inMPI); MPI_File_close(&outMPINewTax); MPI_File_close(&outMPITempTax); + MPI_Barrier(MPI_COMM_WORLD); //make everyone wait - just in case #else - //read namefile - if(namefile != "") { - nameMap.clear(); //remove old names - - ifstream inNames; - openInputFile(namefileNames[s], inNames); - - string firstCol, secondCol; - while(!inNames.eof()) { - inNames >> firstCol >> secondCol; gobble(inNames); - nameMap[firstCol] = getNumNames(secondCol); //ex. seq1 seq1,seq3,seq5 -> seq1 = 3. - } - inNames.close(); - } - - #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ ifstream inFASTA; openInputFile(fastaFileNames[s], inFASTA); - numFastaSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + getNumSeqs(inFASTA, numFastaSeqs); inFASTA.close(); lines.push_back(new linePair(0, numFastaSeqs)); @@ -474,7 +520,7 @@ int ClassifySeqsCommand::execute(){ #else ifstream inFASTA; openInputFile(fastaFileNames[s], inFASTA); - numFastaSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + getNumSeqs(inFASTA, numFastaSeqs); inFASTA.close(); lines.push_back(new linePair(0, numFastaSeqs)); @@ -483,54 +529,81 @@ int ClassifySeqsCommand::execute(){ #endif #endif + m->mothurOutEndLine(); + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); + start = time(NULL); + + #ifdef USE_MPI if (pid == 0) { //this part does not need to be paralellized + + if(namefile != "") { m->mothurOut("Reading " + namefileNames[s] + "..."); cout.flush(); MPIReadNamesFile(namefileNames[s]); m->mothurOut(" Done."); m->mothurOutEndLine(); } + #else + //read namefile + if(namefile != "") { + + m->mothurOut("Reading " + namefileNames[s] + "..."); cout.flush(); + + nameMap.clear(); //remove old names + + ifstream inNames; + openInputFile(namefileNames[s], inNames); + + string firstCol, secondCol; + while(!inNames.eof()) { + inNames >> firstCol >> secondCol; gobble(inNames); + + vector temp; + splitAtComma(secondCol, temp); + + nameMap[firstCol] = temp; + } + inNames.close(); + + m->mothurOut(" Done."); m->mothurOutEndLine(); + } #endif - //make taxonomy tree from new taxonomy file - PhyloTree taxaBrowser; + string group = ""; + if (groupfile != "") { group = groupfileNames[s]; } - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; } + PhyloSummary taxaSum(taxonomyFileName, group); - ifstream in; - openInputFile(tempTaxonomyFile, in); + if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; } - //read in users taxonomy file and add sequences to tree - string name, taxon; - while(!in.eof()){ - in >> name >> taxon; gobble(in); + if (namefile == "") { taxaSum.summarize(tempTaxonomyFile); } + else { + ifstream in; + openInputFile(tempTaxonomyFile, in); - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } remove(tempTaxonomyFile.c_str()); delete classify; return 0; } + //read in users taxonomy file and add sequences to tree + string name, taxon; - if (namefile != "") { + while(!in.eof()){ + in >> name >> taxon; gobble(in); + itNames = nameMap.find(name); if (itNames == nameMap.end()) { m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1); }else{ - for (int i = 0; i < itNames->second; i++) { - taxaBrowser.addSeqToTree(name+toString(i), taxon); //add it as many times as there are identical seqs + for (int i = 0; i < itNames->second.size(); i++) { + taxaSum.addSeqToTree(itNames->second[i], taxon); //add it as many times as there are identical seqs } + itNames->second.clear(); + nameMap.erase(itNames->first); } - }else { taxaBrowser.addSeqToTree(name, taxon); } //add it once + } + in.close(); } - in.close(); - - taxaBrowser.assignHeirarchyIDs(0); - - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } remove(tempTaxonomyFile.c_str()); delete classify; return 0; } - - taxaBrowser.binUnclassified(); - remove(tempTaxonomyFile.c_str()); if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; } - //print summary file ofstream outTaxTree; openOutputFile(taxSummary, outTaxTree); - taxaBrowser.print(outTaxTree); + taxaSum.print(outTaxTree); outTaxTree.close(); //output taxonomy with the unclassified bins added @@ -542,9 +615,10 @@ int ClassifySeqsCommand::execute(){ openOutputFile(unclass, outTax); //get maxLevel from phylotree so you know how many 'unclassified's to add - int maxLevel = taxaBrowser.getMaxLevel(); + int maxLevel = taxaSum.getMaxLevel(); //read taxfile - this reading and rewriting is done to preserve the confidence scores. + string name, taxon; while (!inTax.eof()) { if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } remove(unclass.c_str()); delete classify; return 0; } @@ -560,6 +634,9 @@ int ClassifySeqsCommand::execute(){ remove(newTaxonomyFile.c_str()); rename(unclass.c_str(), newTaxonomyFile.c_str()); + m->mothurOutEndLine(); + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); + #ifdef USE_MPI } #endif @@ -568,10 +645,6 @@ int ClassifySeqsCommand::execute(){ m->mothurOut("Output File Names: "); m->mothurOutEndLine(); for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } m->mothurOutEndLine(); - - - m->mothurOutEndLine(); - m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); } delete classify; @@ -829,7 +902,11 @@ int ClassifySeqsCommand::MPIReadNamesFile(string nameFilename){ string firstCol, secondCol; while(!iss.eof()) { iss >> firstCol >> secondCol; gobble(iss); - nameMap[firstCol] = getNumNames(secondCol); //ex. seq1 seq1,seq3,seq5 -> seq1 = 3. + + vector temp; + splitAtComma(secondCol, temp); + + nameMap[firstCol] = temp; } MPI_File_close(&inMPI);