From: westcott Date: Mon, 4 Oct 2010 18:03:32 +0000 (+0000) Subject: paralellized rarefaction.single X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=a537fefa3bd2c2eba7d532249375b6a11b295e25 paralellized rarefaction.single --- diff --git a/classifyotucommand.cpp b/classifyotucommand.cpp index b724a68..d27a881 100644 --- a/classifyotucommand.cpp +++ b/classifyotucommand.cpp @@ -399,7 +399,7 @@ string ClassifyOtuCommand::findConsensusTaxonomy(int bin, ListVector* thisList, } - if (conTax == "") { conTax = "unclassified;"; } + if (conTax == "") { conTax = "no_consensus;"; } delete phylo; diff --git a/display.h b/display.h index 8c73b57..8067737 100644 --- a/display.h +++ b/display.h @@ -17,6 +17,8 @@ public: virtual void init(string) = 0; virtual void reset() = 0; virtual void close() = 0; + virtual void outputTempFiles(string) {} + virtual void inputTempFiles(string) {} virtual bool isCalcMultiple() = 0; virtual void setAll(bool){} virtual bool hasLciHci(){ return false; } diff --git a/filterseqscommand.cpp b/filterseqscommand.cpp index 798ae64..ffffe4e 100644 --- a/filterseqscommand.cpp +++ b/filterseqscommand.cpp @@ -147,13 +147,14 @@ void FilterSeqsCommand::help(){ try { m->mothurOut("The filter.seqs command reads a file containing sequences and creates a .filter and .filter.fasta file.\n"); - m->mothurOut("The filter.seqs command parameters are fasta, trump, soft, hard and vertical. \n"); + m->mothurOut("The filter.seqs command parameters are fasta, trump, soft, hard, processors and vertical. \n"); m->mothurOut("The fasta parameter is required. You may enter several fasta files to build the filter from and filter, by separating their names with -'s.\n"); m->mothurOut("For example: fasta=abrecovery.fasta-amazon.fasta \n"); m->mothurOut("The trump parameter .... The default is ...\n"); m->mothurOut("The soft parameter .... The default is ....\n"); m->mothurOut("The hard parameter allows you to enter a file containing the filter you want to use.\n"); m->mothurOut("The vertical parameter removes columns where all sequences contain a gap character. The default is T.\n"); + m->mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n"); m->mothurOut("The filter.seqs command should be in the following format: \n"); m->mothurOut("filter.seqs(fasta=yourFastaFile, trump=yourTrump) \n"); m->mothurOut("Example filter.seqs(fasta=abrecovery.fasta, trump=.).\n"); diff --git a/mothur b/mothur index 977c74f..648dccd 100755 Binary files a/mothur and b/mothur differ diff --git a/raredisplay.cpp b/raredisplay.cpp index 5922822..b82127c 100644 --- a/raredisplay.cpp +++ b/raredisplay.cpp @@ -135,6 +135,55 @@ void RareDisplay::close(){ exit(1); } } +/***********************************************************************/ + +void RareDisplay::inputTempFiles(string filename){ + try { + ifstream in; + m->openInputFile(filename, in); + + int thisIters; + in >> thisIters; m->gobble(in); + + for (int i = 0; i < seqs.size(); i++) { + double tempresult, tempvar; + in >> tempresult >> tempvar; m->gobble(in); + + //find weighted result + results[i] = ((nIters * results[i]) + (thisIters * tempresult)) / (float)(nIters + thisIters); + + var[i] = ((nIters * var[i]) + (thisIters * tempvar)) / (float)(nIters + thisIters); + } + + in.close(); + } + catch(exception& e) { + m->errorOut(e, "RareDisplay", "inputTempFiles"); + exit(1); + } +} + +/***********************************************************************/ + +void RareDisplay::outputTempFiles(string filename){ + try { + ofstream out; + m->openOutputFile(filename, out); + + out << nIters << endl; + + for (int i = 0; i < seqs.size(); i++) { + out << results[i] << '\t' << var[i] << endl; + } + + out.close(); + } + catch(exception& e) { + m->errorOut(e, "RareDisplay", "outputTempFiles"); + exit(1); + } +} + /***********************************************************************/ diff --git a/raredisplay.h b/raredisplay.h index 72147ad..a875962 100644 --- a/raredisplay.h +++ b/raredisplay.h @@ -20,6 +20,9 @@ public: void close(); bool isCalcMultiple() { return estimate->getMultiple(); } + void outputTempFiles(string); + void inputTempFiles(string); + private: Calculator* estimate; FileOutput* output; diff --git a/rarefact.cpp b/rarefact.cpp index 691d017..d04885f 100644 --- a/rarefact.cpp +++ b/rarefact.cpp @@ -23,6 +23,45 @@ int Rarefact::getCurve(float percentFreq = 0.01, int nIters = 1000){ int increment = 1; if (percentFreq < 1.0) { increment = numSeqs * percentFreq; } else { increment = percentFreq; } + + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + if(processors == 1){ + driver(rcd, increment, nIters); + }else{ + vector procIters; + + int numItersPerProcessor = nIters / processors; + + //divide iters between processes + for (int i = 0; i < processors; i++) { + if(i == processors - 1){ + numItersPerProcessor = nIters - i * numItersPerProcessor; + } + procIters.push_back(numItersPerProcessor); + } + + createProcesses(procIters, rcd, increment); + } + + #else + driver(rcd, increment, nIters); + #endif + + for(int i=0;iclose(); + } + + delete rcd; + return 0; + } + catch(exception& e) { + m->errorOut(e, "Rarefact", "getCurve"); + exit(1); + } +} +/***********************************************************************/ +int Rarefact::driver(RarefactionCurveData* rcd, int increment, int nIters = 1000){ + try { for(int iter=0;iterclose(); - } - delete rcd; return 0; } catch(exception& e) { - m->errorOut(e, "Rarefact", "getCurve"); + m->errorOut(e, "Rarefact", "driver"); exit(1); } } +/**************************************************************************************************/ +int Rarefact::createProcesses(vector& procIters, RarefactionCurveData* rcd, int increment) { + try { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + int process = 1; + int num = 0; + vector processIDS; + + EstOutput results; + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + driver(rcd, increment, procIters[process]); + + //pass numSeqs to parent + for(int i=0;ioutputTempFiles(tempFile); + } + exit(0); + }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } + } + + driver(rcd, increment, procIters[0]); + + //force parent to wait until all the processes are done + for (int i=0;i<(processors-1);i++) { + int temp = processIDS[i]; + wait(&temp); + } + + //get data created by processes + for (int i=0;i<(processors-1);i++) { + for(int j=0;jinputTempFiles(s); + remove(s.c_str()); + } + } + + return 0; +#endif + } + catch(exception& e) { + m->errorOut(e, "Rarefact", "createProcesses"); + exit(1); + } +} /***********************************************************************/ int Rarefact::getSharedCurve(float percentFreq = 0.01, int nIters = 1000){ diff --git a/rarefact.h b/rarefact.h index 20c1925..7d1fab4 100644 --- a/rarefact.h +++ b/rarefact.h @@ -11,8 +11,8 @@ class Rarefact { public: - Rarefact(OrderVector* o, vector disp) : - numSeqs(o->getNumSeqs()), order(o), displays(disp), label(o->getLabel()) { m = MothurOut::getInstance(); } + Rarefact(OrderVector* o, vector disp, int p) : + numSeqs(o->getNumSeqs()), order(o), displays(disp), label(o->getLabel()), processors(p) { m = MothurOut::getInstance(); } Rarefact(vector shared, vector disp) : lookup(shared), displays(disp) { globaldata = GlobalData::getInstance(); m = MothurOut::getInstance(); } @@ -24,11 +24,14 @@ private: GlobalData* globaldata; OrderVector* order; vector displays; - int numSeqs, numGroupComb; + int numSeqs, numGroupComb, processors; string label; void mergeVectors(SharedRAbundVector*, SharedRAbundVector*); vector lookup; MothurOut* m; + + int createProcesses(vector&, RarefactionCurveData*, int); + int driver(RarefactionCurveData*, int, int); }; diff --git a/rarefactcommand.cpp b/rarefactcommand.cpp index 73d09b1..ae28d23 100644 --- a/rarefactcommand.cpp +++ b/rarefactcommand.cpp @@ -40,7 +40,7 @@ RareFactCommand::RareFactCommand(string option) { else { //valid paramters for this command - string Array[] = {"iters","freq","label","calc","abund","outputdir","inputdir"}; + string Array[] = {"iters","freq","label","calc","abund","processors","outputdir","inputdir"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -93,6 +93,9 @@ RareFactCommand::RareFactCommand(string option) { temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; } convert(temp, nIters); + + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; } + convert(temp, processors); } } @@ -107,8 +110,9 @@ void RareFactCommand::help(){ try { m->mothurOut("The rarefaction.single command can only be executed after a successful read.otu WTIH ONE EXECEPTION.\n"); m->mothurOut("The rarefaction.single command can be executed after a successful cluster command. It will use the .list file from the output of the cluster.\n"); - m->mothurOut("The rarefaction.single command parameters are label, iters, freq, calc and abund. No parameters are required. \n"); + m->mothurOut("The rarefaction.single command parameters are label, iters, freq, calc, processors and abund. No parameters are required. \n"); m->mothurOut("The freq parameter is used indicate when to output your data, by default it is set to 100. But you can set it to a percentage of the number of sequence. For example freq=0.10, means 10%. \n"); + m->mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n"); m->mothurOut("The rarefaction.single command should be in the following format: \n"); m->mothurOut("rarefaction.single(label=yourLabel, iters=yourIters, freq=yourFreq, calc=yourEstimators).\n"); m->mothurOut("Example rarefaction.single(label=unique-.01-.03, iters=10000, freq=10, calc=sobs-rchao-race-rjack-rbootstrap-rshannon-rnpshannon-rsimpson).\n"); @@ -235,7 +239,7 @@ int RareFactCommand::execute(){ if(allLines == 1 || labels.count(order->getLabel()) == 1){ m->mothurOut(order->getLabel()); m->mothurOutEndLine(); - rCurve = new Rarefact(order, rDisplays); + rCurve = new Rarefact(order, rDisplays, processors); rCurve->getCurve(freq, nIters); delete rCurve; @@ -250,7 +254,7 @@ int RareFactCommand::execute(){ order = (input->getOrderVector(lastLabel)); m->mothurOut(order->getLabel()); m->mothurOutEndLine(); - rCurve = new Rarefact(order, rDisplays); + rCurve = new Rarefact(order, rDisplays, processors); rCurve->getCurve(freq, nIters); delete rCurve; @@ -290,7 +294,7 @@ int RareFactCommand::execute(){ order = (input->getOrderVector(lastLabel)); m->mothurOut(order->getLabel()); m->mothurOutEndLine(); - rCurve = new Rarefact(order, rDisplays); + rCurve = new Rarefact(order, rDisplays, processors); rCurve->getCurve(freq, nIters); delete rCurve; diff --git a/rarefactcommand.h b/rarefactcommand.h index f4492a2..3cf0ef7 100644 --- a/rarefactcommand.h +++ b/rarefactcommand.h @@ -36,7 +36,7 @@ private: InputData* input; ValidCalculators* validCalculator; Rarefact* rCurve; - int nIters, abund; + int nIters, abund, processors; float freq; bool abort, allLines; diff --git a/unifracunweightedcommand.cpp b/unifracunweightedcommand.cpp index a50d147..88df73d 100644 --- a/unifracunweightedcommand.cpp +++ b/unifracunweightedcommand.cpp @@ -104,11 +104,12 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option) { void UnifracUnweightedCommand::help(){ try { m->mothurOut("The unifrac.unweighted command can only be executed after a successful read.tree command.\n"); - m->mothurOut("The unifrac.unweighted command parameters are groups, iters, distance and random. No parameters are required.\n"); + m->mothurOut("The unifrac.unweighted command parameters are groups, iters, distance, processors and random. No parameters are required.\n"); m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 1 valid group.\n"); m->mothurOut("The group names are separated by dashes. The iters parameter allows you to specify how many random trees you would like compared to your tree.\n"); m->mothurOut("The distance parameter allows you to create a distance file from the results. The default is false.\n"); m->mothurOut("The random parameter allows you to shut off the comparison to random trees. The default is false, meaning compare don't your trees with randomly generated trees.\n"); + m->mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n"); m->mothurOut("The unifrac.unweighted command should be in the following format: unifrac.unweighted(groups=yourGroups, iters=yourIters).\n"); m->mothurOut("Example unifrac.unweighted(groups=A-B-C, iters=500).\n"); m->mothurOut("The default value for groups is all the groups in your groupfile, and iters is 1000.\n"); diff --git a/unifracweightedcommand.cpp b/unifracweightedcommand.cpp index e4a0a9d..e8add90 100644 --- a/unifracweightedcommand.cpp +++ b/unifracweightedcommand.cpp @@ -96,11 +96,12 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) { void UnifracWeightedCommand::help(){ try { m->mothurOut("The unifrac.weighted command can only be executed after a successful read.tree command.\n"); - m->mothurOut("The unifrac.weighted command parameters are groups, iters, distance and random. No parameters are required.\n"); + m->mothurOut("The unifrac.weighted command parameters are groups, iters, distance, processors and random. No parameters are required.\n"); m->mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 2 valid groups.\n"); m->mothurOut("The group names are separated by dashes. The iters parameter allows you to specify how many random trees you would like compared to your tree.\n"); m->mothurOut("The distance parameter allows you to create a distance file from the results. The default is false.\n"); m->mothurOut("The random parameter allows you to shut off the comparison to random trees. The default is false, meaning don't compare your trees with randomly generated trees.\n"); + m->mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n"); m->mothurOut("The unifrac.weighted command should be in the following format: unifrac.weighted(groups=yourGroups, iters=yourIters).\n"); m->mothurOut("Example unifrac.weighted(groups=A-B-C, iters=500).\n"); m->mothurOut("The default value for groups is all the groups in your groupfile, and iters is 1000.\n");