From: westcott Date: Tue, 28 Sep 2010 16:31:21 +0000 (+0000) Subject: added distance option to summary.shared X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=6446d2e9713a95db5f772135b7aa3387f7ebf7bb added distance option to summary.shared --- diff --git a/clustersplitcommand.cpp b/clustersplitcommand.cpp index 050a615..10579a3 100644 --- a/clustersplitcommand.cpp +++ b/clustersplitcommand.cpp @@ -206,7 +206,7 @@ void ClusterSplitCommand::help(){ m->mothurOut("The method allows you to specify what clustering algorythm you want to use, default=furthest, option furthest, nearest, or average. \n"); m->mothurOut("The splitmethod parameter allows you to specify how you want to split your distance file before you cluster, default=distance, options distance, classify or fasta. \n"); m->mothurOut("The taxonomy parameter allows you to enter the taxonomy file for your sequences, this is only valid if you are using splitmethod=classify. Be sure your taxonomy file does not include the probability scores. \n"); - m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1. \n"); + m->mothurOut("The taxlevel parameter allows you to specify the taxonomy level you want to use to split the distance file, default=1, meaning use the first taxon in each list. \n"); m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n"); #ifdef USE_MPI m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n"); diff --git a/makefile b/makefile index 92c2013..773b52f 100644 --- a/makefile +++ b/makefile @@ -34,11 +34,15 @@ endif 64BIT_VERSION ?= yes ifeq ($(strip $(64BIT_VERSION)),yes) - TARGET_ARCH += -arch x86_64 CXXFLAGS += -DBIT_VERSION #if you are using centos uncomment the following lines #CXX = g++44 + + #if you are a mac user use the following line + TARGET_ARCH += -arch x86_64 + + #if you are a linux user use the following line #CXXFLAGS += -mtune=native -march=native -m64 endif diff --git a/mothur b/mothur index cd70f8b..d4e222c 100755 Binary files a/mothur and b/mothur differ diff --git a/splitmatrix.cpp b/splitmatrix.cpp index a4e1f98..b7a3b49 100644 --- a/splitmatrix.cpp +++ b/splitmatrix.cpp @@ -222,6 +222,7 @@ int SplitMatrix::createDistanceFilesFromTax(map& seqGroup, int numG for(int i=0;ihasPath(fastafile); } string tempDistFile = outputDir + m->getRootName(m->getSimpleName((fastafile + "." + toString(i) + ".temp"))) + "dist"; //if there are valid distances diff --git a/summarysharedcommand.cpp b/summarysharedcommand.cpp index e6472a8..3069a6f 100644 --- a/summarysharedcommand.cpp +++ b/summarysharedcommand.cpp @@ -48,7 +48,7 @@ SummarySharedCommand::SummarySharedCommand(string option) { else { //valid paramters for this command - string Array[] = {"label","calc","groups","all","outputdir","inputdir", "processors"}; + string Array[] = {"label","calc","groups","all","outputdir","distance","inputdir", "processors"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -104,6 +104,9 @@ SummarySharedCommand::SummarySharedCommand(string option) { string temp = validParameter.validFile(parameters, "all", false); if (temp == "not found") { temp = "false"; } all = m->isTrue(temp); + temp = validParameter.validFile(parameters, "distance", false); if (temp == "not found") { temp = "false"; } + createPhylip = m->isTrue(temp); + temp = validParameter.validFile(parameters, "processors", false); if(temp == "not found"){ temp = "1"; } convert(temp, processors); @@ -396,6 +399,7 @@ int SummarySharedCommand::execute(){ m->mothurOut("Output File Names: "); m->mothurOutEndLine(); m->mothurOut(outputFileName); m->mothurOutEndLine(); if (mult) { m->mothurOut(outAllFileName); m->mothurOutEndLine(); } + for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } m->mothurOutEndLine(); return 0; @@ -409,10 +413,12 @@ int SummarySharedCommand::execute(){ /***********************************************************/ int SummarySharedCommand::process(vector thisLookup, string sumFileName, string sumAllFileName) { try { + vector< vector > calcDists; //vector containing vectors that contains the summary results for each group compare + calcDists.resize(sumCalculators.size()); //one for each calc, this will be used to make .dist files #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ - driver(thisLookup, 0, numGroups, sumFileName+".temp", sumAllFileName+".temp"); + driver(thisLookup, 0, numGroups, sumFileName+".temp", sumAllFileName+".temp", calcDists); m->appendFiles((sumFileName + ".temp"), sumFileName); remove((sumFileName + ".temp").c_str()); if (mult) { @@ -420,7 +426,7 @@ int SummarySharedCommand::process(vector thisLookup, string remove((sumAllFileName + ".temp").c_str()); } }else{ - int process = 0; + int process = 1; vector processIDS; //loop through and create all the processes you want @@ -431,11 +437,34 @@ int SummarySharedCommand::process(vector thisLookup, string processIDS.push_back(pid); process++; }else if (pid == 0){ - driver(thisLookup, lines[process].start, lines[process].end, sumFileName + toString(getpid()) + ".temp", sumAllFileName + toString(getpid()) + ".temp"); + driver(thisLookup, lines[process].start, lines[process].end, sumFileName + toString(getpid()) + ".temp", sumAllFileName + toString(getpid()) + ".temp", calcDists); + + //only do this if you want a distance file + if (createPhylip) { + string tempdistFileName = m->getRootName(m->getSimpleName(sumFileName)) + toString(getpid()) + ".dist"; + ofstream outtemp; + m->openOutputFile(tempdistFileName, outtemp); + + for (int i = 0; i < calcDists.size(); i++) { + outtemp << calcDists[i].size() << endl; + + for (int j = 0; j < calcDists[i].size(); j++) { + outtemp << calcDists[i][j].seq1 << '\t' << calcDists[i][j].seq2 << '\t' << calcDists[i][j].dist << endl; + } + } + outtemp.close(); + } + exit(0); }else { m->mothurOut("unable to spawn the necessary processes."); m->mothurOutEndLine(); exit(0); } } - + + //parent do your part + driver(thisLookup, lines[0].start, lines[0].end, sumFileName + toString(getpid()) + ".temp", sumAllFileName + toString(getpid()) + ".temp", calcDists); + m->appendFiles((sumFileName + toString(getpid()) + ".temp"), sumFileName); + remove((sumFileName + toString(getpid()) + ".temp").c_str()); + if (mult) { m->appendFiles((sumAllFileName + toString(getpid()) + ".temp"), sumAllFileName); } + //force parent to wait until all the processes are done for (int i = 0; i < processIDS.size(); i++) { int temp = processIDS[i]; @@ -445,15 +474,36 @@ int SummarySharedCommand::process(vector thisLookup, string for (int i = 0; i < processIDS.size(); i++) { m->appendFiles((sumFileName + toString(processIDS[i]) + ".temp"), sumFileName); remove((sumFileName + toString(processIDS[i]) + ".temp").c_str()); - if (mult) { - if (i == 0) { m->appendFiles((sumAllFileName + toString(processIDS[i]) + ".temp"), sumAllFileName); } - remove((sumAllFileName + toString(processIDS[i]) + ".temp").c_str()); + if (mult) { remove((sumAllFileName + toString(processIDS[i]) + ".temp").c_str()); } + + if (createPhylip) { + string tempdistFileName = m->getRootName(m->getSimpleName(sumFileName)) + toString(processIDS[i]) + ".dist"; + ifstream intemp; + m->openInputFile(tempdistFileName, intemp); + + for (int i = 0; i < calcDists.size(); i++) { + int size = 0; + intemp >> size; m->gobble(intemp); + + for (int j = 0; j < size; j++) { + int seq1 = 0; + int seq2 = 0; + float dist = 1.0; + + intemp >> seq1 >> seq2 >> dist; m->gobble(intemp); + + seqDist tempDist(seq1, seq2, dist); + calcDists[i].push_back(tempDist); + } + } + intemp.close(); + remove(tempdistFileName.c_str()); } } } #else - driver(thisLookup, 0, numGroups, (sumFileName + ".temp"), (sumAllFileName + ".temp")); + driver(thisLookup, 0, numGroups, (sumFileName + ".temp"), (sumAllFileName + ".temp"), calcDists); m->appendFiles((sumFileName + ".temp"), sumFileName); remove((sumFileName + ".temp").c_str()); if (mult) { @@ -461,6 +511,50 @@ int SummarySharedCommand::process(vector thisLookup, string remove((sumAllFileName + ".temp").c_str()); } #endif + + if (createPhylip) { + for (int i = 0; i < calcDists.size(); i++) { + if (m->control_pressed) { break; } + + string distFileName = outputDir + m->getRootName(m->getSimpleName(sumFileName)) + sumCalculators[i]->getName() + "." + thisLookup[0]->getLabel() + ".dist"; + outputNames.push_back(distFileName); + ofstream outDist; + m->openOutputFile(distFileName, outDist); + outDist.setf(ios::fixed, ios::floatfield); outDist.setf(ios::showpoint); + + //initialize matrix + vector< vector > matrix; //square matrix to represent the distance + matrix.resize(thisLookup.size()); + for (int k = 0; k < thisLookup.size(); k++) { matrix[k].resize(thisLookup.size(), 0.0); } + + + for (int j = 0; j < calcDists[i].size(); j++) { + int row = calcDists[i][j].seq1; + int column = calcDists[i][j].seq2; + float dist = calcDists[i][j].dist; + + matrix[row][column] = dist; + matrix[column][row] = dist; + } + + //output to file + outDist << thisLookup.size() << endl; + for (int r=0; rgetGroup(); + if (name.length() < 10) { //pad with spaces to make compatible + while (name.length() < 10) { name += " "; } + } + outDist << name << '\t'; + + //output distances + for (int l = 0; l < r; l++) { outDist << matrix[r][l] << '\t'; } + outDist << endl; + } + + outDist.close(); + } + } } catch(exception& e) { m->errorOut(e, "SummarySharedCommand", "process"); @@ -468,7 +562,7 @@ int SummarySharedCommand::process(vector thisLookup, string } } /**************************************************************************************************/ -int SummarySharedCommand::driver(vector thisLookup, int start, int end, string sumFile, string sumAllFile) { +int SummarySharedCommand::driver(vector thisLookup, int start, int end, string sumFile, string sumAllFile, vector< vector >& calcDists) { try { //loop through calculators and add to file all for all calcs that can do mutiple groups @@ -524,12 +618,15 @@ int SummarySharedCommand::driver(vector thisLookup, int sta for(int i=0;igetValues(subset); //saves the calculator outputs + vector tempdata = sumCalculators[i]->getValues(subset); //saves the calculator outputs if (m->control_pressed) { outputFileHandle.close(); return 1; } outputFileHandle << '\t'; sumCalculators[i]->print(outputFileHandle); + + seqDist temp(l, k, tempdata[0]); + calcDists[i].push_back(temp); } outputFileHandle << endl; } diff --git a/summarysharedcommand.h b/summarysharedcommand.h index b99e0f0..c017264 100644 --- a/summarysharedcommand.h +++ b/summarysharedcommand.h @@ -39,7 +39,7 @@ private: InputData* input; ValidCalculators* validCalculator; - bool abort, allLines, mult, all; + bool abort, allLines, mult, all, createPhylip; set labels; //holds labels to be used string label, calc, groups; vector Estimators, Groups, outputNames; @@ -47,7 +47,7 @@ private: string format, outputDir; int numGroups, processors; int process(vector, string, string); - int driver(vector, int, int, string, string); + int driver(vector, int, int, string, string, vector< vector >&); }; diff --git a/unweighted.cpp b/unweighted.cpp index 8103688..2afc4a7 100644 --- a/unweighted.cpp +++ b/unweighted.cpp @@ -401,7 +401,7 @@ EstOutput Unweighted::driver(Tree* t, vector< vector > namesOfGroupCombo double totalBL = 0.00; //all branch lengths double UW = 0.00; //Unweighted Value = UniqueBL / totalBL; - for(int i=0;igetNumNodes();i++){ + for(int i=0;igetNumNodes();i++){ if (m->control_pressed) { return data; } @@ -411,15 +411,15 @@ EstOutput Unweighted::driver(Tree* t, vector< vector > namesOfGroupCombo int pcountSize = 0; for (int j = 0; j < namesOfGroupCombos[h].size(); j++) { - map::iterator itGroup = t->tree[i].pcount.find(namesOfGroupCombos[h][j]); - if (itGroup != t->tree[i].pcount.end()) { pcountSize++; if (pcountSize > 1) { break; } } + map::iterator itGroup = copyTree->tree[i].pcount.find(namesOfGroupCombos[h][j]); + if (itGroup != copyTree->tree[i].pcount.end()) { pcountSize++; if (pcountSize > 1) { break; } } } if (pcountSize == 0) { } - else if ((t->tree[i].getBranchLength() != -1) && (pcountSize == 1)) { UniqueBL += abs(t->tree[i].getBranchLength()); } + else if ((copyTree->tree[i].getBranchLength() != -1) && (pcountSize == 1)) { UniqueBL += abs(copyTree->tree[i].getBranchLength()); } - if ((t->tree[i].getBranchLength() != -1) && (pcountSize != 0)) { - totalBL += abs(t->tree[i].getBranchLength()); + if ((copyTree->tree[i].getBranchLength() != -1) && (pcountSize != 0)) { + totalBL += abs(copyTree->tree[i].getBranchLength()); } }