From: westcott Date: Tue, 5 Jan 2010 21:29:35 +0000 (+0000) Subject: you can now use a distance matrix as input for the heatmap.sim command. X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=d5bf2c1354d0811a33394d918b15620606560d58 you can now use a distance matrix as input for the heatmap.sim command. added distance parameter to unifrac.weighted and unifrac.unweighted commands that allows you to output a distance matrix from the command. added random parameter to unifrac.weighted and unifrac.unweighted commands that allows you to shut off the comparison to random trees. --- diff --git a/bayesian.cpp b/bayesian.cpp index e6adfc2..e59545a 100644 --- a/bayesian.cpp +++ b/bayesian.cpp @@ -185,7 +185,7 @@ string Bayesian::bootstrapResults(vector kmers, int tax, int numToSelect) { } if (confidence >= confidenceThreshold) { - confidenceTax = seqTax.name + "(" + toString(confidence) + ");" + confidenceTax; + confidenceTax = seqTax.name + "(" + toString(((confidence/(float)iters) * 100)) + ");" + confidenceTax; simpleTax = seqTax.name + ";" + simpleTax; } diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index 7221ac0..768cb0d 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -261,6 +261,7 @@ int ClassifySeqsCommand::execute(){ remove(tempTaxonomyFile.c_str()); taxaBrowser.assignHeirarchyIDs(0); + taxaBrowser.binUnclassified(); ofstream outTaxTree; openOutputFile(taxSummary, outTaxTree); diff --git a/heatmapsim.cpp b/heatmapsim.cpp index 1544dc2..88a49ed 100644 --- a/heatmapsim.cpp +++ b/heatmapsim.cpp @@ -24,7 +24,6 @@ HeatMapSim::HeatMapSim(){ globaldata = GlobalData::getInstance(); } //********************************************************************************************************************** - void HeatMapSim::getPic(vector lookup, vector calcs) { try { EstOutput data; @@ -101,6 +100,73 @@ void HeatMapSim::getPic(vector lookup, vector exit(1); } } +//********************************************************************************************************************** +void HeatMapSim::getPic(vector< vector > dists, vector groups) { + try { + + vector sims; + + string filenamesvg = getRootName(globaldata->inputFileName) + "heatmap.sim.svg"; + openOutputFile(filenamesvg, outsvg); + + //svg image + outsvg << "\n"; + outsvg << "\n"; + + //white backround + outsvg << ""; + outsvg << "Heatmap for " + globaldata->inputFileName + "\n"; + + //column labels + for (int h = 0; h < groups.size(); h++) { + outsvg << "" + groups[h] + "\n"; + outsvg << "" + groups[h] + "\n"; + } + + double biggest = -1; + float scaler; + + //get sim for each comparison and save them so you can find the relative similairity + for(int i = 0; i < (dists.size()-1); i++){ + for(int j = (i+1); j < dists.size(); j++){ + + float sim = 1.0 - dists[i][j]; + sims.push_back(sim); + + //save biggest similairity to set relative sim + if (sim > biggest) { biggest = sim; } + } + } + + //map biggest similairity found to red + scaler = 255.0 / biggest; + + int count = 0; + //output similairites to file + for(int i = 0; i < (dists.size()-1); i++){ + for(int j = (i+1); j < dists.size(); j++){ + + //find relative color + int color = scaler * sims[count]; + //draw box + outsvg << "\n"; + count++; + } + } + + int y = ((dists.size() * 150) + 120); + printLegend(y, biggest); + + outsvg << "\n\n"; + outsvg.close(); + + } + catch(exception& e) { + errorOut(e, "HeatMapSim", "getPic"); + exit(1); + } +} + //********************************************************************************************************************** void HeatMapSim::printLegend(int y, float maxSim) { diff --git a/heatmapsim.h b/heatmapsim.h index c3ec2a7..d38683c 100644 --- a/heatmapsim.h +++ b/heatmapsim.h @@ -24,6 +24,7 @@ class HeatMapSim { ~HeatMapSim(){}; void getPic(vector, vector); + void getPic(vector< vector >, vector); private: void printLegend(int, float); diff --git a/heatmapsimcommand.cpp b/heatmapsimcommand.cpp index 3803692..3287e62 100644 --- a/heatmapsimcommand.cpp +++ b/heatmapsimcommand.cpp @@ -36,7 +36,7 @@ HeatMapSimCommand::HeatMapSimCommand(string option){ else { //valid paramters for this command - string AlignArray[] = {"groups","label", "calc"}; + string AlignArray[] = {"groups","label", "calc","phylip","column","name"}; vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); OptionParser parser(option); @@ -49,43 +49,67 @@ HeatMapSimCommand::HeatMapSimCommand(string option){ if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } } - //make sure the user has already run the read.otu command - if (globaldata->getSharedFile() == "") { - mothurOut("You must read a list and a group, or a shared before you can use the heatmap.sim command."); mothurOutEndLine(); abort = true; - } - - //check for optional parameter and set defaults - // ...at some point should added some additional type checking... - label = validParameter.validFile(parameters, "label", false); - if (label == "not found") { label = ""; } - else { - if(label != "all") { splitAtDash(label, labels); allLines = 0; } - else { allLines = 1; } - } + format = ""; - //if the user has not specified any labels use the ones from read.otu - if (label == "") { - allLines = globaldata->allLines; - labels = globaldata->labels; - } + //required parameters + phylipfile = validParameter.validFile(parameters, "phylip", true); + if (phylipfile == "not open") { abort = true; } + else if (phylipfile == "not found") { phylipfile = ""; } + else { format = "phylip"; } - calc = validParameter.validFile(parameters, "calc", false); - if (calc == "not found") { calc = "jest-thetayc"; } - else { - if (calc == "default") { calc = "jest-thetayc"; } + columnfile = validParameter.validFile(parameters, "column", true); + if (columnfile == "not open") { abort = true; } + else if (columnfile == "not found") { columnfile = ""; } + else { format = "column"; } + + namefile = validParameter.validFile(parameters, "name", true); + if (namefile == "not open") { abort = true; } + else if (namefile == "not found") { namefile = ""; } + + + //error checking on files + if ((globaldata->getSharedFile() == "") && ((phylipfile == "") && (columnfile == ""))) { mothurOut("You must run the read.otu command or provide a distance file before running the heatmap.sim command."); mothurOutEndLine(); abort = true; } + else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When running the heatmap.sim command with a distance file you may not use both the column and the phylip parameters."); mothurOutEndLine(); abort = true; } + + if (columnfile != "") { + if (namefile == "") { mothurOut("You need to provide a namefile if you are going to use the column format."); mothurOutEndLine(); abort = true; } } - splitAtDash(calc, Estimators); - groups = validParameter.validFile(parameters, "groups", false); - if (groups == "not found") { groups = ""; } - else { - splitAtDash(groups, Groups); - globaldata->Groups = Groups; + if (format == "") { format = "shared"; } + + //check for optional parameter and set defaults + // ...at some point should added some additional type checking... + if (format == "shared") { + label = validParameter.validFile(parameters, "label", false); + if (label == "not found") { label = ""; } + else { + if(label != "all") { splitAtDash(label, labels); allLines = 0; } + else { allLines = 1; } + } + + //if the user has not specified any labels use the ones from read.otu + if (label == "") { + allLines = globaldata->allLines; + labels = globaldata->labels; + } + + calc = validParameter.validFile(parameters, "calc", false); + if (calc == "not found") { calc = "jest-thetayc"; } + else { + if (calc == "default") { calc = "jest-thetayc"; } + } + splitAtDash(calc, Estimators); + + groups = validParameter.validFile(parameters, "groups", false); + if (groups == "not found") { groups = ""; } + else { + splitAtDash(groups, Groups); + globaldata->Groups = Groups; + } } if (abort == false) { validCalculator = new ValidCalculators(); - heatmap = new HeatMapSim(); int i; for (i=0; iprintCalc("heat", cout); mothurOut("The default value for calc is jclass-thetayc.\n"); mothurOut("The heatmap.sim command outputs a .svg file for each calculator you choose at each label you specify.\n"); + mothurOut("The second way to use the heatmap.sim command is with a distance file representing the distance bewteen your groups. \n"); + mothurOut("Using the command this way, the phylip or column parameter are required, and only one may be used. If you use a column file the name filename is required. \n"); + mothurOut("The heatmap.sim command should be in the following format: heatmap.sim(phylip=yourDistanceFile).\n"); + mothurOut("Example heatmap.sim(phylip=amazonGroups.dist).\n"); mothurOut("Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n\n"); } @@ -151,14 +181,7 @@ void HeatMapSimCommand::help(){ //********************************************************************************************************************** -HeatMapSimCommand::~HeatMapSimCommand(){ - if (abort == false) { - delete input; globaldata->ginput = NULL; - delete read; - delete heatmap; - delete validCalculator; - } -} +HeatMapSimCommand::~HeatMapSimCommand(){} //********************************************************************************************************************** @@ -167,6 +190,32 @@ int HeatMapSimCommand::execute(){ if (abort == true) { return 0; } + heatmap = new HeatMapSim(); + + if (format == "shared") { + runCommandShared(); + }else if (format == "phylip") { + globaldata->inputFileName = phylipfile; + runCommandDist(); + }else if (format == "column") { + globaldata->inputFileName = columnfile; + runCommandDist(); + } + + delete heatmap; + delete validCalculator; + + return 0; + } + catch(exception& e) { + errorOut(e, "HeatMapSimCommand", "execute"); + exit(1); + } +} + +//********************************************************************************************************************** +int HeatMapSimCommand::runCommandShared() { + try { //if the users entered no valid calculators don't execute command if (heatCalculators.size() == 0) { mothurOut("No valid calculators."); mothurOutEndLine(); return 0; } @@ -248,12 +297,142 @@ int HeatMapSimCommand::execute(){ //reset groups parameter globaldata->Groups.clear(); + delete input; globaldata->ginput = NULL; + delete read; + return 0; } catch(exception& e) { - errorOut(e, "HeatMapSimCommand", "execute"); + errorOut(e, "HeatMapSimCommand", "runCommandShared"); exit(1); } } +//********************************************************************************************************************** +int HeatMapSimCommand::runCommandDist() { + try { + + vector< vector > matrix; + vector names; + ifstream in; + + //read distance file and create distance vector and names vector + if (format == "phylip") { + //read phylip file + openInputFile(phylipfile, in); + + string name; + int numSeqs; + in >> numSeqs >> name; + + //save name + names.push_back(name); + + //resize the matrix and fill with zeros + matrix.resize(numSeqs); + for(int i = 0; i < numSeqs; i++) { + matrix[i].resize(numSeqs, 0.0); + } + + //determine if matrix is square or lower triangle + //if it is square read the distances for the first sequence + char d; + bool square; + while((d=in.get()) != EOF){ + + //is d a number meaning its square + if(isalnum(d)){ + square = true; + in.putback(d); + + for(int i=0;i> matrix[0][i]; + } + break; + } + + //is d a line return meaning its lower triangle + if(d == '\n'){ + square = false; + break; + } + } + + //read rest of matrix + if (square == true) { + for(int i=1;i> name; + names.push_back(name); + + for(int j=0;j> matrix[i][j]; } + gobble(in); + } + }else { + double dist; + for(int i=1;i> name; + names.push_back(name); + + for(int j=0;j> dist; + matrix[i][j] = dist; matrix[j][i] = dist; + } + gobble(in); + } + } + in.close(); + }else { + //read names file + NameAssignment* nameMap = new NameAssignment(namefile); + nameMap->readMap(); + + //put names in order in vector + for (int i = 0; i < nameMap->size(); i++) { + names.push_back(nameMap->get(i)); + } + + //resize matrix + matrix.resize(nameMap->size()); + for (int i = 0; i < nameMap->size(); i++) { + matrix[i].resize(nameMap->size(), 0.0); + } + + //read column file + string first, second; + double dist; + openInputFile(columnfile, in); + + while (!in.eof()) { + in >> first >> second >> dist; gobble(in); + + map::iterator itA = nameMap->find(first); + map::iterator itB = nameMap->find(second); + + if(itA == nameMap->end()){ cerr << "AAError: Sequence '" << first << "' was not found in the names file, please correct\n"; exit(1); } + if(itB == nameMap->end()){ cerr << "ABError: Sequence '" << second << "' was not found in the names file, please correct\n"; exit(1); } + + //save distance + matrix[itA->second][itB->second] = dist; + matrix[itB->second][itA->second] = dist; + } + in.close(); + + delete nameMap; + } + + heatmap->getPic(matrix, names); //vector>, vector + + return 0; + } + catch(exception& e) { + errorOut(e, "HeatMapSimCommand", "runCommandDist"); + exit(1); + } +} //********************************************************************************************************************** + + + + + + diff --git a/heatmapsimcommand.h b/heatmapsimcommand.h index 595f918..49924eb 100644 --- a/heatmapsimcommand.h +++ b/heatmapsimcommand.h @@ -39,8 +39,11 @@ private: map::iterator it; bool abort, allLines; set labels; //holds labels to be used - string format, groups, label, calc; + string format, groups, label, calc, phylipfile, columnfile, namefile; vector Estimators, Groups; + + int runCommandShared(); + int runCommandDist(); }; diff --git a/pcacommand.cpp b/pcacommand.cpp index ef89ae5..09324c9 100644 --- a/pcacommand.cpp +++ b/pcacommand.cpp @@ -21,7 +21,7 @@ PCACommand::PCACommand(string option){ else { //valid paramters for this command - string Array[] = {"phylip","lt"}; + string Array[] = {"phylip"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -59,11 +59,11 @@ PCACommand::PCACommand(string option){ // if (namefile == "") { mothurOut("You need to provide a namefile if you are going to use the column format."); mothurOutEndLine(); abort = true; } //} - string temp = validParameter.validFile(parameters, "lt", false); if (temp == "not found") { temp = "false"; } - bool lt = isTrue(temp); + //string temp = validParameter.validFile(parameters, "lt", false); if (temp == "not found") { temp = "false"; } + //bool lt = isTrue(temp); - if (lt) { matrix = 2; } - else { matrix = 1; } + //if (lt) { matrix = 2; } + //else { matrix = 1; } } @@ -108,14 +108,14 @@ int PCACommand::execute(){ else{ fbase += "."; } - read(filename, matrix, names, D); + read(filename, names, D); double offset = 0.0000; vector d; vector e; vector > G = D; vector > copy_G; - int rank = D.size(); + //int rank = D.size(); cout << "\nProcessing...\n"; @@ -160,7 +160,7 @@ void PCACommand::get_comment(istream& f, char begin, char end){ /*********************************************************************************************************************************/ -void PCACommand::read_mega(istream& f, int square_m, vector& name_list, vector >& d){ +void PCACommand::read_mega(istream& f, vector& name_list, vector >& d){ try { get_comment(f, '#', '\n'); @@ -262,23 +262,47 @@ void PCACommand::read_phylip(istream& f, int square_m, vector& name_list /*********************************************************************************************************************************/ -void PCACommand::read(string fname, int m, vector& names, vector >& D){ +void PCACommand::read(string fname, vector& names, vector >& D){ try { - ifstream f(fname.c_str()); - if(!f) { - cerr << "Error: Could not open " << fname << endl; - exit(1); - } + ifstream f; + openInputFile(fname, f); + char test = f.peek(); if(test == '#'){ - read_mega(f, m, names, D); + read_mega(f, names, D); } else{ + //check whether matrix is square + char d; + int m = 1; + int numSeqs; + string name; + + f >> numSeqs >> name; + + while((d=f.get()) != EOF){ + + //is d a number meaning its square + if(isalnum(d)){ + m = 1; + break; + } + + //is d a line return meaning its lower triangle + if(d == '\n'){ + m = 2; + break; + } + } + f.close(); + + //reopen to get back to beginning + openInputFile(fname, f); read_phylip(f, m, names, D); } - int rank = D.size(); + //int rank = D.size(); } catch(exception& e) { errorOut(e, "PCACommand", "read"); diff --git a/pcacommand.h b/pcacommand.h index 822be3b..143e83d 100644 --- a/pcacommand.h +++ b/pcacommand.h @@ -27,12 +27,11 @@ private: bool abort; string phylipfile, columnfile, namefile, format, filename, fbase; float cutoff, precision; - int matrix; void get_comment(istream&, char, char); - void read_mega(istream&, int, vector&, vector >&); + void read_mega(istream&, vector&, vector >&); void read_phylip(istream&, int, vector&, vector >&); - void read(string, int, vector&, vector >&); + void read(string, vector&, vector >&); double pythag(double, double); void matrix_mult(vector >, vector >, vector >&); void recenter(double, vector >, vector >&); diff --git a/phylotree.cpp b/phylotree.cpp index 2a44192..8d03607 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -31,6 +31,7 @@ PhyloTree::PhyloTree(string tfile){ numSeqs = 0; tree.push_back(TaxNode("Root")); tree[0].heirarchyID = "0"; + maxLevel = 0; ifstream in; openInputFile(tfile, in); @@ -151,6 +152,10 @@ void PhyloTree::assignHeirarchyIDs(int index){ tree[it->second].heirarchyID = tree[index].heirarchyID + '.' + toString(counter); counter++; tree[it->second].level = tree[index].level + 1; + + //save maxLevel for binning the unclassified seqs + if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; } + assignHeirarchyIDs(it->second); } } @@ -159,7 +164,60 @@ void PhyloTree::assignHeirarchyIDs(int index){ exit(1); } } - +/**************************************************************************************************/ +void PhyloTree::binUnclassified(){ + try { + map::iterator itBin; + map::iterator childPointer; + + //go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary + for (itBin = name2Taxonomy.begin(); itBin != name2Taxonomy.end(); itBin++) { + + int level = tree[itBin->second].level; + int currentNode = itBin->second; + + //this sequence is unclassified at some levels + while(level != maxLevel){ + + level++; + + string taxon = "unclassified"; + + //does the parent have a child names 'unclassified'? + childPointer = tree[currentNode].children.find(taxon); + + if(childPointer != tree[currentNode].children.end()){ //if the node already exists, move on + currentNode = childPointer->second; //currentNode becomes 'unclassified' + tree[currentNode].accessions.push_back(itBin->first); //add this seq + name2Taxonomy[itBin->first] = currentNode; + } + else{ //otherwise, create it + tree.push_back(TaxNode(taxon)); + numNodes++; + tree[currentNode].children[taxon] = numNodes-1; + tree[numNodes-1].parent = currentNode; + + currentNode = tree[currentNode].children[taxon]; + tree[currentNode].accessions.push_back(itBin->first); + name2Taxonomy[itBin->first] = currentNode; + } + + if (level == maxLevel) { uniqueTaxonomies[currentNode] = currentNode; } + } + } + + //clear HeirarchyIDs and reset them + for (int i = 1; i < tree.size(); i++) { + tree[i].heirarchyID = ""; + } + assignHeirarchyIDs(0); + + } + catch(exception& e) { + errorOut(e, "PhyloTree", "binUnclassified"); + exit(1); + } +} /**************************************************************************************************/ void PhyloTree::print(ofstream& out){ @@ -182,6 +240,8 @@ void PhyloTree::print(int i, ofstream& out){ out <second].level << '\t' << tree[it->second].heirarchyID << '\t' << tree[it->second].name << '\t' << tree[it->second].children.size() << '\t' << tree[it->second].accessions.size() << endl; print(it->second, out); } + + } catch(exception& e) { errorOut(e, "PhyloTree", "print"); diff --git a/phylotree.h b/phylotree.h index f8365d5..6e5b58d 100644 --- a/phylotree.h +++ b/phylotree.h @@ -35,7 +35,9 @@ public: void addSeqToTree(string, string); void assignHeirarchyIDs(int); void print(ofstream&); - vector getGenusNodes(); + vector getGenusNodes(); + void binUnclassified(); + TaxNode get(int i) { return tree[i]; } TaxNode get(string seqName) { return tree[name2Taxonomy[seqName]]; } int getIndex(string seqName) { return name2Taxonomy[seqName]; } @@ -49,6 +51,7 @@ private: void print(int, ofstream&); int numNodes; int numSeqs; + int maxLevel; }; /**************************************************************************************************/ diff --git a/unifracunweightedcommand.cpp b/unifracunweightedcommand.cpp index 326e695..b551e5f 100644 --- a/unifracunweightedcommand.cpp +++ b/unifracunweightedcommand.cpp @@ -21,7 +21,7 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option) { else { //valid paramters for this command - string Array[] = {"groups","iters"}; + string Array[] = {"groups","iters","distance","random"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -46,10 +46,24 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option) { globaldata->Groups = Groups; } - itersString = validParameter.validFile(parameters, "iters", false); if (itersString == "not found") { itersString = "1000"; } + itersString = validParameter.validFile(parameters, "iters", false); if (itersString == "not found") { itersString = "1000"; } convert(itersString, iters); + string temp = validParameter.validFile(parameters, "distance", false); if (temp == "not found") { temp = "false"; } + phylip = isTrue(temp); + temp = validParameter.validFile(parameters, "random", false); if (temp == "not found") { temp = "true"; } + random = isTrue(temp); + + if (!random) { iters = 0; } //turn off random calcs + + //if user selects distance = true and no groups it won't calc the pairwise + if ((phylip) && (Groups.size() == 0)) { + groups = "all"; + splitAtDash(groups, Groups); + globaldata->Groups = Groups; + } + if (abort == false) { T = globaldata->gTree; tmap = globaldata->gTreemap; @@ -80,9 +94,11 @@ UnifracUnweightedCommand::UnifracUnweightedCommand(string option) { void UnifracUnweightedCommand::help(){ try { mothurOut("The unifrac.unweighted command can only be executed after a successful read.tree command.\n"); - mothurOut("The unifrac.unweighted command parameters are groups and iters. No parameters are required.\n"); + mothurOut("The unifrac.unweighted command parameters are groups, iters, distance and random. No parameters are required.\n"); mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 1 valid group.\n"); mothurOut("The group names are separated by dashes. The iters parameter allows you to specify how many random trees you would like compared to your tree.\n"); + mothurOut("The distance parameter allows you to create a distance file from the results. The default is false.\n"); + mothurOut("The random parameter allows you to shut off the comparison to random trees. The default is true, meaning compare your trees with randomly generated trees.\n"); mothurOut("The unifrac.unweighted command should be in the following format: unifrac.unweighted(groups=yourGroups, iters=yourIters).\n"); mothurOut("Example unifrac.unweighted(groups=A-B-C, iters=500).\n"); mothurOut("The default value for groups is all the groups in your groupfile, and iters is 1000.\n"); @@ -113,7 +129,7 @@ int UnifracUnweightedCommand::execute() { for (int i = 0; i < T.size(); i++) { counter = 0; - output = new ColumnFile(globaldata->getTreeFile() + toString(i+1) + ".unweighted", itersString); + if (random) { output = new ColumnFile(globaldata->getTreeFile() + toString(i+1) + ".unweighted", itersString); } //get unweighted for users tree rscoreFreq.resize(numComp); @@ -127,7 +143,9 @@ int UnifracUnweightedCommand::execute() { for(int k = 0; k < numComp; k++) { //saves users score utreeScores[k].push_back(userData[k]); - + + //add users score to validscores + validScores[userData[k]] = userData[k]; } //get unweighted scores for random trees @@ -160,15 +178,16 @@ int UnifracUnweightedCommand::execute() { if (it2 != rscoreFreq[a].end()) { rscoreFreq[a][it->first] /= iters; rcumul-= it2->second; } else { rscoreFreq[a][it->first] = 0.0000; } //no random trees with that score } - UWScoreSig[a].push_back(rCumul[a][userData[a]]); + + if (random) { UWScoreSig[a].push_back(rCumul[a][userData[a]]); } + else { UWScoreSig[a].push_back(0.0); } } - - - printUnweightedFile(); + //print output files printUWSummaryFile(i); + if (random) { printUnweightedFile(); delete output; } + if (phylip) { createPhylipFile(i); } - delete output; rscoreFreq.clear(); rCumul.clear(); validScores.clear(); @@ -193,13 +212,15 @@ void UnifracUnweightedCommand::printUnweightedFile() { try { vector data; vector tags; - tags.push_back("Score"); tags.push_back("RandFreq"); tags.push_back("RandCumul"); + tags.push_back("Score"); + tags.push_back("RandFreq"); tags.push_back("RandCumul"); + for(int a = 0; a < numComp; a++) { output->initFile(groupComb[a], tags); //print each line for (map::iterator it = validScores.begin(); it != validScores.end(); it++) { - data.push_back(it->first); data.push_back(rscoreFreq[a][it->first]); data.push_back(rCumul[a][it->first]); + data.push_back(it->first); data.push_back(rscoreFreq[a][it->first]); data.push_back(rCumul[a][it->first]); output->output(data); data.clear(); } @@ -225,14 +246,20 @@ void UnifracUnweightedCommand::printUWSummaryFile(int i) { outSum << i+1 << '\t'; mothurOut(toString(i+1) + "\t"); - if (UWScoreSig[a][0] > (1/(float)iters)) { - outSum << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << UWScoreSig[a][0] << endl; - cout << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << UWScoreSig[a][0] << endl; - mothurOutJustToLog(groupComb[a] + "\t" + toString(utreeScores[a][0]) + "\t" + toString(UWScoreSig[a][0])); mothurOutEndLine(); - }else { - outSum << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl; - cout << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl; - mothurOutJustToLog(groupComb[a] + "\t" + toString(utreeScores[a][0]) + "\t<" + toString((1/float(iters)))); mothurOutEndLine(); + if (random) { + if (UWScoreSig[a][0] > (1/(float)iters)) { + outSum << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << UWScoreSig[a][0] << endl; + cout << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << UWScoreSig[a][0] << endl; + mothurOutJustToLog(groupComb[a] + "\t" + toString(utreeScores[a][0]) + "\t" + toString(UWScoreSig[a][0])); mothurOutEndLine(); + }else { + outSum << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl; + cout << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl; + mothurOutJustToLog(groupComb[a] + "\t" + toString(utreeScores[a][0]) + "\t<" + toString((1/float(iters)))); mothurOutEndLine(); + } + }else{ + outSum << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << "0.00" << endl; + cout << setprecision(6) << groupComb[a] << '\t' << utreeScores[a][0] << '\t' << "0.00" << endl; + mothurOutJustToLog(groupComb[a] + "\t" + toString(utreeScores[a][0]) + "\t0.00"); mothurOutEndLine(); } } @@ -242,7 +269,49 @@ void UnifracUnweightedCommand::printUWSummaryFile(int i) { exit(1); } } - /***********************************************************/ +void UnifracUnweightedCommand::createPhylipFile(int i) { + try { + string phylipFileName = globaldata->getTreeFile() + toString(i+1) + ".unweighted.dist"; + ofstream out; + openOutputFile(phylipFileName, out); + + //output numSeqs + out << globaldata->Groups.size() << endl; + + //make matrix with scores in it + vector< vector > dists; dists.resize(globaldata->Groups.size()); + for (int i = 0; i < globaldata->Groups.size(); i++) { + dists[i].resize(globaldata->Groups.size(), 0.0); + } + + //flip it so you can print it + int count = 0; + for (int r=0; rGroups.size(); r++) { + for (int l = r+1; l < globaldata->Groups.size(); l++) { + dists[r][l] = (1.0 - utreeScores[count][0]); + dists[l][r] = (1.0 - utreeScores[count][0]); + count++; + } + } + + //output to file + for (int r=0; rGroups.size(); r++) { + //output name + out << globaldata->Groups[r] << '\t'; + + //output distances + for (int l = 0; l < r; l++) { out << dists[r][l] << '\t'; } + out << endl; + } + out.close(); + } + catch(exception& e) { + errorOut(e, "UnifracUnweightedCommand", "createPhylipFile"); + exit(1); + } +} +/***********************************************************/ + diff --git a/unifracunweightedcommand.h b/unifracunweightedcommand.h index 2fca41a..155fa75 100644 --- a/unifracunweightedcommand.h +++ b/unifracunweightedcommand.h @@ -45,7 +45,7 @@ class UnifracUnweightedCommand : public Command { vector< map > rscoreFreq; //map -vector entry for each combination. vector< map > rCumul; //map -vector entry for each combination. - bool abort; + bool abort, phylip, random; string groups, itersString; vector Groups; //holds groups to be used @@ -54,6 +54,7 @@ class UnifracUnweightedCommand : public Command { void printUWSummaryFile(int); void printUnweightedFile(); + void createPhylipFile(int); }; diff --git a/unifracweightedcommand.cpp b/unifracweightedcommand.cpp index fb3eb3c..f9cdd5a 100644 --- a/unifracweightedcommand.cpp +++ b/unifracweightedcommand.cpp @@ -21,7 +21,7 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) { else { //valid paramters for this command - string Array[] = {"groups","iters"}; + string Array[] = {"groups","iters","distance","random"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -48,7 +48,15 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) { itersString = validParameter.validFile(parameters, "iters", false); if (itersString == "not found") { itersString = "1000"; } convert(itersString, iters); + + string temp = validParameter.validFile(parameters, "distance", false); if (temp == "not found") { temp = "false"; } + phylip = isTrue(temp); + temp = validParameter.validFile(parameters, "random", false); if (temp == "not found") { temp = "true"; } + random = isTrue(temp); + + if (!random) { iters = 0; } //turn off random calcs + if (abort == false) { T = globaldata->gTree; @@ -78,9 +86,11 @@ UnifracWeightedCommand::UnifracWeightedCommand(string option) { void UnifracWeightedCommand::help(){ try { mothurOut("The unifrac.weighted command can only be executed after a successful read.tree command.\n"); - mothurOut("The unifrac.weighted command parameters are groups and iters. No parameters are required.\n"); + mothurOut("The unifrac.weighted command parameters are groups, iters, distance and random. No parameters are required.\n"); mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 2 valid groups.\n"); mothurOut("The group names are separated by dashes. The iters parameter allows you to specify how many random trees you would like compared to your tree.\n"); + mothurOut("The distance parameter allows you to create a distance file from the results. The default is false.\n"); + mothurOut("The random parameter allows you to shut off the comparison to random trees. The default is true, meaning compare your trees with randomly generated trees.\n"); mothurOut("The unifrac.weighted command should be in the following format: unifrac.weighted(groups=yourGroups, iters=yourIters).\n"); mothurOut("Example unifrac.weighted(groups=A-B-C, iters=500).\n"); mothurOut("The default value for groups is all the groups in your groupfile, and iters is 1000.\n"); @@ -100,7 +110,7 @@ int UnifracWeightedCommand::execute() { if (abort == true) { return 0; } Progress* reading; - reading = new Progress("Comparing to random:", iters); + if (random) { reading = new Progress("Comparing to random:", iters); } //get weighted for users tree userData.resize(numComp,0); //data[0] = weightedscore AB, data[1] = weightedscore AC... @@ -115,7 +125,7 @@ int UnifracWeightedCommand::execute() { rScores.resize(numComp); //data[0] = weightedscore AB, data[1] = weightedscore AC... uScores.resize(numComp); //data[0] = weightedscore AB, data[1] = weightedscore AC... - output = new ColumnFile(globaldata->getTreeFile() + toString(i+1) + ".weighted", itersString); + if (random) { output = new ColumnFile(globaldata->getTreeFile() + toString(i+1) + ".weighted", itersString); } userData = weighted->getValues(T[i]); //userData[0] = weightedscore @@ -154,26 +164,28 @@ int UnifracWeightedCommand::execute() { //removeValidScoresDuplicates(); //find the signifigance of the score for summary file - for (int f = 0; f < numComp; f++) { - //sort random scores - sort(rScores[f].begin(), rScores[f].end()); + if (random) { + for (int f = 0; f < numComp; f++) { + //sort random scores + sort(rScores[f].begin(), rScores[f].end()); + + //the index of the score higher than yours is returned + //so if you have 1000 random trees the index returned is 100 + //then there are 900 trees with a score greater then you. + //giving you a signifigance of 0.900 + int index = findIndex(userData[f], f); if (index == -1) { mothurOut("error in UnifracWeightedCommand"); mothurOutEndLine(); exit(1); } //error code + + //the signifigance is the number of trees with the users score or higher + WScoreSig.push_back((iters-index)/(float)iters); + } - //the index of the score higher than yours is returned - //so if you have 1000 random trees the index returned is 100 - //then there are 900 trees with a score greater then you. - //giving you a signifigance of 0.900 - int index = findIndex(userData[f], f); if (index == -1) { mothurOut("error in UnifracWeightedCommand"); mothurOutEndLine(); exit(1); } //error code - - //the signifigance is the number of trees with the users score or higher - WScoreSig.push_back((iters-index)/(float)iters); + //out << "Tree# " << i << endl; + calculateFreqsCumuls(); + printWeightedFile(); + + delete output; } - //out << "Tree# " << i << endl; - calculateFreqsCumuls(); - printWeightedFile(); - - delete output; - //clear data rScores.clear(); uScores.clear(); @@ -181,11 +193,12 @@ int UnifracWeightedCommand::execute() { } //finish progress bar - reading->finish(); - delete reading; + if (random) { reading->finish(); delete reading; } printWSummaryFile(); + if (phylip) { createPhylipFile(); } + //clear out users groups globaldata->Groups.clear(); @@ -238,14 +251,20 @@ void UnifracWeightedCommand::printWSummaryFile() { int count = 0; for (int i = 0; i < T.size(); i++) { for (int j = 0; j < numComp; j++) { - if (WScoreSig[count] > (1/(float)iters)) { - outSum << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << setprecision(itersString.length()) << WScoreSig[count] << endl; - cout << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << setprecision(itersString.length()) << WScoreSig[count] << endl; - mothurOutJustToLog(toString(i+1) +"\t" + groupComb[j] +"\t" + toString(utreeScores[count]) +"\t" + toString(WScoreSig[count])); mothurOutEndLine(); + if (random) { + if (WScoreSig[count] > (1/(float)iters)) { + outSum << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << setprecision(itersString.length()) << WScoreSig[count] << endl; + cout << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << setprecision(itersString.length()) << WScoreSig[count] << endl; + mothurOutJustToLog(toString(i+1) +"\t" + groupComb[j] +"\t" + toString(utreeScores[count]) +"\t" + toString(WScoreSig[count])); mothurOutEndLine(); + }else{ + outSum << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl; + cout << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl; + mothurOutJustToLog(toString(i+1) +"\t" + groupComb[j] +"\t" + toString(utreeScores[count]) +"\t<" + toString((1/float(iters)))); mothurOutEndLine(); + } }else{ - outSum << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl; - cout << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << setprecision(itersString.length()) << "<" << (1/float(iters)) << endl; - mothurOutJustToLog(toString(i+1) +"\t" + groupComb[j] +"\t" + toString(utreeScores[count]) +"\t<" + toString((1/float(iters)))); mothurOutEndLine(); + outSum << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << "0.00" << endl; + cout << setprecision(6) << i+1 << '\t' << groupComb[j] << '\t' << utreeScores[count] << '\t' << "0.00" << endl; + mothurOutJustToLog(toString(i+1) +"\t" + groupComb[j] +"\t" + toString(utreeScores[count]) +"\t0.00"); mothurOutEndLine(); } count++; } @@ -257,7 +276,52 @@ void UnifracWeightedCommand::printWSummaryFile() { exit(1); } } +/***********************************************************/ +void UnifracWeightedCommand::createPhylipFile() { + try { + int count = 0; + //for each tree + for (int i = 0; i < T.size(); i++) { + + string phylipFileName = globaldata->getTreeFile() + toString(i+1) + ".weighted.dist"; + ofstream out; + openOutputFile(phylipFileName, out); + + //output numSeqs + out << globaldata->Groups.size() << endl; + + //make matrix with scores in it + vector< vector > dists; dists.resize(globaldata->Groups.size()); + for (int i = 0; i < globaldata->Groups.size(); i++) { + dists[i].resize(globaldata->Groups.size(), 0.0); + } + + //flip it so you can print it + for (int r=0; rGroups.size(); r++) { + for (int l = r+1; l < globaldata->Groups.size(); l++) { + dists[r][l] = (1.0 - utreeScores[count]); + dists[l][r] = (1.0 - utreeScores[count]); + count++; + } + } + //output to file + for (int r=0; rGroups.size(); r++) { + //output name + out << globaldata->Groups[r] << '\t'; + + //output distances + for (int l = 0; l < r; l++) { out << dists[r][l] << '\t'; } + out << endl; + } + out.close(); + } + } + catch(exception& e) { + errorOut(e, "UnifracWeightedCommand", "createPhylipFile"); + exit(1); + } +} /***********************************************************/ int UnifracWeightedCommand::findIndex(float score, int index) { try{ diff --git a/unifracweightedcommand.h b/unifracweightedcommand.h index 5f55dee..9c0ad99 100644 --- a/unifracweightedcommand.h +++ b/unifracweightedcommand.h @@ -49,7 +49,7 @@ class UnifracWeightedCommand : public Command { vector< map > rCumul; //map -vector entry for each c map validScores; //map contains scores from random - bool abort; + bool abort, phylip, random; string groups, itersString; vector Groups; //holds groups to be used @@ -58,6 +58,7 @@ class UnifracWeightedCommand : public Command { void printWSummaryFile(); void printWeightedFile(); + void createPhylipFile(); //void removeValidScoresDuplicates(); int findIndex(float, int); void calculateFreqsCumuls();