From: westcott Date: Mon, 30 Aug 2010 13:49:18 +0000 (+0000) Subject: sped up unifrac weighted. X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=9b884118e97081d743c7b001216567f9b9131258 sped up unifrac weighted. --- diff --git a/readtree.cpp b/readtree.cpp index f72d698..e98524b 100644 --- a/readtree.cpp +++ b/readtree.cpp @@ -341,7 +341,7 @@ int ReadNewickTree::readNewickInt(istream& f, int& n, Tree* T) { if(f.peek() == ':'){ readSpecialChar(f,':',"colon"); - if(n >= numNodes){ m->mothurOut("Error: Too many nodes in input tree\n"); readOk = -1; return -1; } + if(n >= numNodes){ m->mothurOut("Error: Too many nodes in input tree\n"); readOk = -1; return -1; } T->tree[n].setBranchLength(readBranchLength(f)); }else{ diff --git a/tree.cpp b/tree.cpp index 3cb2611..f8bb76f 100644 --- a/tree.cpp +++ b/tree.cpp @@ -34,17 +34,28 @@ Tree::Tree() { numNodes = 2*numLeaves - 1; tree.resize(numNodes); + + //initialize groupNodeInfo + for (int i = 0; i < globaldata->gTreemap->namesOfGroups.size(); i++) { + groupNodeInfo[globaldata->gTreemap->namesOfGroups[i]].resize(0); + } //initialize tree with correct number of nodes, name and group info. for (int i = 0; i < numNodes; i++) { //initialize leaf nodes if (i <= (numLeaves-1)) { tree[i].setName(globaldata->Treenames[i]); - vector tempGroups; tempGroups.push_back(globaldata->gTreemap->getGroup(globaldata->Treenames[i])); + + //save group info + string group = globaldata->gTreemap->getGroup(globaldata->Treenames[i]); + vector tempGroups; tempGroups.push_back(group); tree[i].setGroup(tempGroups); + groupNodeInfo[group].push_back(i); + //set pcount and pGroup for groupname to 1. - tree[i].pcount[globaldata->gTreemap->getGroup(globaldata->Treenames[i])] = 1; - tree[i].pGroups[globaldata->gTreemap->getGroup(globaldata->Treenames[i])] = 1; + tree[i].pcount[group] = 1; + tree[i].pGroups[group] = 1; + //Treemap knows name, group and index to speed up search globaldata->gTreemap->setIndex(globaldata->Treenames[i], i); @@ -97,10 +108,14 @@ void Tree::addNamesToCounts() { map::iterator itCounts; int maxPars = 1; + set groupsAddedForThisNode; for (int j = 0; j < dupNames.size(); j++) { - + + string group = globaldata->gTreemap->getGroup(dupNames[j]); + if (dupNames[j] != name) {//you already added yourself in the constructor - string group = globaldata->gTreemap->getGroup(dupNames[j]); + + if (groupsAddedForThisNode.count(group) == 0) { groupNodeInfo[group].push_back(i); groupsAddedForThisNode.insert(group); } //if you have not already added this node for this group, then add it //update pcounts itCounts = tree[i].pcount.find(group); @@ -122,7 +137,7 @@ void Tree::addNamesToCounts() { if(tree[i].pGroups[group] > maxPars){ maxPars = tree[i].pGroups[group]; } - }//end if + }else { groupsAddedForThisNode.insert(group); } //add it so you don't add it to groupNodeInfo again }//end for if (maxPars > 1) { //then we have some more dominant groups @@ -208,6 +223,27 @@ int Tree::assembleTree() { exit(1); } } +/*****************************************************************/ +int Tree::assembleTree(string n) { + try { + + //build the pGroups in non leaf nodes to be used in the parsimony calcs. + for (int i = numLeaves; i < numNodes; i++) { + if (m->control_pressed) { return 1; } + + tree[i].pGroups = (mergeGroups(i)); + tree[i].pcount = (mergeGcounts(i)); + } + //float B = clock(); + //cout << "assembleTree\t" << (B-A) / CLOCKS_PER_SEC << endl; + return 0; + } + catch(exception& e) { + m->errorOut(e, "Tree", "assembleTree"); + exit(1); + } +} + /*****************************************************************/ void Tree::getCopy(Tree* copy) { try { @@ -240,6 +276,8 @@ void Tree::getCopy(Tree* copy) { tree[i].pcount = copy->tree[i].pcount; } + groupNodeInfo = copy->groupNodeInfo; + } catch(exception& e) { m->errorOut(e, "Tree", "getCopy"); @@ -392,6 +430,11 @@ map Tree::mergeGcounts(int position) { void Tree::randomLabels(vector g) { try { + + //initialize groupNodeInfo + for (int i = 0; i < globaldata->gTreemap->namesOfGroups.size(); i++) { + groupNodeInfo[globaldata->gTreemap->namesOfGroups[i]].resize(0); + } for(int i = 0; i < numLeaves; i++){ int z; @@ -423,6 +466,9 @@ void Tree::randomLabels(vector g) { tree[z].pcount = (tree[i].pcount); tree[i].pcount = (gcount_hold); } + + for (int k = 0; k < (tree[i].getGroup()).size(); k++) { groupNodeInfo[(tree[i].getGroup())[k]].push_back(i); } + for (int k = 0; k < (tree[z].getGroup()).size(); k++) { groupNodeInfo[(tree[z].getGroup())[k]].push_back(z); } } } catch(exception& e) { @@ -479,14 +525,14 @@ void Tree::randomBlengths() { /*************************************************************************************************/ void Tree::assembleRandomUnifracTree(vector g) { randomLabels(g); - assembleTree(); + assembleTree("noNameCounts"); } /*************************************************************************************************/ void Tree::assembleRandomUnifracTree(string groupA, string groupB) { vector temp; temp.push_back(groupA); temp.push_back(groupB); randomLabels(temp); - assembleTree(); + assembleTree("noNameCounts"); } /*************************************************************************************************/ diff --git a/tree.h b/tree.h index 59343d5..1c44d8c 100644 --- a/tree.h +++ b/tree.h @@ -39,9 +39,10 @@ public: //this function takes the leaf info and populates the non leaf nodes int assembleTree(); + int assembleTree(string); vector tree; //the first n nodes are the leaves, where n is the number of sequences. - + map< string, vector > groupNodeInfo; //maps group to indexes of leaf nodes with that group, different groups may contain same node because of names file. private: GlobalData* globaldata; diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 8a767f6..3c1367a 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -429,7 +429,7 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string vector fastaFileNames; vector qualFileNames; - + cout << "here" << endl; if (oligoFile != "") { m->openOutputFile(groupFile, outGroups); for (int i = 0; i < fastaNames.size(); i++) { @@ -446,17 +446,17 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string #endif } } - +cout << "here " << filename << endl; ifstream inFASTA; m->openInputFile(filename, inFASTA); inFASTA.seekg(line->start); - + cout << "here " << qFileName << endl; ifstream qFile; if(qFileName != "") { m->openInputFile(qFileName, qFile); qFile.seekg(qline->start); } bool done = false; int count = 0; - + cout << "here" << endl; while (!done) { if (m->control_pressed) { diff --git a/unifracunweightedcommand.cpp b/unifracunweightedcommand.cpp index 848c85c..5fd26df 100644 --- a/unifracunweightedcommand.cpp +++ b/unifracunweightedcommand.cpp @@ -125,6 +125,8 @@ int UnifracUnweightedCommand::execute() { if (abort == true) { return 0; } + int start = time(NULL); + userData.resize(numComp,0); //data[0] = unweightedscore randomData.resize(numComp,0); //data[0] = unweightedscore //create new tree with same num nodes and leaves as users @@ -239,6 +241,8 @@ int UnifracUnweightedCommand::execute() { if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; } + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run unifrac.unweighted."); m->mothurOutEndLine(); + m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } diff --git a/unifracweightedcommand.cpp b/unifracweightedcommand.cpp index 28d94d0..9cda8f2 100644 --- a/unifracweightedcommand.cpp +++ b/unifracweightedcommand.cpp @@ -116,6 +116,8 @@ int UnifracWeightedCommand::execute() { if (abort == true) { return 0; } + int start = time(NULL); + Progress* reading; if (random) { reading = new Progress("Comparing to random:", iters); } @@ -264,6 +266,8 @@ int UnifracWeightedCommand::execute() { return 0; } + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to run unifrac.weighted."); m->mothurOutEndLine(); + m->mothurOutEndLine(); m->mothurOut("Output File Names: "); m->mothurOutEndLine(); for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } diff --git a/weighted.cpp b/weighted.cpp index faa1ff7..c8b64e2 100644 --- a/weighted.cpp +++ b/weighted.cpp @@ -18,6 +18,10 @@ EstOutput Weighted::getValues(Tree* t) { vector D; numGroups = globaldata->Groups.size(); + + vector sums = getBranchLengthSums(t); + + if (m->control_pressed) { return data; } //calculate number of comparisons i.e. with groups A,B,C = AB, AC, BC = 3; int count = 0; @@ -29,31 +33,33 @@ EstOutput Weighted::getValues(Tree* t) { vector groups; groups.push_back(globaldata->Groups[i]); groups.push_back(globaldata->Groups[l]); D.push_back(0.0000); //initialize a spot in D for each combination + + //adding the wieghted sums from group i + for (int j = 0; j < t->groupNodeInfo[groups[0]].size(); j++) { //the leaf nodes that have seqs from group i + map::iterator it = t->tree[t->groupNodeInfo[groups[0]][j]].pcount.find(groups[0]); + int numSeqsInGroupI = it->second; + + double weightedSum = ((numSeqsInGroupI * sums[t->groupNodeInfo[groups[0]][j]]) / (double)tmap->seqsPerGroup[groups[0]]); - /********************************************************/ + D[count] += weightedSum; + } + + //adding the wieghted sums from group l + for (int j = 0; j < t->groupNodeInfo[groups[1]].size(); j++) { //the leaf nodes that have seqs from group l + map::iterator it = t->tree[t->groupNodeInfo[groups[1]][j]].pcount.find(groups[1]); + int numSeqsInGroupL = it->second; + + double weightedSum = ((numSeqsInGroupL * sums[t->groupNodeInfo[groups[1]][j]]) / (double)tmap->seqsPerGroup[groups[1]]); + + D[count] += weightedSum; + } + + /******************************************************** //calculate a D value for each group combo for(int v=0;vgetNumLeaves();v++){ if (m->control_pressed) { return data; } - int index = v; - double sum = 0.0000; - - //while you aren't at root - while(t->tree[index].getParent() != -1){ - - //if you have a BL - if(t->tree[index].getBranchLength() != -1){ - sum += abs(t->tree[index].getBranchLength()); - } - index = t->tree[index].getParent(); - } - - //get last breanch length added - if(t->tree[index].getBranchLength() != -1){ - sum += abs(t->tree[index].getBranchLength()); - } - //is this sum from a sequence which is in one of the users groups if (m->inUsersGroups(t->tree[v].getGroup(), globaldata->Groups) == true) { //is this sum from a sequence which is in this groupCombo @@ -145,13 +151,38 @@ EstOutput Weighted::getValues(Tree* t, string groupA, string groupB) { data.clear(); //clear out old values + vector sums = getBranchLengthSums(t); + + if (m->control_pressed) { return data; } + //initialize weighted score WScore[(groupA+groupB)] = 0.0; - float D = 0.0; + double D = 0.0; vector groups; groups.push_back(groupA); groups.push_back(groupB); - - /********************************************************/ + + //adding the wieghted sums from groupA + for (int j = 0; j < t->groupNodeInfo[groupA].size(); j++) { //the leaf nodes that have seqs from group i + map::iterator it = t->tree[j].pcount.find(groupA); + int numSeqsInGroupA = it->second; + + double weightedSum = ((numSeqsInGroupA * sums[j]) / (double)tmap->seqsPerGroup[groupA]); + + D += weightedSum; + } + + //adding the wieghted sums from groupB + for (int j = 0; j < t->groupNodeInfo[groupB].size(); j++) { //the leaf nodes that have seqs from group l + map::iterator it = t->tree[j].pcount.find(groupB); + int numSeqsInGroupB = it->second; + + double weightedSum = ((numSeqsInGroupB * sums[j]) / (double)tmap->seqsPerGroup[groupB]); + + D += weightedSum; + } + + + /******************************************************** //calculate a D value for the group combo for(int v=0;vgetNumLeaves();v++){ if (m->control_pressed) { return data; } @@ -240,12 +271,45 @@ EstOutput Weighted::getValues(Tree* t, string groupA, string groupB) { exit(1); } } - - - - - - +/**************************************************************************************************/ +vector Weighted::getBranchLengthSums(Tree* t) { + try { + + vector sums; + + for(int v=0;vgetNumLeaves();v++){ + + if (m->control_pressed) { return sums; } + + int index = v; + double sum = 0.0000; + + //while you aren't at root + while(t->tree[index].getParent() != -1){ + + //if you have a BL + if(t->tree[index].getBranchLength() != -1){ + sum += abs(t->tree[index].getBranchLength()); + } + index = t->tree[index].getParent(); + } + + //get last breanch length added + if(t->tree[index].getBranchLength() != -1){ + sum += abs(t->tree[index].getBranchLength()); + } + + sums.push_back(sum); + } + + return sums; + } + catch(exception& e) { + m->errorOut(e, "Weighted", "getBranchLengthSums"); + exit(1); + } +} +/**************************************************************************************************/ diff --git a/weighted.h b/weighted.h index 3914ef2..fedfa0c 100644 --- a/weighted.h +++ b/weighted.h @@ -30,6 +30,8 @@ class Weighted : public TreeCalculator { TreeMap* tmap; map::iterator it; map WScore; //a score for each group combination i.e. AB, AC, BC. + + vector getBranchLengthSums(Tree*); }; /***********************************************************************/