From 5f44783e6d74a9c207492ac244210c915cadc272 Mon Sep 17 00:00:00 2001 From: westcott Date: Wed, 3 Feb 2010 12:26:44 +0000 Subject: [PATCH] added name option to read.tree for use in unifrac and parimony commands --- globaldata.hpp | 1 + mothur.h | 2 +- nameassignment.cpp | 2 +- readtreecommand.cpp | 61 +++++++++++++++++++++++++++++--- readtreecommand.h | 5 ++- tree.cpp | 84 +++++++++++++++++++++++++++++++++++++++++++++ tree.h | 7 ++-- 7 files changed, 153 insertions(+), 9 deletions(-) diff --git a/globaldata.hpp b/globaldata.hpp index 0238a0a..f3b9b9a 100644 --- a/globaldata.hpp +++ b/globaldata.hpp @@ -47,6 +47,7 @@ public: vector Estimators, Groups; //holds estimators to be used set labels; //holds labels to be used vector Treenames; + map names; string getPhylipFile(); diff --git a/mothur.h b/mothur.h index 74d71ca..a63d7ae 100644 --- a/mothur.h +++ b/mothur.h @@ -503,7 +503,7 @@ inline string getFullPathName(string fileName){ if (path.rfind("./") == -1) { return fileName; } //already complete name else { newFileName = fileName.substr(fileName.rfind("./")+2); } //save the complete part of the name - char* cwdpath; + char* cwdpath = new char[1024]; size_t size; cwdpath=getcwd(cwdpath,size); cwd = cwdpath; diff --git a/nameassignment.cpp b/nameassignment.cpp index d098a48..bc088f4 100644 --- a/nameassignment.cpp +++ b/nameassignment.cpp @@ -69,7 +69,7 @@ ListVector NameAssignment::getListVector(void){ void NameAssignment::print(ostream& out){ try { map::iterator it; -cout << (*this).size() << endl; +//cout << (*this).size() << endl; for(it = (*this).begin(); it!=(*this).end(); it++){ out << it->first << '\t' << it->second << endl; //prints out keys and values of the map this. } diff --git a/readtreecommand.cpp b/readtreecommand.cpp index 5b371f5..09f899b 100644 --- a/readtreecommand.cpp +++ b/readtreecommand.cpp @@ -20,7 +20,7 @@ ReadTreeCommand::ReadTreeCommand(string option){ else { //valid paramters for this command - string Array[] = {"tree","group","outputdir","inputdir"}; + string Array[] = {"tree","group","name","outputdir","inputdir"}; vector myArray (Array, Array+(sizeof(Array)/sizeof(string))); OptionParser parser(option); @@ -56,6 +56,15 @@ ReadTreeCommand::ReadTreeCommand(string option){ //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["group"] = inputDir + it->second; } } + + it = parameters.find("name"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["name"] = inputDir + it->second; } + } + } @@ -76,6 +85,11 @@ ReadTreeCommand::ReadTreeCommand(string option){ globaldata->gTreemap = treeMap; } + namefile = validParameter.validFile(parameters, "name", true); + if (namefile == "not open") { abort = true; } + else if (namefile == "not found") { treefile = ""; } + else { readNamesFile(); } + if (abort == false) { filename = treefile; read = new ReadNewickTree(filename); @@ -130,7 +144,7 @@ int ReadTreeCommand::execute(){ T[i]->assembleTree(); } - //output any names that are in names file but not in tree + //output any names that are in group file but not in tree if (globaldata->Treenames.size() < treeMap->getNumSeqs()) { for (int i = 0; i < treeMap->namesOfSeqs.size(); i++) { //is that name in the tree? @@ -142,8 +156,13 @@ int ReadTreeCommand::execute(){ //then you did not find it so report it if (count == globaldata->Treenames.size()) { - mothurOut(treeMap->namesOfSeqs[i] + " is in your namefile and not in your tree. It will be disregarded."); mothurOutEndLine(); - treeMap->removeSeq(treeMap->namesOfSeqs[i]); + //if it is in your namefile then don't remove + map::iterator it = nameMap.find(treeMap->namesOfSeqs[i]); + + if (it == nameMap.end()) { + mothurOut(treeMap->namesOfSeqs[i] + " is in your groupfile and not in your tree. It will be disregarded."); mothurOutEndLine(); + treeMap->removeSeq(treeMap->namesOfSeqs[i]); + } } } } @@ -155,5 +174,39 @@ int ReadTreeCommand::execute(){ exit(1); } } +/*****************************************************************/ +int ReadTreeCommand::readNamesFile() { + try { + globaldata->names.clear(); + + ifstream in; + openInputFile(namefile, in); + + string first, second; + map::iterator itNames; + + while(!in.eof()) { + in >> first >> second; gobble(in); + + itNames = globaldata->names.find(first); + if (itNames == globaldata->names.end()) { + globaldata->names[first] = second; + + //we need a list of names in your namefile to use above when removing extra seqs above so we don't remove them + vector dupNames; + splitAtComma(second, dupNames); + + for (int i = 0; i < dupNames.size(); i++) { nameMap[dupNames[i]] = dupNames[i]; } + }else { mothurOut(first + " has already been seen in namefile, disregarding names file."); mothurOutEndLine(); in.close(); globaldata->names.clear(); return 1; } + } + in.close(); + + return 0; + } + catch(exception& e) { + errorOut(e, "ReadTreeCommand", "readNamesFile"); + exit(1); + } +} //********************************************************************************************************************** diff --git a/readtreecommand.h b/readtreecommand.h index 99ddc38..b0d10f7 100644 --- a/readtreecommand.h +++ b/readtreecommand.h @@ -27,8 +27,11 @@ private: GlobalData* globaldata; ReadTree* read; TreeMap* treeMap; - string filename, treefile, groupfile; + string filename, treefile, groupfile, namefile; bool abort; + map nameMap; + + int readNamesFile(); }; diff --git a/tree.cpp b/tree.cpp index ad80b2b..52b7322 100644 --- a/tree.cpp +++ b/tree.cpp @@ -50,6 +50,85 @@ Tree::Tree() { /*****************************************************************/ Tree::~Tree() {} /*****************************************************************/ +void Tree::addNamesToCounts() { + try { + //ex. seq1 seq2,seq3,se4 + // seq1 = pasture + // seq2 = forest + // seq4 = pasture + // seq3 = ocean + + //before this function seq1.pcount = pasture -> 1 + //after seq1.pcount = pasture -> 2, forest -> 1, ocean -> 1 + + //before this function seq1.pgroups = pasture -> 1 + //after seq1.pgroups = pasture -> 1 since that is the dominant group + + + //go through each leaf and update its pcounts and pgroups + for (int i = 0; i < numLeaves; i++) { + string name = tree[i].getName(); + + map::iterator itNames = globaldata->names.find(name); + + if (itNames == globaldata->names.end()) { mothurOut(name + " is not in your name file, please correct."); mothurOutEndLine(); exit(1); } + else { + vector dupNames; + splitAtComma(globaldata->names[name], dupNames); + + map::iterator itCounts; + int maxPars = 1; + for (int j = 0; j < dupNames.size(); j++) { + + if (dupNames[j] != name) {//you already added yourself in the constructor + string group = globaldata->gTreemap->getGroup(dupNames[j]); + + //update pcounts + itCounts = tree[i].pcount.find(group); + if (itCounts == tree[i].pcount.end()) { //new group, add it + tree[i].pcount[group] = 1; + }else { + tree[i].pcount[group]++; + } + + //update pgroups + itCounts = tree[i].pGroups.find(group); + if (itCounts == tree[i].pGroups.end()) { //new group, add it + tree[i].pGroups[group] = 1; + }else { + tree[i].pGroups[group]++; + } + + //keep highest group + if(tree[i].pGroups[group] > maxPars){ + maxPars = tree[i].pGroups[group]; + } + }//end if + }//end for + + if (maxPars > 1) { //then we have some more dominant groups + //erase all the groups that are less than maxPars because you found a more dominant group. + for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();){ + if(it->second < maxPars){ + tree[i].pGroups.erase(it++); + }else { it++; } + } + //set one remaining groups to 1 + for(it=tree[i].pGroups.begin();it!=tree[i].pGroups.end();it++){ + tree[i].pGroups[it->first] = 1; + } + }//end if + + }//end else + }//end for + + } + catch(exception& e) { + errorOut(e, "Tree", "addNamesToCounts"); + exit(1); + } +} +/*****************************************************************/ int Tree::getIndex(string searchName) { try { //Treemap knows name, group and index to speed up search @@ -78,6 +157,10 @@ void Tree::setIndex(string searchName, int index) { /*****************************************************************/ void Tree::assembleTree() { try { + + //if user has given a names file we want to include that info in the pgroups and pcount info. + if(globaldata->names.size() != 0) { addNamesToCounts(); } + //build the pGroups in non leaf nodes to be used in the parsimony calcs. for (int i = numLeaves; i < numNodes; i++) { tree[i].pGroups = (mergeGroups(i)); @@ -120,6 +203,7 @@ void Tree::getCopy(Tree* copy) { //copy pcount tree[i].pcount = copy->tree[i].pcount; } + } catch(exception& e) { errorOut(e, "Tree", "getCopy"); diff --git a/tree.h b/tree.h index c90f51a..a076cef 100644 --- a/tree.h +++ b/tree.h @@ -37,9 +37,10 @@ public: int findRoot(); //return index of root node //this function takes the leaf info and populates the non leaf nodes - void assembleTree(); + void assembleTree(); vector tree; //the first n nodes are the leaves, where n is the number of sequences. + private: GlobalData* globaldata; int numNodes, numLeaves; @@ -48,8 +49,9 @@ private: map::iterator it, it2; map mergeGroups(int); //returns a map with a groupname and the number of times that group was seen in the children - map mergeGcounts(int); + + void addNamesToCounts(); void randomTopology(); void randomBlengths(); void randomLabels(vector); @@ -60,6 +62,7 @@ private: //not included in the tree. //only takes names from the first tree in the tree file and assumes that all trees use the same names. int readTreeString(ifstream&); + }; #endif -- 2.39.2