]> git.donarmstrong.com Git - mothur.git/blobdiff - phylotree.cpp
reworked the classifiers summary file added groupfile option to classify.seqs, added...
[mothur.git] / phylotree.cpp
index 33aedc6395f09fc24576b93e1b72261884d98907..8c48e4f1f783c9387c83d4e5d1bfdbb475eb09cf 100644 (file)
@@ -28,34 +28,78 @@ PhyloTree::PhyloTree(){
 }
 /**************************************************************************************************/
 
-PhyloTree::PhyloTree(ifstream& in){
+PhyloTree::PhyloTree(ifstream& in, string filename){
        try {
                m = MothurOut::getInstance();
                calcTotals = false;
                
-               in >> numNodes; gobble(in);
-               
-               tree.resize(numNodes);
-               
-               for (int i = 0; i < tree.size(); i++) {
-                       in >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(in);
-               }
-               
-               //read genus nodes
-               int numGenus = 0;
-               in >> numGenus; gobble(in);
-       
-               int gnode, gsize;
-               totals.clear();
-               for (int i = 0; i < numGenus; i++) {
-                       in >> gnode >> gsize; gobble(in);
+               #ifdef USE_MPI
+                       MPI_File inMPI;
+                       MPI_Offset size;
+                       MPI_Status status;
+
+                       char inFileName[1024];
+                       strcpy(inFileName, filename.c_str());
+
+                       MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  
+                       MPI_File_get_size(inMPI, &size);
                        
-                       uniqueTaxonomies[gnode] = gnode;
-                       totals.push_back(gsize);
-               }
+                       char* buffer = new char[size];
+                       MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
+
+                       string tempBuf = buffer;
+                       if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
+                       istringstream iss (tempBuf,istringstream::in);
+                       delete buffer;
+                       
+                       iss >> numNodes; gobble(iss);
+                       
+                       tree.resize(numNodes);
+                       
+                       for (int i = 0; i < tree.size(); i++) {
+                               iss >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(iss);
+                       }
                        
-               in.close();
+                       //read genus nodes
+                       int numGenus = 0;
+                       iss >> numGenus; gobble(iss);
+                       
+                       int gnode, gsize;
+                       totals.clear();
+                       for (int i = 0; i < numGenus; i++) {
+                               iss >> gnode >> gsize; gobble(iss);
+                               
+                               uniqueTaxonomies[gnode] = gnode;
+                               totals.push_back(gsize);
+                       }
+                       
+                       MPI_File_close(&inMPI);
+                       
+               #else
+                       in >> numNodes; gobble(in);
+                       
+                       tree.resize(numNodes);
+                       
+                       for (int i = 0; i < tree.size(); i++) {
+                               in >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(in);
+                       }
+                       
+                       //read genus nodes
+                       int numGenus = 0;
+                       in >> numGenus; gobble(in);
+                       
+                       int gnode, gsize;
+                       totals.clear();
+                       for (int i = 0; i < numGenus; i++) {
+                               in >> gnode >> gsize; gobble(in);
+                               
+                               uniqueTaxonomies[gnode] = gnode;
+                               totals.push_back(gsize);
+                       }
+                       
+                       in.close();
+                       
+               #endif
                
        }
        catch(exception& e) {
@@ -74,20 +118,70 @@ PhyloTree::PhyloTree(string tfile){
                tree[0].heirarchyID = "0";
                maxLevel = 0;
                calcTotals = true;
+               string name, tax;
+
                
-               ifstream in;
-               openInputFile(tfile, in);
+               #ifdef USE_MPI
+                       int pid, num;
+                       vector<long> positions;
+                       
+                       MPI_Status status; 
+                       MPI_File inMPI;
+                       MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+                       char inFileName[1024];
+                       strcpy(inFileName, tfile.c_str());
+
+                       MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
+
+                       if (pid == 0) {
+                               positions = setFilePosEachLine(tfile, num);
+                               
+                               //send file positions to all processes
+                               MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
+                               MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos 
+                       }else{
+                               MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
+                               positions.resize(num);
+                               MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
+                       }
                
-               //read in users taxonomy file and add sequences to tree
-               string name, tax;
-               while(!in.eof()){
-                       in >> name >> tax; gobble(in);
+                       //read file 
+                       for(int i=0;i<num;i++){
+                               //read next sequence
+                               int length = positions[i+1] - positions[i];
+                               char* buf4 = new char[length];
+
+                               MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
+
+                               string tempBuf = buf4;
+                               if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
+                               delete buf4;
+
+                               istringstream iss (tempBuf,istringstream::in);
+                               iss >> name >> tax;
+                               addSeqToTree(name, tax);
+                       }
                        
-                       addSeqToTree(name, tax);
-               }
-               in.close();
-       
+                       MPI_File_close(&inMPI);
+               
+               #else
+                       ifstream in;
+                       openInputFile(tfile, in);
+                       
+                       //read in users taxonomy file and add sequences to tree
+                       while(!in.eof()){
+                               in >> name >> tax; gobble(in);
+                               
+                               addSeqToTree(name, tax);
+                       }
+                       in.close();
+               #endif
+               
                assignHeirarchyIDs(0);
+               
+               //create file for summary if needed
+               setUp(tfile);
        }
        catch(exception& e) {
                m->errorOut(e, "PhyloTree", "PhyloTree");
@@ -232,15 +326,49 @@ void PhyloTree::assignHeirarchyIDs(int index){
        }
 }
 /**************************************************************************************************/
-void PhyloTree::binUnclassified(){
+void PhyloTree::setUp(string tfile){
+       try{
+               string taxFileNameTest = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.sum";
+               
+               #ifdef USE_MPI
+                       int pid;
+                       MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+                       if (pid == 0) {  binUnclassified(taxFileNameTest);  }
+               
+               #else
+                       //create file needed for summary if it doesn't exist
+                       ifstream FileTest(taxFileNameTest.c_str());
+                       
+                       if (!FileTest) { 
+                               binUnclassified(taxFileNameTest); 
+                       }
+               #endif
+       }
+       catch(exception& e) {
+               m->errorOut(e, "PhyloTree", "setUp");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+void PhyloTree::binUnclassified(string file){
        try {
+       
+               ofstream out;
+               openOutputFile(file, out);
+               
                map<string, int>::iterator itBin;
                map<string, int>::iterator childPointer;
                
+               vector<TaxNode> copy = tree;
+               int copyNodes = numNodes;
+               
                //go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary
                for (itBin = name2Taxonomy.begin(); itBin != name2Taxonomy.end(); itBin++) {
-               
-                       int level = tree[itBin->second].level;
+                       
+                       if (m->control_pressed) {  out.close(); break;  }
+                       
+                       int level = copy[itBin->second].level;
                        int currentNode = itBin->second;
                        
                        //this sequence is unclassified at some levels
@@ -251,34 +379,30 @@ void PhyloTree::binUnclassified(){
                                string taxon = "unclassified";  
                                
                                //does the parent have a child names 'unclassified'?
-                               childPointer = tree[currentNode].children.find(taxon);
+                               childPointer = copy[currentNode].children.find(taxon);
                                
-                               if(childPointer != tree[currentNode].children.end()){   //if the node already exists, move on
+                               if(childPointer != copy[currentNode].children.end()){   //if the node already exists, move on
                                        currentNode = childPointer->second; //currentNode becomes 'unclassified'
-                                       tree[currentNode].accessions.push_back(itBin->first);  //add this seq
-                                       name2Taxonomy[itBin->first] = currentNode;
+                                       copy[currentNode].accessions.push_back(itBin->first);  //add this seq
                                }
                                else{                                                                                   //otherwise, create it
-                                       tree.push_back(TaxNode(taxon));
-                                       numNodes++;
-                                       tree[currentNode].children[taxon] = numNodes-1;
-                                       tree[numNodes-1].parent = currentNode;
+                                       copy.push_back(TaxNode(taxon));
+                                       copyNodes++;
+                                       copy[currentNode].children[taxon] = copyNodes-1;
+                                       copy[copyNodes-1].parent = currentNode;
+                                       copy[copyNodes-1].level = copy[currentNode].level + 1;
                                                                        
-                                       currentNode = tree[currentNode].children[taxon];
-                                       tree[currentNode].accessions.push_back(itBin->first);
-                                       name2Taxonomy[itBin->first] = currentNode;
+                                       currentNode = copy[currentNode].children[taxon];
+                                       copy[currentNode].accessions.push_back(itBin->first);
                                }
-                               
-                               if (level == maxLevel) {   uniqueTaxonomies[currentNode] = currentNode; }
                        }
                }
                
-               //clear HeirarchyIDs and reset them
-               for (int i = 1; i < tree.size(); i++) {
-                       tree[i].heirarchyID = "";
+               if (!m->control_pressed) {
+                       //print copy tree
+                       print(out, copy);
                }
-               assignHeirarchyIDs(0);
-               
+                               
        }
        catch(exception& e) {
                m->errorOut(e, "PhyloTree", "binUnclassified");
@@ -306,26 +430,22 @@ string PhyloTree::getFullTaxonomy(string seqName) {
 }
 /**************************************************************************************************/
 
-void PhyloTree::print(ofstream& out){
-       try {
-               out << tree[0].level << '\t'<< tree[0].heirarchyID << '\t' << tree[0].name << '\t' << tree[0].children.size() << '\t' << tree[0].accessions.size() << endl;
-               print(0, out);
-       }
-       catch(exception& e) {
-               m->errorOut(e, "PhyloTree", "print");
-               exit(1);
-       }
-}
-
-/**************************************************************************************************/
-
-void PhyloTree::print(int i, ofstream& out){
+void PhyloTree::print(ofstream& out, vector<TaxNode>& copy){
        try {
-               map<string,int>::iterator it;
-               for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
-                       out <<tree[it->second].level << '\t' << tree[it->second].heirarchyID << '\t' << tree[it->second].name << '\t' << tree[it->second].children.size() << '\t' << tree[it->second].accessions.size() << endl;
-                       print(it->second, out);
+               out << copy.size() << endl;
+               
+               for (int i = 0; i < copy.size(); i++) {
+       
+                       out << copy[i].level << '\t'<< copy[i].name << '\t' << copy[i].children.size() << '\t';
+                       
+                       map<string,int>::iterator it;
+                       for(it=copy[i].children.begin();it!=copy[i].children.end();it++){
+                               out << it->first << '\t' << it->second << '\t';
+                       }
+                       out << endl;
                }
+               
+               out.close();
        }
        catch(exception& e) {
                m->errorOut(e, "PhyloTree", "print");
@@ -335,23 +455,37 @@ void PhyloTree::print(int i, ofstream& out){
 /**************************************************************************************************/
 void PhyloTree::printTreeNodes(string treefilename) {
        try {
-               ofstream outTree;
-               openOutputFile(treefilename, outTree);
-               
-               //print treenodes
-               outTree << tree.size() << endl;
-               for (int i = 0; i < tree.size(); i++) {
-                       outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
-               }
-               
-               //print genus nodes
-               outTree << endl << uniqueTaxonomies.size() << endl;
-               map<int, int>::iterator it2;
-               for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl;  }
-               outTree << endl;
+       
+               #ifdef USE_MPI
+                       int pid;
+                       MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+                       if (pid == 0) {  
                
+               #endif
+
+                       ofstream outTree;
+                       openOutputFile(treefilename, outTree);
+                       
+                       //print treenodes
+                       outTree << tree.size() << endl;
+                       for (int i = 0; i < tree.size(); i++) {
+                               outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
+                       }
+                       
+                       //print genus nodes
+                       outTree << endl << uniqueTaxonomies.size() << endl;
+                       map<int, int>::iterator it2;
+                       for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl;  }
+                       outTree << endl;
+                       
+                       
+                       outTree.close();
                
-               outTree.close();
+               #ifdef USE_MPI
+                       }
+               #endif
+
                
        }
        catch(exception& e) {