]> git.donarmstrong.com Git - mothur.git/commitdiff
reworked the classifiers summary file added groupfile option to classify.seqs, added...
authorwestcott <westcott>
Tue, 27 Apr 2010 17:54:59 +0000 (17:54 +0000)
committerwestcott <westcott>
Tue, 27 Apr 2010 17:54:59 +0000 (17:54 +0000)
14 files changed:
Mothur.xcodeproj/project.pbxproj
bayesian.cpp
bayesian.h
classify.cpp
classifyseqscommand.cpp
classifyseqscommand.h
distancecommand.cpp
makefile
phylosummary.cpp [new file with mode: 0644]
phylosummary.h [new file with mode: 0644]
phylotree.cpp
phylotree.h
rawtrainingdatamaker.cpp [deleted file]
rawtrainingdatamaker.h [deleted file]

index 805bde0484399fcb37491f58a97cec4f775288bd..cc3b4b6a472230fe32f51c559792ecc2f7bb3cc4 100644 (file)
@@ -12,8 +12,8 @@
                A747E81C116365E000FB9042 /* chimeraslayercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraslayercommand.h; sourceTree = "<group>"; };
                A747E81D116365E000FB9042 /* chimeraslayercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraslayercommand.cpp; sourceTree = "<group>"; };
                A7639F8D1175DF35008F5578 /* makefile */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.make; path = makefile; sourceTree = "<group>"; };
-               A76AAD02117F322B003D8DA1 /* rawtrainingdatamaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = rawtrainingdatamaker.h; sourceTree = SOURCE_ROOT; };
-               A76AAD03117F322B003D8DA1 /* rawtrainingdatamaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = rawtrainingdatamaker.cpp; sourceTree = SOURCE_ROOT; };
+               A76AAD02117F322B003D8DA1 /* phylosummary.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = phylosummary.h; sourceTree = "<group>"; };
+               A76AAD03117F322B003D8DA1 /* phylosummary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = phylosummary.cpp; sourceTree = "<group>"; };
                A78254461164D7790002E2DD /* chimerapintailcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerapintailcommand.h; sourceTree = "<group>"; };
                A78254471164D7790002E2DD /* chimerapintailcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerapintailcommand.cpp; sourceTree = "<group>"; };
                A7825502116519F70002E2DD /* chimerabellerophoncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerabellerophoncommand.h; sourceTree = "<group>"; };
                                A7DA208A113FECD400BF472F /* knn.h */,
                                A7DA20C4113FECD400BF472F /* phylotree.cpp */,
                                A7DA20C5113FECD400BF472F /* phylotree.h */,
-                               A76AAD02117F322B003D8DA1 /* rawtrainingdatamaker.h */,
-                               A76AAD03117F322B003D8DA1 /* rawtrainingdatamaker.cpp */,
+                               A76AAD02117F322B003D8DA1 /* phylosummary.h */,
+                               A76AAD03117F322B003D8DA1 /* phylosummary.cpp */,
                                A7DA215C113FECD400BF472F /* taxonomyequalizer.cpp */,
                                A7DA215D113FECD400BF472F /* taxonomyequalizer.h */,
                        );
index 0929749095386a72c378ac1c9ea586e3f3d354d2..e5543cdf0c320847a7cb9bc16957b64bb73d3b6a 100644 (file)
@@ -9,7 +9,7 @@
 
 #include "bayesian.h"
 #include "kmer.hpp"
-#include "rawtrainingdatamaker.h"
+#include "phylosummary.h"
 
 /**************************************************************************************************/
 Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i) : 
@@ -18,30 +18,30 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i)  {
                                        
                /************calculate the probablity that each word will be in a specific taxonomy*************/
                string phyloTreeName = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.train";
-               ifstream phyloTreeTest(phyloTreeName.c_str());
-               
-               ofstream out;
                string probFileName = tfile.substr(0,tfile.find_last_of(".")+1) + tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.prob";
-               ifstream probFileTest(probFileName.c_str());
+               string probFileName2 = tfile.substr(0,tfile.find_last_of(".")+1) + tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.numNonZero";
                
+               ofstream out;
                ofstream out2;
-               string probFileName2 = tfile.substr(0,tfile.find_last_of(".")+1) + tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.numNonZero";
+               
+               ifstream phyloTreeTest(phyloTreeName.c_str());
                ifstream probFileTest2(probFileName2.c_str());
+               ifstream probFileTest(probFileName.c_str());
                
                int start = time(NULL);
                
                if(probFileTest && probFileTest2 && phyloTreeTest){     
                        m->mothurOut("Reading template taxonomy...     "); cout.flush();
                        
-                       phyloTree = new PhyloTree(phyloTreeTest);
-       
+                       phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName);
+                       
                        m->mothurOut("DONE."); m->mothurOutEndLine();
                        
                        genusNodes = phyloTree->getGenusNodes(); 
                        genusTotals = phyloTree->getGenusTotals();
                
                        m->mothurOut("Reading template probabilities...     "); cout.flush();
-                       readProbFile(probFileTest, probFileTest2);      
+                       readProbFile(probFileTest, probFileTest2, probFileName, probFileName2); 
                        
                }else{
                
@@ -65,6 +65,14 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i)  {
                        wordGenusProb.resize(numKmers);
                
                        for (int j = 0; j < wordGenusProb.size(); j++) {        wordGenusProb[j].resize(genusNodes.size());             }
+                       
+                       
+                       #ifdef USE_MPI
+                               int pid;
+                               MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+                               if (pid == 0) {  
+                       #endif
 
                        ofstream out;
                        openOutputFile(probFileName, out);
@@ -74,12 +82,27 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i)  {
                        ofstream out2;
                        openOutputFile(probFileName2, out2);
                        
+                       #ifdef USE_MPI
+                               }
+                       #endif
+
+                       
                        //for each word
                        for (int i = 0; i < numKmers; i++) {
                                if (m->control_pressed) { break; }
                                
+                               #ifdef USE_MPI
+                                       MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+                                       if (pid == 0) {  
+                               #endif
+
                                out << i << '\t';
                                
+                               #ifdef USE_MPI
+                                       }
+                               #endif
+                               
                                vector<int> seqsWithWordi = database->getSequencesWithKmer(i);
                                
                                map<int, int> count;
@@ -98,20 +121,52 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i)  {
                                for (int k = 0; k < genusNodes.size(); k++) {
                                        //probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1);
                                        wordGenusProb[i][k] = log((count[genusNodes[k]] + probabilityInTemplate) / (float) (genusTotals[k] + 1));  
-                                       if (count[genusNodes[k]] != 0) {  out << k << '\t' << wordGenusProb[i][k] << '\t';  numNotZero++;  }
+                                       if (count[genusNodes[k]] != 0) { 
+                                               #ifdef USE_MPI
+                                                       MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+                                                       if (pid == 0) {  
+                                               #endif
+
+                                               out << k << '\t' << wordGenusProb[i][k] << '\t'; 
+                                               
+                                               #ifdef USE_MPI
+                                                       }
+                                               #endif
+
+                                               numNotZero++;  
+                                       }
                                }
+                               
+                               #ifdef USE_MPI
+                                       MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+                                       if (pid == 0) {  
+                               #endif
+                               
                                out << endl;
                                out2 << probabilityInTemplate << '\t' << numNotZero << endl;
+                               
+                               #ifdef USE_MPI
+                                       }
+                               #endif
                        }
                        
+                       #ifdef USE_MPI
+                               MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+                               if (pid == 0) {  
+                       #endif
+                       
                        out.close();
                        out2.close();
                        
+                       #ifdef USE_MPI
+                               }
+                       #endif
+                       
                        //read in new phylotree with less info. - its faster
                        ifstream phyloTreeTest(phyloTreeName.c_str());
                        delete phyloTree;
                        
-                       phyloTree = new PhyloTree(phyloTreeTest);
+                       phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName);
                }
        
                m->mothurOut("DONE."); m->mothurOutEndLine();
@@ -292,45 +347,162 @@ map<string, int> Bayesian::parseTaxMap(string newTax) {
        }
 }
 /**************************************************************************************************/
-void Bayesian::readProbFile(ifstream& in, ifstream& inNum) {
+void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string inNumName) {
        try{
                
-               in >> numKmers; gobble(in);
-               
-               //initialze probabilities
-               wordGenusProb.resize(numKmers);
-               
-               for (int j = 0; j < wordGenusProb.size(); j++) {        wordGenusProb[j].resize(genusNodes.size());             }
+               #ifdef USE_MPI
+                       
+                       int pid, num, num2;
+                       vector<long> positions;
+                       vector<long> positions2;
+                       
+                       MPI_Status status; 
+                       MPI_File inMPI;
+                       MPI_File inMPI2;
+                       MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+                       char inFileName[1024];
+                       strcpy(inFileName, inNumName.c_str());
+                       
+                       char inFileName2[1024];
+                       strcpy(inFileName2, inName.c_str());
+
+                       MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
+                       MPI_File_open(MPI_COMM_WORLD, inFileName2, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI2);  //comm, filename, mode, info, filepointer
+
+                       if (pid == 0) {
+                               positions = setFilePosEachLine(inNumName, num);
+                               
+                               //send file positions to all processes
+                               MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
+                               MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos 
+                               
+                               positions2 = setFilePosEachLine(inName, num2);
+                               
+                               //send file positions to all processes
+                               MPI_Bcast(&num2, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
+                               MPI_Bcast(&positions2[0], (num2+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos       
+
+                       }else{
+                               MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
+                               positions.resize(num);
+                               MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
+                               
+                               MPI_Bcast(&num2, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
+                               positions2.resize(num2);
+                               MPI_Bcast(&positions2[0], (num2+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
+
+                       }
                
-               int kmer, name, count;  count = 0;
-               vector<int> num; num.resize(numKmers);
-               float prob;
-               vector<float> zeroCountProb; zeroCountProb.resize(numKmers);            
-       
-               while (inNum) {
-                       inNum >> zeroCountProb[count] >> num[count];  
-                       count++;
-                       gobble(inNum);
-               }
-               inNum.close();
-       
-               while(in) {
-                       in >> kmer;
+                       //read numKmers
+                       int length = positions2[1] - positions2[0];
+                       char* buf = new char[length];
+
+                       MPI_File_read_at(inMPI2, positions2[0], buf, length, MPI_CHAR, &status);
+
+                       string tempBuf = buf;
+                       if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
+                       delete buf;
+
+                       istringstream iss (tempBuf,istringstream::in);
+                       iss >> numKmers;  
+                       
+                       //initialze probabilities
+                       wordGenusProb.resize(numKmers);
+                       
+                       for (int j = 0; j < wordGenusProb.size(); j++) {        wordGenusProb[j].resize(genusNodes.size());             }
                        
-                       //set them all to zero value
-                       for (int i = 0; i < genusNodes.size(); i++) {
-                               wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1));
+                       int kmer, name;  
+                       vector<int> numbers; numbers.resize(numKmers);
+                       float prob;
+                       vector<float> zeroCountProb; zeroCountProb.resize(numKmers);            
+
+                       //read file 
+                       for(int i=0;i<num;i++){
+                               //read next sequence
+                               length = positions[i+1] - positions[i];
+                               char* buf4 = new char[length];
+
+                               MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
+
+                               tempBuf = buf4;
+                               if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
+                               delete buf4;
+
+                               istringstream iss (tempBuf,istringstream::in);
+                               iss >> zeroCountProb[i] >> numbers[i];  
                        }
                        
-                       //get probs for nonzero values
-                       for (int i = 0; i < num[kmer]; i++) {
-                               in >> name >> prob;
-                               wordGenusProb[kmer][name] = prob;
+                       MPI_File_close(&inMPI);
+                       
+                       for(int i=1;i<num2;i++){
+                               //read next sequence
+                               length = positions2[i+1] - positions2[i];
+                               char* buf4 = new char[length];
+
+                               MPI_File_read_at(inMPI2, positions2[i], buf4, length, MPI_CHAR, &status);
+
+                               tempBuf = buf4;
+                               if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
+                               delete buf4;
+
+                               istringstream iss (tempBuf,istringstream::in);
+                               
+                               iss >> kmer;
+                               
+                               //set them all to zero value
+                               for (int i = 0; i < genusNodes.size(); i++) {
+                                       wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1));
+                               }
+                               
+                               //get probs for nonzero values
+                               for (int i = 0; i < numbers[kmer]; i++) {
+                                       iss >> name >> prob;
+                                       wordGenusProb[kmer][name] = prob;
+                               }
+                               
                        }
+                       MPI_File_close(&inMPI2);
+               #else
+               
+                       in >> numKmers; gobble(in);
                        
-                       gobble(in);
-               }
-               in.close();
+                       //initialze probabilities
+                       wordGenusProb.resize(numKmers);
+                       
+                       for (int j = 0; j < wordGenusProb.size(); j++) {        wordGenusProb[j].resize(genusNodes.size());             }
+                       
+                       int kmer, name, count;  count = 0;
+                       vector<int> num; num.resize(numKmers);
+                       float prob;
+                       vector<float> zeroCountProb; zeroCountProb.resize(numKmers);            
+               
+                       while (inNum) {
+                               inNum >> zeroCountProb[count] >> num[count];  
+                               count++;
+                               gobble(inNum);
+                       }
+                       inNum.close();
+               
+                       while(in) {
+                               in >> kmer;
+                               
+                               //set them all to zero value
+                               for (int i = 0; i < genusNodes.size(); i++) {
+                                       wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1));
+                               }
+                               
+                               //get probs for nonzero values
+                               for (int i = 0; i < num[kmer]; i++) {
+                                       in >> name >> prob;
+                                       wordGenusProb[kmer][name] = prob;
+                               }
+                               
+                               gobble(in);
+                       }
+                       in.close();
+                       
+               #endif
        }
        catch(exception& e) {
                m->errorOut(e, "Bayesian", "readProbFile");
index 6e35e9f8bd485734962e55a140f2dbb134cc8110..70757be9ecf2a4c6791fd0fec7c9f74ab8a13501 100644 (file)
@@ -34,7 +34,7 @@ private:
        
        string bootstrapResults(vector<int>, int, int);
        int getMostProbableTaxonomy(vector<int>);
-       void readProbFile(ifstream&, ifstream&);
+       void readProbFile(ifstream&, ifstream&, string, string);
        
 };
 
index be287a05069c394def3f1cf48257f25b863db3be..147f499b87357f5782b98fc882db80f3233f2aee 100644 (file)
@@ -237,6 +237,8 @@ int Classify::readTaxonomy(string file) {
        
                phyloTree->assignHeirarchyIDs(0);
                
+               phyloTree->setUp(file);
+               
                m->mothurOut("DONE.");
                m->mothurOutEndLine();  cout.flush();
                
index a38bf8f34ad0e15e98e85f5b4ad627babefcbf2b..38378491c072c1ed33cc257ec954de6a81f3bb7d 100644 (file)
@@ -11,6 +11,7 @@
 #include "sequence.hpp"
 #include "bayesian.h"
 #include "phylotree.h"
+#include "phylosummary.h"
 #include "knn.h"
 
 //**********************************************************************************************************************
@@ -25,7 +26,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
                else {
                        
                        //valid paramters for this command
-                       string AlignArray[] =  {"template","fasta","name","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters", "outputdir","inputdir"};
+                       string AlignArray[] =  {"template","fasta","name","group","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters", "outputdir","inputdir"};
                        vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
                        
                        OptionParser parser(option);
@@ -62,6 +63,14 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
                                        //if the user has not given a path then, add inputdir. else leave path alone.
                                        if (path == "") {       parameters["taxonomy"] = inputDir + it->second;         }
                                }
+                               
+                               it = parameters.find("group");
+                               //user has given a template file
+                               if(it != parameters.end()){ 
+                                       path = hasPath(it->second);
+                                       //if the user has not given a path then, add inputdir. else leave path alone.
+                                       if (path == "") {       parameters["group"] = inputDir + it->second;            }
+                               }
                        }
 
                        //check for required parameters
@@ -73,6 +82,10 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option)  {
                        }
                        else if (templateFileName == "not open") { abort = true; }      
                        
+                       groupfile = validParameter.validFile(parameters, "group", true);
+                       if (groupfile == "not open") { abort = true; }  
+                       else if (groupfile == "not found") { groupfile = ""; }
+                       
                        fastaFileName = validParameter.validFile(parameters, "fasta", false);
                        if (fastaFileName == "not found") { m->mothurOut("fasta is a required parameter for the classify.seqs command."); m->mothurOutEndLine(); abort = true;  }
                        else { 
@@ -250,6 +263,7 @@ void ClassifySeqsCommand::help(){
                m->mothurOut("The template, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n");
                m->mothurOut("The search parameter allows you to specify the method to find most similar template.  Your options are: suffix, kmer, blast and distance. The default is kmer.\n");
                m->mothurOut("The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n");
+               m->mothurOut("The group parameter allows you add a group file so you can have the summary totals broken up by group.\n");
                m->mothurOut("The method parameter allows you to specify classification method to use.  Your options are: bayesian and knn. The default is bayesian.\n");
                m->mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate.  The default is 8.\n");
                m->mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
@@ -491,50 +505,40 @@ int ClassifySeqsCommand::execute(){
                        m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
                        start = time(NULL);
                        
-                       //make taxonomy tree from new taxonomy file 
-                       PhyloTree taxaBrowser;
+                       PhyloSummary taxaSum(taxonomyFileName, groupfile);
                        
                        if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       remove(outputNames[i].c_str()); } delete classify; return 0; }
                        
-                       ifstream in;
-                       openInputFile(tempTaxonomyFile, in);
-               
-                       //read in users taxonomy file and add sequences to tree
-                       string name, taxon;
-                       while(!in.eof()){
-                               in >> name >> taxon; gobble(in);
-                               
-                               if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       remove(outputNames[i].c_str()); } remove(tempTaxonomyFile.c_str()); delete classify; return 0; }
+                       if (namefile == "") {  taxaSum.summarize(tempTaxonomyFile);  }
+                       else {
+                               ifstream in;
+                               openInputFile(tempTaxonomyFile, in);
                                
-                               if (namefile != "") {
+                               //read in users taxonomy file and add sequences to tree
+                               string name, taxon;
+                               while(!in.eof()){
+                                       in >> name >> taxon; gobble(in);
+                                       
                                        itNames = nameMap.find(name);
                
                                        if (itNames == nameMap.end()) { 
                                                m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1);
                                        }else{
                                                for (int i = 0; i < itNames->second; i++) { 
-                                                       taxaBrowser.addSeqToTree(name+toString(i), taxon);  //add it as many times as there are identical seqs
+                                                       taxaSum.addSeqToTree(name+toString(i), taxon);  //add it as many times as there are identical seqs
                                                }
                                        }
-                               }else {  taxaBrowser.addSeqToTree(name, taxon);  } //add it once
+                               }
+                               in.close();
                        }
-                       in.close();
-       
-                       taxaBrowser.assignHeirarchyIDs(0);
-                       
-                       if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       remove(outputNames[i].c_str()); } remove(tempTaxonomyFile.c_str()); delete classify; return 0; }
-
-                       taxaBrowser.binUnclassified();
-                       
                        remove(tempTaxonomyFile.c_str());
                        
                        if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       remove(outputNames[i].c_str()); } delete classify; return 0; }
-
                        
                        //print summary file
                        ofstream outTaxTree;
                        openOutputFile(taxSummary, outTaxTree);
-                       taxaBrowser.print(outTaxTree);
+                       taxaSum.print(outTaxTree);
                        outTaxTree.close();
                        
                        //output taxonomy with the unclassified bins added
@@ -546,9 +550,10 @@ int ClassifySeqsCommand::execute(){
                        openOutputFile(unclass, outTax);
                        
                        //get maxLevel from phylotree so you know how many 'unclassified's to add
-                       int maxLevel = taxaBrowser.getMaxLevel();
+                       int maxLevel = taxaSum.getMaxLevel();
                        
                        //read taxfile - this reading and rewriting is done to preserve the confidence scores.
+                       string name, taxon;
                        while (!inTax.eof()) {
                                if (m->control_pressed) {  for (int i = 0; i < outputNames.size(); i++) {       remove(outputNames[i].c_str()); } remove(unclass.c_str()); delete classify; return 0; }
 
@@ -564,6 +569,9 @@ int ClassifySeqsCommand::execute(){
                        remove(newTaxonomyFile.c_str());
                        rename(unclass.c_str(), newTaxonomyFile.c_str());
                        
+                       m->mothurOutEndLine();
+                       m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for  " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
+                       
                        #ifdef USE_MPI  
                                }
                        #endif
@@ -572,10 +580,7 @@ int ClassifySeqsCommand::execute(){
                        m->mothurOut("Output File Names: "); m->mothurOutEndLine();
                        for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
                        m->mothurOutEndLine();
-
                        
-                       //m->mothurOutEndLine();
-                       //m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for  " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
                }
                
                delete classify;
index 0ffd18c27c70084acd2cd2c0eb99d6d2055c44b0..d19f862d340983786f00c25eaf02a6df30e0dea8 100644 (file)
@@ -48,7 +48,7 @@ private:
        
        Classify* classify;
        
-       string fastaFileName, templateFileName, distanceFileName, namefile, search, method, taxonomyFileName, outputDir;
+       string fastaFileName, templateFileName, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile;
        int processors, kmerSize, numWanted, cutoff, iters;
        float match, misMatch, gapOpen, gapExtend;
        bool abort, probs;
index 10a99dd240ee862884d59b8570581ebbe129ef32..da5d0f157390cb1a70c666ac86a9ecae1221f095 100644 (file)
@@ -278,8 +278,7 @@ int DistanceCommand::execute(){
                                        delete buf;
 
                                        int count = 0;
-                                       while (count < fileSize) { //read 1000 characters at a time
-                                               //send freqs
+                                       while (count < fileSize) { 
                                                char buf2[1];
                                                MPI_File_read(inMPI, buf2, 1, MPI_CHAR, &status);
                                                MPI_File_write(outMPI, buf2, 1, MPI_CHAR, &status);
index 5812d4551f9c6cc635f8ed4ee5bde2505a4b18df..d4aac7f8f4fc2252cb51ab17ca553f36b265de00 100644 (file)
--- a/makefile
+++ b/makefile
@@ -220,7 +220,7 @@ mothur : \
                ./classify.o\\r
                ./phylotree.o\\r
                ./bayesian.o\
-               ./rawtrainingdatamaker.o\\r
+               ./phylosummary.o\\r
                ./alignmentdb.o\\r
                ./knn.o\\r
                ./distancedb.o\\r
@@ -418,7 +418,7 @@ mothur : \
                ./classify.o\\r
                ./phylotree.o\\r
                ./bayesian.o\
-               ./rawtrainingdatamaker.o\\r
+               ./phylosummary.o\\r
                ./alignmentdb.o\\r
                ./knn.o\\r
                ./distancedb.o\\r
@@ -619,7 +619,7 @@ clean :
                ./classify.o\\r
                ./phylotree.o\\r
                ./bayesian.o\
-               ./rawtrainingdatamaker.o\\r
+               ./phylosummary.o\\r
                ./alignmentdb.o\\r
                ./knn.o\\r
                ./distancedb.o\\r
@@ -1623,9 +1623,9 @@ install : mothur
 ./chimerabellerophoncommand.o : chimerabellerophoncommand.cpp\r
        $(CC) $(CC_OPTIONS) chimerabellerophoncommand.cpp -c $(INCLUDE) -o ./chimerabellerophoncommand.o\r
 
-# Item # 171 -- rawtrainingdatamaker --\r
-./rawtrainingdatamaker.o : rawtrainingdatamaker.cpp\r
-       $(CC) $(CC_OPTIONS) rawtrainingdatamaker.cpp -c $(INCLUDE) -o ./rawtrainingdatamaker.o\r
+# Item # 171 -- phylosummary --\r
+./phylosummary.o : phylosummary.cpp\r
+       $(CC) $(CC_OPTIONS) phylosummary.cpp -c $(INCLUDE) -o ./phylosummary.o\r
 \r
 \r
 \r
diff --git a/phylosummary.cpp b/phylosummary.cpp
new file mode 100644 (file)
index 0000000..1e8d1bc
--- /dev/null
@@ -0,0 +1,274 @@
+/*
+ *  rawTrainingDataMaker.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 4/21/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "phylosummary.h"
+
+/**************************************************************************************************/
+
+PhyloSummary::PhyloSummary(string refTfile, string groupFile){
+       try {
+               m = MothurOut::getInstance();
+               maxLevel = 0;
+               
+               if (groupFile != "") {
+                       groupmap = new GroupMap(groupFile);
+                       groupmap->readMap();
+               }else{
+                       groupmap = NULL;
+               }
+                               
+               //check for necessary files
+               string taxFileNameTest = refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum";
+               ifstream FileTest(taxFileNameTest.c_str());
+               
+               if (!FileTest) { 
+                       m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
+               }else{
+                       readTreeStruct(FileTest);
+               }
+               
+               tree[0].rank = "0";
+               assignRank(0);
+
+       }
+       catch(exception& e) {
+               m->errorOut(e, "PhyloSummary", "PhyloSummary");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
+void PhyloSummary::summarize(string userTfile){
+       try {
+               
+               ifstream in;
+               openInputFile(userTfile, in);
+               
+               //read in users taxonomy file and add sequences to tree
+               string name, tax;
+               while(!in.eof()){
+                       in >> name >> tax; gobble(in);
+                       
+                       addSeqToTree(name, tax);
+                       
+                       if (m->control_pressed) { break;  }
+               }
+               in.close();
+       }
+       catch(exception& e) {
+               m->errorOut(e, "PhyloSummary", "summarize");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+string PhyloSummary::getNextTaxon(string& heirarchy){
+       try {
+               string currentLevel = "";
+               if(heirarchy != ""){
+                       int pos = heirarchy.find_first_of(';');
+                       currentLevel=heirarchy.substr(0,pos);
+                       if (pos != (heirarchy.length()-1)) {  heirarchy=heirarchy.substr(pos+1);  }
+                       else { heirarchy = ""; }
+               }
+               return currentLevel;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "PhyloSummary", "getNextTaxon");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
+       try {
+               numSeqs++;
+               
+               map<string, int>::iterator childPointer;
+               
+               int currentNode = 0;
+               string taxon;
+               
+               int level = 0;
+               
+               while (seqTaxonomy != "") {
+                       
+                       if (m->control_pressed) { return 0; }
+                       
+                       //somehow the parent is getting one too many accnos
+                       //use print to reassign the taxa id
+                       taxon = getNextTaxon(seqTaxonomy);
+                       
+                       childPointer = tree[currentNode].children.find(taxon);
+                       
+                       if(childPointer != tree[currentNode].children.end()){   //if the node already exists, update count and move on
+                               if (groupmap != NULL) {
+                                       //find out the sequences group
+                                       string group = groupmap->getGroup(seqName);
+                                       
+                                       //do you have a count for this group?
+                                       map<string, int>::iterator itGroup = tree[currentNode].groupCount.find(group);
+                                       
+                                       //if yes, increment it - there should not be a case where we can't find it since we load group in read
+                                       if (itGroup != tree[currentNode].groupCount.end()) {
+                                               tree[currentNode].groupCount[group]++;
+                                       }
+                               }
+                               
+                               tree[currentNode].total++;
+
+                               currentNode = childPointer->second;
+                       }else{                                                                                  //otherwise, create it
+                               m->mothurOut("Error: cannot find taxonomy in tree for " + seqName + "."); m->mothurOutEndLine();
+                               seqTaxonomy = "";
+                       }
+                       
+                       level++;
+                       
+                       if ((seqTaxonomy == "") && (level < maxLevel)) {  //if you think you are done and you are not.
+                               for (int k = level; k < maxLevel; k++) {  seqTaxonomy += "unclassified;";   }
+                       }
+               }
+
+       }
+       catch(exception& e) {
+               m->errorOut(e, "PhyloSummary", "addSeqToTree");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
+void PhyloSummary::assignRank(int index){
+       try {
+               map<string,int>::iterator it;
+               int counter = 1;
+               
+               for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
+                       tree[it->second].rank = tree[index].rank + '.' + toString(counter);
+                       counter++;
+                                                                       
+                       assignRank(it->second);
+               }
+       }
+       catch(exception& e) {
+               m->errorOut(e, "PhyloSummary", "assignRank");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
+void PhyloSummary::print(ofstream& out){
+       try {
+               //print labels
+               out << "taxlevel\t rank ID\t label\t daughterlevels\t total\t";
+               if (groupmap != NULL) {
+                       for (int i = 0; i < groupmap->namesOfGroups.size(); i++) {
+                               out << groupmap->namesOfGroups[i] << '\t';
+                       }
+               }
+               
+               out << endl;
+               
+               //print root
+               out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << tree[0].children.size() << "\t" << tree[0].total << "\t";
+               
+               map<string, int>::iterator itGroup;
+               if (groupmap != NULL) {
+                       for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) {
+                               out << itGroup->second << '\t';
+                       }
+               }
+               out << endl;
+               
+               //print rest
+               print(0, out);
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "PhyloSummary", "print");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+void PhyloSummary::print(int i, ofstream& out){
+       try {
+               map<string,int>::iterator it;
+               for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
+                       
+                       if (tree[it->second].total != 0)  {
+                               out << tree[it->second].level << "\t" << tree[it->second].rank << "\t" << tree[it->second].name << "\t" << tree[it->second].children.size() << "\t" << tree[it->second].total << "\t";
+                               
+                               map<string, int>::iterator itGroup;
+                               if (groupmap != NULL) {
+                                       for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) {
+                                               out << itGroup->second << '\t';
+                                       }
+                               }
+                               out << endl;
+                       }
+                       
+                       print(it->second, out);
+               }
+       }
+       catch(exception& e) {
+               m->errorOut(e, "PhyloSummary", "print");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+void PhyloSummary::readTreeStruct(ifstream& in){
+       try {
+               int num;
+               
+               in >> num; gobble(in);
+               
+               tree.resize(num);
+       
+               //read the tree file
+               for (int i = 0; i < tree.size(); i++) {
+       
+                       in >> tree[i].level >> tree[i].name >> num; //num contains the number of children tree[i] has
+                       
+                       //set children
+                       string childName;
+                       int childIndex;
+                       for (int j = 0; j < num; j++) {
+                               in >> childName >> childIndex;
+                               tree[i].children[childName] = childIndex;
+                       }
+                       
+                       //initialize groupcounts
+                       if (groupmap != NULL) {
+                               for (int j = 0; j < groupmap->namesOfGroups.size(); j++) {
+                                       tree[i].groupCount[groupmap->namesOfGroups[j]] = 0;
+                               }
+                       }
+                       
+                       tree[i].total = 0;
+                       
+                       gobble(in);
+                       
+                       if (tree[i].level > maxLevel) {  maxLevel = tree[i].level;  }
+               }
+
+       }
+       catch(exception& e) {
+               m->errorOut(e, "PhyloSummary", "print");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+
+       
diff --git a/phylosummary.h b/phylosummary.h
new file mode 100644 (file)
index 0000000..9ebdf81
--- /dev/null
@@ -0,0 +1,61 @@
+#ifndef RAWTRAININGDATAMAKER_H
+#define RAWTRAININGDATAMAKER_H
+
+/*
+ *  rawTrainingDataMaker.h
+ *  Mothur
+ *
+ *  Created by westcott on 4/21/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "mothurout.h"
+#include "groupmap.h"
+
+/**************************************************************************************************/
+
+struct rawTaxNode {
+       map<string, int> children;  //childs name to index in tree
+       int parent, level;
+       string name, rank;
+       map<string, int> groupCount;
+       int total;
+       
+       rawTaxNode(string n) : name(n), level(0), parent(-1), total(0) {}
+       rawTaxNode(){}
+};
+
+/**************************************************************************************************/
+//doesn't use MPI ifdefs since only pid 0 uses this class
+class PhyloSummary {
+
+public:
+       PhyloSummary(string, string);
+       ~PhyloSummary() { if (groupmap != NULL)  {  delete groupmap;  }  }
+       
+       void summarize(string);  //pass it a taxonomy file and a group file and it makes the tree
+       int addSeqToTree(string, string);
+       void print(ofstream&);
+       int getMaxLevel() { return maxLevel; }
+       
+private:
+       string getNextTaxon(string&);
+       vector<rawTaxNode> tree;
+       void print(int, ofstream&);
+       void assignRank(int);
+       void readTreeStruct(ifstream&);
+       GroupMap* groupmap;
+       
+       int numNodes;
+       int numSeqs;
+       int maxLevel;
+       MothurOut* m;
+};
+
+/**************************************************************************************************/
+
+#endif
+
+
index 33aedc6395f09fc24576b93e1b72261884d98907..8c48e4f1f783c9387c83d4e5d1bfdbb475eb09cf 100644 (file)
@@ -28,34 +28,78 @@ PhyloTree::PhyloTree(){
 }
 /**************************************************************************************************/
 
-PhyloTree::PhyloTree(ifstream& in){
+PhyloTree::PhyloTree(ifstream& in, string filename){
        try {
                m = MothurOut::getInstance();
                calcTotals = false;
                
-               in >> numNodes; gobble(in);
-               
-               tree.resize(numNodes);
-               
-               for (int i = 0; i < tree.size(); i++) {
-                       in >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(in);
-               }
-               
-               //read genus nodes
-               int numGenus = 0;
-               in >> numGenus; gobble(in);
-       
-               int gnode, gsize;
-               totals.clear();
-               for (int i = 0; i < numGenus; i++) {
-                       in >> gnode >> gsize; gobble(in);
+               #ifdef USE_MPI
+                       MPI_File inMPI;
+                       MPI_Offset size;
+                       MPI_Status status;
+
+                       char inFileName[1024];
+                       strcpy(inFileName, filename.c_str());
+
+                       MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  
+                       MPI_File_get_size(inMPI, &size);
                        
-                       uniqueTaxonomies[gnode] = gnode;
-                       totals.push_back(gsize);
-               }
+                       char* buffer = new char[size];
+                       MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
+
+                       string tempBuf = buffer;
+                       if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size);  }
+                       istringstream iss (tempBuf,istringstream::in);
+                       delete buffer;
+                       
+                       iss >> numNodes; gobble(iss);
+                       
+                       tree.resize(numNodes);
+                       
+                       for (int i = 0; i < tree.size(); i++) {
+                               iss >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(iss);
+                       }
                        
-               in.close();
+                       //read genus nodes
+                       int numGenus = 0;
+                       iss >> numGenus; gobble(iss);
+                       
+                       int gnode, gsize;
+                       totals.clear();
+                       for (int i = 0; i < numGenus; i++) {
+                               iss >> gnode >> gsize; gobble(iss);
+                               
+                               uniqueTaxonomies[gnode] = gnode;
+                               totals.push_back(gsize);
+                       }
+                       
+                       MPI_File_close(&inMPI);
+                       
+               #else
+                       in >> numNodes; gobble(in);
+                       
+                       tree.resize(numNodes);
+                       
+                       for (int i = 0; i < tree.size(); i++) {
+                               in >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(in);
+                       }
+                       
+                       //read genus nodes
+                       int numGenus = 0;
+                       in >> numGenus; gobble(in);
+                       
+                       int gnode, gsize;
+                       totals.clear();
+                       for (int i = 0; i < numGenus; i++) {
+                               in >> gnode >> gsize; gobble(in);
+                               
+                               uniqueTaxonomies[gnode] = gnode;
+                               totals.push_back(gsize);
+                       }
+                       
+                       in.close();
+                       
+               #endif
                
        }
        catch(exception& e) {
@@ -74,20 +118,70 @@ PhyloTree::PhyloTree(string tfile){
                tree[0].heirarchyID = "0";
                maxLevel = 0;
                calcTotals = true;
+               string name, tax;
+
                
-               ifstream in;
-               openInputFile(tfile, in);
+               #ifdef USE_MPI
+                       int pid, num;
+                       vector<long> positions;
+                       
+                       MPI_Status status; 
+                       MPI_File inMPI;
+                       MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+                       char inFileName[1024];
+                       strcpy(inFileName, tfile.c_str());
+
+                       MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
+
+                       if (pid == 0) {
+                               positions = setFilePosEachLine(tfile, num);
+                               
+                               //send file positions to all processes
+                               MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
+                               MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos 
+                       }else{
+                               MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
+                               positions.resize(num);
+                               MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
+                       }
                
-               //read in users taxonomy file and add sequences to tree
-               string name, tax;
-               while(!in.eof()){
-                       in >> name >> tax; gobble(in);
+                       //read file 
+                       for(int i=0;i<num;i++){
+                               //read next sequence
+                               int length = positions[i+1] - positions[i];
+                               char* buf4 = new char[length];
+
+                               MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
+
+                               string tempBuf = buf4;
+                               if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
+                               delete buf4;
+
+                               istringstream iss (tempBuf,istringstream::in);
+                               iss >> name >> tax;
+                               addSeqToTree(name, tax);
+                       }
                        
-                       addSeqToTree(name, tax);
-               }
-               in.close();
-       
+                       MPI_File_close(&inMPI);
+               
+               #else
+                       ifstream in;
+                       openInputFile(tfile, in);
+                       
+                       //read in users taxonomy file and add sequences to tree
+                       while(!in.eof()){
+                               in >> name >> tax; gobble(in);
+                               
+                               addSeqToTree(name, tax);
+                       }
+                       in.close();
+               #endif
+               
                assignHeirarchyIDs(0);
+               
+               //create file for summary if needed
+               setUp(tfile);
        }
        catch(exception& e) {
                m->errorOut(e, "PhyloTree", "PhyloTree");
@@ -232,15 +326,49 @@ void PhyloTree::assignHeirarchyIDs(int index){
        }
 }
 /**************************************************************************************************/
-void PhyloTree::binUnclassified(){
+void PhyloTree::setUp(string tfile){
+       try{
+               string taxFileNameTest = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.sum";
+               
+               #ifdef USE_MPI
+                       int pid;
+                       MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+                       if (pid == 0) {  binUnclassified(taxFileNameTest);  }
+               
+               #else
+                       //create file needed for summary if it doesn't exist
+                       ifstream FileTest(taxFileNameTest.c_str());
+                       
+                       if (!FileTest) { 
+                               binUnclassified(taxFileNameTest); 
+                       }
+               #endif
+       }
+       catch(exception& e) {
+               m->errorOut(e, "PhyloTree", "setUp");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+void PhyloTree::binUnclassified(string file){
        try {
+       
+               ofstream out;
+               openOutputFile(file, out);
+               
                map<string, int>::iterator itBin;
                map<string, int>::iterator childPointer;
                
+               vector<TaxNode> copy = tree;
+               int copyNodes = numNodes;
+               
                //go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary
                for (itBin = name2Taxonomy.begin(); itBin != name2Taxonomy.end(); itBin++) {
-               
-                       int level = tree[itBin->second].level;
+                       
+                       if (m->control_pressed) {  out.close(); break;  }
+                       
+                       int level = copy[itBin->second].level;
                        int currentNode = itBin->second;
                        
                        //this sequence is unclassified at some levels
@@ -251,34 +379,30 @@ void PhyloTree::binUnclassified(){
                                string taxon = "unclassified";  
                                
                                //does the parent have a child names 'unclassified'?
-                               childPointer = tree[currentNode].children.find(taxon);
+                               childPointer = copy[currentNode].children.find(taxon);
                                
-                               if(childPointer != tree[currentNode].children.end()){   //if the node already exists, move on
+                               if(childPointer != copy[currentNode].children.end()){   //if the node already exists, move on
                                        currentNode = childPointer->second; //currentNode becomes 'unclassified'
-                                       tree[currentNode].accessions.push_back(itBin->first);  //add this seq
-                                       name2Taxonomy[itBin->first] = currentNode;
+                                       copy[currentNode].accessions.push_back(itBin->first);  //add this seq
                                }
                                else{                                                                                   //otherwise, create it
-                                       tree.push_back(TaxNode(taxon));
-                                       numNodes++;
-                                       tree[currentNode].children[taxon] = numNodes-1;
-                                       tree[numNodes-1].parent = currentNode;
+                                       copy.push_back(TaxNode(taxon));
+                                       copyNodes++;
+                                       copy[currentNode].children[taxon] = copyNodes-1;
+                                       copy[copyNodes-1].parent = currentNode;
+                                       copy[copyNodes-1].level = copy[currentNode].level + 1;
                                                                        
-                                       currentNode = tree[currentNode].children[taxon];
-                                       tree[currentNode].accessions.push_back(itBin->first);
-                                       name2Taxonomy[itBin->first] = currentNode;
+                                       currentNode = copy[currentNode].children[taxon];
+                                       copy[currentNode].accessions.push_back(itBin->first);
                                }
-                               
-                               if (level == maxLevel) {   uniqueTaxonomies[currentNode] = currentNode; }
                        }
                }
                
-               //clear HeirarchyIDs and reset them
-               for (int i = 1; i < tree.size(); i++) {
-                       tree[i].heirarchyID = "";
+               if (!m->control_pressed) {
+                       //print copy tree
+                       print(out, copy);
                }
-               assignHeirarchyIDs(0);
-               
+                               
        }
        catch(exception& e) {
                m->errorOut(e, "PhyloTree", "binUnclassified");
@@ -306,26 +430,22 @@ string PhyloTree::getFullTaxonomy(string seqName) {
 }
 /**************************************************************************************************/
 
-void PhyloTree::print(ofstream& out){
-       try {
-               out << tree[0].level << '\t'<< tree[0].heirarchyID << '\t' << tree[0].name << '\t' << tree[0].children.size() << '\t' << tree[0].accessions.size() << endl;
-               print(0, out);
-       }
-       catch(exception& e) {
-               m->errorOut(e, "PhyloTree", "print");
-               exit(1);
-       }
-}
-
-/**************************************************************************************************/
-
-void PhyloTree::print(int i, ofstream& out){
+void PhyloTree::print(ofstream& out, vector<TaxNode>& copy){
        try {
-               map<string,int>::iterator it;
-               for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
-                       out <<tree[it->second].level << '\t' << tree[it->second].heirarchyID << '\t' << tree[it->second].name << '\t' << tree[it->second].children.size() << '\t' << tree[it->second].accessions.size() << endl;
-                       print(it->second, out);
+               out << copy.size() << endl;
+               
+               for (int i = 0; i < copy.size(); i++) {
+       
+                       out << copy[i].level << '\t'<< copy[i].name << '\t' << copy[i].children.size() << '\t';
+                       
+                       map<string,int>::iterator it;
+                       for(it=copy[i].children.begin();it!=copy[i].children.end();it++){
+                               out << it->first << '\t' << it->second << '\t';
+                       }
+                       out << endl;
                }
+               
+               out.close();
        }
        catch(exception& e) {
                m->errorOut(e, "PhyloTree", "print");
@@ -335,23 +455,37 @@ void PhyloTree::print(int i, ofstream& out){
 /**************************************************************************************************/
 void PhyloTree::printTreeNodes(string treefilename) {
        try {
-               ofstream outTree;
-               openOutputFile(treefilename, outTree);
-               
-               //print treenodes
-               outTree << tree.size() << endl;
-               for (int i = 0; i < tree.size(); i++) {
-                       outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
-               }
-               
-               //print genus nodes
-               outTree << endl << uniqueTaxonomies.size() << endl;
-               map<int, int>::iterator it2;
-               for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl;  }
-               outTree << endl;
+       
+               #ifdef USE_MPI
+                       int pid;
+                       MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+                       if (pid == 0) {  
                
+               #endif
+
+                       ofstream outTree;
+                       openOutputFile(treefilename, outTree);
+                       
+                       //print treenodes
+                       outTree << tree.size() << endl;
+                       for (int i = 0; i < tree.size(); i++) {
+                               outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
+                       }
+                       
+                       //print genus nodes
+                       outTree << endl << uniqueTaxonomies.size() << endl;
+                       map<int, int>::iterator it2;
+                       for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl;  }
+                       outTree << endl;
+                       
+                       
+                       outTree.close();
                
-               outTree.close();
+               #ifdef USE_MPI
+                       }
+               #endif
+
                
        }
        catch(exception& e) {
index b65a651b7a864f45ee04617ef447f344005a3aba..a57cf256ecf0e0eded1182681f7ab91601e0a1a5 100644 (file)
@@ -32,15 +32,14 @@ class PhyloTree {
 public:
        PhyloTree();
        PhyloTree(string);  //pass it a taxonomy file and it makes the tree
-       PhyloTree(ifstream&);  //pass it a taxonomy file and it makes the train.tree
+       PhyloTree(ifstream&, string);  //pass it a taxonomy file and it makes the train.tree
        ~PhyloTree() {};
        int addSeqToTree(string, string);
        void assignHeirarchyIDs(int);
-       void print(ofstream&);
        void printTreeNodes(string); //used by bayesian to save time
        vector<int> getGenusNodes();
        vector<int> getGenusTotals();   
-       void binUnclassified();
+       void setUp(string);  //used to create file needed for summary file if you use () constructor and add seqs manually instead of passing taxonomyfile
                
        TaxNode get(int i)                              {       return tree[i];                                                 }
        TaxNode get(string seqName)             {       return tree[name2Taxonomy[seqName]];    }
@@ -52,12 +51,15 @@ public:
        
 private:
        string getNextTaxon(string&);
+       void print(ofstream&, vector<TaxNode>&);
+       void binUnclassified(string);
+       
        vector<TaxNode> tree;
        vector<int> genusIndex; //holds the indexes in tree where the genus level taxonomies are stored
        vector<int> totals; //holds the numSeqs at each genus level taxonomy
        map<string, int> name2Taxonomy;  //maps name to index in tree
        map<int, int> uniqueTaxonomies;  //map of unique taxonomies
-       void print(int, ofstream&);
+       //void print(int, ofstream&);
        int numNodes;
        int numSeqs;
        int maxLevel;
diff --git a/rawtrainingdatamaker.cpp b/rawtrainingdatamaker.cpp
deleted file mode 100644 (file)
index 362cdb7..0000000
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- *  rawTrainingDataMaker.cpp
- *  Mothur
- *
- *  Created by westcott on 4/21/10.
- *  Copyright 2010 Schloss Lab. All rights reserved.
- *
- */
-
-#include "rawtrainingdatamaker.h"
-
-/**************************************************************************************************/
-
-RawTrainingDataMaker::RawTrainingDataMaker(){
-       try {
-               m = MothurOut::getInstance();
-               numNodes = 1;
-               numSeqs = 0;
-               tree.push_back(rawTaxNode("Root"));
-               tree[0].rank = "Root";
-               maxLevel = 0;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "RawTrainingDataMaker", "RawTrainingDataMaker");
-               exit(1);
-       }
-}
-/**************************************************************************************************/
-
-RawTrainingDataMaker::RawTrainingDataMaker(string tfile){
-       try {
-               m = MothurOut::getInstance();
-               numNodes = 1;
-               numSeqs = 0;
-               tree.push_back(rawTaxNode("Root"));
-               tree[0].rank = "Root";
-               maxLevel = 0;
-               
-               ifstream in;
-               openInputFile(tfile, in);
-               
-               //read in users taxonomy file and add sequences to tree
-               string name, tax;
-               while(!in.eof()){
-                       in >> name >> tax; gobble(in);
-                       
-                       addSeqToTree(name, tax);
-               }
-               in.close();
-       
-               assignRank(0);
-       }
-       catch(exception& e) {
-               m->errorOut(e, "RawTrainingDataMaker", "RawTrainingDataMaker");
-               exit(1);
-       }
-}
-
-/**************************************************************************************************/
-
-string RawTrainingDataMaker::getNextTaxon(string& heirarchy){
-       try {
-               string currentLevel = "";
-               if(heirarchy != ""){
-                       int pos = heirarchy.find_first_of(';');
-                       currentLevel=heirarchy.substr(0,pos);
-                       if (pos != (heirarchy.length()-1)) {  heirarchy=heirarchy.substr(pos+1);  }
-                       else { heirarchy = ""; }
-               }
-               return currentLevel;
-       }
-       catch(exception& e) {
-               m->errorOut(e, "RawTrainingDataMaker", "getNextTaxon");
-               exit(1);
-       }
-}
-
-/**************************************************************************************************/
-
-int RawTrainingDataMaker::addSeqToTree(string seqName, string seqTaxonomy){
-       try {
-               numSeqs++;
-               
-               map<string, int>::iterator childPointer;
-               
-               int currentNode = 0;
-               string taxon;
-               
-               while(seqTaxonomy != ""){
-                       
-                       if (m->control_pressed) { return 0; }
-                       
-                       //somehow the parent is getting one too many accnos
-                       //use print to reassign the taxa id
-                       taxon = getNextTaxon(seqTaxonomy);
-                       
-                       childPointer = tree[currentNode].children.find(taxon);
-                       
-                       if(childPointer != tree[currentNode].children.end()){   //if the node already exists, move on
-                               currentNode = childPointer->second;
-                       }else{                                                                                  //otherwise, create it
-                               tree.push_back(rawTaxNode(taxon));
-                               numNodes++;
-                               tree[currentNode].children[taxon] = numNodes-1;
-                               tree[numNodes-1].parent = currentNode;
-                               
-                               currentNode = tree[currentNode].children[taxon];
-                       }
-               }
-
-       }
-       catch(exception& e) {
-               m->errorOut(e, "RawTrainingDataMaker", "addSeqToTree");
-               exit(1);
-       }
-}
-/**************************************************************************************************/
-
-void RawTrainingDataMaker::assignRank(int index){
-       try {
-               map<string,int>::iterator it;
-                               
-               string ranks[9] = { "Root","Domain","Kingdom","Phylum","Class","Order","Family","Genus","Species" };
-               
-               for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
-                       tree[it->second].level = tree[index].level + 1;
-                       
-                       if (tree[it->second].level > 8) { 
-                               tree[it->second].rank = ("unknown" + toString(tree[it->second].level));
-                       }else {
-                               tree[it->second].rank = ranks[tree[it->second].level];
-                       }
-                                               
-                       //save maxLevel for binning the unclassified seqs
-                       if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; } 
-                       
-                       assignRank(it->second);
-               }
-       }
-       catch(exception& e) {
-               m->errorOut(e, "RawTrainingDataMaker", "assignRank");
-               exit(1);
-       }
-}
-/**************************************************************************************************/
-
-void RawTrainingDataMaker::print(ofstream& out){
-       try {
-               //string temp = tree[0].name +" " + tree[0].rank;
-               //sanityCheck[temp] = temp;
-               
-               out << "0" << "*" << tree[0].name << "*" << tree[0].parent << "*" << tree[0].level << "*" << tree[0].rank << endl;
-               print(0, out);
-               
-       }
-       catch(exception& e) {
-               m->errorOut(e, "RawTrainingDataMaker", "print");
-               exit(1);
-       }
-}
-
-/**************************************************************************************************/
-
-void RawTrainingDataMaker::print(int i, ofstream& out){
-       try {
-               map<string,int>::iterator it;
-               for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
-                       //string temp = tree[it->second].name + " " + tree[it->second].rank;
-                       
-                       //map<string, string>::iterator itSan;
-                       //itSan = sanityCheck.find(temp);
-                       
-                       //if (itSan == sanityCheck.end()) {
-                               out << it->second << "*" << tree[it->second].name << "*" << tree[it->second].parent << "*" << tree[it->second].level << "*" << tree[it->second].rank << endl;
-                               //sanityCheck[temp] = temp;
-                       //}
-                       print(it->second, out);
-               }
-       }
-       catch(exception& e) {
-               m->errorOut(e, "RawTrainingDataMaker", "print");
-               exit(1);
-       }
-}
-/**************************************************************************************************/
-
-
-       
diff --git a/rawtrainingdatamaker.h b/rawtrainingdatamaker.h
deleted file mode 100644 (file)
index 912d781..0000000
+++ /dev/null
@@ -1,54 +0,0 @@
-#ifndef RAWTRAININGDATAMAKER_H
-#define RAWTRAININGDATAMAKER_H
-
-/*
- *  rawTrainingDataMaker.h
- *  Mothur
- *
- *  Created by westcott on 4/21/10.
- *  Copyright 2010 Schloss Lab. All rights reserved.
- *
- */
-
-#include "mothur.h"
-#include "mothurout.h"
-
-/**************************************************************************************************/
-
-struct rawTaxNode {
-       map<string, int> children;  //childs name to index in tree
-       int parent, level;
-       string name, rank;
-       
-       rawTaxNode(string n) : name(n), level(0), parent(-1) {          }
-       rawTaxNode(){}
-};
-
-/**************************************************************************************************/
-
-class RawTrainingDataMaker {
-
-public:
-       RawTrainingDataMaker();
-       RawTrainingDataMaker(string);  //pass it a taxonomy file and it makes the tree
-       ~RawTrainingDataMaker() {};
-       int addSeqToTree(string, string);
-       void assignRank(int);
-       void print(ofstream&);
-       
-private:
-       string getNextTaxon(string&);
-       vector<rawTaxNode> tree;
-       void print(int, ofstream&);
-       int numNodes;
-       int numSeqs;
-       int maxLevel;
-       //map<string, string> sanityCheck;
-       MothurOut* m;
-};
-
-/**************************************************************************************************/
-
-#endif
-
-