]> git.donarmstrong.com Git - mothur.git/commitdiff
made classifier faster
authorwestcott <westcott>
Mon, 26 Apr 2010 15:05:21 +0000 (15:05 +0000)
committerwestcott <westcott>
Mon, 26 Apr 2010 15:05:21 +0000 (15:05 +0000)
17 files changed:
Mothur.xcodeproj/project.pbxproj
bayesian.cpp
bellerophon.cpp
chimerabellerophoncommand.cpp
chimerapintailcommand.cpp
chimeraslayercommand.cpp
classify.cpp
classify.h
classifyseqscommand.cpp
getoturepcommand.cpp
knn.cpp
makefile
phylotree.cpp
phylotree.h
rawtrainingdatamaker.cpp [new file with mode: 0644]
rawtrainingdatamaker.h [new file with mode: 0644]
readdistcommand.cpp

index 79c1e5b704ab469a27e074500f36a5dcf710f9fb..805bde0484399fcb37491f58a97cec4f775288bd 100644 (file)
@@ -12,6 +12,8 @@
                A747E81C116365E000FB9042 /* chimeraslayercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraslayercommand.h; sourceTree = "<group>"; };
                A747E81D116365E000FB9042 /* chimeraslayercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraslayercommand.cpp; sourceTree = "<group>"; };
                A7639F8D1175DF35008F5578 /* makefile */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.make; path = makefile; sourceTree = "<group>"; };
+               A76AAD02117F322B003D8DA1 /* rawtrainingdatamaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = rawtrainingdatamaker.h; sourceTree = SOURCE_ROOT; };
+               A76AAD03117F322B003D8DA1 /* rawtrainingdatamaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = rawtrainingdatamaker.cpp; sourceTree = SOURCE_ROOT; };
                A78254461164D7790002E2DD /* chimerapintailcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerapintailcommand.h; sourceTree = "<group>"; };
                A78254471164D7790002E2DD /* chimerapintailcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerapintailcommand.cpp; sourceTree = "<group>"; };
                A7825502116519F70002E2DD /* chimerabellerophoncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerabellerophoncommand.h; sourceTree = "<group>"; };
                                A7DA208A113FECD400BF472F /* knn.h */,
                                A7DA20C4113FECD400BF472F /* phylotree.cpp */,
                                A7DA20C5113FECD400BF472F /* phylotree.h */,
+                               A76AAD02117F322B003D8DA1 /* rawtrainingdatamaker.h */,
+                               A76AAD03117F322B003D8DA1 /* rawtrainingdatamaker.cpp */,
                                A7DA215C113FECD400BF472F /* taxonomyequalizer.cpp */,
                                A7DA215D113FECD400BF472F /* taxonomyequalizer.h */,
                        );
index b007c006dc741c2fd1755d60dcee7d3f5942aa39..0929749095386a72c378ac1c9ea586e3f3d354d2 100644 (file)
@@ -9,29 +9,17 @@
 
 #include "bayesian.h"
 #include "kmer.hpp"
+#include "rawtrainingdatamaker.h"
 
 /**************************************************************************************************/
 Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i) : 
-Classify(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0), kmerSize(ksize), confidenceThreshold(cutoff), iters(i)  {
+Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i)  {
        try {
                                        
-               numKmers = database->getMaxKmer() + 1;
-               
-               //initialze probabilities
-               wordGenusProb.resize(numKmers);
-               
-               genusNodes = phyloTree->getGenusNodes(); 
-               
-               for (int j = 0; j < wordGenusProb.size(); j++) {        wordGenusProb[j].resize(genusNodes.size());             }
-                       
-               //reset counts because we are on a new word
-               for (int j = 0; j < genusNodes.size(); j++) {
-                       TaxNode temp = phyloTree->get(genusNodes[j]);
-                       genusTotals.push_back(temp.accessions.size());
-               }
-
-               
                /************calculate the probablity that each word will be in a specific taxonomy*************/
+               string phyloTreeName = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.train";
+               ifstream phyloTreeTest(phyloTreeName.c_str());
+               
                ofstream out;
                string probFileName = tfile.substr(0,tfile.find_last_of(".")+1) + tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.prob";
                ifstream probFileTest(probFileName.c_str());
@@ -42,15 +30,47 @@ Classify(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0), kmerSize(ksize), c
                
                int start = time(NULL);
                
-               if(probFileTest && probFileTest2){      
+               if(probFileTest && probFileTest2 && phyloTreeTest){     
+                       m->mothurOut("Reading template taxonomy...     "); cout.flush();
+                       
+                       phyloTree = new PhyloTree(phyloTreeTest);
+       
+                       m->mothurOut("DONE."); m->mothurOutEndLine();
+                       
+                       genusNodes = phyloTree->getGenusNodes(); 
+                       genusTotals = phyloTree->getGenusTotals();
+               
                        m->mothurOut("Reading template probabilities...     "); cout.flush();
                        readProbFile(probFileTest, probFileTest2);      
+                       
                }else{
+               
+                       //create search database and names vector
+                       generateDatabaseAndNames(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0);
+                       
+                       genusNodes = phyloTree->getGenusNodes(); 
+                       genusTotals = phyloTree->getGenusTotals();
+                       
+                       m->mothurOut("Calculating template taxonomy tree...     "); cout.flush();
+                       
+                       phyloTree->printTreeNodes(phyloTreeName);
+                                               
+                       m->mothurOut("DONE."); m->mothurOutEndLine();
+                       
                        m->mothurOut("Calculating template probabilities...     "); cout.flush();
+                       
+                       numKmers = database->getMaxKmer() + 1;
+               
+                       //initialze probabilities
+                       wordGenusProb.resize(numKmers);
+               
+                       for (int j = 0; j < wordGenusProb.size(); j++) {        wordGenusProb[j].resize(genusNodes.size());             }
 
                        ofstream out;
                        openOutputFile(probFileName, out);
                        
+                       out << numKmers << endl;
+                       
                        ofstream out2;
                        openOutputFile(probFileName2, out2);
                        
@@ -86,9 +106,14 @@ Classify(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0), kmerSize(ksize), c
                        
                        out.close();
                        out2.close();
+                       
+                       //read in new phylotree with less info. - its faster
+                       ifstream phyloTreeTest(phyloTreeName.c_str());
+                       delete phyloTree;
+                       
+                       phyloTree = new PhyloTree(phyloTreeTest);
                }
-               
-               
+       
                m->mothurOut("DONE."); m->mothurOutEndLine();
                m->mothurOut("It took " + toString(time(NULL) - start) + " seconds get probabilities. "); m->mothurOutEndLine();
        }
@@ -113,14 +138,12 @@ string Bayesian::getTaxonomy(Sequence* seq) {
                for (int i = 0; i < queryKmerString.length(); i++) {
                        if (queryKmerString[i] != '!') { //this kmer is in the query
                                queryKmers.push_back(i);
                        }
                }
                
                if (queryKmers.size() == 0) {  m->mothurOut(seq->getName() + "is bad."); m->mothurOutEndLine(); return "bad seq"; }
                
                int index = getMostProbableTaxonomy(queryKmers);
-
                
                if (m->control_pressed) { return tax; }
                                        
@@ -272,18 +295,25 @@ map<string, int> Bayesian::parseTaxMap(string newTax) {
 void Bayesian::readProbFile(ifstream& in, ifstream& inNum) {
        try{
                
+               in >> numKmers; gobble(in);
+               
+               //initialze probabilities
+               wordGenusProb.resize(numKmers);
+               
+               for (int j = 0; j < wordGenusProb.size(); j++) {        wordGenusProb[j].resize(genusNodes.size());             }
+               
                int kmer, name, count;  count = 0;
                vector<int> num; num.resize(numKmers);
                float prob;
                vector<float> zeroCountProb; zeroCountProb.resize(numKmers);            
-               
+       
                while (inNum) {
                        inNum >> zeroCountProb[count] >> num[count];  
                        count++;
                        gobble(inNum);
                }
                inNum.close();
-               
+       
                while(in) {
                        in >> kmer;
                        
index e33230ed8b9949550bf716cb82e4c7bf6a6353ae..54025eb9285c87be35039a3051454cfd763a3c4d 100644 (file)
@@ -132,7 +132,7 @@ int Bellerophon::print(ostream& out, ostream& outAcc) {
 //***************************************************************************************************************
 int Bellerophon::print(MPI_File& out, MPI_File& outAcc) {
        try {
-               
+       
                int pid;
                MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
                
@@ -148,17 +148,26 @@ int Bellerophon::print(MPI_File& out, MPI_File& outAcc) {
 
                        if (m->control_pressed) { return numSeqs; }
                        
-                       outString += "Name\tScore\tLeft\tRight\n";
+                       outString = "Name\tScore\tLeft\tRight\n";
+                       MPI_Status status;
+                       int olength = outString.length();
+                       char* buf5 = new char[olength];
+                       memcpy(buf5, outString.c_str(), olength);
+                                       
+                       MPI_File_write_shared(out, buf5, olength, MPI_CHAR, &status);
+                       
+                       delete buf5;
+
                        //output prefenence structure to .chimeras file
                        for (int i = 0; i < best.size(); i++) {
                                
                                if (m->control_pressed) {  return numSeqs; }
                                
-                               outString += best[i].name + "\t" +  toString(best[i].score) + "\t" + best[i].leftParent + "\t" + best[i].rightParent + "\n";
-                               
+                               outString = best[i].name + "\t" +  toString(best[i].score) + "\t" + best[i].leftParent + "\t" + best[i].rightParent + "\n";
+                       
                                MPI_Status status;
                                int length = outString.length();
-                               char* buf2 = new char[length];\r
+                               char* buf2 = new char[length];
                                memcpy(buf2, outString.c_str(), length);
                                        
                                MPI_File_write_shared(out, buf2, length, MPI_CHAR, &status);
@@ -168,12 +177,12 @@ int Bellerophon::print(MPI_File& out, MPI_File& outAcc) {
                                //calc # of seqs with preference above 95%tile
                                if (best[i].score >= cutoffScore) { 
                                        above1++; 
-                                       string outAccString;
+                                       string outAccString = "";
                                         outAccString += best[i].name + "\n";
                                        
                                        MPI_Status statusAcc;
                                        length = outAccString.length();
-                                       char* buf = new char[length];\r
+                                       char* buf = new char[length];
                                        memcpy(buf, outAccString.c_str(), length);
                                        
                                        MPI_File_write_shared(outAcc, buf, length, MPI_CHAR, &statusAcc);
@@ -291,7 +300,7 @@ int Bellerophon::getChimeras() {
                                if (m->control_pressed) { return 0; }
                                
                                int bestLength = MPIBestSend[i].length();
-                               char* buf = new char[bestLength];\r
+                               char* buf = new char[bestLength];
                                memcpy(buf, MPIBestSend[i].c_str(), bestLength);
                                
                                MPI_Send(&bestLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD);
index 67f6b57be1a64f9ae760c86258cea67cdbd62f02..668956f838306c2e3bbb0eb0a06b7089a228960d 100644 (file)
@@ -90,7 +90,7 @@ void ChimeraBellerophonCommand::help(){
                m->mothurOut("The chimera.bellerophon command reads a fastafile and creates list of potentially chimeric sequences.\n");
                m->mothurOut("The chimera.bellerophon command parameters are fasta, filter, correction, processors, window, increment. The fasta parameter is required.\n");
                m->mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter, default=false. \n");
-               m->mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs.\n");
+               m->mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs, default=true.\n");
                m->mothurOut("The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n");
                #ifdef USE_MPI
                m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
index 384c88513f06f9bdf8a738a62cb2a11cf6596661..62f1f2bae942f1c91ca1ee16bb21b6a4996c790e 100644 (file)
@@ -147,7 +147,7 @@ void ChimeraPintailCommand::help(){
                #ifdef USE_MPI
                m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
                #endif
-               m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default=1/4 sequence length. \n");
+               m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default=300. \n");
                m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default=25.\n");
                m->mothurOut("The conservation parameter allows you to enter a frequency file containing the highest bases frequency at each place in the alignment.\n");
                m->mothurOut("The quantile parameter allows you to enter a file containing quantiles for a template files sequences, if you use the filter the quantile file generated becomes unique to the fasta file you used.\n");
@@ -175,12 +175,11 @@ int ChimeraPintailCommand::execute(){
                
                int start = time(NULL); 
                
-               chimera = new Pintail(fastafile, templatefile, filter, processors, maskfile, consfile, quanfile, window, increment, outputDir);
-               
                //set user options
                if (maskfile == "default") { m->mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); m->mothurOutEndLine();  }
                
-
+               chimera = new Pintail(fastafile, templatefile, filter, processors, maskfile, consfile, quanfile, window, increment, outputDir);
+               
                string outputFileName, accnosFileName;
                if (maskfile != "") {
                        outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + maskfile + ".pintail.chimeras";
index 8bb8c2c5e53a860939e092f23ebb5853e5186b44..58aecf58ccd10354c5aca094d20827c44fc00ff9 100644 (file)
@@ -137,14 +137,13 @@ void ChimeraSlayerCommand::help(){
        
                m->mothurOut("The chimera.slayer command reads a fastafile and templatefile and outputs potentially chimeric sequences.\n");
                m->mothurOut("This command was modeled after the chimeraSlayer written by the Broad Institute.\n");
-               m->mothurOut("The chimera.slayer command parameters are fasta, template, filter, mask, processors, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment and numwanted.\n"); //realign,
+               m->mothurOut("The chimera.slayer command parameters are fasta, template, processors, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment and numwanted.\n"); //realign,
                m->mothurOut("The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required. \n");
                m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. \n");
                m->mothurOut("The processors parameter allows you to specify how many processors you would like to use.  The default is 1. \n");
                #ifdef USE_MPI
                m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
                #endif
-               m->mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the your sequences. \n");
                m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default=50. \n");
                m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default=5.\n");
                m->mothurOut("The numwanted parameter allows you to specify how many sequences you would each query sequence compared with, default=15.\n");
@@ -160,7 +159,6 @@ void ChimeraSlayerCommand::help(){
                m->mothurOut("The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10) \n");
                m->mothurOut("The search parameter allows you to specify search method for finding the closest parent. Choices are distance, blast, and kmer, default distance. \n");
                //m->mothurOut("The realign parameter allows you to realign the query to the potential parents. Choices are true or false, default false. Found to make results worse. \n");
-               m->mothurOut("NOT ALL PARAMETERS ARE USED BY ALL METHODS. Please look below for method specifics.\n\n");
                m->mothurOut("The chimera.slayer command should be in the following format: \n");
                m->mothurOut("chimera.slayer(fasta=yourFastaFile, template=yourTemplate, search=yourSearch) \n");
                m->mothurOut("Example: chimera.slayer(fasta=AD.align, template=core_set_aligned.imputed.fasta, search=kmer) \n");
index c3e52abc5bcd9347853945346ee12cf641a268e3..be287a05069c394def3f1cf48257f25b863db3be 100644 (file)
-/*\r
- *  classify.cpp\r
- *  Mothur\r
- *\r
- *  Created by westcott on 11/3/09.\r
- *  Copyright 2009 Schloss Lab. All rights reserved.\r
- *\r
- */\r
-\r
-#include "classify.h"\r
-#include "sequence.hpp"\r
-#include "kmerdb.hpp"\r
-#include "suffixdb.hpp"\r
-#include "blastdb.hpp"\r
-#include "distancedb.hpp"\r
-\r
-/**************************************************************************************************/\r
-Classify::Classify(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) : taxFile(tfile), templateFile(tempFile) {         \r
-       try {   \r
-               m = MothurOut::getInstance();                                                                   \r
-               readTaxonomy(taxFile);  \r
-               \r
-               int start = time(NULL);\r
-               int numSeqs = 0;\r
-               \r
-               m->mothurOut("Generating search database...    "); cout.flush();\r
-#ifdef USE_MPI \r
-                       int pid;\r
-                       vector<long> positions;\r
-               \r
-                       MPI_Status status; \r
-                       MPI_File inMPI;\r
-                       MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are\r
-\r
-                       //char* inFileName = new char[tempFile.length()];\r
-                       //memcpy(inFileName, tempFile.c_str(), tempFile.length());\r
-                       \r
-                       char inFileName[1024];\r
-                       strcpy(inFileName, tempFile.c_str());\r
-\r
-                       MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer\r
-                       //delete inFileName;\r
-\r
-                       if (pid == 0) { //only one process needs to scan file\r
-                               positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs\r
-\r
-                               //send file positions to all processes\r
-                               MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs\r
-                               MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos     \r
-                       }else{\r
-                               MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs\r
-                               positions.resize(numSeqs);\r
-                               MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions\r
-                       }\r
-                       \r
-                       //create database\r
-                       if(method == "kmer")                    {       database = new KmerDB(tempFile, kmerSize);                      }\r
-                       else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }\r
-                       else if(method == "blast")              {       database = new BlastDB(gapOpen, gapExtend, match, misMatch);    }\r
-                       else if(method == "distance")   {       database = new DistanceDB();    }\r
-                       else {\r
-                               m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();\r
-                               database = new KmerDB(tempFile, 8);\r
-                       }\r
-\r
-                       //read file \r
-                       for(int i=0;i<numSeqs;i++){\r
-                               //read next sequence\r
-                               int length = positions[i+1] - positions[i];\r
-                               char* buf4 = new char[length];\r
-                               MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);\r
-                               \r
-                               string tempBuf = buf4;\r
-                               if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }\r
-                               delete buf4;\r
-                               istringstream iss (tempBuf,istringstream::in);\r
-                               \r
-                               Sequence temp(iss);  \r
-                               if (temp.getName() != "") {\r
-                                       names.push_back(temp.getName());\r
-                                       database->addSequence(temp);    \r
-                               }\r
-                       }\r
-                       \r
-                       database->generateDB();\r
-                       MPI_File_close(&inMPI);\r
-       #else\r
-               \r
-               //need to know number of template seqs for suffixdb\r
-               if (method == "suffix") {\r
-                       ifstream inFASTA;\r
-                       openInputFile(tempFile, inFASTA);\r
-                       numSeqs = count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');\r
-                       inFASTA.close();\r
-               }\r
-\r
-               bool needToGenerate = true;\r
-               string kmerDBName;\r
-               if(method == "kmer")                    {       \r
-                       database = new KmerDB(tempFile, kmerSize);                      \r
-                       \r
-                       kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";\r
-                       ifstream kmerFileTest(kmerDBName.c_str());\r
-                       if(kmerFileTest){       needToGenerate = false;         }\r
-               }\r
-               else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }\r
-               else if(method == "blast")              {       database = new BlastDB(gapOpen, gapExtend, match, misMatch);    }\r
-               else if(method == "distance")   {       database = new DistanceDB();    }\r
-               else {\r
-                       m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");\r
-                       m->mothurOutEndLine();\r
-                       database = new KmerDB(tempFile, 8);\r
-               }\r
-               \r
-               if (needToGenerate) {\r
-                       ifstream fastaFile;\r
-                       openInputFile(tempFile, fastaFile);\r
-                       \r
-                       while (!fastaFile.eof()) {\r
-                               Sequence temp(fastaFile);\r
-                               gobble(fastaFile);\r
-                       \r
-                               names.push_back(temp.getName());\r
-                                                               \r
-                               database->addSequence(temp);    \r
-                       }\r
-                       fastaFile.close();\r
-\r
-                       database->generateDB();\r
-                       \r
-               }else if ((method == "kmer") && (!needToGenerate)) {    \r
-                       ifstream kmerFileTest(kmerDBName.c_str());\r
-                       database->readKmerDB(kmerFileTest);     \r
-                       \r
-                       ifstream fastaFile;\r
-                       openInputFile(tempFile, fastaFile);\r
-                       \r
-                       while (!fastaFile.eof()) {\r
-                               Sequence temp(fastaFile);\r
-                               gobble(fastaFile);\r
-                       \r
-                               names.push_back(temp.getName());\r
-                       }\r
-                       fastaFile.close();\r
-               }\r
-#endif         \r
-               database->setNumSeqs(names.size());\r
-               \r
-               m->mothurOut("DONE."); m->mothurOutEndLine();\r
-               m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();\r
-\r
-       }\r
-       catch(exception& e) {\r
-               m->errorOut(e, "Classify", "Classify");\r
-               exit(1);\r
-       }\r
-}\r
-/**************************************************************************************************/\r
-\r
-void Classify::readTaxonomy(string file) {\r
-       try {\r
-               \r
-               phyloTree = new PhyloTree();\r
-               string name, taxInfo;\r
-               \r
-               m->mothurOutEndLine();\r
-               m->mothurOut("Reading in the " + file + " taxonomy...\t");      cout.flush();\r
-\r
-#ifdef USE_MPI \r
-               int pid, num;\r
-               vector<long> positions;\r
-               \r
-               MPI_Status status; \r
-               MPI_File inMPI;\r
-               MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are\r
-\r
-               //char* inFileName = new char[file.length()];\r
-               //memcpy(inFileName, file.c_str(), file.length());\r
-               \r
-               char inFileName[1024];\r
-               strcpy(inFileName, file.c_str());\r
-\r
-               MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer\r
-               //delete inFileName;\r
-\r
-               if (pid == 0) {\r
-                       positions = setFilePosEachLine(file, num);\r
-                       \r
-                       //send file positions to all processes\r
-                       MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs\r
-                       MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos \r
-               }else{\r
-                       MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs\r
-                       positions.resize(num);\r
-                       MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions\r
-               }\r
-       \r
-               //read file \r
-               for(int i=0;i<num;i++){\r
-                       //read next sequence\r
-                       int length = positions[i+1] - positions[i];\r
-                       char* buf4 = new char[length];\r
-\r
-                       MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);\r
-\r
-                       string tempBuf = buf4;\r
-                       if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }\r
-                       delete buf4;\r
-\r
-                       istringstream iss (tempBuf,istringstream::in);\r
-                       iss >> name >> taxInfo;\r
-                       taxonomy[name] = taxInfo;\r
-                       phyloTree->addSeqToTree(name, taxInfo);\r
-               }\r
-               \r
-               MPI_File_close(&inMPI);\r
-#else                          \r
-               ifstream inTax;\r
-               openInputFile(file, inTax);\r
-       \r
-               //read template seqs and save\r
-               while (!inTax.eof()) {\r
-                       inTax >> name >> taxInfo;\r
-                       \r
-                       taxonomy[name] = taxInfo;\r
-                       \r
-                       phyloTree->addSeqToTree(name, taxInfo);\r
-               \r
-                       gobble(inTax);\r
-               }\r
-               inTax.close();\r
-#endif \r
-       \r
-               phyloTree->assignHeirarchyIDs(0);\r
-               \r
-               m->mothurOut("DONE.");\r
-               m->mothurOutEndLine();  cout.flush();\r
-       \r
-       }\r
-       catch(exception& e) {\r
-               m->errorOut(e, "Classify", "readTaxonomy");\r
-               exit(1);\r
-       }\r
-}\r
-/**************************************************************************************************/\r
-\r
-vector<string> Classify::parseTax(string tax) {\r
-       try {\r
-               vector<string> taxons;\r
-               \r
-               tax = tax.substr(0, tax.length()-1);  //get rid of last ';'\r
-       \r
-               //parse taxonomy\r
-               string individual;\r
-               while (tax.find_first_of(';') != -1) {\r
-                       individual = tax.substr(0,tax.find_first_of(';'));\r
-                       tax = tax.substr(tax.find_first_of(';')+1, tax.length());\r
-                       taxons.push_back(individual);\r
-                       \r
-               }\r
-               //get last one\r
-               taxons.push_back(tax);\r
-               \r
-               return taxons;\r
-       }\r
-       catch(exception& e) {\r
-               m->errorOut(e, "Classify", "parseTax");\r
-               exit(1);\r
-       }\r
-}\r
-/**************************************************************************************************/\r
-\r
+/*
+ *  classify.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 11/3/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "classify.h"
+#include "sequence.hpp"
+#include "kmerdb.hpp"
+#include "suffixdb.hpp"
+#include "blastdb.hpp"
+#include "distancedb.hpp"
+
+/**************************************************************************************************/
+void Classify::generateDatabaseAndNames(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch)  {            
+       try {   
+               taxFile = tfile;
+               readTaxonomy(taxFile);  
+               
+               templateFile = tempFile;        
+               
+               int start = time(NULL);
+               int numSeqs = 0;
+               
+               m->mothurOut("Generating search database...    "); cout.flush();
+#ifdef USE_MPI 
+                       int pid;
+                       vector<long> positions;
+               
+                       MPI_Status status; 
+                       MPI_File inMPI;
+                       MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+                       //char* inFileName = new char[tempFile.length()];
+                       //memcpy(inFileName, tempFile.c_str(), tempFile.length());
+                       
+                       char inFileName[1024];
+                       strcpy(inFileName, tempFile.c_str());
+
+                       MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
+                       //delete inFileName;
+
+                       if (pid == 0) { //only one process needs to scan file
+                               positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs
+
+                               //send file positions to all processes
+                               MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
+                               MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos     
+                       }else{
+                               MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
+                               positions.resize(numSeqs);
+                               MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
+                       }
+                       
+                       //create database
+                       if(method == "kmer")                    {       database = new KmerDB(tempFile, kmerSize);                      }
+                       else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
+                       else if(method == "blast")              {       database = new BlastDB(gapOpen, gapExtend, match, misMatch);    }
+                       else if(method == "distance")   {       database = new DistanceDB();    }
+                       else {
+                               m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();
+                               database = new KmerDB(tempFile, 8);
+                       }
+
+                       //read file 
+                       for(int i=0;i<numSeqs;i++){
+                               //read next sequence
+                               int length = positions[i+1] - positions[i];
+                               char* buf4 = new char[length];
+                               MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
+                               
+                               string tempBuf = buf4;
+                               if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
+                               delete buf4;
+                               istringstream iss (tempBuf,istringstream::in);
+                               
+                               Sequence temp(iss);  
+                               if (temp.getName() != "") {
+                                       names.push_back(temp.getName());
+                                       database->addSequence(temp);    
+                               }
+                       }
+                       
+                       database->generateDB();
+                       MPI_File_close(&inMPI);
+       #else
+               
+               //need to know number of template seqs for suffixdb
+               if (method == "suffix") {
+                       ifstream inFASTA;
+                       openInputFile(tempFile, inFASTA);
+                       numSeqs = count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
+                       inFASTA.close();
+               }
+
+               bool needToGenerate = true;
+               string kmerDBName;
+               if(method == "kmer")                    {       
+                       database = new KmerDB(tempFile, kmerSize);                      
+                       
+                       kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
+                       ifstream kmerFileTest(kmerDBName.c_str());
+                       if(kmerFileTest){       needToGenerate = false;         }
+               }
+               else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
+               else if(method == "blast")              {       database = new BlastDB(gapOpen, gapExtend, match, misMatch);    }
+               else if(method == "distance")   {       database = new DistanceDB();    }
+               else {
+                       m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
+                       m->mothurOutEndLine();
+                       database = new KmerDB(tempFile, 8);
+               }
+               
+               if (needToGenerate) {
+                       ifstream fastaFile;
+                       openInputFile(tempFile, fastaFile);
+                       
+                       while (!fastaFile.eof()) {
+                               Sequence temp(fastaFile);
+                               gobble(fastaFile);
+                       
+                               names.push_back(temp.getName());
+                                                               
+                               database->addSequence(temp);    
+                       }
+                       fastaFile.close();
+
+                       database->generateDB();
+                       
+               }else if ((method == "kmer") && (!needToGenerate)) {    
+                       ifstream kmerFileTest(kmerDBName.c_str());
+                       database->readKmerDB(kmerFileTest);     
+                       
+                       ifstream fastaFile;
+                       openInputFile(tempFile, fastaFile);
+                       
+                       while (!fastaFile.eof()) {
+                               Sequence temp(fastaFile);
+                               gobble(fastaFile);
+
+                               names.push_back(temp.getName());
+                       }
+                       fastaFile.close();
+               }
+#endif         
+               database->setNumSeqs(names.size());
+               
+               m->mothurOut("DONE."); m->mothurOutEndLine();
+               m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();
+
+       }
+       catch(exception& e) {
+               m->errorOut(e, "Classify", "generateDatabaseAndNames");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+Classify::Classify() {         m = MothurOut::getInstance();   database = NULL;        }
+/**************************************************************************************************/
+
+int Classify::readTaxonomy(string file) {
+       try {
+               
+               phyloTree = new PhyloTree();
+               string name, taxInfo;
+               
+               m->mothurOutEndLine();
+               m->mothurOut("Reading in the " + file + " taxonomy...\t");      cout.flush();
+
+#ifdef USE_MPI 
+               int pid, num;
+               vector<long> positions;
+               
+               MPI_Status status; 
+               MPI_File inMPI;
+               MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+               //char* inFileName = new char[file.length()];
+               //memcpy(inFileName, file.c_str(), file.length());
+               
+               char inFileName[1024];
+               strcpy(inFileName, file.c_str());
+
+               MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);  //comm, filename, mode, info, filepointer
+               //delete inFileName;
+
+               if (pid == 0) {
+                       positions = setFilePosEachLine(file, num);
+                       
+                       //send file positions to all processes
+                       MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD);  //send numSeqs
+                       MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos 
+               }else{
+                       MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
+                       positions.resize(num);
+                       MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
+               }
+       
+               //read file 
+               for(int i=0;i<num;i++){
+                       //read next sequence
+                       int length = positions[i+1] - positions[i];
+                       char* buf4 = new char[length];
+
+                       MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
+
+                       string tempBuf = buf4;
+                       if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
+                       delete buf4;
+
+                       istringstream iss (tempBuf,istringstream::in);
+                       iss >> name >> taxInfo;
+                       taxonomy[name] = taxInfo;
+                       phyloTree->addSeqToTree(name, taxInfo);
+               }
+               
+               MPI_File_close(&inMPI);
+#else                          
+               ifstream inTax;
+               openInputFile(file, inTax);
+       
+               //read template seqs and save
+               while (!inTax.eof()) {
+                       inTax >> name >> taxInfo;
+                       
+                       taxonomy[name] = taxInfo;
+                       
+                       phyloTree->addSeqToTree(name, taxInfo);
+               
+                       gobble(inTax);
+               }
+               inTax.close();
+#endif 
+       
+               phyloTree->assignHeirarchyIDs(0);
+               
+               m->mothurOut("DONE.");
+               m->mothurOutEndLine();  cout.flush();
+               
+               return phyloTree->getNumSeqs();
+       
+       }
+       catch(exception& e) {
+               m->errorOut(e, "Classify", "readTaxonomy");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
+vector<string> Classify::parseTax(string tax) {
+       try {
+               vector<string> taxons;
+               
+               tax = tax.substr(0, tax.length()-1);  //get rid of last ';'
+       
+               //parse taxonomy
+               string individual;
+               while (tax.find_first_of(';') != -1) {
+                       individual = tax.substr(0,tax.find_first_of(';'));
+                       tax = tax.substr(tax.find_first_of(';')+1, tax.length());
+                       taxons.push_back(individual);
+                       
+               }
+               //get last one
+               taxons.push_back(tax);
+               
+               return taxons;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "Classify", "parseTax");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
index 1a34362ce798072d8554bb22a997b11dc16d223b..a2401dde6c10feaa4f83c6f3f7a65b8ff885dafd 100644 (file)
@@ -26,13 +26,14 @@ class Sequence;
 class Classify {
 
 public:
-       Classify(string, string, string, int, float, float, float, float);
+       Classify();
        
-       virtual ~Classify(){  delete phyloTree; delete database;  };
+       virtual ~Classify(){  delete phyloTree; if (database != NULL) {  delete database; } };
        virtual string getTaxonomy(Sequence*) = 0;
        //virtual map<string, int> getConfidenceScores() { return taxConfidenceScore; }
        //virtual vector<string> parseTax(string);
        virtual string getSimpleTax()  { return simpleTax;      }
+       virtual void generateDatabaseAndNames(string, string, string, int, float, float, float, float);
        
 protected:
 
@@ -46,9 +47,10 @@ protected:
        string taxFile, templateFile, simpleTax;
        vector<string> names;
        
-       void readTaxonomy(string);
+       int readTaxonomy(string);
        vector<string> parseTax(string);
        MothurOut* m;
+       
 };
 
 /**************************************************************************************************/
index bed65806ca72aecc1c26536e1dcbbe6d74b8b67b..a38bf8f34ad0e15e98e85f5b4ad627babefcbf2b 100644 (file)
@@ -487,6 +487,10 @@ int ClassifySeqsCommand::execute(){
                        if (pid == 0) {  //this part does not need to be paralellized
                #endif
 
+                       m->mothurOutEndLine();
+                       m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
+                       start = time(NULL);
+                       
                        //make taxonomy tree from new taxonomy file 
                        PhyloTree taxaBrowser;
                        
@@ -570,8 +574,8 @@ int ClassifySeqsCommand::execute(){
                        m->mothurOutEndLine();
 
                        
-                       m->mothurOutEndLine();
-                       m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
+                       //m->mothurOutEndLine();
+                       //m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for  " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
                }
                
                delete classify;
index e9373a93cbf81ae6bcd84bdd063e01947994de6e..2c62efd0e3442f63dcbdbc48d9c7d56072c3012c 100644 (file)
@@ -327,7 +327,7 @@ int GetOTURepCommand::execute(){
                
                if (m->control_pressed) { 
                        if (large) {  inRow.close(); remove(distFile.c_str());  }
-                       delete groupMap; delete fasta; return 0; 
+                       delete fasta; return 0; 
                }
                                
                //if user gave a namesfile then use it
@@ -350,7 +350,7 @@ int GetOTURepCommand::execute(){
                
                if (m->control_pressed) { 
                        if (large) {  inRow.close(); remove(distFile.c_str());  }
-                       delete groupMap; delete fasta; delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
+                       delete fasta; delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
                }
        
                while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
diff --git a/knn.cpp b/knn.cpp
index 93a7aa33faa6e2e1d9cb43c0505464056cbb1f08..64b5b3a288db8ce2f1d6da7e08af90bd8e26e613 100644 (file)
--- a/knn.cpp
+++ b/knn.cpp
 
 /**************************************************************************************************/
 Knn::Knn(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch, int n) 
-: Classify(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch), num(n)  {}
+: Classify(), num(n)  {
+       try {
+               //create search database and names vector
+               generateDatabaseAndNames(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch);
+       }
+       catch(exception& e) {
+               m->errorOut(e, "Knn", "Knn");
+               exit(1);
+       }
+}
 /**************************************************************************************************/
 string Knn::getTaxonomy(Sequence* seq) {
        try {
                string tax;
                
                //use database to find closest seq
-
                vector<int> closest = database->findClosestSequences(seq, num);
                
                if (m->control_pressed) { return tax; }
index d62f0290e5c9dce0b4d8100a6e92940ccca2e1df..5812d4551f9c6cc635f8ed4ee5bde2505a4b18df 100644 (file)
--- a/makefile
+++ b/makefile
@@ -26,7 +26,7 @@ ifeq  ($(strip $(USEREADLINE)),yes)
       -L../readline-6.0\r
 endif\r
 \r
-USEMPI ?= yes\r
+USEMPI ?= no\r
 \r
 ifeq  ($(strip $(USEMPI)),yes)\r
     CC_OPTIONS += -DUSE_MPI\r
@@ -219,7 +219,8 @@ mothur : \
                ./parsesffcommand.o\\r
                ./classify.o\\r
                ./phylotree.o\\r
-               ./bayesian.o\\r
+               ./bayesian.o\
+               ./rawtrainingdatamaker.o\\r
                ./alignmentdb.o\\r
                ./knn.o\\r
                ./distancedb.o\\r
@@ -416,7 +417,8 @@ mothur : \
                ./parsesffcommand.o\\r
                ./classify.o\\r
                ./phylotree.o\\r
-               ./bayesian.o\\r
+               ./bayesian.o\
+               ./rawtrainingdatamaker.o\\r
                ./alignmentdb.o\\r
                ./knn.o\\r
                ./distancedb.o\\r
@@ -616,7 +618,8 @@ clean :
                ./parsesffcommand.o\\r
                ./classify.o\\r
                ./phylotree.o\\r
-               ./bayesian.o\\r
+               ./bayesian.o\
+               ./rawtrainingdatamaker.o\\r
                ./alignmentdb.o\\r
                ./knn.o\\r
                ./distancedb.o\\r
@@ -1619,6 +1622,11 @@ install : mothur
 # Item # 196 -- chimerabellerophoncommand --\r
 ./chimerabellerophoncommand.o : chimerabellerophoncommand.cpp\r
        $(CC) $(CC_OPTIONS) chimerabellerophoncommand.cpp -c $(INCLUDE) -o ./chimerabellerophoncommand.o\r
+
+# Item # 171 -- rawtrainingdatamaker --\r
+./rawtrainingdatamaker.o : rawtrainingdatamaker.cpp\r
+       $(CC) $(CC_OPTIONS) rawtrainingdatamaker.cpp -c $(INCLUDE) -o ./rawtrainingdatamaker.o\r
+\r
 \r
 \r
 ##### END RUN ####\r
index 3cdd6acb2a33c38daf4af882253ad2c34acc39a6..33aedc6395f09fc24576b93e1b72261884d98907 100644 (file)
@@ -19,6 +19,44 @@ PhyloTree::PhyloTree(){
                tree.push_back(TaxNode("Root"));
                tree[0].heirarchyID = "0";
                maxLevel = 0;
+               calcTotals = true;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "PhyloTree", "PhyloTree");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
+PhyloTree::PhyloTree(ifstream& in){
+       try {
+               m = MothurOut::getInstance();
+               calcTotals = false;
+               
+               in >> numNodes; gobble(in);
+               
+               tree.resize(numNodes);
+               
+               for (int i = 0; i < tree.size(); i++) {
+                       in >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(in);
+               }
+               
+               //read genus nodes
+               int numGenus = 0;
+               in >> numGenus; gobble(in);
+       
+               int gnode, gsize;
+               totals.clear();
+               for (int i = 0; i < numGenus; i++) {
+                       in >> gnode >> gsize; gobble(in);
+                       
+                       uniqueTaxonomies[gnode] = gnode;
+                       totals.push_back(gsize);
+               }
+                       
+               in.close();
+               
        }
        catch(exception& e) {
                m->errorOut(e, "PhyloTree", "PhyloTree");
@@ -35,6 +73,7 @@ PhyloTree::PhyloTree(string tfile){
                tree.push_back(TaxNode("Root"));
                tree[0].heirarchyID = "0";
                maxLevel = 0;
+               calcTotals = true;
                
                ifstream in;
                openInputFile(tfile, in);
@@ -149,6 +188,27 @@ vector<int> PhyloTree::getGenusNodes()     {
        }
 }
 /**************************************************************************************************/
+vector<int> PhyloTree::getGenusTotals()        {
+       try {
+       
+               if (calcTotals) {
+                       totals.clear();
+                       //reset counts because we are on a new word
+                       for (int j = 0; j < genusIndex.size(); j++) {
+                               totals.push_back(tree[genusIndex[j]].accessions.size());
+                       }
+                       return totals;
+               }else{
+                       return totals;
+               }
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "PhyloTree", "getGenusNodes");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
 
 void PhyloTree::assignHeirarchyIDs(int index){
        try {
@@ -266,15 +326,39 @@ void PhyloTree::print(int i, ofstream& out){
                        out <<tree[it->second].level << '\t' << tree[it->second].heirarchyID << '\t' << tree[it->second].name << '\t' << tree[it->second].children.size() << '\t' << tree[it->second].accessions.size() << endl;
                        print(it->second, out);
                }
+       }
+       catch(exception& e) {
+               m->errorOut(e, "PhyloTree", "print");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+void PhyloTree::printTreeNodes(string treefilename) {
+       try {
+               ofstream outTree;
+               openOutputFile(treefilename, outTree);
+               
+               //print treenodes
+               outTree << tree.size() << endl;
+               for (int i = 0; i < tree.size(); i++) {
+                       outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
+               }
+               
+               //print genus nodes
+               outTree << endl << uniqueTaxonomies.size() << endl;
+               map<int, int>::iterator it2;
+               for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) {  outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl;  }
+               outTree << endl;
                
                
+               outTree.close();
+               
        }
        catch(exception& e) {
-               m->errorOut(e, "PhyloTree", "print");
+               m->errorOut(e, "PhyloTree", "printTreeNodes");
                exit(1);
        }
 }
-
 /**************************************************************************************************/
 
 
index 75abffcf8879898910745343d045b56c619569b9..b65a651b7a864f45ee04617ef447f344005a3aba 100644 (file)
@@ -32,11 +32,14 @@ class PhyloTree {
 public:
        PhyloTree();
        PhyloTree(string);  //pass it a taxonomy file and it makes the tree
+       PhyloTree(ifstream&);  //pass it a taxonomy file and it makes the train.tree
        ~PhyloTree() {};
        int addSeqToTree(string, string);
        void assignHeirarchyIDs(int);
        void print(ofstream&);
+       void printTreeNodes(string); //used by bayesian to save time
        vector<int> getGenusNodes();
+       vector<int> getGenusTotals();   
        void binUnclassified();
                
        TaxNode get(int i)                              {       return tree[i];                                                 }
@@ -45,17 +48,20 @@ public:
        string getName(int i)                   {       return tree[i].name;                                    }
        string getFullTaxonomy(string);  //pass a sequence name return taxonomy
        int getMaxLevel()                               {       return maxLevel;                                                }
+       int getNumSeqs()  {  return numSeqs;  }
        
 private:
        string getNextTaxon(string&);
        vector<TaxNode> tree;
        vector<int> genusIndex; //holds the indexes in tree where the genus level taxonomies are stored
+       vector<int> totals; //holds the numSeqs at each genus level taxonomy
        map<string, int> name2Taxonomy;  //maps name to index in tree
        map<int, int> uniqueTaxonomies;  //map of unique taxonomies
        void print(int, ofstream&);
        int numNodes;
        int numSeqs;
        int maxLevel;
+       bool calcTotals;
        MothurOut* m;
 };
 
diff --git a/rawtrainingdatamaker.cpp b/rawtrainingdatamaker.cpp
new file mode 100644 (file)
index 0000000..362cdb7
--- /dev/null
@@ -0,0 +1,188 @@
+/*
+ *  rawTrainingDataMaker.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 4/21/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "rawtrainingdatamaker.h"
+
+/**************************************************************************************************/
+
+RawTrainingDataMaker::RawTrainingDataMaker(){
+       try {
+               m = MothurOut::getInstance();
+               numNodes = 1;
+               numSeqs = 0;
+               tree.push_back(rawTaxNode("Root"));
+               tree[0].rank = "Root";
+               maxLevel = 0;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "RawTrainingDataMaker", "RawTrainingDataMaker");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
+RawTrainingDataMaker::RawTrainingDataMaker(string tfile){
+       try {
+               m = MothurOut::getInstance();
+               numNodes = 1;
+               numSeqs = 0;
+               tree.push_back(rawTaxNode("Root"));
+               tree[0].rank = "Root";
+               maxLevel = 0;
+               
+               ifstream in;
+               openInputFile(tfile, in);
+               
+               //read in users taxonomy file and add sequences to tree
+               string name, tax;
+               while(!in.eof()){
+                       in >> name >> tax; gobble(in);
+                       
+                       addSeqToTree(name, tax);
+               }
+               in.close();
+       
+               assignRank(0);
+       }
+       catch(exception& e) {
+               m->errorOut(e, "RawTrainingDataMaker", "RawTrainingDataMaker");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+string RawTrainingDataMaker::getNextTaxon(string& heirarchy){
+       try {
+               string currentLevel = "";
+               if(heirarchy != ""){
+                       int pos = heirarchy.find_first_of(';');
+                       currentLevel=heirarchy.substr(0,pos);
+                       if (pos != (heirarchy.length()-1)) {  heirarchy=heirarchy.substr(pos+1);  }
+                       else { heirarchy = ""; }
+               }
+               return currentLevel;
+       }
+       catch(exception& e) {
+               m->errorOut(e, "RawTrainingDataMaker", "getNextTaxon");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+int RawTrainingDataMaker::addSeqToTree(string seqName, string seqTaxonomy){
+       try {
+               numSeqs++;
+               
+               map<string, int>::iterator childPointer;
+               
+               int currentNode = 0;
+               string taxon;
+               
+               while(seqTaxonomy != ""){
+                       
+                       if (m->control_pressed) { return 0; }
+                       
+                       //somehow the parent is getting one too many accnos
+                       //use print to reassign the taxa id
+                       taxon = getNextTaxon(seqTaxonomy);
+                       
+                       childPointer = tree[currentNode].children.find(taxon);
+                       
+                       if(childPointer != tree[currentNode].children.end()){   //if the node already exists, move on
+                               currentNode = childPointer->second;
+                       }else{                                                                                  //otherwise, create it
+                               tree.push_back(rawTaxNode(taxon));
+                               numNodes++;
+                               tree[currentNode].children[taxon] = numNodes-1;
+                               tree[numNodes-1].parent = currentNode;
+                               
+                               currentNode = tree[currentNode].children[taxon];
+                       }
+               }
+
+       }
+       catch(exception& e) {
+               m->errorOut(e, "RawTrainingDataMaker", "addSeqToTree");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
+void RawTrainingDataMaker::assignRank(int index){
+       try {
+               map<string,int>::iterator it;
+                               
+               string ranks[9] = { "Root","Domain","Kingdom","Phylum","Class","Order","Family","Genus","Species" };
+               
+               for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
+                       tree[it->second].level = tree[index].level + 1;
+                       
+                       if (tree[it->second].level > 8) { 
+                               tree[it->second].rank = ("unknown" + toString(tree[it->second].level));
+                       }else {
+                               tree[it->second].rank = ranks[tree[it->second].level];
+                       }
+                                               
+                       //save maxLevel for binning the unclassified seqs
+                       if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; } 
+                       
+                       assignRank(it->second);
+               }
+       }
+       catch(exception& e) {
+               m->errorOut(e, "RawTrainingDataMaker", "assignRank");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
+void RawTrainingDataMaker::print(ofstream& out){
+       try {
+               //string temp = tree[0].name +" " + tree[0].rank;
+               //sanityCheck[temp] = temp;
+               
+               out << "0" << "*" << tree[0].name << "*" << tree[0].parent << "*" << tree[0].level << "*" << tree[0].rank << endl;
+               print(0, out);
+               
+       }
+       catch(exception& e) {
+               m->errorOut(e, "RawTrainingDataMaker", "print");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
+
+void RawTrainingDataMaker::print(int i, ofstream& out){
+       try {
+               map<string,int>::iterator it;
+               for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
+                       //string temp = tree[it->second].name + " " + tree[it->second].rank;
+                       
+                       //map<string, string>::iterator itSan;
+                       //itSan = sanityCheck.find(temp);
+                       
+                       //if (itSan == sanityCheck.end()) {
+                               out << it->second << "*" << tree[it->second].name << "*" << tree[it->second].parent << "*" << tree[it->second].level << "*" << tree[it->second].rank << endl;
+                               //sanityCheck[temp] = temp;
+                       //}
+                       print(it->second, out);
+               }
+       }
+       catch(exception& e) {
+               m->errorOut(e, "RawTrainingDataMaker", "print");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
+
+       
diff --git a/rawtrainingdatamaker.h b/rawtrainingdatamaker.h
new file mode 100644 (file)
index 0000000..912d781
--- /dev/null
@@ -0,0 +1,54 @@
+#ifndef RAWTRAININGDATAMAKER_H
+#define RAWTRAININGDATAMAKER_H
+
+/*
+ *  rawTrainingDataMaker.h
+ *  Mothur
+ *
+ *  Created by westcott on 4/21/10.
+ *  Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "mothurout.h"
+
+/**************************************************************************************************/
+
+struct rawTaxNode {
+       map<string, int> children;  //childs name to index in tree
+       int parent, level;
+       string name, rank;
+       
+       rawTaxNode(string n) : name(n), level(0), parent(-1) {          }
+       rawTaxNode(){}
+};
+
+/**************************************************************************************************/
+
+class RawTrainingDataMaker {
+
+public:
+       RawTrainingDataMaker();
+       RawTrainingDataMaker(string);  //pass it a taxonomy file and it makes the tree
+       ~RawTrainingDataMaker() {};
+       int addSeqToTree(string, string);
+       void assignRank(int);
+       void print(ofstream&);
+       
+private:
+       string getNextTaxon(string&);
+       vector<rawTaxNode> tree;
+       void print(int, ofstream&);
+       int numNodes;
+       int numSeqs;
+       int maxLevel;
+       //map<string, string> sanityCheck;
+       MothurOut* m;
+};
+
+/**************************************************************************************************/
+
+#endif
+
+
index bcecb7844e8a0348180884fbdb54993c286c53ac..7f60d712861f34a79b798e0ef8d1a3cbcbe714a5 100644 (file)
@@ -175,7 +175,7 @@ void ReadDistCommand::help(){
                m->mothurOut("For this use the read.dist command should be in the following format: \n");
                m->mothurOut("read.dist(phylip=yourDistFile, name=yourNameFile, cutoff=yourCutoff, precision=yourPrecision) \n");
                m->mothurOut("The phylip or column parameter is required, but only one may be used.  If you use a column file the name filename is required. \n");
-               m->mothurOut("The sim parameter is used to indicate that your distance file contains similiarity values instead of distance values. The default is false, if sim=true then mothur will convert the similairity values to distances. \n");
+               m->mothurOut("The sim parameter is used to indicate that your distance file contains similarity values instead of distance values. The default is false, if sim=true then mothur will convert the similarity values to distances. \n");
                m->mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
                m->mothurOut("The second way to use the read.dist command is to read a phylip or column and a group, so you can use the libshuff command.\n");
                m->mothurOut("For this use the read.dist command should be in the following format: \n");