]> git.donarmstrong.com Git - mothur.git/blobdiff - bayesian.cpp
working on windows paralellization, added trimOligos class to be used by trim.flows...
[mothur.git] / bayesian.cpp
index cf55d5c585ee2c7c2ca6534f7192996f2ef34d02..d62bb23e3e23ae1c98b3757ddd4075b4edf5a806 100644 (file)
 #include "bayesian.h"
 #include "kmer.hpp"
 #include "phylosummary.h"
-
+#include "referencedb.h"
 /**************************************************************************************************/
-Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i) : 
+Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i, int tid) : 
 Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i)  {
        try {
+               ReferenceDB* rdb = ReferenceDB::getInstance();
+               
+               threadID = tid;
+               string baseName = tempFile;
+                       
+               if (baseName == "saved") { baseName = rdb->getSavedReference(); }
+               
+               string baseTName = tfile;
+               if (baseTName == "saved") { baseTName = rdb->getSavedTaxonomy(); }
                
                /************calculate the probablity that each word will be in a specific taxonomy*************/
-               string tfileroot = tfile.substr(0,tfile.find_last_of(".")+1);
-               string tempfileroot = m->getRootName(m->getSimpleName(tempFile));
+               string tfileroot = baseTName.substr(0,baseTName.find_last_of(".")+1);
+               string tempfileroot = m->getRootName(m->getSimpleName(baseName));
                string phyloTreeName = tfileroot + "tree.train";
                string phyloTreeSumName = tfileroot + "tree.sum";
                string probFileName = tfileroot + tempfileroot + char('0'+ kmerSize) + "mer.prob";
@@ -40,7 +49,23 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i)  {
                        FilesGood = checkReleaseDate(probFileTest, probFileTest2, phyloTreeTest, probFileTest3);
                }
                
+               //if you want to save, but you dont need to calculate then just read
+               if (rdb->save && probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood && (tempFile != "saved")) {  
+                       ifstream saveIn;
+                       m->openInputFile(tempFile, saveIn);
+                       
+                       while (!saveIn.eof()) {
+                               Sequence temp(saveIn);
+                               m->gobble(saveIn);
+                               
+                               rdb->referenceSeqs.push_back(temp); 
+                       }
+                       saveIn.close();                 
+               }
+               
                if(probFileTest && probFileTest2 && phyloTreeTest && probFileTest3 && FilesGood){       
+                       if (tempFile == "saved") { m->mothurOutEndLine();  m->mothurOut("Using sequences from " + rdb->getSavedReference() + " that are saved in memory.");     m->mothurOutEndLine(); }
+                       
                        m->mothurOut("Reading template taxonomy...     "); cout.flush();
                        
                        phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName);
@@ -49,17 +74,24 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i)  {
                        
                        genusNodes = phyloTree->getGenusNodes(); 
                        genusTotals = phyloTree->getGenusTotals();
-               
-                       m->mothurOut("Reading template probabilities...     "); cout.flush();
-                       readProbFile(probFileTest, probFileTest2, probFileName, probFileName2); 
                        
+                       if (tfile == "saved") { 
+                               m->mothurOutEndLine();  m->mothurOut("Using probabilties from " + rdb->getSavedTaxonomy() + " that are saved in memory...    ");        cout.flush();; 
+                               wordGenusProb = rdb->wordGenusProb;
+                       }else {
+                               m->mothurOut("Reading template probabilities...     "); cout.flush();
+                               readProbFile(probFileTest, probFileTest2, probFileName, probFileName2);
+                       }       
+                       
+                       //save probabilities
+                       if (rdb->save) { rdb->wordGenusProb = wordGenusProb; }
                }else{
                
                        //create search database and names vector
                        generateDatabaseAndNames(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0);
                        
                        //prevents errors caused by creating shortcut files if you had an error in the sanity check.
-                       if (m->control_pressed) {  remove(phyloTreeName.c_str());  remove(probFileName.c_str()); remove(probFileName2.c_str()); }
+                       if (m->control_pressed) {  m->mothurRemove(phyloTreeName);  m->mothurRemove(probFileName); m->mothurRemove(probFileName2); }
                        else{ 
                                genusNodes = phyloTree->getGenusNodes(); 
                                genusTotals = phyloTree->getGenusTotals();
@@ -194,6 +226,9 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i)  {
                                delete phyloTree;
                                
                                phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName);
+                               
+                               //save probabilities
+                               if (rdb->save) { rdb->wordGenusProb = wordGenusProb; }
                        }
                }
        
@@ -411,8 +446,8 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string
                #ifdef USE_MPI
                        
                        int pid, num, num2, processors;
-                       vector<unsigned long int> positions;
-                       vector<unsigned long int> positions2;
+                       vector<unsigned long long> positions;
+                       vector<unsigned long long> positions2;
                        
                        MPI_Status status; 
                        MPI_File inMPI;
@@ -541,7 +576,7 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string
                        string line = m->getline(in); m->gobble(in);
                        
                        in >> numKmers; m->gobble(in);
-                       
+                       //cout << threadID << '\t' << line << '\t' << numKmers << &in << '\t' << &inNum << '\t' << genusNodes.size() << endl;
                        //initialze probabilities
                        wordGenusProb.resize(numKmers);
                        
@@ -554,22 +589,25 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string
                        
                        //read version
                        string line2 = m->getline(inNum); m->gobble(inNum);
-                       
+               //cout << threadID << '\t' << line2 << '\t' << this << endl;    
                        while (inNum) {
                                inNum >> zeroCountProb[count] >> num[count];  
                                count++;
                                m->gobble(inNum);
+                               //cout << threadID << '\t' << count << endl;
                        }
                        inNum.close();
-               
+               //cout << threadID << '\t' << "here1 " << &wordGenusProb << '\t' << &num << endl; //
+               //cout << threadID << '\t' << &genusTotals << '\t' << endl; 
+               //cout << threadID << '\t' << genusNodes.size() << endl;
                        while(in) {
                                in >> kmer;
-                               
+                       //cout << threadID << '\t' << kmer << endl;
                                //set them all to zero value
                                for (int i = 0; i < genusNodes.size(); i++) {
                                        wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1));
                                }
-                               
+                       //cout << threadID << '\t' << num[kmer] << "here" << endl;      
                                //get probs for nonzero values
                                for (int i = 0; i < num[kmer]; i++) {
                                        in >> name >> prob;
@@ -579,7 +617,7 @@ void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string
                                m->gobble(in);
                        }
                        in.close();
-                       
+               //cout << threadID << '\t' << "here" << endl;   
                #endif
        }
        catch(exception& e) {