From 6f4b9401f7deb8aaf0d87659298308f4138cc3b0 Mon Sep 17 00:00:00 2001 From: westcott Date: Mon, 26 Apr 2010 15:05:21 +0000 Subject: [PATCH] made classifier faster --- Mothur.xcodeproj/project.pbxproj | 4 + bayesian.cpp | 78 +++-- bellerophon.cpp | 25 +- chimerabellerophoncommand.cpp | 2 +- chimerapintailcommand.cpp | 7 +- chimeraslayercommand.cpp | 4 +- classify.cpp | 550 ++++++++++++++++--------------- classify.h | 8 +- classifyseqscommand.cpp | 8 +- getoturepcommand.cpp | 4 +- knn.cpp | 12 +- makefile | 16 +- phylotree.cpp | 88 ++++- phylotree.h | 6 + rawtrainingdatamaker.cpp | 188 +++++++++++ rawtrainingdatamaker.h | 54 +++ readdistcommand.cpp | 2 +- 17 files changed, 728 insertions(+), 328 deletions(-) create mode 100644 rawtrainingdatamaker.cpp create mode 100644 rawtrainingdatamaker.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 79c1e5b..805bde0 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -12,6 +12,8 @@ A747E81C116365E000FB9042 /* chimeraslayercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraslayercommand.h; sourceTree = ""; }; A747E81D116365E000FB9042 /* chimeraslayercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraslayercommand.cpp; sourceTree = ""; }; A7639F8D1175DF35008F5578 /* makefile */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.make; path = makefile; sourceTree = ""; }; + A76AAD02117F322B003D8DA1 /* rawtrainingdatamaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = rawtrainingdatamaker.h; sourceTree = SOURCE_ROOT; }; + A76AAD03117F322B003D8DA1 /* rawtrainingdatamaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = rawtrainingdatamaker.cpp; sourceTree = SOURCE_ROOT; }; A78254461164D7790002E2DD /* chimerapintailcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerapintailcommand.h; sourceTree = ""; }; A78254471164D7790002E2DD /* chimerapintailcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerapintailcommand.cpp; sourceTree = ""; }; A7825502116519F70002E2DD /* chimerabellerophoncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerabellerophoncommand.h; sourceTree = ""; }; @@ -829,6 +831,8 @@ A7DA208A113FECD400BF472F /* knn.h */, A7DA20C4113FECD400BF472F /* phylotree.cpp */, A7DA20C5113FECD400BF472F /* phylotree.h */, + A76AAD02117F322B003D8DA1 /* rawtrainingdatamaker.h */, + A76AAD03117F322B003D8DA1 /* rawtrainingdatamaker.cpp */, A7DA215C113FECD400BF472F /* taxonomyequalizer.cpp */, A7DA215D113FECD400BF472F /* taxonomyequalizer.h */, ); diff --git a/bayesian.cpp b/bayesian.cpp index b007c00..0929749 100644 --- a/bayesian.cpp +++ b/bayesian.cpp @@ -9,29 +9,17 @@ #include "bayesian.h" #include "kmer.hpp" +#include "rawtrainingdatamaker.h" /**************************************************************************************************/ Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i) : -Classify(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { +Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { try { - numKmers = database->getMaxKmer() + 1; - - //initialze probabilities - wordGenusProb.resize(numKmers); - - genusNodes = phyloTree->getGenusNodes(); - - for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); } - - //reset counts because we are on a new word - for (int j = 0; j < genusNodes.size(); j++) { - TaxNode temp = phyloTree->get(genusNodes[j]); - genusTotals.push_back(temp.accessions.size()); - } - - /************calculate the probablity that each word will be in a specific taxonomy*************/ + string phyloTreeName = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.train"; + ifstream phyloTreeTest(phyloTreeName.c_str()); + ofstream out; string probFileName = tfile.substr(0,tfile.find_last_of(".")+1) + tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.prob"; ifstream probFileTest(probFileName.c_str()); @@ -42,15 +30,47 @@ Classify(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0), kmerSize(ksize), c int start = time(NULL); - if(probFileTest && probFileTest2){ + if(probFileTest && probFileTest2 && phyloTreeTest){ + m->mothurOut("Reading template taxonomy... "); cout.flush(); + + phyloTree = new PhyloTree(phyloTreeTest); + + m->mothurOut("DONE."); m->mothurOutEndLine(); + + genusNodes = phyloTree->getGenusNodes(); + genusTotals = phyloTree->getGenusTotals(); + m->mothurOut("Reading template probabilities... "); cout.flush(); readProbFile(probFileTest, probFileTest2); + }else{ + + //create search database and names vector + generateDatabaseAndNames(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0); + + genusNodes = phyloTree->getGenusNodes(); + genusTotals = phyloTree->getGenusTotals(); + + m->mothurOut("Calculating template taxonomy tree... "); cout.flush(); + + phyloTree->printTreeNodes(phyloTreeName); + + m->mothurOut("DONE."); m->mothurOutEndLine(); + m->mothurOut("Calculating template probabilities... "); cout.flush(); + + numKmers = database->getMaxKmer() + 1; + + //initialze probabilities + wordGenusProb.resize(numKmers); + + for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); } ofstream out; openOutputFile(probFileName, out); + out << numKmers << endl; + ofstream out2; openOutputFile(probFileName2, out2); @@ -86,9 +106,14 @@ Classify(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0), kmerSize(ksize), c out.close(); out2.close(); + + //read in new phylotree with less info. - its faster + ifstream phyloTreeTest(phyloTreeName.c_str()); + delete phyloTree; + + phyloTree = new PhyloTree(phyloTreeTest); } - - + m->mothurOut("DONE."); m->mothurOutEndLine(); m->mothurOut("It took " + toString(time(NULL) - start) + " seconds get probabilities. "); m->mothurOutEndLine(); } @@ -113,14 +138,12 @@ string Bayesian::getTaxonomy(Sequence* seq) { for (int i = 0; i < queryKmerString.length(); i++) { if (queryKmerString[i] != '!') { //this kmer is in the query queryKmers.push_back(i); - } } if (queryKmers.size() == 0) { m->mothurOut(seq->getName() + "is bad."); m->mothurOutEndLine(); return "bad seq"; } int index = getMostProbableTaxonomy(queryKmers); - if (m->control_pressed) { return tax; } @@ -272,18 +295,25 @@ map Bayesian::parseTaxMap(string newTax) { void Bayesian::readProbFile(ifstream& in, ifstream& inNum) { try{ + in >> numKmers; gobble(in); + + //initialze probabilities + wordGenusProb.resize(numKmers); + + for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); } + int kmer, name, count; count = 0; vector num; num.resize(numKmers); float prob; vector zeroCountProb; zeroCountProb.resize(numKmers); - + while (inNum) { inNum >> zeroCountProb[count] >> num[count]; count++; gobble(inNum); } inNum.close(); - + while(in) { in >> kmer; diff --git a/bellerophon.cpp b/bellerophon.cpp index e33230e..54025eb 100644 --- a/bellerophon.cpp +++ b/bellerophon.cpp @@ -132,7 +132,7 @@ int Bellerophon::print(ostream& out, ostream& outAcc) { //*************************************************************************************************************** int Bellerophon::print(MPI_File& out, MPI_File& outAcc) { try { - + int pid; MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are @@ -148,17 +148,26 @@ int Bellerophon::print(MPI_File& out, MPI_File& outAcc) { if (m->control_pressed) { return numSeqs; } - outString += "Name\tScore\tLeft\tRight\n"; + outString = "Name\tScore\tLeft\tRight\n"; + MPI_Status status; + int olength = outString.length(); + char* buf5 = new char[olength]; + memcpy(buf5, outString.c_str(), olength); + + MPI_File_write_shared(out, buf5, olength, MPI_CHAR, &status); + + delete buf5; + //output prefenence structure to .chimeras file for (int i = 0; i < best.size(); i++) { if (m->control_pressed) { return numSeqs; } - outString += best[i].name + "\t" + toString(best[i].score) + "\t" + best[i].leftParent + "\t" + best[i].rightParent + "\n"; - + outString = best[i].name + "\t" + toString(best[i].score) + "\t" + best[i].leftParent + "\t" + best[i].rightParent + "\n"; + MPI_Status status; int length = outString.length(); - char* buf2 = new char[length]; + char* buf2 = new char[length]; memcpy(buf2, outString.c_str(), length); MPI_File_write_shared(out, buf2, length, MPI_CHAR, &status); @@ -168,12 +177,12 @@ int Bellerophon::print(MPI_File& out, MPI_File& outAcc) { //calc # of seqs with preference above 95%tile if (best[i].score >= cutoffScore) { above1++; - string outAccString; + string outAccString = ""; outAccString += best[i].name + "\n"; MPI_Status statusAcc; length = outAccString.length(); - char* buf = new char[length]; + char* buf = new char[length]; memcpy(buf, outAccString.c_str(), length); MPI_File_write_shared(outAcc, buf, length, MPI_CHAR, &statusAcc); @@ -291,7 +300,7 @@ int Bellerophon::getChimeras() { if (m->control_pressed) { return 0; } int bestLength = MPIBestSend[i].length(); - char* buf = new char[bestLength]; + char* buf = new char[bestLength]; memcpy(buf, MPIBestSend[i].c_str(), bestLength); MPI_Send(&bestLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD); diff --git a/chimerabellerophoncommand.cpp b/chimerabellerophoncommand.cpp index 67f6b57..668956f 100644 --- a/chimerabellerophoncommand.cpp +++ b/chimerabellerophoncommand.cpp @@ -90,7 +90,7 @@ void ChimeraBellerophonCommand::help(){ m->mothurOut("The chimera.bellerophon command reads a fastafile and creates list of potentially chimeric sequences.\n"); m->mothurOut("The chimera.bellerophon command parameters are fasta, filter, correction, processors, window, increment. The fasta parameter is required.\n"); m->mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter, default=false. \n"); - m->mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs.\n"); + m->mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs, default=true.\n"); m->mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"); #ifdef USE_MPI m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n"); diff --git a/chimerapintailcommand.cpp b/chimerapintailcommand.cpp index 384c885..62f1f2b 100644 --- a/chimerapintailcommand.cpp +++ b/chimerapintailcommand.cpp @@ -147,7 +147,7 @@ void ChimeraPintailCommand::help(){ #ifdef USE_MPI m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n"); #endif - m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default=1/4 sequence length. \n"); + m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default=300. \n"); m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default=25.\n"); m->mothurOut("The conservation parameter allows you to enter a frequency file containing the highest bases frequency at each place in the alignment.\n"); m->mothurOut("The quantile parameter allows you to enter a file containing quantiles for a template files sequences, if you use the filter the quantile file generated becomes unique to the fasta file you used.\n"); @@ -175,12 +175,11 @@ int ChimeraPintailCommand::execute(){ int start = time(NULL); - chimera = new Pintail(fastafile, templatefile, filter, processors, maskfile, consfile, quanfile, window, increment, outputDir); - //set user options if (maskfile == "default") { m->mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); m->mothurOutEndLine(); } - + chimera = new Pintail(fastafile, templatefile, filter, processors, maskfile, consfile, quanfile, window, increment, outputDir); + string outputFileName, accnosFileName; if (maskfile != "") { outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + maskfile + ".pintail.chimeras"; diff --git a/chimeraslayercommand.cpp b/chimeraslayercommand.cpp index 8bb8c2c..58aecf5 100644 --- a/chimeraslayercommand.cpp +++ b/chimeraslayercommand.cpp @@ -137,14 +137,13 @@ void ChimeraSlayerCommand::help(){ m->mothurOut("The chimera.slayer command reads a fastafile and templatefile and outputs potentially chimeric sequences.\n"); m->mothurOut("This command was modeled after the chimeraSlayer written by the Broad Institute.\n"); - m->mothurOut("The chimera.slayer command parameters are fasta, template, filter, mask, processors, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment and numwanted.\n"); //realign, + m->mothurOut("The chimera.slayer command parameters are fasta, template, processors, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment and numwanted.\n"); //realign, m->mothurOut("The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required. \n"); m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. \n"); m->mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n"); #ifdef USE_MPI m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n"); #endif - m->mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the your sequences. \n"); m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default=50. \n"); m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default=5.\n"); m->mothurOut("The numwanted parameter allows you to specify how many sequences you would each query sequence compared with, default=15.\n"); @@ -160,7 +159,6 @@ void ChimeraSlayerCommand::help(){ m->mothurOut("The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10) \n"); m->mothurOut("The search parameter allows you to specify search method for finding the closest parent. Choices are distance, blast, and kmer, default distance. \n"); //m->mothurOut("The realign parameter allows you to realign the query to the potential parents. Choices are true or false, default false. Found to make results worse. \n"); - m->mothurOut("NOT ALL PARAMETERS ARE USED BY ALL METHODS. Please look below for method specifics.\n\n"); m->mothurOut("The chimera.slayer command should be in the following format: \n"); m->mothurOut("chimera.slayer(fasta=yourFastaFile, template=yourTemplate, search=yourSearch) \n"); m->mothurOut("Example: chimera.slayer(fasta=AD.align, template=core_set_aligned.imputed.fasta, search=kmer) \n"); diff --git a/classify.cpp b/classify.cpp index c3e52ab..be287a0 100644 --- a/classify.cpp +++ b/classify.cpp @@ -1,272 +1,278 @@ -/* - * classify.cpp - * Mothur - * - * Created by westcott on 11/3/09. - * Copyright 2009 Schloss Lab. All rights reserved. - * - */ - -#include "classify.h" -#include "sequence.hpp" -#include "kmerdb.hpp" -#include "suffixdb.hpp" -#include "blastdb.hpp" -#include "distancedb.hpp" - -/**************************************************************************************************/ -Classify::Classify(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) : taxFile(tfile), templateFile(tempFile) { - try { - m = MothurOut::getInstance(); - readTaxonomy(taxFile); - - int start = time(NULL); - int numSeqs = 0; - - m->mothurOut("Generating search database... "); cout.flush(); -#ifdef USE_MPI - int pid; - vector positions; - - MPI_Status status; - MPI_File inMPI; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are - - //char* inFileName = new char[tempFile.length()]; - //memcpy(inFileName, tempFile.c_str(), tempFile.length()); - - char inFileName[1024]; - strcpy(inFileName, tempFile.c_str()); - - MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer - //delete inFileName; - - if (pid == 0) { //only one process needs to scan file - positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs - - //send file positions to all processes - MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs - MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos - }else{ - MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs - positions.resize(numSeqs); - MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions - } - - //create database - if(method == "kmer") { database = new KmerDB(tempFile, kmerSize); } - else if(method == "suffix") { database = new SuffixDB(numSeqs); } - else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); } - else if(method == "distance") { database = new DistanceDB(); } - else { - m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine(); - database = new KmerDB(tempFile, 8); - } - - //read file - for(int i=0;i length) { tempBuf = tempBuf.substr(0, length); } - delete buf4; - istringstream iss (tempBuf,istringstream::in); - - Sequence temp(iss); - if (temp.getName() != "") { - names.push_back(temp.getName()); - database->addSequence(temp); - } - } - - database->generateDB(); - MPI_File_close(&inMPI); - #else - - //need to know number of template seqs for suffixdb - if (method == "suffix") { - ifstream inFASTA; - openInputFile(tempFile, inFASTA); - numSeqs = count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); - inFASTA.close(); - } - - bool needToGenerate = true; - string kmerDBName; - if(method == "kmer") { - database = new KmerDB(tempFile, kmerSize); - - kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; - ifstream kmerFileTest(kmerDBName.c_str()); - if(kmerFileTest){ needToGenerate = false; } - } - else if(method == "suffix") { database = new SuffixDB(numSeqs); } - else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); } - else if(method == "distance") { database = new DistanceDB(); } - else { - m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); - m->mothurOutEndLine(); - database = new KmerDB(tempFile, 8); - } - - if (needToGenerate) { - ifstream fastaFile; - openInputFile(tempFile, fastaFile); - - while (!fastaFile.eof()) { - Sequence temp(fastaFile); - gobble(fastaFile); - - names.push_back(temp.getName()); - - database->addSequence(temp); - } - fastaFile.close(); - - database->generateDB(); - - }else if ((method == "kmer") && (!needToGenerate)) { - ifstream kmerFileTest(kmerDBName.c_str()); - database->readKmerDB(kmerFileTest); - - ifstream fastaFile; - openInputFile(tempFile, fastaFile); - - while (!fastaFile.eof()) { - Sequence temp(fastaFile); - gobble(fastaFile); - - names.push_back(temp.getName()); - } - fastaFile.close(); - } -#endif - database->setNumSeqs(names.size()); - - m->mothurOut("DONE."); m->mothurOutEndLine(); - m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine(); - - } - catch(exception& e) { - m->errorOut(e, "Classify", "Classify"); - exit(1); - } -} -/**************************************************************************************************/ - -void Classify::readTaxonomy(string file) { - try { - - phyloTree = new PhyloTree(); - string name, taxInfo; - - m->mothurOutEndLine(); - m->mothurOut("Reading in the " + file + " taxonomy...\t"); cout.flush(); - -#ifdef USE_MPI - int pid, num; - vector positions; - - MPI_Status status; - MPI_File inMPI; - MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are - - //char* inFileName = new char[file.length()]; - //memcpy(inFileName, file.c_str(), file.length()); - - char inFileName[1024]; - strcpy(inFileName, file.c_str()); - - MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer - //delete inFileName; - - if (pid == 0) { - positions = setFilePosEachLine(file, num); - - //send file positions to all processes - MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs - MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos - }else{ - MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs - positions.resize(num); - MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions - } - - //read file - for(int i=0;i length) { tempBuf = tempBuf.substr(0, length); } - delete buf4; - - istringstream iss (tempBuf,istringstream::in); - iss >> name >> taxInfo; - taxonomy[name] = taxInfo; - phyloTree->addSeqToTree(name, taxInfo); - } - - MPI_File_close(&inMPI); -#else - ifstream inTax; - openInputFile(file, inTax); - - //read template seqs and save - while (!inTax.eof()) { - inTax >> name >> taxInfo; - - taxonomy[name] = taxInfo; - - phyloTree->addSeqToTree(name, taxInfo); - - gobble(inTax); - } - inTax.close(); -#endif - - phyloTree->assignHeirarchyIDs(0); - - m->mothurOut("DONE."); - m->mothurOutEndLine(); cout.flush(); - - } - catch(exception& e) { - m->errorOut(e, "Classify", "readTaxonomy"); - exit(1); - } -} -/**************************************************************************************************/ - -vector Classify::parseTax(string tax) { - try { - vector taxons; - - tax = tax.substr(0, tax.length()-1); //get rid of last ';' - - //parse taxonomy - string individual; - while (tax.find_first_of(';') != -1) { - individual = tax.substr(0,tax.find_first_of(';')); - tax = tax.substr(tax.find_first_of(';')+1, tax.length()); - taxons.push_back(individual); - - } - //get last one - taxons.push_back(tax); - - return taxons; - } - catch(exception& e) { - m->errorOut(e, "Classify", "parseTax"); - exit(1); - } -} -/**************************************************************************************************/ - +/* + * classify.cpp + * Mothur + * + * Created by westcott on 11/3/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + +#include "classify.h" +#include "sequence.hpp" +#include "kmerdb.hpp" +#include "suffixdb.hpp" +#include "blastdb.hpp" +#include "distancedb.hpp" + +/**************************************************************************************************/ +void Classify::generateDatabaseAndNames(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) { + try { + taxFile = tfile; + readTaxonomy(taxFile); + + templateFile = tempFile; + + int start = time(NULL); + int numSeqs = 0; + + m->mothurOut("Generating search database... "); cout.flush(); +#ifdef USE_MPI + int pid; + vector positions; + + MPI_Status status; + MPI_File inMPI; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + //char* inFileName = new char[tempFile.length()]; + //memcpy(inFileName, tempFile.c_str(), tempFile.length()); + + char inFileName[1024]; + strcpy(inFileName, tempFile.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + //delete inFileName; + + if (pid == 0) { //only one process needs to scan file + positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs + + //send file positions to all processes + MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs + MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos + }else{ + MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs + positions.resize(numSeqs); + MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions + } + + //create database + if(method == "kmer") { database = new KmerDB(tempFile, kmerSize); } + else if(method == "suffix") { database = new SuffixDB(numSeqs); } + else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); } + else if(method == "distance") { database = new DistanceDB(); } + else { + m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine(); + database = new KmerDB(tempFile, 8); + } + + //read file + for(int i=0;i length) { tempBuf = tempBuf.substr(0, length); } + delete buf4; + istringstream iss (tempBuf,istringstream::in); + + Sequence temp(iss); + if (temp.getName() != "") { + names.push_back(temp.getName()); + database->addSequence(temp); + } + } + + database->generateDB(); + MPI_File_close(&inMPI); + #else + + //need to know number of template seqs for suffixdb + if (method == "suffix") { + ifstream inFASTA; + openInputFile(tempFile, inFASTA); + numSeqs = count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + inFASTA.close(); + } + + bool needToGenerate = true; + string kmerDBName; + if(method == "kmer") { + database = new KmerDB(tempFile, kmerSize); + + kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; + ifstream kmerFileTest(kmerDBName.c_str()); + if(kmerFileTest){ needToGenerate = false; } + } + else if(method == "suffix") { database = new SuffixDB(numSeqs); } + else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); } + else if(method == "distance") { database = new DistanceDB(); } + else { + m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); + m->mothurOutEndLine(); + database = new KmerDB(tempFile, 8); + } + + if (needToGenerate) { + ifstream fastaFile; + openInputFile(tempFile, fastaFile); + + while (!fastaFile.eof()) { + Sequence temp(fastaFile); + gobble(fastaFile); + + names.push_back(temp.getName()); + + database->addSequence(temp); + } + fastaFile.close(); + + database->generateDB(); + + }else if ((method == "kmer") && (!needToGenerate)) { + ifstream kmerFileTest(kmerDBName.c_str()); + database->readKmerDB(kmerFileTest); + + ifstream fastaFile; + openInputFile(tempFile, fastaFile); + + while (!fastaFile.eof()) { + Sequence temp(fastaFile); + gobble(fastaFile); + + names.push_back(temp.getName()); + } + fastaFile.close(); + } +#endif + database->setNumSeqs(names.size()); + + m->mothurOut("DONE."); m->mothurOutEndLine(); + m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine(); + + } + catch(exception& e) { + m->errorOut(e, "Classify", "generateDatabaseAndNames"); + exit(1); + } +} +/**************************************************************************************************/ +Classify::Classify() { m = MothurOut::getInstance(); database = NULL; } +/**************************************************************************************************/ + +int Classify::readTaxonomy(string file) { + try { + + phyloTree = new PhyloTree(); + string name, taxInfo; + + m->mothurOutEndLine(); + m->mothurOut("Reading in the " + file + " taxonomy...\t"); cout.flush(); + +#ifdef USE_MPI + int pid, num; + vector positions; + + MPI_Status status; + MPI_File inMPI; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + //char* inFileName = new char[file.length()]; + //memcpy(inFileName, file.c_str(), file.length()); + + char inFileName[1024]; + strcpy(inFileName, file.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + //delete inFileName; + + if (pid == 0) { + positions = setFilePosEachLine(file, num); + + //send file positions to all processes + MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs + MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos + }else{ + MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs + positions.resize(num); + MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions + } + + //read file + for(int i=0;i length) { tempBuf = tempBuf.substr(0, length); } + delete buf4; + + istringstream iss (tempBuf,istringstream::in); + iss >> name >> taxInfo; + taxonomy[name] = taxInfo; + phyloTree->addSeqToTree(name, taxInfo); + } + + MPI_File_close(&inMPI); +#else + ifstream inTax; + openInputFile(file, inTax); + + //read template seqs and save + while (!inTax.eof()) { + inTax >> name >> taxInfo; + + taxonomy[name] = taxInfo; + + phyloTree->addSeqToTree(name, taxInfo); + + gobble(inTax); + } + inTax.close(); +#endif + + phyloTree->assignHeirarchyIDs(0); + + m->mothurOut("DONE."); + m->mothurOutEndLine(); cout.flush(); + + return phyloTree->getNumSeqs(); + + } + catch(exception& e) { + m->errorOut(e, "Classify", "readTaxonomy"); + exit(1); + } +} +/**************************************************************************************************/ + +vector Classify::parseTax(string tax) { + try { + vector taxons; + + tax = tax.substr(0, tax.length()-1); //get rid of last ';' + + //parse taxonomy + string individual; + while (tax.find_first_of(';') != -1) { + individual = tax.substr(0,tax.find_first_of(';')); + tax = tax.substr(tax.find_first_of(';')+1, tax.length()); + taxons.push_back(individual); + + } + //get last one + taxons.push_back(tax); + + return taxons; + } + catch(exception& e) { + m->errorOut(e, "Classify", "parseTax"); + exit(1); + } +} +/**************************************************************************************************/ + diff --git a/classify.h b/classify.h index 1a34362..a2401dd 100644 --- a/classify.h +++ b/classify.h @@ -26,13 +26,14 @@ class Sequence; class Classify { public: - Classify(string, string, string, int, float, float, float, float); + Classify(); - virtual ~Classify(){ delete phyloTree; delete database; }; + virtual ~Classify(){ delete phyloTree; if (database != NULL) { delete database; } }; virtual string getTaxonomy(Sequence*) = 0; //virtual map getConfidenceScores() { return taxConfidenceScore; } //virtual vector parseTax(string); virtual string getSimpleTax() { return simpleTax; } + virtual void generateDatabaseAndNames(string, string, string, int, float, float, float, float); protected: @@ -46,9 +47,10 @@ protected: string taxFile, templateFile, simpleTax; vector names; - void readTaxonomy(string); + int readTaxonomy(string); vector parseTax(string); MothurOut* m; + }; /**************************************************************************************************/ diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index bed6580..a38bf8f 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -487,6 +487,10 @@ int ClassifySeqsCommand::execute(){ if (pid == 0) { //this part does not need to be paralellized #endif + m->mothurOutEndLine(); + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); + start = time(NULL); + //make taxonomy tree from new taxonomy file PhyloTree taxaBrowser; @@ -570,8 +574,8 @@ int ClassifySeqsCommand::execute(){ m->mothurOutEndLine(); - m->mothurOutEndLine(); - m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); + //m->mothurOutEndLine(); + //m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); } delete classify; diff --git a/getoturepcommand.cpp b/getoturepcommand.cpp index e9373a9..2c62efd 100644 --- a/getoturepcommand.cpp +++ b/getoturepcommand.cpp @@ -327,7 +327,7 @@ int GetOTURepCommand::execute(){ if (m->control_pressed) { if (large) { inRow.close(); remove(distFile.c_str()); } - delete groupMap; delete fasta; return 0; + delete fasta; return 0; } //if user gave a namesfile then use it @@ -350,7 +350,7 @@ int GetOTURepCommand::execute(){ if (m->control_pressed) { if (large) { inRow.close(); remove(distFile.c_str()); } - delete groupMap; delete fasta; delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; + delete fasta; delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; } while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { diff --git a/knn.cpp b/knn.cpp index 93a7aa3..64b5b3a 100644 --- a/knn.cpp +++ b/knn.cpp @@ -11,14 +11,22 @@ /**************************************************************************************************/ Knn::Knn(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch, int n) -: Classify(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch), num(n) {} +: Classify(), num(n) { + try { + //create search database and names vector + generateDatabaseAndNames(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch); + } + catch(exception& e) { + m->errorOut(e, "Knn", "Knn"); + exit(1); + } +} /**************************************************************************************************/ string Knn::getTaxonomy(Sequence* seq) { try { string tax; //use database to find closest seq - vector closest = database->findClosestSequences(seq, num); if (m->control_pressed) { return tax; } diff --git a/makefile b/makefile index d62f029..5812d45 100644 --- a/makefile +++ b/makefile @@ -26,7 +26,7 @@ ifeq ($(strip $(USEREADLINE)),yes) -L../readline-6.0 endif -USEMPI ?= yes +USEMPI ?= no ifeq ($(strip $(USEMPI)),yes) CC_OPTIONS += -DUSE_MPI @@ -219,7 +219,8 @@ mothur : \ ./parsesffcommand.o\ ./classify.o\ ./phylotree.o\ - ./bayesian.o\ + ./bayesian.o\ + ./rawtrainingdatamaker.o\ ./alignmentdb.o\ ./knn.o\ ./distancedb.o\ @@ -416,7 +417,8 @@ mothur : \ ./parsesffcommand.o\ ./classify.o\ ./phylotree.o\ - ./bayesian.o\ + ./bayesian.o\ + ./rawtrainingdatamaker.o\ ./alignmentdb.o\ ./knn.o\ ./distancedb.o\ @@ -616,7 +618,8 @@ clean : ./parsesffcommand.o\ ./classify.o\ ./phylotree.o\ - ./bayesian.o\ + ./bayesian.o\ + ./rawtrainingdatamaker.o\ ./alignmentdb.o\ ./knn.o\ ./distancedb.o\ @@ -1619,6 +1622,11 @@ install : mothur # Item # 196 -- chimerabellerophoncommand -- ./chimerabellerophoncommand.o : chimerabellerophoncommand.cpp $(CC) $(CC_OPTIONS) chimerabellerophoncommand.cpp -c $(INCLUDE) -o ./chimerabellerophoncommand.o + +# Item # 171 -- rawtrainingdatamaker -- +./rawtrainingdatamaker.o : rawtrainingdatamaker.cpp + $(CC) $(CC_OPTIONS) rawtrainingdatamaker.cpp -c $(INCLUDE) -o ./rawtrainingdatamaker.o + ##### END RUN #### diff --git a/phylotree.cpp b/phylotree.cpp index 3cdd6ac..33aedc6 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -19,6 +19,44 @@ PhyloTree::PhyloTree(){ tree.push_back(TaxNode("Root")); tree[0].heirarchyID = "0"; maxLevel = 0; + calcTotals = true; + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "PhyloTree"); + exit(1); + } +} +/**************************************************************************************************/ + +PhyloTree::PhyloTree(ifstream& in){ + try { + m = MothurOut::getInstance(); + calcTotals = false; + + in >> numNodes; gobble(in); + + tree.resize(numNodes); + + for (int i = 0; i < tree.size(); i++) { + in >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(in); + + } + + //read genus nodes + int numGenus = 0; + in >> numGenus; gobble(in); + + int gnode, gsize; + totals.clear(); + for (int i = 0; i < numGenus; i++) { + in >> gnode >> gsize; gobble(in); + + uniqueTaxonomies[gnode] = gnode; + totals.push_back(gsize); + } + + in.close(); + } catch(exception& e) { m->errorOut(e, "PhyloTree", "PhyloTree"); @@ -35,6 +73,7 @@ PhyloTree::PhyloTree(string tfile){ tree.push_back(TaxNode("Root")); tree[0].heirarchyID = "0"; maxLevel = 0; + calcTotals = true; ifstream in; openInputFile(tfile, in); @@ -149,6 +188,27 @@ vector PhyloTree::getGenusNodes() { } } /**************************************************************************************************/ +vector PhyloTree::getGenusTotals() { + try { + + if (calcTotals) { + totals.clear(); + //reset counts because we are on a new word + for (int j = 0; j < genusIndex.size(); j++) { + totals.push_back(tree[genusIndex[j]].accessions.size()); + } + return totals; + }else{ + return totals; + } + + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "getGenusNodes"); + exit(1); + } +} +/**************************************************************************************************/ void PhyloTree::assignHeirarchyIDs(int index){ try { @@ -266,15 +326,39 @@ void PhyloTree::print(int i, ofstream& out){ out <second].level << '\t' << tree[it->second].heirarchyID << '\t' << tree[it->second].name << '\t' << tree[it->second].children.size() << '\t' << tree[it->second].accessions.size() << endl; print(it->second, out); } + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "print"); + exit(1); + } +} +/**************************************************************************************************/ +void PhyloTree::printTreeNodes(string treefilename) { + try { + ofstream outTree; + openOutputFile(treefilename, outTree); + + //print treenodes + outTree << tree.size() << endl; + for (int i = 0; i < tree.size(); i++) { + outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl; + } + + //print genus nodes + outTree << endl << uniqueTaxonomies.size() << endl; + map::iterator it2; + for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl; } + outTree << endl; + outTree.close(); + } catch(exception& e) { - m->errorOut(e, "PhyloTree", "print"); + m->errorOut(e, "PhyloTree", "printTreeNodes"); exit(1); } } - /**************************************************************************************************/ diff --git a/phylotree.h b/phylotree.h index 75abffc..b65a651 100644 --- a/phylotree.h +++ b/phylotree.h @@ -32,11 +32,14 @@ class PhyloTree { public: PhyloTree(); PhyloTree(string); //pass it a taxonomy file and it makes the tree + PhyloTree(ifstream&); //pass it a taxonomy file and it makes the train.tree ~PhyloTree() {}; int addSeqToTree(string, string); void assignHeirarchyIDs(int); void print(ofstream&); + void printTreeNodes(string); //used by bayesian to save time vector getGenusNodes(); + vector getGenusTotals(); void binUnclassified(); TaxNode get(int i) { return tree[i]; } @@ -45,17 +48,20 @@ public: string getName(int i) { return tree[i].name; } string getFullTaxonomy(string); //pass a sequence name return taxonomy int getMaxLevel() { return maxLevel; } + int getNumSeqs() { return numSeqs; } private: string getNextTaxon(string&); vector tree; vector genusIndex; //holds the indexes in tree where the genus level taxonomies are stored + vector totals; //holds the numSeqs at each genus level taxonomy map name2Taxonomy; //maps name to index in tree map uniqueTaxonomies; //map of unique taxonomies void print(int, ofstream&); int numNodes; int numSeqs; int maxLevel; + bool calcTotals; MothurOut* m; }; diff --git a/rawtrainingdatamaker.cpp b/rawtrainingdatamaker.cpp new file mode 100644 index 0000000..362cdb7 --- /dev/null +++ b/rawtrainingdatamaker.cpp @@ -0,0 +1,188 @@ +/* + * rawTrainingDataMaker.cpp + * Mothur + * + * Created by westcott on 4/21/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "rawtrainingdatamaker.h" + +/**************************************************************************************************/ + +RawTrainingDataMaker::RawTrainingDataMaker(){ + try { + m = MothurOut::getInstance(); + numNodes = 1; + numSeqs = 0; + tree.push_back(rawTaxNode("Root")); + tree[0].rank = "Root"; + maxLevel = 0; + } + catch(exception& e) { + m->errorOut(e, "RawTrainingDataMaker", "RawTrainingDataMaker"); + exit(1); + } +} +/**************************************************************************************************/ + +RawTrainingDataMaker::RawTrainingDataMaker(string tfile){ + try { + m = MothurOut::getInstance(); + numNodes = 1; + numSeqs = 0; + tree.push_back(rawTaxNode("Root")); + tree[0].rank = "Root"; + maxLevel = 0; + + ifstream in; + openInputFile(tfile, in); + + //read in users taxonomy file and add sequences to tree + string name, tax; + while(!in.eof()){ + in >> name >> tax; gobble(in); + + addSeqToTree(name, tax); + } + in.close(); + + assignRank(0); + } + catch(exception& e) { + m->errorOut(e, "RawTrainingDataMaker", "RawTrainingDataMaker"); + exit(1); + } +} + +/**************************************************************************************************/ + +string RawTrainingDataMaker::getNextTaxon(string& heirarchy){ + try { + string currentLevel = ""; + if(heirarchy != ""){ + int pos = heirarchy.find_first_of(';'); + currentLevel=heirarchy.substr(0,pos); + if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); } + else { heirarchy = ""; } + } + return currentLevel; + } + catch(exception& e) { + m->errorOut(e, "RawTrainingDataMaker", "getNextTaxon"); + exit(1); + } +} + +/**************************************************************************************************/ + +int RawTrainingDataMaker::addSeqToTree(string seqName, string seqTaxonomy){ + try { + numSeqs++; + + map::iterator childPointer; + + int currentNode = 0; + string taxon; + + while(seqTaxonomy != ""){ + + if (m->control_pressed) { return 0; } + + //somehow the parent is getting one too many accnos + //use print to reassign the taxa id + taxon = getNextTaxon(seqTaxonomy); + + childPointer = tree[currentNode].children.find(taxon); + + if(childPointer != tree[currentNode].children.end()){ //if the node already exists, move on + currentNode = childPointer->second; + }else{ //otherwise, create it + tree.push_back(rawTaxNode(taxon)); + numNodes++; + tree[currentNode].children[taxon] = numNodes-1; + tree[numNodes-1].parent = currentNode; + + currentNode = tree[currentNode].children[taxon]; + } + } + + } + catch(exception& e) { + m->errorOut(e, "RawTrainingDataMaker", "addSeqToTree"); + exit(1); + } +} +/**************************************************************************************************/ + +void RawTrainingDataMaker::assignRank(int index){ + try { + map::iterator it; + + string ranks[9] = { "Root","Domain","Kingdom","Phylum","Class","Order","Family","Genus","Species" }; + + for(it=tree[index].children.begin();it!=tree[index].children.end();it++){ + tree[it->second].level = tree[index].level + 1; + + if (tree[it->second].level > 8) { + tree[it->second].rank = ("unknown" + toString(tree[it->second].level)); + }else { + tree[it->second].rank = ranks[tree[it->second].level]; + } + + //save maxLevel for binning the unclassified seqs + if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; } + + assignRank(it->second); + } + } + catch(exception& e) { + m->errorOut(e, "RawTrainingDataMaker", "assignRank"); + exit(1); + } +} +/**************************************************************************************************/ + +void RawTrainingDataMaker::print(ofstream& out){ + try { + //string temp = tree[0].name +" " + tree[0].rank; + //sanityCheck[temp] = temp; + + out << "0" << "*" << tree[0].name << "*" << tree[0].parent << "*" << tree[0].level << "*" << tree[0].rank << endl; + print(0, out); + + } + catch(exception& e) { + m->errorOut(e, "RawTrainingDataMaker", "print"); + exit(1); + } +} + +/**************************************************************************************************/ + +void RawTrainingDataMaker::print(int i, ofstream& out){ + try { + map::iterator it; + for(it=tree[i].children.begin();it!=tree[i].children.end();it++){ + //string temp = tree[it->second].name + " " + tree[it->second].rank; + + //map::iterator itSan; + //itSan = sanityCheck.find(temp); + + //if (itSan == sanityCheck.end()) { + out << it->second << "*" << tree[it->second].name << "*" << tree[it->second].parent << "*" << tree[it->second].level << "*" << tree[it->second].rank << endl; + //sanityCheck[temp] = temp; + //} + print(it->second, out); + } + } + catch(exception& e) { + m->errorOut(e, "RawTrainingDataMaker", "print"); + exit(1); + } +} +/**************************************************************************************************/ + + + diff --git a/rawtrainingdatamaker.h b/rawtrainingdatamaker.h new file mode 100644 index 0000000..912d781 --- /dev/null +++ b/rawtrainingdatamaker.h @@ -0,0 +1,54 @@ +#ifndef RAWTRAININGDATAMAKER_H +#define RAWTRAININGDATAMAKER_H + +/* + * rawTrainingDataMaker.h + * Mothur + * + * Created by westcott on 4/21/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "mothur.h" +#include "mothurout.h" + +/**************************************************************************************************/ + +struct rawTaxNode { + map children; //childs name to index in tree + int parent, level; + string name, rank; + + rawTaxNode(string n) : name(n), level(0), parent(-1) { } + rawTaxNode(){} +}; + +/**************************************************************************************************/ + +class RawTrainingDataMaker { + +public: + RawTrainingDataMaker(); + RawTrainingDataMaker(string); //pass it a taxonomy file and it makes the tree + ~RawTrainingDataMaker() {}; + int addSeqToTree(string, string); + void assignRank(int); + void print(ofstream&); + +private: + string getNextTaxon(string&); + vector tree; + void print(int, ofstream&); + int numNodes; + int numSeqs; + int maxLevel; + //map sanityCheck; + MothurOut* m; +}; + +/**************************************************************************************************/ + +#endif + + diff --git a/readdistcommand.cpp b/readdistcommand.cpp index bcecb78..7f60d71 100644 --- a/readdistcommand.cpp +++ b/readdistcommand.cpp @@ -175,7 +175,7 @@ void ReadDistCommand::help(){ m->mothurOut("For this use the read.dist command should be in the following format: \n"); m->mothurOut("read.dist(phylip=yourDistFile, name=yourNameFile, cutoff=yourCutoff, precision=yourPrecision) \n"); m->mothurOut("The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n"); - m->mothurOut("The sim parameter is used to indicate that your distance file contains similiarity values instead of distance values. The default is false, if sim=true then mothur will convert the similairity values to distances. \n"); + m->mothurOut("The sim parameter is used to indicate that your distance file contains similarity values instead of distance values. The default is false, if sim=true then mothur will convert the similarity values to distances. \n"); m->mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n"); m->mothurOut("The second way to use the read.dist command is to read a phylip or column and a group, so you can use the libshuff command.\n"); m->mothurOut("For this use the read.dist command should be in the following format: \n"); -- 2.39.2