A747E81C116365E000FB9042 /* chimeraslayercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraslayercommand.h; sourceTree = "<group>"; };
A747E81D116365E000FB9042 /* chimeraslayercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraslayercommand.cpp; sourceTree = "<group>"; };
A7639F8D1175DF35008F5578 /* makefile */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.make; path = makefile; sourceTree = "<group>"; };
+ A76AAD02117F322B003D8DA1 /* rawtrainingdatamaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = rawtrainingdatamaker.h; sourceTree = SOURCE_ROOT; };
+ A76AAD03117F322B003D8DA1 /* rawtrainingdatamaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = rawtrainingdatamaker.cpp; sourceTree = SOURCE_ROOT; };
A78254461164D7790002E2DD /* chimerapintailcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerapintailcommand.h; sourceTree = "<group>"; };
A78254471164D7790002E2DD /* chimerapintailcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerapintailcommand.cpp; sourceTree = "<group>"; };
A7825502116519F70002E2DD /* chimerabellerophoncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerabellerophoncommand.h; sourceTree = "<group>"; };
A7DA208A113FECD400BF472F /* knn.h */,
A7DA20C4113FECD400BF472F /* phylotree.cpp */,
A7DA20C5113FECD400BF472F /* phylotree.h */,
+ A76AAD02117F322B003D8DA1 /* rawtrainingdatamaker.h */,
+ A76AAD03117F322B003D8DA1 /* rawtrainingdatamaker.cpp */,
A7DA215C113FECD400BF472F /* taxonomyequalizer.cpp */,
A7DA215D113FECD400BF472F /* taxonomyequalizer.h */,
);
#include "bayesian.h"
#include "kmer.hpp"
+#include "rawtrainingdatamaker.h"
/**************************************************************************************************/
Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i) :
-Classify(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
+Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) {
try {
- numKmers = database->getMaxKmer() + 1;
-
- //initialze probabilities
- wordGenusProb.resize(numKmers);
-
- genusNodes = phyloTree->getGenusNodes();
-
- for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); }
-
- //reset counts because we are on a new word
- for (int j = 0; j < genusNodes.size(); j++) {
- TaxNode temp = phyloTree->get(genusNodes[j]);
- genusTotals.push_back(temp.accessions.size());
- }
-
-
/************calculate the probablity that each word will be in a specific taxonomy*************/
+ string phyloTreeName = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.train";
+ ifstream phyloTreeTest(phyloTreeName.c_str());
+
ofstream out;
string probFileName = tfile.substr(0,tfile.find_last_of(".")+1) + tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.prob";
ifstream probFileTest(probFileName.c_str());
int start = time(NULL);
- if(probFileTest && probFileTest2){
+ if(probFileTest && probFileTest2 && phyloTreeTest){
+ m->mothurOut("Reading template taxonomy... "); cout.flush();
+
+ phyloTree = new PhyloTree(phyloTreeTest);
+
+ m->mothurOut("DONE."); m->mothurOutEndLine();
+
+ genusNodes = phyloTree->getGenusNodes();
+ genusTotals = phyloTree->getGenusTotals();
+
m->mothurOut("Reading template probabilities... "); cout.flush();
readProbFile(probFileTest, probFileTest2);
+
}else{
+
+ //create search database and names vector
+ generateDatabaseAndNames(tfile, tempFile, method, ksize, 0.0, 0.0, 0.0, 0.0);
+
+ genusNodes = phyloTree->getGenusNodes();
+ genusTotals = phyloTree->getGenusTotals();
+
+ m->mothurOut("Calculating template taxonomy tree... "); cout.flush();
+
+ phyloTree->printTreeNodes(phyloTreeName);
+
+ m->mothurOut("DONE."); m->mothurOutEndLine();
+
m->mothurOut("Calculating template probabilities... "); cout.flush();
+
+ numKmers = database->getMaxKmer() + 1;
+
+ //initialze probabilities
+ wordGenusProb.resize(numKmers);
+
+ for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); }
ofstream out;
openOutputFile(probFileName, out);
+ out << numKmers << endl;
+
ofstream out2;
openOutputFile(probFileName2, out2);
out.close();
out2.close();
+
+ //read in new phylotree with less info. - its faster
+ ifstream phyloTreeTest(phyloTreeName.c_str());
+ delete phyloTree;
+
+ phyloTree = new PhyloTree(phyloTreeTest);
}
-
-
+
m->mothurOut("DONE."); m->mothurOutEndLine();
m->mothurOut("It took " + toString(time(NULL) - start) + " seconds get probabilities. "); m->mothurOutEndLine();
}
for (int i = 0; i < queryKmerString.length(); i++) {
if (queryKmerString[i] != '!') { //this kmer is in the query
queryKmers.push_back(i);
-
}
}
if (queryKmers.size() == 0) { m->mothurOut(seq->getName() + "is bad."); m->mothurOutEndLine(); return "bad seq"; }
int index = getMostProbableTaxonomy(queryKmers);
-
if (m->control_pressed) { return tax; }
void Bayesian::readProbFile(ifstream& in, ifstream& inNum) {
try{
+ in >> numKmers; gobble(in);
+
+ //initialze probabilities
+ wordGenusProb.resize(numKmers);
+
+ for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); }
+
int kmer, name, count; count = 0;
vector<int> num; num.resize(numKmers);
float prob;
vector<float> zeroCountProb; zeroCountProb.resize(numKmers);
-
+
while (inNum) {
inNum >> zeroCountProb[count] >> num[count];
count++;
gobble(inNum);
}
inNum.close();
-
+
while(in) {
in >> kmer;
//***************************************************************************************************************
int Bellerophon::print(MPI_File& out, MPI_File& outAcc) {
try {
-
+
int pid;
MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
if (m->control_pressed) { return numSeqs; }
- outString += "Name\tScore\tLeft\tRight\n";
+ outString = "Name\tScore\tLeft\tRight\n";
+ MPI_Status status;
+ int olength = outString.length();
+ char* buf5 = new char[olength];
+ memcpy(buf5, outString.c_str(), olength);
+
+ MPI_File_write_shared(out, buf5, olength, MPI_CHAR, &status);
+
+ delete buf5;
+
//output prefenence structure to .chimeras file
for (int i = 0; i < best.size(); i++) {
if (m->control_pressed) { return numSeqs; }
- outString += best[i].name + "\t" + toString(best[i].score) + "\t" + best[i].leftParent + "\t" + best[i].rightParent + "\n";
-
+ outString = best[i].name + "\t" + toString(best[i].score) + "\t" + best[i].leftParent + "\t" + best[i].rightParent + "\n";
+
MPI_Status status;
int length = outString.length();
- char* buf2 = new char[length];\r
+ char* buf2 = new char[length];
memcpy(buf2, outString.c_str(), length);
MPI_File_write_shared(out, buf2, length, MPI_CHAR, &status);
//calc # of seqs with preference above 95%tile
if (best[i].score >= cutoffScore) {
above1++;
- string outAccString;
+ string outAccString = "";
outAccString += best[i].name + "\n";
MPI_Status statusAcc;
length = outAccString.length();
- char* buf = new char[length];\r
+ char* buf = new char[length];
memcpy(buf, outAccString.c_str(), length);
MPI_File_write_shared(outAcc, buf, length, MPI_CHAR, &statusAcc);
if (m->control_pressed) { return 0; }
int bestLength = MPIBestSend[i].length();
- char* buf = new char[bestLength];\r
+ char* buf = new char[bestLength];
memcpy(buf, MPIBestSend[i].c_str(), bestLength);
MPI_Send(&bestLength, 1, MPI_INT, 0, 2001, MPI_COMM_WORLD);
m->mothurOut("The chimera.bellerophon command reads a fastafile and creates list of potentially chimeric sequences.\n");
m->mothurOut("The chimera.bellerophon command parameters are fasta, filter, correction, processors, window, increment. The fasta parameter is required.\n");
m->mothurOut("The filter parameter allows you to specify if you would like to apply a vertical and 50% soft filter, default=false. \n");
- m->mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs.\n");
+ m->mothurOut("The correction parameter allows you to put more emphasis on the distance between highly similar sequences and less emphasis on the differences between remote homologs, default=true.\n");
m->mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n");
#ifdef USE_MPI
m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
#ifdef USE_MPI
m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
#endif
- m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default=1/4 sequence length. \n");
+ m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default=300. \n");
m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default=25.\n");
m->mothurOut("The conservation parameter allows you to enter a frequency file containing the highest bases frequency at each place in the alignment.\n");
m->mothurOut("The quantile parameter allows you to enter a file containing quantiles for a template files sequences, if you use the filter the quantile file generated becomes unique to the fasta file you used.\n");
int start = time(NULL);
- chimera = new Pintail(fastafile, templatefile, filter, processors, maskfile, consfile, quanfile, window, increment, outputDir);
-
//set user options
if (maskfile == "default") { m->mothurOut("I am using the default 236627 EU009184.1 Shigella dysenteriae str. FBD013."); m->mothurOutEndLine(); }
-
+ chimera = new Pintail(fastafile, templatefile, filter, processors, maskfile, consfile, quanfile, window, increment, outputDir);
+
string outputFileName, accnosFileName;
if (maskfile != "") {
outputFileName = outputDir + getRootName(getSimpleName(fastafile)) + maskfile + ".pintail.chimeras";
m->mothurOut("The chimera.slayer command reads a fastafile and templatefile and outputs potentially chimeric sequences.\n");
m->mothurOut("This command was modeled after the chimeraSlayer written by the Broad Institute.\n");
- m->mothurOut("The chimera.slayer command parameters are fasta, template, filter, mask, processors, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment and numwanted.\n"); //realign,
+ m->mothurOut("The chimera.slayer command parameters are fasta, template, processors, ksize, window, match, mismatch, divergence. minsim, mincov, minbs, minsnp, parents, search, iters, increment and numwanted.\n"); //realign,
m->mothurOut("The fasta parameter allows you to enter the fasta file containing your potentially chimeric sequences, and is required. \n");
m->mothurOut("The template parameter allows you to enter a template file containing known non-chimeric sequences, and is required. \n");
m->mothurOut("The processors parameter allows you to specify how many processors you would like to use. The default is 1. \n");
#ifdef USE_MPI
m->mothurOut("When using MPI, the processors parameter is set to the number of MPI processes running. \n");
#endif
- m->mothurOut("The mask parameter allows you to specify a file containing one sequence you wish to use as a mask for the your sequences. \n");
m->mothurOut("The window parameter allows you to specify the window size for searching for chimeras, default=50. \n");
m->mothurOut("The increment parameter allows you to specify how far you move each window while finding chimeric sequences, default=5.\n");
m->mothurOut("The numwanted parameter allows you to specify how many sequences you would each query sequence compared with, default=15.\n");
m->mothurOut("The minsnp parameter allows you to specify percent of SNPs to sample on each side of breakpoint for computing bootstrap support (default: 10) \n");
m->mothurOut("The search parameter allows you to specify search method for finding the closest parent. Choices are distance, blast, and kmer, default distance. \n");
//m->mothurOut("The realign parameter allows you to realign the query to the potential parents. Choices are true or false, default false. Found to make results worse. \n");
- m->mothurOut("NOT ALL PARAMETERS ARE USED BY ALL METHODS. Please look below for method specifics.\n\n");
m->mothurOut("The chimera.slayer command should be in the following format: \n");
m->mothurOut("chimera.slayer(fasta=yourFastaFile, template=yourTemplate, search=yourSearch) \n");
m->mothurOut("Example: chimera.slayer(fasta=AD.align, template=core_set_aligned.imputed.fasta, search=kmer) \n");
-/*\r
- * classify.cpp\r
- * Mothur\r
- *\r
- * Created by westcott on 11/3/09.\r
- * Copyright 2009 Schloss Lab. All rights reserved.\r
- *\r
- */\r
-\r
-#include "classify.h"\r
-#include "sequence.hpp"\r
-#include "kmerdb.hpp"\r
-#include "suffixdb.hpp"\r
-#include "blastdb.hpp"\r
-#include "distancedb.hpp"\r
-\r
-/**************************************************************************************************/\r
-Classify::Classify(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) : taxFile(tfile), templateFile(tempFile) { \r
- try { \r
- m = MothurOut::getInstance(); \r
- readTaxonomy(taxFile); \r
- \r
- int start = time(NULL);\r
- int numSeqs = 0;\r
- \r
- m->mothurOut("Generating search database... "); cout.flush();\r
-#ifdef USE_MPI \r
- int pid;\r
- vector<long> positions;\r
- \r
- MPI_Status status; \r
- MPI_File inMPI;\r
- MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are\r
-\r
- //char* inFileName = new char[tempFile.length()];\r
- //memcpy(inFileName, tempFile.c_str(), tempFile.length());\r
- \r
- char inFileName[1024];\r
- strcpy(inFileName, tempFile.c_str());\r
-\r
- MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer\r
- //delete inFileName;\r
-\r
- if (pid == 0) { //only one process needs to scan file\r
- positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs\r
-\r
- //send file positions to all processes\r
- MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs\r
- MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos \r
- }else{\r
- MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs\r
- positions.resize(numSeqs);\r
- MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions\r
- }\r
- \r
- //create database\r
- if(method == "kmer") { database = new KmerDB(tempFile, kmerSize); }\r
- else if(method == "suffix") { database = new SuffixDB(numSeqs); }\r
- else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); }\r
- else if(method == "distance") { database = new DistanceDB(); }\r
- else {\r
- m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();\r
- database = new KmerDB(tempFile, 8);\r
- }\r
-\r
- //read file \r
- for(int i=0;i<numSeqs;i++){\r
- //read next sequence\r
- int length = positions[i+1] - positions[i];\r
- char* buf4 = new char[length];\r
- MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);\r
- \r
- string tempBuf = buf4;\r
- if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }\r
- delete buf4;\r
- istringstream iss (tempBuf,istringstream::in);\r
- \r
- Sequence temp(iss); \r
- if (temp.getName() != "") {\r
- names.push_back(temp.getName());\r
- database->addSequence(temp); \r
- }\r
- }\r
- \r
- database->generateDB();\r
- MPI_File_close(&inMPI);\r
- #else\r
- \r
- //need to know number of template seqs for suffixdb\r
- if (method == "suffix") {\r
- ifstream inFASTA;\r
- openInputFile(tempFile, inFASTA);\r
- numSeqs = count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');\r
- inFASTA.close();\r
- }\r
-\r
- bool needToGenerate = true;\r
- string kmerDBName;\r
- if(method == "kmer") { \r
- database = new KmerDB(tempFile, kmerSize); \r
- \r
- kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";\r
- ifstream kmerFileTest(kmerDBName.c_str());\r
- if(kmerFileTest){ needToGenerate = false; }\r
- }\r
- else if(method == "suffix") { database = new SuffixDB(numSeqs); }\r
- else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); }\r
- else if(method == "distance") { database = new DistanceDB(); }\r
- else {\r
- m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");\r
- m->mothurOutEndLine();\r
- database = new KmerDB(tempFile, 8);\r
- }\r
- \r
- if (needToGenerate) {\r
- ifstream fastaFile;\r
- openInputFile(tempFile, fastaFile);\r
- \r
- while (!fastaFile.eof()) {\r
- Sequence temp(fastaFile);\r
- gobble(fastaFile);\r
- \r
- names.push_back(temp.getName());\r
- \r
- database->addSequence(temp); \r
- }\r
- fastaFile.close();\r
-\r
- database->generateDB();\r
- \r
- }else if ((method == "kmer") && (!needToGenerate)) { \r
- ifstream kmerFileTest(kmerDBName.c_str());\r
- database->readKmerDB(kmerFileTest); \r
- \r
- ifstream fastaFile;\r
- openInputFile(tempFile, fastaFile);\r
- \r
- while (!fastaFile.eof()) {\r
- Sequence temp(fastaFile);\r
- gobble(fastaFile);\r
- \r
- names.push_back(temp.getName());\r
- }\r
- fastaFile.close();\r
- }\r
-#endif \r
- database->setNumSeqs(names.size());\r
- \r
- m->mothurOut("DONE."); m->mothurOutEndLine();\r
- m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();\r
-\r
- }\r
- catch(exception& e) {\r
- m->errorOut(e, "Classify", "Classify");\r
- exit(1);\r
- }\r
-}\r
-/**************************************************************************************************/\r
-\r
-void Classify::readTaxonomy(string file) {\r
- try {\r
- \r
- phyloTree = new PhyloTree();\r
- string name, taxInfo;\r
- \r
- m->mothurOutEndLine();\r
- m->mothurOut("Reading in the " + file + " taxonomy...\t"); cout.flush();\r
-\r
-#ifdef USE_MPI \r
- int pid, num;\r
- vector<long> positions;\r
- \r
- MPI_Status status; \r
- MPI_File inMPI;\r
- MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are\r
-\r
- //char* inFileName = new char[file.length()];\r
- //memcpy(inFileName, file.c_str(), file.length());\r
- \r
- char inFileName[1024];\r
- strcpy(inFileName, file.c_str());\r
-\r
- MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer\r
- //delete inFileName;\r
-\r
- if (pid == 0) {\r
- positions = setFilePosEachLine(file, num);\r
- \r
- //send file positions to all processes\r
- MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs\r
- MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos \r
- }else{\r
- MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs\r
- positions.resize(num);\r
- MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions\r
- }\r
- \r
- //read file \r
- for(int i=0;i<num;i++){\r
- //read next sequence\r
- int length = positions[i+1] - positions[i];\r
- char* buf4 = new char[length];\r
-\r
- MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);\r
-\r
- string tempBuf = buf4;\r
- if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }\r
- delete buf4;\r
-\r
- istringstream iss (tempBuf,istringstream::in);\r
- iss >> name >> taxInfo;\r
- taxonomy[name] = taxInfo;\r
- phyloTree->addSeqToTree(name, taxInfo);\r
- }\r
- \r
- MPI_File_close(&inMPI);\r
-#else \r
- ifstream inTax;\r
- openInputFile(file, inTax);\r
- \r
- //read template seqs and save\r
- while (!inTax.eof()) {\r
- inTax >> name >> taxInfo;\r
- \r
- taxonomy[name] = taxInfo;\r
- \r
- phyloTree->addSeqToTree(name, taxInfo);\r
- \r
- gobble(inTax);\r
- }\r
- inTax.close();\r
-#endif \r
- \r
- phyloTree->assignHeirarchyIDs(0);\r
- \r
- m->mothurOut("DONE.");\r
- m->mothurOutEndLine(); cout.flush();\r
- \r
- }\r
- catch(exception& e) {\r
- m->errorOut(e, "Classify", "readTaxonomy");\r
- exit(1);\r
- }\r
-}\r
-/**************************************************************************************************/\r
-\r
-vector<string> Classify::parseTax(string tax) {\r
- try {\r
- vector<string> taxons;\r
- \r
- tax = tax.substr(0, tax.length()-1); //get rid of last ';'\r
- \r
- //parse taxonomy\r
- string individual;\r
- while (tax.find_first_of(';') != -1) {\r
- individual = tax.substr(0,tax.find_first_of(';'));\r
- tax = tax.substr(tax.find_first_of(';')+1, tax.length());\r
- taxons.push_back(individual);\r
- \r
- }\r
- //get last one\r
- taxons.push_back(tax);\r
- \r
- return taxons;\r
- }\r
- catch(exception& e) {\r
- m->errorOut(e, "Classify", "parseTax");\r
- exit(1);\r
- }\r
-}\r
-/**************************************************************************************************/\r
-\r
+/*
+ * classify.cpp
+ * Mothur
+ *
+ * Created by westcott on 11/3/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "classify.h"
+#include "sequence.hpp"
+#include "kmerdb.hpp"
+#include "suffixdb.hpp"
+#include "blastdb.hpp"
+#include "distancedb.hpp"
+
+/**************************************************************************************************/
+void Classify::generateDatabaseAndNames(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch) {
+ try {
+ taxFile = tfile;
+ readTaxonomy(taxFile);
+
+ templateFile = tempFile;
+
+ int start = time(NULL);
+ int numSeqs = 0;
+
+ m->mothurOut("Generating search database... "); cout.flush();
+#ifdef USE_MPI
+ int pid;
+ vector<long> positions;
+
+ MPI_Status status;
+ MPI_File inMPI;
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+ //char* inFileName = new char[tempFile.length()];
+ //memcpy(inFileName, tempFile.c_str(), tempFile.length());
+
+ char inFileName[1024];
+ strcpy(inFileName, tempFile.c_str());
+
+ MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
+ //delete inFileName;
+
+ if (pid == 0) { //only one process needs to scan file
+ positions = setFilePosFasta(tempFile, numSeqs); //fills MPIPos, returns numSeqs
+
+ //send file positions to all processes
+ MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs
+ MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos
+ }else{
+ MPI_Bcast(&numSeqs, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
+ positions.resize(numSeqs);
+ MPI_Bcast(&positions[0], (numSeqs+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
+ }
+
+ //create database
+ if(method == "kmer") { database = new KmerDB(tempFile, kmerSize); }
+ else if(method == "suffix") { database = new SuffixDB(numSeqs); }
+ else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); }
+ else if(method == "distance") { database = new DistanceDB(); }
+ else {
+ m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); m->mothurOutEndLine();
+ database = new KmerDB(tempFile, 8);
+ }
+
+ //read file
+ for(int i=0;i<numSeqs;i++){
+ //read next sequence
+ int length = positions[i+1] - positions[i];
+ char* buf4 = new char[length];
+ MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
+
+ string tempBuf = buf4;
+ if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
+ delete buf4;
+ istringstream iss (tempBuf,istringstream::in);
+
+ Sequence temp(iss);
+ if (temp.getName() != "") {
+ names.push_back(temp.getName());
+ database->addSequence(temp);
+ }
+ }
+
+ database->generateDB();
+ MPI_File_close(&inMPI);
+ #else
+
+ //need to know number of template seqs for suffixdb
+ if (method == "suffix") {
+ ifstream inFASTA;
+ openInputFile(tempFile, inFASTA);
+ numSeqs = count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
+ inFASTA.close();
+ }
+
+ bool needToGenerate = true;
+ string kmerDBName;
+ if(method == "kmer") {
+ database = new KmerDB(tempFile, kmerSize);
+
+ kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
+ ifstream kmerFileTest(kmerDBName.c_str());
+ if(kmerFileTest){ needToGenerate = false; }
+ }
+ else if(method == "suffix") { database = new SuffixDB(numSeqs); }
+ else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); }
+ else if(method == "distance") { database = new DistanceDB(); }
+ else {
+ m->mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
+ m->mothurOutEndLine();
+ database = new KmerDB(tempFile, 8);
+ }
+
+ if (needToGenerate) {
+ ifstream fastaFile;
+ openInputFile(tempFile, fastaFile);
+
+ while (!fastaFile.eof()) {
+ Sequence temp(fastaFile);
+ gobble(fastaFile);
+
+ names.push_back(temp.getName());
+
+ database->addSequence(temp);
+ }
+ fastaFile.close();
+
+ database->generateDB();
+
+ }else if ((method == "kmer") && (!needToGenerate)) {
+ ifstream kmerFileTest(kmerDBName.c_str());
+ database->readKmerDB(kmerFileTest);
+
+ ifstream fastaFile;
+ openInputFile(tempFile, fastaFile);
+
+ while (!fastaFile.eof()) {
+ Sequence temp(fastaFile);
+ gobble(fastaFile);
+
+ names.push_back(temp.getName());
+ }
+ fastaFile.close();
+ }
+#endif
+ database->setNumSeqs(names.size());
+
+ m->mothurOut("DONE."); m->mothurOutEndLine();
+ m->mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); m->mothurOutEndLine();
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Classify", "generateDatabaseAndNames");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+Classify::Classify() { m = MothurOut::getInstance(); database = NULL; }
+/**************************************************************************************************/
+
+int Classify::readTaxonomy(string file) {
+ try {
+
+ phyloTree = new PhyloTree();
+ string name, taxInfo;
+
+ m->mothurOutEndLine();
+ m->mothurOut("Reading in the " + file + " taxonomy...\t"); cout.flush();
+
+#ifdef USE_MPI
+ int pid, num;
+ vector<long> positions;
+
+ MPI_Status status;
+ MPI_File inMPI;
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+ //char* inFileName = new char[file.length()];
+ //memcpy(inFileName, file.c_str(), file.length());
+
+ char inFileName[1024];
+ strcpy(inFileName, file.c_str());
+
+ MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
+ //delete inFileName;
+
+ if (pid == 0) {
+ positions = setFilePosEachLine(file, num);
+
+ //send file positions to all processes
+ MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs
+ MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos
+ }else{
+ MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
+ positions.resize(num);
+ MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
+ }
+
+ //read file
+ for(int i=0;i<num;i++){
+ //read next sequence
+ int length = positions[i+1] - positions[i];
+ char* buf4 = new char[length];
+
+ MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
+
+ string tempBuf = buf4;
+ if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
+ delete buf4;
+
+ istringstream iss (tempBuf,istringstream::in);
+ iss >> name >> taxInfo;
+ taxonomy[name] = taxInfo;
+ phyloTree->addSeqToTree(name, taxInfo);
+ }
+
+ MPI_File_close(&inMPI);
+#else
+ ifstream inTax;
+ openInputFile(file, inTax);
+
+ //read template seqs and save
+ while (!inTax.eof()) {
+ inTax >> name >> taxInfo;
+
+ taxonomy[name] = taxInfo;
+
+ phyloTree->addSeqToTree(name, taxInfo);
+
+ gobble(inTax);
+ }
+ inTax.close();
+#endif
+
+ phyloTree->assignHeirarchyIDs(0);
+
+ m->mothurOut("DONE.");
+ m->mothurOutEndLine(); cout.flush();
+
+ return phyloTree->getNumSeqs();
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Classify", "readTaxonomy");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+vector<string> Classify::parseTax(string tax) {
+ try {
+ vector<string> taxons;
+
+ tax = tax.substr(0, tax.length()-1); //get rid of last ';'
+
+ //parse taxonomy
+ string individual;
+ while (tax.find_first_of(';') != -1) {
+ individual = tax.substr(0,tax.find_first_of(';'));
+ tax = tax.substr(tax.find_first_of(';')+1, tax.length());
+ taxons.push_back(individual);
+
+ }
+ //get last one
+ taxons.push_back(tax);
+
+ return taxons;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Classify", "parseTax");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
class Classify {
public:
- Classify(string, string, string, int, float, float, float, float);
+ Classify();
- virtual ~Classify(){ delete phyloTree; delete database; };
+ virtual ~Classify(){ delete phyloTree; if (database != NULL) { delete database; } };
virtual string getTaxonomy(Sequence*) = 0;
//virtual map<string, int> getConfidenceScores() { return taxConfidenceScore; }
//virtual vector<string> parseTax(string);
virtual string getSimpleTax() { return simpleTax; }
+ virtual void generateDatabaseAndNames(string, string, string, int, float, float, float, float);
protected:
string taxFile, templateFile, simpleTax;
vector<string> names;
- void readTaxonomy(string);
+ int readTaxonomy(string);
vector<string> parseTax(string);
MothurOut* m;
+
};
/**************************************************************************************************/
if (pid == 0) { //this part does not need to be paralellized
#endif
+ m->mothurOutEndLine();
+ m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
+ start = time(NULL);
+
//make taxonomy tree from new taxonomy file
PhyloTree taxaBrowser;
m->mothurOutEndLine();
- m->mothurOutEndLine();
- m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
+ //m->mothurOutEndLine();
+ //m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
}
delete classify;
if (m->control_pressed) {
if (large) { inRow.close(); remove(distFile.c_str()); }
- delete groupMap; delete fasta; return 0;
+ delete fasta; return 0;
}
//if user gave a namesfile then use it
if (m->control_pressed) {
if (large) { inRow.close(); remove(distFile.c_str()); }
- delete groupMap; delete fasta; delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
+ delete fasta; delete read; delete input; delete list; globaldata->gListVector = NULL; return 0;
}
while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
/**************************************************************************************************/
Knn::Knn(string tfile, string tempFile, string method, int kmerSize, float gapOpen, float gapExtend, float match, float misMatch, int n)
-: Classify(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch), num(n) {}
+: Classify(), num(n) {
+ try {
+ //create search database and names vector
+ generateDatabaseAndNames(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch);
+ }
+ catch(exception& e) {
+ m->errorOut(e, "Knn", "Knn");
+ exit(1);
+ }
+}
/**************************************************************************************************/
string Knn::getTaxonomy(Sequence* seq) {
try {
string tax;
//use database to find closest seq
-
vector<int> closest = database->findClosestSequences(seq, num);
if (m->control_pressed) { return tax; }
-L../readline-6.0\r
endif\r
\r
-USEMPI ?= yes\r
+USEMPI ?= no\r
\r
ifeq ($(strip $(USEMPI)),yes)\r
CC_OPTIONS += -DUSE_MPI\r
./parsesffcommand.o\\r
./classify.o\\r
./phylotree.o\\r
- ./bayesian.o\\r
+ ./bayesian.o\
+ ./rawtrainingdatamaker.o\\r
./alignmentdb.o\\r
./knn.o\\r
./distancedb.o\\r
./parsesffcommand.o\\r
./classify.o\\r
./phylotree.o\\r
- ./bayesian.o\\r
+ ./bayesian.o\
+ ./rawtrainingdatamaker.o\\r
./alignmentdb.o\\r
./knn.o\\r
./distancedb.o\\r
./parsesffcommand.o\\r
./classify.o\\r
./phylotree.o\\r
- ./bayesian.o\\r
+ ./bayesian.o\
+ ./rawtrainingdatamaker.o\\r
./alignmentdb.o\\r
./knn.o\\r
./distancedb.o\\r
# Item # 196 -- chimerabellerophoncommand --\r
./chimerabellerophoncommand.o : chimerabellerophoncommand.cpp\r
$(CC) $(CC_OPTIONS) chimerabellerophoncommand.cpp -c $(INCLUDE) -o ./chimerabellerophoncommand.o\r
+
+# Item # 171 -- rawtrainingdatamaker --\r
+./rawtrainingdatamaker.o : rawtrainingdatamaker.cpp\r
+ $(CC) $(CC_OPTIONS) rawtrainingdatamaker.cpp -c $(INCLUDE) -o ./rawtrainingdatamaker.o\r
+\r
\r
\r
##### END RUN ####\r
tree.push_back(TaxNode("Root"));
tree[0].heirarchyID = "0";
maxLevel = 0;
+ calcTotals = true;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PhyloTree", "PhyloTree");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+PhyloTree::PhyloTree(ifstream& in){
+ try {
+ m = MothurOut::getInstance();
+ calcTotals = false;
+
+ in >> numNodes; gobble(in);
+
+ tree.resize(numNodes);
+
+ for (int i = 0; i < tree.size(); i++) {
+ in >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(in);
+
+ }
+
+ //read genus nodes
+ int numGenus = 0;
+ in >> numGenus; gobble(in);
+
+ int gnode, gsize;
+ totals.clear();
+ for (int i = 0; i < numGenus; i++) {
+ in >> gnode >> gsize; gobble(in);
+
+ uniqueTaxonomies[gnode] = gnode;
+ totals.push_back(gsize);
+ }
+
+ in.close();
+
}
catch(exception& e) {
m->errorOut(e, "PhyloTree", "PhyloTree");
tree.push_back(TaxNode("Root"));
tree[0].heirarchyID = "0";
maxLevel = 0;
+ calcTotals = true;
ifstream in;
openInputFile(tfile, in);
}
}
/**************************************************************************************************/
+vector<int> PhyloTree::getGenusTotals() {
+ try {
+
+ if (calcTotals) {
+ totals.clear();
+ //reset counts because we are on a new word
+ for (int j = 0; j < genusIndex.size(); j++) {
+ totals.push_back(tree[genusIndex[j]].accessions.size());
+ }
+ return totals;
+ }else{
+ return totals;
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PhyloTree", "getGenusNodes");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
void PhyloTree::assignHeirarchyIDs(int index){
try {
out <<tree[it->second].level << '\t' << tree[it->second].heirarchyID << '\t' << tree[it->second].name << '\t' << tree[it->second].children.size() << '\t' << tree[it->second].accessions.size() << endl;
print(it->second, out);
}
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PhyloTree", "print");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+void PhyloTree::printTreeNodes(string treefilename) {
+ try {
+ ofstream outTree;
+ openOutputFile(treefilename, outTree);
+
+ //print treenodes
+ outTree << tree.size() << endl;
+ for (int i = 0; i < tree.size(); i++) {
+ outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
+ }
+
+ //print genus nodes
+ outTree << endl << uniqueTaxonomies.size() << endl;
+ map<int, int>::iterator it2;
+ for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl; }
+ outTree << endl;
+ outTree.close();
+
}
catch(exception& e) {
- m->errorOut(e, "PhyloTree", "print");
+ m->errorOut(e, "PhyloTree", "printTreeNodes");
exit(1);
}
}
-
/**************************************************************************************************/
public:
PhyloTree();
PhyloTree(string); //pass it a taxonomy file and it makes the tree
+ PhyloTree(ifstream&); //pass it a taxonomy file and it makes the train.tree
~PhyloTree() {};
int addSeqToTree(string, string);
void assignHeirarchyIDs(int);
void print(ofstream&);
+ void printTreeNodes(string); //used by bayesian to save time
vector<int> getGenusNodes();
+ vector<int> getGenusTotals();
void binUnclassified();
TaxNode get(int i) { return tree[i]; }
string getName(int i) { return tree[i].name; }
string getFullTaxonomy(string); //pass a sequence name return taxonomy
int getMaxLevel() { return maxLevel; }
+ int getNumSeqs() { return numSeqs; }
private:
string getNextTaxon(string&);
vector<TaxNode> tree;
vector<int> genusIndex; //holds the indexes in tree where the genus level taxonomies are stored
+ vector<int> totals; //holds the numSeqs at each genus level taxonomy
map<string, int> name2Taxonomy; //maps name to index in tree
map<int, int> uniqueTaxonomies; //map of unique taxonomies
void print(int, ofstream&);
int numNodes;
int numSeqs;
int maxLevel;
+ bool calcTotals;
MothurOut* m;
};
--- /dev/null
+/*
+ * rawTrainingDataMaker.cpp
+ * Mothur
+ *
+ * Created by westcott on 4/21/10.
+ * Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "rawtrainingdatamaker.h"
+
+/**************************************************************************************************/
+
+RawTrainingDataMaker::RawTrainingDataMaker(){
+ try {
+ m = MothurOut::getInstance();
+ numNodes = 1;
+ numSeqs = 0;
+ tree.push_back(rawTaxNode("Root"));
+ tree[0].rank = "Root";
+ maxLevel = 0;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RawTrainingDataMaker", "RawTrainingDataMaker");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+RawTrainingDataMaker::RawTrainingDataMaker(string tfile){
+ try {
+ m = MothurOut::getInstance();
+ numNodes = 1;
+ numSeqs = 0;
+ tree.push_back(rawTaxNode("Root"));
+ tree[0].rank = "Root";
+ maxLevel = 0;
+
+ ifstream in;
+ openInputFile(tfile, in);
+
+ //read in users taxonomy file and add sequences to tree
+ string name, tax;
+ while(!in.eof()){
+ in >> name >> tax; gobble(in);
+
+ addSeqToTree(name, tax);
+ }
+ in.close();
+
+ assignRank(0);
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RawTrainingDataMaker", "RawTrainingDataMaker");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+string RawTrainingDataMaker::getNextTaxon(string& heirarchy){
+ try {
+ string currentLevel = "";
+ if(heirarchy != ""){
+ int pos = heirarchy.find_first_of(';');
+ currentLevel=heirarchy.substr(0,pos);
+ if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); }
+ else { heirarchy = ""; }
+ }
+ return currentLevel;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RawTrainingDataMaker", "getNextTaxon");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int RawTrainingDataMaker::addSeqToTree(string seqName, string seqTaxonomy){
+ try {
+ numSeqs++;
+
+ map<string, int>::iterator childPointer;
+
+ int currentNode = 0;
+ string taxon;
+
+ while(seqTaxonomy != ""){
+
+ if (m->control_pressed) { return 0; }
+
+ //somehow the parent is getting one too many accnos
+ //use print to reassign the taxa id
+ taxon = getNextTaxon(seqTaxonomy);
+
+ childPointer = tree[currentNode].children.find(taxon);
+
+ if(childPointer != tree[currentNode].children.end()){ //if the node already exists, move on
+ currentNode = childPointer->second;
+ }else{ //otherwise, create it
+ tree.push_back(rawTaxNode(taxon));
+ numNodes++;
+ tree[currentNode].children[taxon] = numNodes-1;
+ tree[numNodes-1].parent = currentNode;
+
+ currentNode = tree[currentNode].children[taxon];
+ }
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RawTrainingDataMaker", "addSeqToTree");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+void RawTrainingDataMaker::assignRank(int index){
+ try {
+ map<string,int>::iterator it;
+
+ string ranks[9] = { "Root","Domain","Kingdom","Phylum","Class","Order","Family","Genus","Species" };
+
+ for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
+ tree[it->second].level = tree[index].level + 1;
+
+ if (tree[it->second].level > 8) {
+ tree[it->second].rank = ("unknown" + toString(tree[it->second].level));
+ }else {
+ tree[it->second].rank = ranks[tree[it->second].level];
+ }
+
+ //save maxLevel for binning the unclassified seqs
+ if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; }
+
+ assignRank(it->second);
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RawTrainingDataMaker", "assignRank");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+void RawTrainingDataMaker::print(ofstream& out){
+ try {
+ //string temp = tree[0].name +" " + tree[0].rank;
+ //sanityCheck[temp] = temp;
+
+ out << "0" << "*" << tree[0].name << "*" << tree[0].parent << "*" << tree[0].level << "*" << tree[0].rank << endl;
+ print(0, out);
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RawTrainingDataMaker", "print");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void RawTrainingDataMaker::print(int i, ofstream& out){
+ try {
+ map<string,int>::iterator it;
+ for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
+ //string temp = tree[it->second].name + " " + tree[it->second].rank;
+
+ //map<string, string>::iterator itSan;
+ //itSan = sanityCheck.find(temp);
+
+ //if (itSan == sanityCheck.end()) {
+ out << it->second << "*" << tree[it->second].name << "*" << tree[it->second].parent << "*" << tree[it->second].level << "*" << tree[it->second].rank << endl;
+ //sanityCheck[temp] = temp;
+ //}
+ print(it->second, out);
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "RawTrainingDataMaker", "print");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+
+
--- /dev/null
+#ifndef RAWTRAININGDATAMAKER_H
+#define RAWTRAININGDATAMAKER_H
+
+/*
+ * rawTrainingDataMaker.h
+ * Mothur
+ *
+ * Created by westcott on 4/21/10.
+ * Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "mothurout.h"
+
+/**************************************************************************************************/
+
+struct rawTaxNode {
+ map<string, int> children; //childs name to index in tree
+ int parent, level;
+ string name, rank;
+
+ rawTaxNode(string n) : name(n), level(0), parent(-1) { }
+ rawTaxNode(){}
+};
+
+/**************************************************************************************************/
+
+class RawTrainingDataMaker {
+
+public:
+ RawTrainingDataMaker();
+ RawTrainingDataMaker(string); //pass it a taxonomy file and it makes the tree
+ ~RawTrainingDataMaker() {};
+ int addSeqToTree(string, string);
+ void assignRank(int);
+ void print(ofstream&);
+
+private:
+ string getNextTaxon(string&);
+ vector<rawTaxNode> tree;
+ void print(int, ofstream&);
+ int numNodes;
+ int numSeqs;
+ int maxLevel;
+ //map<string, string> sanityCheck;
+ MothurOut* m;
+};
+
+/**************************************************************************************************/
+
+#endif
+
+
m->mothurOut("For this use the read.dist command should be in the following format: \n");
m->mothurOut("read.dist(phylip=yourDistFile, name=yourNameFile, cutoff=yourCutoff, precision=yourPrecision) \n");
m->mothurOut("The phylip or column parameter is required, but only one may be used. If you use a column file the name filename is required. \n");
- m->mothurOut("The sim parameter is used to indicate that your distance file contains similiarity values instead of distance values. The default is false, if sim=true then mothur will convert the similairity values to distances. \n");
+ m->mothurOut("The sim parameter is used to indicate that your distance file contains similarity values instead of distance values. The default is false, if sim=true then mothur will convert the similarity values to distances. \n");
m->mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
m->mothurOut("The second way to use the read.dist command is to read a phylip or column and a group, so you can use the libshuff command.\n");
m->mothurOut("For this use the read.dist command should be in the following format: \n");