A747E81C116365E000FB9042 /* chimeraslayercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraslayercommand.h; sourceTree = "<group>"; };
A747E81D116365E000FB9042 /* chimeraslayercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraslayercommand.cpp; sourceTree = "<group>"; };
A7639F8D1175DF35008F5578 /* makefile */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.make; path = makefile; sourceTree = "<group>"; };
- A76AAD02117F322B003D8DA1 /* rawtrainingdatamaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = rawtrainingdatamaker.h; sourceTree = SOURCE_ROOT; };
- A76AAD03117F322B003D8DA1 /* rawtrainingdatamaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = rawtrainingdatamaker.cpp; sourceTree = SOURCE_ROOT; };
+ A76AAD02117F322B003D8DA1 /* phylosummary.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = phylosummary.h; sourceTree = "<group>"; };
+ A76AAD03117F322B003D8DA1 /* phylosummary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = phylosummary.cpp; sourceTree = "<group>"; };
A78254461164D7790002E2DD /* chimerapintailcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerapintailcommand.h; sourceTree = "<group>"; };
A78254471164D7790002E2DD /* chimerapintailcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerapintailcommand.cpp; sourceTree = "<group>"; };
A7825502116519F70002E2DD /* chimerabellerophoncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerabellerophoncommand.h; sourceTree = "<group>"; };
A7DA208A113FECD400BF472F /* knn.h */,
A7DA20C4113FECD400BF472F /* phylotree.cpp */,
A7DA20C5113FECD400BF472F /* phylotree.h */,
- A76AAD02117F322B003D8DA1 /* rawtrainingdatamaker.h */,
- A76AAD03117F322B003D8DA1 /* rawtrainingdatamaker.cpp */,
+ A76AAD02117F322B003D8DA1 /* phylosummary.h */,
+ A76AAD03117F322B003D8DA1 /* phylosummary.cpp */,
A7DA215C113FECD400BF472F /* taxonomyequalizer.cpp */,
A7DA215D113FECD400BF472F /* taxonomyequalizer.h */,
);
#include "bayesian.h"
#include "kmer.hpp"
-#include "rawtrainingdatamaker.h"
+#include "phylosummary.h"
/**************************************************************************************************/
Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i) :
/************calculate the probablity that each word will be in a specific taxonomy*************/
string phyloTreeName = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.train";
- ifstream phyloTreeTest(phyloTreeName.c_str());
-
- ofstream out;
string probFileName = tfile.substr(0,tfile.find_last_of(".")+1) + tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.prob";
- ifstream probFileTest(probFileName.c_str());
+ string probFileName2 = tfile.substr(0,tfile.find_last_of(".")+1) + tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.numNonZero";
+ ofstream out;
ofstream out2;
- string probFileName2 = tfile.substr(0,tfile.find_last_of(".")+1) + tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.numNonZero";
+
+ ifstream phyloTreeTest(phyloTreeName.c_str());
ifstream probFileTest2(probFileName2.c_str());
+ ifstream probFileTest(probFileName.c_str());
int start = time(NULL);
if(probFileTest && probFileTest2 && phyloTreeTest){
m->mothurOut("Reading template taxonomy... "); cout.flush();
- phyloTree = new PhyloTree(phyloTreeTest);
-
+ phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName);
+
m->mothurOut("DONE."); m->mothurOutEndLine();
genusNodes = phyloTree->getGenusNodes();
genusTotals = phyloTree->getGenusTotals();
m->mothurOut("Reading template probabilities... "); cout.flush();
- readProbFile(probFileTest, probFileTest2);
+ readProbFile(probFileTest, probFileTest2, probFileName, probFileName2);
}else{
wordGenusProb.resize(numKmers);
for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); }
+
+
+ #ifdef USE_MPI
+ int pid;
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+ if (pid == 0) {
+ #endif
ofstream out;
openOutputFile(probFileName, out);
ofstream out2;
openOutputFile(probFileName2, out2);
+ #ifdef USE_MPI
+ }
+ #endif
+
+
//for each word
for (int i = 0; i < numKmers; i++) {
if (m->control_pressed) { break; }
+ #ifdef USE_MPI
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+ if (pid == 0) {
+ #endif
+
out << i << '\t';
+ #ifdef USE_MPI
+ }
+ #endif
+
vector<int> seqsWithWordi = database->getSequencesWithKmer(i);
map<int, int> count;
for (int k = 0; k < genusNodes.size(); k++) {
//probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1);
wordGenusProb[i][k] = log((count[genusNodes[k]] + probabilityInTemplate) / (float) (genusTotals[k] + 1));
- if (count[genusNodes[k]] != 0) { out << k << '\t' << wordGenusProb[i][k] << '\t'; numNotZero++; }
+ if (count[genusNodes[k]] != 0) {
+ #ifdef USE_MPI
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+ if (pid == 0) {
+ #endif
+
+ out << k << '\t' << wordGenusProb[i][k] << '\t';
+
+ #ifdef USE_MPI
+ }
+ #endif
+
+ numNotZero++;
+ }
}
+
+ #ifdef USE_MPI
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+ if (pid == 0) {
+ #endif
+
out << endl;
out2 << probabilityInTemplate << '\t' << numNotZero << endl;
+
+ #ifdef USE_MPI
+ }
+ #endif
}
+ #ifdef USE_MPI
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+ if (pid == 0) {
+ #endif
+
out.close();
out2.close();
+ #ifdef USE_MPI
+ }
+ #endif
+
//read in new phylotree with less info. - its faster
ifstream phyloTreeTest(phyloTreeName.c_str());
delete phyloTree;
- phyloTree = new PhyloTree(phyloTreeTest);
+ phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName);
}
m->mothurOut("DONE."); m->mothurOutEndLine();
}
}
/**************************************************************************************************/
-void Bayesian::readProbFile(ifstream& in, ifstream& inNum) {
+void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string inNumName) {
try{
- in >> numKmers; gobble(in);
-
- //initialze probabilities
- wordGenusProb.resize(numKmers);
-
- for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); }
+ #ifdef USE_MPI
+
+ int pid, num, num2;
+ vector<long> positions;
+ vector<long> positions2;
+
+ MPI_Status status;
+ MPI_File inMPI;
+ MPI_File inMPI2;
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+ char inFileName[1024];
+ strcpy(inFileName, inNumName.c_str());
+
+ char inFileName2[1024];
+ strcpy(inFileName2, inName.c_str());
+
+ MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
+ MPI_File_open(MPI_COMM_WORLD, inFileName2, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI2); //comm, filename, mode, info, filepointer
+
+ if (pid == 0) {
+ positions = setFilePosEachLine(inNumName, num);
+
+ //send file positions to all processes
+ MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs
+ MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos
+
+ positions2 = setFilePosEachLine(inName, num2);
+
+ //send file positions to all processes
+ MPI_Bcast(&num2, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs
+ MPI_Bcast(&positions2[0], (num2+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos
+
+ }else{
+ MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
+ positions.resize(num);
+ MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
+
+ MPI_Bcast(&num2, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
+ positions2.resize(num2);
+ MPI_Bcast(&positions2[0], (num2+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
+
+ }
- int kmer, name, count; count = 0;
- vector<int> num; num.resize(numKmers);
- float prob;
- vector<float> zeroCountProb; zeroCountProb.resize(numKmers);
-
- while (inNum) {
- inNum >> zeroCountProb[count] >> num[count];
- count++;
- gobble(inNum);
- }
- inNum.close();
-
- while(in) {
- in >> kmer;
+ //read numKmers
+ int length = positions2[1] - positions2[0];
+ char* buf = new char[length];
+
+ MPI_File_read_at(inMPI2, positions2[0], buf, length, MPI_CHAR, &status);
+
+ string tempBuf = buf;
+ if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
+ delete buf;
+
+ istringstream iss (tempBuf,istringstream::in);
+ iss >> numKmers;
+
+ //initialze probabilities
+ wordGenusProb.resize(numKmers);
+
+ for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); }
- //set them all to zero value
- for (int i = 0; i < genusNodes.size(); i++) {
- wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1));
+ int kmer, name;
+ vector<int> numbers; numbers.resize(numKmers);
+ float prob;
+ vector<float> zeroCountProb; zeroCountProb.resize(numKmers);
+
+ //read file
+ for(int i=0;i<num;i++){
+ //read next sequence
+ length = positions[i+1] - positions[i];
+ char* buf4 = new char[length];
+
+ MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
+
+ tempBuf = buf4;
+ if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
+ delete buf4;
+
+ istringstream iss (tempBuf,istringstream::in);
+ iss >> zeroCountProb[i] >> numbers[i];
}
- //get probs for nonzero values
- for (int i = 0; i < num[kmer]; i++) {
- in >> name >> prob;
- wordGenusProb[kmer][name] = prob;
+ MPI_File_close(&inMPI);
+
+ for(int i=1;i<num2;i++){
+ //read next sequence
+ length = positions2[i+1] - positions2[i];
+ char* buf4 = new char[length];
+
+ MPI_File_read_at(inMPI2, positions2[i], buf4, length, MPI_CHAR, &status);
+
+ tempBuf = buf4;
+ if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
+ delete buf4;
+
+ istringstream iss (tempBuf,istringstream::in);
+
+ iss >> kmer;
+
+ //set them all to zero value
+ for (int i = 0; i < genusNodes.size(); i++) {
+ wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1));
+ }
+
+ //get probs for nonzero values
+ for (int i = 0; i < numbers[kmer]; i++) {
+ iss >> name >> prob;
+ wordGenusProb[kmer][name] = prob;
+ }
+
}
+ MPI_File_close(&inMPI2);
+ #else
+
+ in >> numKmers; gobble(in);
- gobble(in);
- }
- in.close();
+ //initialze probabilities
+ wordGenusProb.resize(numKmers);
+
+ for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); }
+
+ int kmer, name, count; count = 0;
+ vector<int> num; num.resize(numKmers);
+ float prob;
+ vector<float> zeroCountProb; zeroCountProb.resize(numKmers);
+
+ while (inNum) {
+ inNum >> zeroCountProb[count] >> num[count];
+ count++;
+ gobble(inNum);
+ }
+ inNum.close();
+
+ while(in) {
+ in >> kmer;
+
+ //set them all to zero value
+ for (int i = 0; i < genusNodes.size(); i++) {
+ wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1));
+ }
+
+ //get probs for nonzero values
+ for (int i = 0; i < num[kmer]; i++) {
+ in >> name >> prob;
+ wordGenusProb[kmer][name] = prob;
+ }
+
+ gobble(in);
+ }
+ in.close();
+
+ #endif
}
catch(exception& e) {
m->errorOut(e, "Bayesian", "readProbFile");
string bootstrapResults(vector<int>, int, int);
int getMostProbableTaxonomy(vector<int>);
- void readProbFile(ifstream&, ifstream&);
+ void readProbFile(ifstream&, ifstream&, string, string);
};
phyloTree->assignHeirarchyIDs(0);
+ phyloTree->setUp(file);
+
m->mothurOut("DONE.");
m->mothurOutEndLine(); cout.flush();
#include "sequence.hpp"
#include "bayesian.h"
#include "phylotree.h"
+#include "phylosummary.h"
#include "knn.h"
//**********************************************************************************************************************
else {
//valid paramters for this command
- string AlignArray[] = {"template","fasta","name","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters", "outputdir","inputdir"};
+ string AlignArray[] = {"template","fasta","name","group","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters", "outputdir","inputdir"};
vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
OptionParser parser(option);
//if the user has not given a path then, add inputdir. else leave path alone.
if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
}
+
+ it = parameters.find("group");
+ //user has given a template file
+ if(it != parameters.end()){
+ path = hasPath(it->second);
+ //if the user has not given a path then, add inputdir. else leave path alone.
+ if (path == "") { parameters["group"] = inputDir + it->second; }
+ }
}
//check for required parameters
}
else if (templateFileName == "not open") { abort = true; }
+ groupfile = validParameter.validFile(parameters, "group", true);
+ if (groupfile == "not open") { abort = true; }
+ else if (groupfile == "not found") { groupfile = ""; }
+
fastaFileName = validParameter.validFile(parameters, "fasta", false);
if (fastaFileName == "not found") { m->mothurOut("fasta is a required parameter for the classify.seqs command."); m->mothurOutEndLine(); abort = true; }
else {
m->mothurOut("The template, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n");
m->mothurOut("The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer, blast and distance. The default is kmer.\n");
m->mothurOut("The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n");
+ m->mothurOut("The group parameter allows you add a group file so you can have the summary totals broken up by group.\n");
m->mothurOut("The method parameter allows you to specify classification method to use. Your options are: bayesian and knn. The default is bayesian.\n");
m->mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n");
m->mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
start = time(NULL);
- //make taxonomy tree from new taxonomy file
- PhyloTree taxaBrowser;
+ PhyloSummary taxaSum(taxonomyFileName, groupfile);
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; }
- ifstream in;
- openInputFile(tempTaxonomyFile, in);
-
- //read in users taxonomy file and add sequences to tree
- string name, taxon;
- while(!in.eof()){
- in >> name >> taxon; gobble(in);
-
- if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } remove(tempTaxonomyFile.c_str()); delete classify; return 0; }
+ if (namefile == "") { taxaSum.summarize(tempTaxonomyFile); }
+ else {
+ ifstream in;
+ openInputFile(tempTaxonomyFile, in);
- if (namefile != "") {
+ //read in users taxonomy file and add sequences to tree
+ string name, taxon;
+ while(!in.eof()){
+ in >> name >> taxon; gobble(in);
+
itNames = nameMap.find(name);
if (itNames == nameMap.end()) {
m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1);
}else{
for (int i = 0; i < itNames->second; i++) {
- taxaBrowser.addSeqToTree(name+toString(i), taxon); //add it as many times as there are identical seqs
+ taxaSum.addSeqToTree(name+toString(i), taxon); //add it as many times as there are identical seqs
}
}
- }else { taxaBrowser.addSeqToTree(name, taxon); } //add it once
+ }
+ in.close();
}
- in.close();
-
- taxaBrowser.assignHeirarchyIDs(0);
-
- if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } remove(tempTaxonomyFile.c_str()); delete classify; return 0; }
-
- taxaBrowser.binUnclassified();
-
remove(tempTaxonomyFile.c_str());
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; }
-
//print summary file
ofstream outTaxTree;
openOutputFile(taxSummary, outTaxTree);
- taxaBrowser.print(outTaxTree);
+ taxaSum.print(outTaxTree);
outTaxTree.close();
//output taxonomy with the unclassified bins added
openOutputFile(unclass, outTax);
//get maxLevel from phylotree so you know how many 'unclassified's to add
- int maxLevel = taxaBrowser.getMaxLevel();
+ int maxLevel = taxaSum.getMaxLevel();
//read taxfile - this reading and rewriting is done to preserve the confidence scores.
+ string name, taxon;
while (!inTax.eof()) {
if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } remove(unclass.c_str()); delete classify; return 0; }
remove(newTaxonomyFile.c_str());
rename(unclass.c_str(), newTaxonomyFile.c_str());
+ m->mothurOutEndLine();
+ m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
+
#ifdef USE_MPI
}
#endif
m->mothurOut("Output File Names: "); m->mothurOutEndLine();
for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
m->mothurOutEndLine();
-
- //m->mothurOutEndLine();
- //m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine();
}
delete classify;
Classify* classify;
- string fastaFileName, templateFileName, distanceFileName, namefile, search, method, taxonomyFileName, outputDir;
+ string fastaFileName, templateFileName, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile;
int processors, kmerSize, numWanted, cutoff, iters;
float match, misMatch, gapOpen, gapExtend;
bool abort, probs;
delete buf;
int count = 0;
- while (count < fileSize) { //read 1000 characters at a time
- //send freqs
+ while (count < fileSize) {
char buf2[1];
MPI_File_read(inMPI, buf2, 1, MPI_CHAR, &status);
MPI_File_write(outMPI, buf2, 1, MPI_CHAR, &status);
./classify.o\\r
./phylotree.o\\r
./bayesian.o\
- ./rawtrainingdatamaker.o\\r
+ ./phylosummary.o\\r
./alignmentdb.o\\r
./knn.o\\r
./distancedb.o\\r
./classify.o\\r
./phylotree.o\\r
./bayesian.o\
- ./rawtrainingdatamaker.o\\r
+ ./phylosummary.o\\r
./alignmentdb.o\\r
./knn.o\\r
./distancedb.o\\r
./classify.o\\r
./phylotree.o\\r
./bayesian.o\
- ./rawtrainingdatamaker.o\\r
+ ./phylosummary.o\\r
./alignmentdb.o\\r
./knn.o\\r
./distancedb.o\\r
./chimerabellerophoncommand.o : chimerabellerophoncommand.cpp\r
$(CC) $(CC_OPTIONS) chimerabellerophoncommand.cpp -c $(INCLUDE) -o ./chimerabellerophoncommand.o\r
-# Item # 171 -- rawtrainingdatamaker --\r
-./rawtrainingdatamaker.o : rawtrainingdatamaker.cpp\r
- $(CC) $(CC_OPTIONS) rawtrainingdatamaker.cpp -c $(INCLUDE) -o ./rawtrainingdatamaker.o\r
+# Item # 171 -- phylosummary --\r
+./phylosummary.o : phylosummary.cpp\r
+ $(CC) $(CC_OPTIONS) phylosummary.cpp -c $(INCLUDE) -o ./phylosummary.o\r
\r
\r
\r
--- /dev/null
+/*
+ * rawTrainingDataMaker.cpp
+ * Mothur
+ *
+ * Created by westcott on 4/21/10.
+ * Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "phylosummary.h"
+
+/**************************************************************************************************/
+
+PhyloSummary::PhyloSummary(string refTfile, string groupFile){
+ try {
+ m = MothurOut::getInstance();
+ maxLevel = 0;
+
+ if (groupFile != "") {
+ groupmap = new GroupMap(groupFile);
+ groupmap->readMap();
+ }else{
+ groupmap = NULL;
+ }
+
+ //check for necessary files
+ string taxFileNameTest = refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum";
+ ifstream FileTest(taxFileNameTest.c_str());
+
+ if (!FileTest) {
+ m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1);
+ }else{
+ readTreeStruct(FileTest);
+ }
+
+ tree[0].rank = "0";
+ assignRank(0);
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PhyloSummary", "PhyloSummary");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+void PhyloSummary::summarize(string userTfile){
+ try {
+
+ ifstream in;
+ openInputFile(userTfile, in);
+
+ //read in users taxonomy file and add sequences to tree
+ string name, tax;
+ while(!in.eof()){
+ in >> name >> tax; gobble(in);
+
+ addSeqToTree(name, tax);
+
+ if (m->control_pressed) { break; }
+ }
+ in.close();
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PhyloSummary", "summarize");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+string PhyloSummary::getNextTaxon(string& heirarchy){
+ try {
+ string currentLevel = "";
+ if(heirarchy != ""){
+ int pos = heirarchy.find_first_of(';');
+ currentLevel=heirarchy.substr(0,pos);
+ if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); }
+ else { heirarchy = ""; }
+ }
+ return currentLevel;
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PhyloSummary", "getNextTaxon");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){
+ try {
+ numSeqs++;
+
+ map<string, int>::iterator childPointer;
+
+ int currentNode = 0;
+ string taxon;
+
+ int level = 0;
+
+ while (seqTaxonomy != "") {
+
+ if (m->control_pressed) { return 0; }
+
+ //somehow the parent is getting one too many accnos
+ //use print to reassign the taxa id
+ taxon = getNextTaxon(seqTaxonomy);
+
+ childPointer = tree[currentNode].children.find(taxon);
+
+ if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on
+ if (groupmap != NULL) {
+ //find out the sequences group
+ string group = groupmap->getGroup(seqName);
+
+ //do you have a count for this group?
+ map<string, int>::iterator itGroup = tree[currentNode].groupCount.find(group);
+
+ //if yes, increment it - there should not be a case where we can't find it since we load group in read
+ if (itGroup != tree[currentNode].groupCount.end()) {
+ tree[currentNode].groupCount[group]++;
+ }
+ }
+
+ tree[currentNode].total++;
+
+ currentNode = childPointer->second;
+ }else{ //otherwise, create it
+ m->mothurOut("Error: cannot find taxonomy in tree for " + seqName + "."); m->mothurOutEndLine();
+ seqTaxonomy = "";
+ }
+
+ level++;
+
+ if ((seqTaxonomy == "") && (level < maxLevel)) { //if you think you are done and you are not.
+ for (int k = level; k < maxLevel; k++) { seqTaxonomy += "unclassified;"; }
+ }
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PhyloSummary", "addSeqToTree");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+void PhyloSummary::assignRank(int index){
+ try {
+ map<string,int>::iterator it;
+ int counter = 1;
+
+ for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
+ tree[it->second].rank = tree[index].rank + '.' + toString(counter);
+ counter++;
+
+ assignRank(it->second);
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PhyloSummary", "assignRank");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+void PhyloSummary::print(ofstream& out){
+ try {
+ //print labels
+ out << "taxlevel\t rank ID\t label\t daughterlevels\t total\t";
+ if (groupmap != NULL) {
+ for (int i = 0; i < groupmap->namesOfGroups.size(); i++) {
+ out << groupmap->namesOfGroups[i] << '\t';
+ }
+ }
+
+ out << endl;
+
+ //print root
+ out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << tree[0].children.size() << "\t" << tree[0].total << "\t";
+
+ map<string, int>::iterator itGroup;
+ if (groupmap != NULL) {
+ for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) {
+ out << itGroup->second << '\t';
+ }
+ }
+ out << endl;
+
+ //print rest
+ print(0, out);
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PhyloSummary", "print");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+void PhyloSummary::print(int i, ofstream& out){
+ try {
+ map<string,int>::iterator it;
+ for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
+
+ if (tree[it->second].total != 0) {
+ out << tree[it->second].level << "\t" << tree[it->second].rank << "\t" << tree[it->second].name << "\t" << tree[it->second].children.size() << "\t" << tree[it->second].total << "\t";
+
+ map<string, int>::iterator itGroup;
+ if (groupmap != NULL) {
+ for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) {
+ out << itGroup->second << '\t';
+ }
+ }
+ out << endl;
+ }
+
+ print(it->second, out);
+ }
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PhyloSummary", "print");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+void PhyloSummary::readTreeStruct(ifstream& in){
+ try {
+ int num;
+
+ in >> num; gobble(in);
+
+ tree.resize(num);
+
+ //read the tree file
+ for (int i = 0; i < tree.size(); i++) {
+
+ in >> tree[i].level >> tree[i].name >> num; //num contains the number of children tree[i] has
+
+ //set children
+ string childName;
+ int childIndex;
+ for (int j = 0; j < num; j++) {
+ in >> childName >> childIndex;
+ tree[i].children[childName] = childIndex;
+ }
+
+ //initialize groupcounts
+ if (groupmap != NULL) {
+ for (int j = 0; j < groupmap->namesOfGroups.size(); j++) {
+ tree[i].groupCount[groupmap->namesOfGroups[j]] = 0;
+ }
+ }
+
+ tree[i].total = 0;
+
+ gobble(in);
+
+ if (tree[i].level > maxLevel) { maxLevel = tree[i].level; }
+ }
+
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PhyloSummary", "print");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
+
+
+
--- /dev/null
+#ifndef RAWTRAININGDATAMAKER_H
+#define RAWTRAININGDATAMAKER_H
+
+/*
+ * rawTrainingDataMaker.h
+ * Mothur
+ *
+ * Created by westcott on 4/21/10.
+ * Copyright 2010 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "mothurout.h"
+#include "groupmap.h"
+
+/**************************************************************************************************/
+
+struct rawTaxNode {
+ map<string, int> children; //childs name to index in tree
+ int parent, level;
+ string name, rank;
+ map<string, int> groupCount;
+ int total;
+
+ rawTaxNode(string n) : name(n), level(0), parent(-1), total(0) {}
+ rawTaxNode(){}
+};
+
+/**************************************************************************************************/
+//doesn't use MPI ifdefs since only pid 0 uses this class
+class PhyloSummary {
+
+public:
+ PhyloSummary(string, string);
+ ~PhyloSummary() { if (groupmap != NULL) { delete groupmap; } }
+
+ void summarize(string); //pass it a taxonomy file and a group file and it makes the tree
+ int addSeqToTree(string, string);
+ void print(ofstream&);
+ int getMaxLevel() { return maxLevel; }
+
+private:
+ string getNextTaxon(string&);
+ vector<rawTaxNode> tree;
+ void print(int, ofstream&);
+ void assignRank(int);
+ void readTreeStruct(ifstream&);
+ GroupMap* groupmap;
+
+ int numNodes;
+ int numSeqs;
+ int maxLevel;
+ MothurOut* m;
+};
+
+/**************************************************************************************************/
+
+#endif
+
+
}
/**************************************************************************************************/
-PhyloTree::PhyloTree(ifstream& in){
+PhyloTree::PhyloTree(ifstream& in, string filename){
try {
m = MothurOut::getInstance();
calcTotals = false;
- in >> numNodes; gobble(in);
-
- tree.resize(numNodes);
-
- for (int i = 0; i < tree.size(); i++) {
- in >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(in);
-
- }
-
- //read genus nodes
- int numGenus = 0;
- in >> numGenus; gobble(in);
-
- int gnode, gsize;
- totals.clear();
- for (int i = 0; i < numGenus; i++) {
- in >> gnode >> gsize; gobble(in);
+ #ifdef USE_MPI
+ MPI_File inMPI;
+ MPI_Offset size;
+ MPI_Status status;
+
+ char inFileName[1024];
+ strcpy(inFileName, filename.c_str());
+
+ MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI);
+ MPI_File_get_size(inMPI, &size);
- uniqueTaxonomies[gnode] = gnode;
- totals.push_back(gsize);
- }
+ char* buffer = new char[size];
+ MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status);
+
+ string tempBuf = buffer;
+ if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); }
+ istringstream iss (tempBuf,istringstream::in);
+ delete buffer;
+
+ iss >> numNodes; gobble(iss);
+
+ tree.resize(numNodes);
+
+ for (int i = 0; i < tree.size(); i++) {
+ iss >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(iss);
+ }
- in.close();
+ //read genus nodes
+ int numGenus = 0;
+ iss >> numGenus; gobble(iss);
+
+ int gnode, gsize;
+ totals.clear();
+ for (int i = 0; i < numGenus; i++) {
+ iss >> gnode >> gsize; gobble(iss);
+
+ uniqueTaxonomies[gnode] = gnode;
+ totals.push_back(gsize);
+ }
+
+ MPI_File_close(&inMPI);
+
+ #else
+ in >> numNodes; gobble(in);
+
+ tree.resize(numNodes);
+
+ for (int i = 0; i < tree.size(); i++) {
+ in >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(in);
+ }
+
+ //read genus nodes
+ int numGenus = 0;
+ in >> numGenus; gobble(in);
+
+ int gnode, gsize;
+ totals.clear();
+ for (int i = 0; i < numGenus; i++) {
+ in >> gnode >> gsize; gobble(in);
+
+ uniqueTaxonomies[gnode] = gnode;
+ totals.push_back(gsize);
+ }
+
+ in.close();
+
+ #endif
}
catch(exception& e) {
tree[0].heirarchyID = "0";
maxLevel = 0;
calcTotals = true;
+ string name, tax;
+
- ifstream in;
- openInputFile(tfile, in);
+ #ifdef USE_MPI
+ int pid, num;
+ vector<long> positions;
+
+ MPI_Status status;
+ MPI_File inMPI;
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+ char inFileName[1024];
+ strcpy(inFileName, tfile.c_str());
+
+ MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer
+
+ if (pid == 0) {
+ positions = setFilePosEachLine(tfile, num);
+
+ //send file positions to all processes
+ MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs
+ MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos
+ }else{
+ MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs
+ positions.resize(num);
+ MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions
+ }
- //read in users taxonomy file and add sequences to tree
- string name, tax;
- while(!in.eof()){
- in >> name >> tax; gobble(in);
+ //read file
+ for(int i=0;i<num;i++){
+ //read next sequence
+ int length = positions[i+1] - positions[i];
+ char* buf4 = new char[length];
+
+ MPI_File_read_at(inMPI, positions[i], buf4, length, MPI_CHAR, &status);
+
+ string tempBuf = buf4;
+ if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); }
+ delete buf4;
+
+ istringstream iss (tempBuf,istringstream::in);
+ iss >> name >> tax;
+ addSeqToTree(name, tax);
+ }
- addSeqToTree(name, tax);
- }
- in.close();
-
+ MPI_File_close(&inMPI);
+
+ #else
+ ifstream in;
+ openInputFile(tfile, in);
+
+ //read in users taxonomy file and add sequences to tree
+ while(!in.eof()){
+ in >> name >> tax; gobble(in);
+
+ addSeqToTree(name, tax);
+ }
+ in.close();
+ #endif
+
assignHeirarchyIDs(0);
+
+ //create file for summary if needed
+ setUp(tfile);
}
catch(exception& e) {
m->errorOut(e, "PhyloTree", "PhyloTree");
}
}
/**************************************************************************************************/
-void PhyloTree::binUnclassified(){
+void PhyloTree::setUp(string tfile){
+ try{
+ string taxFileNameTest = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.sum";
+
+ #ifdef USE_MPI
+ int pid;
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+ if (pid == 0) { binUnclassified(taxFileNameTest); }
+
+ #else
+ //create file needed for summary if it doesn't exist
+ ifstream FileTest(taxFileNameTest.c_str());
+
+ if (!FileTest) {
+ binUnclassified(taxFileNameTest);
+ }
+ #endif
+ }
+ catch(exception& e) {
+ m->errorOut(e, "PhyloTree", "setUp");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+void PhyloTree::binUnclassified(string file){
try {
+
+ ofstream out;
+ openOutputFile(file, out);
+
map<string, int>::iterator itBin;
map<string, int>::iterator childPointer;
+ vector<TaxNode> copy = tree;
+ int copyNodes = numNodes;
+
//go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary
for (itBin = name2Taxonomy.begin(); itBin != name2Taxonomy.end(); itBin++) {
-
- int level = tree[itBin->second].level;
+
+ if (m->control_pressed) { out.close(); break; }
+
+ int level = copy[itBin->second].level;
int currentNode = itBin->second;
//this sequence is unclassified at some levels
string taxon = "unclassified";
//does the parent have a child names 'unclassified'?
- childPointer = tree[currentNode].children.find(taxon);
+ childPointer = copy[currentNode].children.find(taxon);
- if(childPointer != tree[currentNode].children.end()){ //if the node already exists, move on
+ if(childPointer != copy[currentNode].children.end()){ //if the node already exists, move on
currentNode = childPointer->second; //currentNode becomes 'unclassified'
- tree[currentNode].accessions.push_back(itBin->first); //add this seq
- name2Taxonomy[itBin->first] = currentNode;
+ copy[currentNode].accessions.push_back(itBin->first); //add this seq
}
else{ //otherwise, create it
- tree.push_back(TaxNode(taxon));
- numNodes++;
- tree[currentNode].children[taxon] = numNodes-1;
- tree[numNodes-1].parent = currentNode;
+ copy.push_back(TaxNode(taxon));
+ copyNodes++;
+ copy[currentNode].children[taxon] = copyNodes-1;
+ copy[copyNodes-1].parent = currentNode;
+ copy[copyNodes-1].level = copy[currentNode].level + 1;
- currentNode = tree[currentNode].children[taxon];
- tree[currentNode].accessions.push_back(itBin->first);
- name2Taxonomy[itBin->first] = currentNode;
+ currentNode = copy[currentNode].children[taxon];
+ copy[currentNode].accessions.push_back(itBin->first);
}
-
- if (level == maxLevel) { uniqueTaxonomies[currentNode] = currentNode; }
}
}
- //clear HeirarchyIDs and reset them
- for (int i = 1; i < tree.size(); i++) {
- tree[i].heirarchyID = "";
+ if (!m->control_pressed) {
+ //print copy tree
+ print(out, copy);
}
- assignHeirarchyIDs(0);
-
+
}
catch(exception& e) {
m->errorOut(e, "PhyloTree", "binUnclassified");
}
/**************************************************************************************************/
-void PhyloTree::print(ofstream& out){
- try {
- out << tree[0].level << '\t'<< tree[0].heirarchyID << '\t' << tree[0].name << '\t' << tree[0].children.size() << '\t' << tree[0].accessions.size() << endl;
- print(0, out);
- }
- catch(exception& e) {
- m->errorOut(e, "PhyloTree", "print");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-void PhyloTree::print(int i, ofstream& out){
+void PhyloTree::print(ofstream& out, vector<TaxNode>& copy){
try {
- map<string,int>::iterator it;
- for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
- out <<tree[it->second].level << '\t' << tree[it->second].heirarchyID << '\t' << tree[it->second].name << '\t' << tree[it->second].children.size() << '\t' << tree[it->second].accessions.size() << endl;
- print(it->second, out);
+ out << copy.size() << endl;
+
+ for (int i = 0; i < copy.size(); i++) {
+
+ out << copy[i].level << '\t'<< copy[i].name << '\t' << copy[i].children.size() << '\t';
+
+ map<string,int>::iterator it;
+ for(it=copy[i].children.begin();it!=copy[i].children.end();it++){
+ out << it->first << '\t' << it->second << '\t';
+ }
+ out << endl;
}
+
+ out.close();
}
catch(exception& e) {
m->errorOut(e, "PhyloTree", "print");
/**************************************************************************************************/
void PhyloTree::printTreeNodes(string treefilename) {
try {
- ofstream outTree;
- openOutputFile(treefilename, outTree);
-
- //print treenodes
- outTree << tree.size() << endl;
- for (int i = 0; i < tree.size(); i++) {
- outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
- }
-
- //print genus nodes
- outTree << endl << uniqueTaxonomies.size() << endl;
- map<int, int>::iterator it2;
- for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl; }
- outTree << endl;
+
+ #ifdef USE_MPI
+ int pid;
+ MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
+
+ if (pid == 0) {
+ #endif
+
+ ofstream outTree;
+ openOutputFile(treefilename, outTree);
+
+ //print treenodes
+ outTree << tree.size() << endl;
+ for (int i = 0; i < tree.size(); i++) {
+ outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl;
+ }
+
+ //print genus nodes
+ outTree << endl << uniqueTaxonomies.size() << endl;
+ map<int, int>::iterator it2;
+ for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl; }
+ outTree << endl;
+
+
+ outTree.close();
- outTree.close();
+ #ifdef USE_MPI
+ }
+ #endif
+
}
catch(exception& e) {
public:
PhyloTree();
PhyloTree(string); //pass it a taxonomy file and it makes the tree
- PhyloTree(ifstream&); //pass it a taxonomy file and it makes the train.tree
+ PhyloTree(ifstream&, string); //pass it a taxonomy file and it makes the train.tree
~PhyloTree() {};
int addSeqToTree(string, string);
void assignHeirarchyIDs(int);
- void print(ofstream&);
void printTreeNodes(string); //used by bayesian to save time
vector<int> getGenusNodes();
vector<int> getGenusTotals();
- void binUnclassified();
+ void setUp(string); //used to create file needed for summary file if you use () constructor and add seqs manually instead of passing taxonomyfile
TaxNode get(int i) { return tree[i]; }
TaxNode get(string seqName) { return tree[name2Taxonomy[seqName]]; }
private:
string getNextTaxon(string&);
+ void print(ofstream&, vector<TaxNode>&);
+ void binUnclassified(string);
+
vector<TaxNode> tree;
vector<int> genusIndex; //holds the indexes in tree where the genus level taxonomies are stored
vector<int> totals; //holds the numSeqs at each genus level taxonomy
map<string, int> name2Taxonomy; //maps name to index in tree
map<int, int> uniqueTaxonomies; //map of unique taxonomies
- void print(int, ofstream&);
+ //void print(int, ofstream&);
int numNodes;
int numSeqs;
int maxLevel;
+++ /dev/null
-/*
- * rawTrainingDataMaker.cpp
- * Mothur
- *
- * Created by westcott on 4/21/10.
- * Copyright 2010 Schloss Lab. All rights reserved.
- *
- */
-
-#include "rawtrainingdatamaker.h"
-
-/**************************************************************************************************/
-
-RawTrainingDataMaker::RawTrainingDataMaker(){
- try {
- m = MothurOut::getInstance();
- numNodes = 1;
- numSeqs = 0;
- tree.push_back(rawTaxNode("Root"));
- tree[0].rank = "Root";
- maxLevel = 0;
- }
- catch(exception& e) {
- m->errorOut(e, "RawTrainingDataMaker", "RawTrainingDataMaker");
- exit(1);
- }
-}
-/**************************************************************************************************/
-
-RawTrainingDataMaker::RawTrainingDataMaker(string tfile){
- try {
- m = MothurOut::getInstance();
- numNodes = 1;
- numSeqs = 0;
- tree.push_back(rawTaxNode("Root"));
- tree[0].rank = "Root";
- maxLevel = 0;
-
- ifstream in;
- openInputFile(tfile, in);
-
- //read in users taxonomy file and add sequences to tree
- string name, tax;
- while(!in.eof()){
- in >> name >> tax; gobble(in);
-
- addSeqToTree(name, tax);
- }
- in.close();
-
- assignRank(0);
- }
- catch(exception& e) {
- m->errorOut(e, "RawTrainingDataMaker", "RawTrainingDataMaker");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-string RawTrainingDataMaker::getNextTaxon(string& heirarchy){
- try {
- string currentLevel = "";
- if(heirarchy != ""){
- int pos = heirarchy.find_first_of(';');
- currentLevel=heirarchy.substr(0,pos);
- if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); }
- else { heirarchy = ""; }
- }
- return currentLevel;
- }
- catch(exception& e) {
- m->errorOut(e, "RawTrainingDataMaker", "getNextTaxon");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-int RawTrainingDataMaker::addSeqToTree(string seqName, string seqTaxonomy){
- try {
- numSeqs++;
-
- map<string, int>::iterator childPointer;
-
- int currentNode = 0;
- string taxon;
-
- while(seqTaxonomy != ""){
-
- if (m->control_pressed) { return 0; }
-
- //somehow the parent is getting one too many accnos
- //use print to reassign the taxa id
- taxon = getNextTaxon(seqTaxonomy);
-
- childPointer = tree[currentNode].children.find(taxon);
-
- if(childPointer != tree[currentNode].children.end()){ //if the node already exists, move on
- currentNode = childPointer->second;
- }else{ //otherwise, create it
- tree.push_back(rawTaxNode(taxon));
- numNodes++;
- tree[currentNode].children[taxon] = numNodes-1;
- tree[numNodes-1].parent = currentNode;
-
- currentNode = tree[currentNode].children[taxon];
- }
- }
-
- }
- catch(exception& e) {
- m->errorOut(e, "RawTrainingDataMaker", "addSeqToTree");
- exit(1);
- }
-}
-/**************************************************************************************************/
-
-void RawTrainingDataMaker::assignRank(int index){
- try {
- map<string,int>::iterator it;
-
- string ranks[9] = { "Root","Domain","Kingdom","Phylum","Class","Order","Family","Genus","Species" };
-
- for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
- tree[it->second].level = tree[index].level + 1;
-
- if (tree[it->second].level > 8) {
- tree[it->second].rank = ("unknown" + toString(tree[it->second].level));
- }else {
- tree[it->second].rank = ranks[tree[it->second].level];
- }
-
- //save maxLevel for binning the unclassified seqs
- if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; }
-
- assignRank(it->second);
- }
- }
- catch(exception& e) {
- m->errorOut(e, "RawTrainingDataMaker", "assignRank");
- exit(1);
- }
-}
-/**************************************************************************************************/
-
-void RawTrainingDataMaker::print(ofstream& out){
- try {
- //string temp = tree[0].name +" " + tree[0].rank;
- //sanityCheck[temp] = temp;
-
- out << "0" << "*" << tree[0].name << "*" << tree[0].parent << "*" << tree[0].level << "*" << tree[0].rank << endl;
- print(0, out);
-
- }
- catch(exception& e) {
- m->errorOut(e, "RawTrainingDataMaker", "print");
- exit(1);
- }
-}
-
-/**************************************************************************************************/
-
-void RawTrainingDataMaker::print(int i, ofstream& out){
- try {
- map<string,int>::iterator it;
- for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
- //string temp = tree[it->second].name + " " + tree[it->second].rank;
-
- //map<string, string>::iterator itSan;
- //itSan = sanityCheck.find(temp);
-
- //if (itSan == sanityCheck.end()) {
- out << it->second << "*" << tree[it->second].name << "*" << tree[it->second].parent << "*" << tree[it->second].level << "*" << tree[it->second].rank << endl;
- //sanityCheck[temp] = temp;
- //}
- print(it->second, out);
- }
- }
- catch(exception& e) {
- m->errorOut(e, "RawTrainingDataMaker", "print");
- exit(1);
- }
-}
-/**************************************************************************************************/
-
-
-
+++ /dev/null
-#ifndef RAWTRAININGDATAMAKER_H
-#define RAWTRAININGDATAMAKER_H
-
-/*
- * rawTrainingDataMaker.h
- * Mothur
- *
- * Created by westcott on 4/21/10.
- * Copyright 2010 Schloss Lab. All rights reserved.
- *
- */
-
-#include "mothur.h"
-#include "mothurout.h"
-
-/**************************************************************************************************/
-
-struct rawTaxNode {
- map<string, int> children; //childs name to index in tree
- int parent, level;
- string name, rank;
-
- rawTaxNode(string n) : name(n), level(0), parent(-1) { }
- rawTaxNode(){}
-};
-
-/**************************************************************************************************/
-
-class RawTrainingDataMaker {
-
-public:
- RawTrainingDataMaker();
- RawTrainingDataMaker(string); //pass it a taxonomy file and it makes the tree
- ~RawTrainingDataMaker() {};
- int addSeqToTree(string, string);
- void assignRank(int);
- void print(ofstream&);
-
-private:
- string getNextTaxon(string&);
- vector<rawTaxNode> tree;
- void print(int, ofstream&);
- int numNodes;
- int numSeqs;
- int maxLevel;
- //map<string, string> sanityCheck;
- MothurOut* m;
-};
-
-/**************************************************************************************************/
-
-#endif
-
-