From cd9dbd8b53bbe32af3e9c6bead4aa6d796a278a5 Mon Sep 17 00:00:00 2001 From: westcott Date: Tue, 27 Apr 2010 17:54:59 +0000 Subject: [PATCH] reworked the classifiers summary file added groupfile option to classify.seqs, added mpi code to classifiers modification --- Mothur.xcodeproj/project.pbxproj | 8 +- bayesian.cpp | 256 +++++++++++++++---- bayesian.h | 2 +- classify.cpp | 2 + classifyseqscommand.cpp | 63 ++--- classifyseqscommand.h | 2 +- distancecommand.cpp | 3 +- makefile | 12 +- phylosummary.cpp | 274 ++++++++++++++++++++ rawtrainingdatamaker.h => phylosummary.h | 23 +- phylotree.cpp | 306 ++++++++++++++++------- phylotree.h | 10 +- rawtrainingdatamaker.cpp | 188 -------------- 13 files changed, 778 insertions(+), 371 deletions(-) create mode 100644 phylosummary.cpp rename rawtrainingdatamaker.h => phylosummary.h (65%) delete mode 100644 rawtrainingdatamaker.cpp diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 805bde0..cc3b4b6 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -12,8 +12,8 @@ A747E81C116365E000FB9042 /* chimeraslayercommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraslayercommand.h; sourceTree = ""; }; A747E81D116365E000FB9042 /* chimeraslayercommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraslayercommand.cpp; sourceTree = ""; }; A7639F8D1175DF35008F5578 /* makefile */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.make; path = makefile; sourceTree = ""; }; - A76AAD02117F322B003D8DA1 /* rawtrainingdatamaker.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = rawtrainingdatamaker.h; sourceTree = SOURCE_ROOT; }; - A76AAD03117F322B003D8DA1 /* rawtrainingdatamaker.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = rawtrainingdatamaker.cpp; sourceTree = SOURCE_ROOT; }; + A76AAD02117F322B003D8DA1 /* phylosummary.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = phylosummary.h; sourceTree = ""; }; + A76AAD03117F322B003D8DA1 /* phylosummary.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = phylosummary.cpp; sourceTree = ""; }; A78254461164D7790002E2DD /* chimerapintailcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerapintailcommand.h; sourceTree = ""; }; A78254471164D7790002E2DD /* chimerapintailcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerapintailcommand.cpp; sourceTree = ""; }; A7825502116519F70002E2DD /* chimerabellerophoncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerabellerophoncommand.h; sourceTree = ""; }; @@ -831,8 +831,8 @@ A7DA208A113FECD400BF472F /* knn.h */, A7DA20C4113FECD400BF472F /* phylotree.cpp */, A7DA20C5113FECD400BF472F /* phylotree.h */, - A76AAD02117F322B003D8DA1 /* rawtrainingdatamaker.h */, - A76AAD03117F322B003D8DA1 /* rawtrainingdatamaker.cpp */, + A76AAD02117F322B003D8DA1 /* phylosummary.h */, + A76AAD03117F322B003D8DA1 /* phylosummary.cpp */, A7DA215C113FECD400BF472F /* taxonomyequalizer.cpp */, A7DA215D113FECD400BF472F /* taxonomyequalizer.h */, ); diff --git a/bayesian.cpp b/bayesian.cpp index 0929749..e5543cd 100644 --- a/bayesian.cpp +++ b/bayesian.cpp @@ -9,7 +9,7 @@ #include "bayesian.h" #include "kmer.hpp" -#include "rawtrainingdatamaker.h" +#include "phylosummary.h" /**************************************************************************************************/ Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff, int i) : @@ -18,30 +18,30 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { /************calculate the probablity that each word will be in a specific taxonomy*************/ string phyloTreeName = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.train"; - ifstream phyloTreeTest(phyloTreeName.c_str()); - - ofstream out; string probFileName = tfile.substr(0,tfile.find_last_of(".")+1) + tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.prob"; - ifstream probFileTest(probFileName.c_str()); + string probFileName2 = tfile.substr(0,tfile.find_last_of(".")+1) + tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.numNonZero"; + ofstream out; ofstream out2; - string probFileName2 = tfile.substr(0,tfile.find_last_of(".")+1) + tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.numNonZero"; + + ifstream phyloTreeTest(phyloTreeName.c_str()); ifstream probFileTest2(probFileName2.c_str()); + ifstream probFileTest(probFileName.c_str()); int start = time(NULL); if(probFileTest && probFileTest2 && phyloTreeTest){ m->mothurOut("Reading template taxonomy... "); cout.flush(); - phyloTree = new PhyloTree(phyloTreeTest); - + phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName); + m->mothurOut("DONE."); m->mothurOutEndLine(); genusNodes = phyloTree->getGenusNodes(); genusTotals = phyloTree->getGenusTotals(); m->mothurOut("Reading template probabilities... "); cout.flush(); - readProbFile(probFileTest, probFileTest2); + readProbFile(probFileTest, probFileTest2, probFileName, probFileName2); }else{ @@ -65,6 +65,14 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { wordGenusProb.resize(numKmers); for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); } + + + #ifdef USE_MPI + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + if (pid == 0) { + #endif ofstream out; openOutputFile(probFileName, out); @@ -74,12 +82,27 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { ofstream out2; openOutputFile(probFileName2, out2); + #ifdef USE_MPI + } + #endif + + //for each word for (int i = 0; i < numKmers; i++) { if (m->control_pressed) { break; } + #ifdef USE_MPI + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + if (pid == 0) { + #endif + out << i << '\t'; + #ifdef USE_MPI + } + #endif + vector seqsWithWordi = database->getSequencesWithKmer(i); map count; @@ -98,20 +121,52 @@ Classify(), kmerSize(ksize), confidenceThreshold(cutoff), iters(i) { for (int k = 0; k < genusNodes.size(); k++) { //probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1); wordGenusProb[i][k] = log((count[genusNodes[k]] + probabilityInTemplate) / (float) (genusTotals[k] + 1)); - if (count[genusNodes[k]] != 0) { out << k << '\t' << wordGenusProb[i][k] << '\t'; numNotZero++; } + if (count[genusNodes[k]] != 0) { + #ifdef USE_MPI + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + if (pid == 0) { + #endif + + out << k << '\t' << wordGenusProb[i][k] << '\t'; + + #ifdef USE_MPI + } + #endif + + numNotZero++; + } } + + #ifdef USE_MPI + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + if (pid == 0) { + #endif + out << endl; out2 << probabilityInTemplate << '\t' << numNotZero << endl; + + #ifdef USE_MPI + } + #endif } + #ifdef USE_MPI + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + if (pid == 0) { + #endif + out.close(); out2.close(); + #ifdef USE_MPI + } + #endif + //read in new phylotree with less info. - its faster ifstream phyloTreeTest(phyloTreeName.c_str()); delete phyloTree; - phyloTree = new PhyloTree(phyloTreeTest); + phyloTree = new PhyloTree(phyloTreeTest, phyloTreeName); } m->mothurOut("DONE."); m->mothurOutEndLine(); @@ -292,45 +347,162 @@ map Bayesian::parseTaxMap(string newTax) { } } /**************************************************************************************************/ -void Bayesian::readProbFile(ifstream& in, ifstream& inNum) { +void Bayesian::readProbFile(ifstream& in, ifstream& inNum, string inName, string inNumName) { try{ - in >> numKmers; gobble(in); - - //initialze probabilities - wordGenusProb.resize(numKmers); - - for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); } + #ifdef USE_MPI + + int pid, num, num2; + vector positions; + vector positions2; + + MPI_Status status; + MPI_File inMPI; + MPI_File inMPI2; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + char inFileName[1024]; + strcpy(inFileName, inNumName.c_str()); + + char inFileName2[1024]; + strcpy(inFileName2, inName.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + MPI_File_open(MPI_COMM_WORLD, inFileName2, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI2); //comm, filename, mode, info, filepointer + + if (pid == 0) { + positions = setFilePosEachLine(inNumName, num); + + //send file positions to all processes + MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs + MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos + + positions2 = setFilePosEachLine(inName, num2); + + //send file positions to all processes + MPI_Bcast(&num2, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs + MPI_Bcast(&positions2[0], (num2+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos + + }else{ + MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs + positions.resize(num); + MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions + + MPI_Bcast(&num2, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs + positions2.resize(num2); + MPI_Bcast(&positions2[0], (num2+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions + + } - int kmer, name, count; count = 0; - vector num; num.resize(numKmers); - float prob; - vector zeroCountProb; zeroCountProb.resize(numKmers); - - while (inNum) { - inNum >> zeroCountProb[count] >> num[count]; - count++; - gobble(inNum); - } - inNum.close(); - - while(in) { - in >> kmer; + //read numKmers + int length = positions2[1] - positions2[0]; + char* buf = new char[length]; + + MPI_File_read_at(inMPI2, positions2[0], buf, length, MPI_CHAR, &status); + + string tempBuf = buf; + if (tempBuf.length() > length) { tempBuf = tempBuf.substr(0, length); } + delete buf; + + istringstream iss (tempBuf,istringstream::in); + iss >> numKmers; + + //initialze probabilities + wordGenusProb.resize(numKmers); + + for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); } - //set them all to zero value - for (int i = 0; i < genusNodes.size(); i++) { - wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1)); + int kmer, name; + vector numbers; numbers.resize(numKmers); + float prob; + vector zeroCountProb; zeroCountProb.resize(numKmers); + + //read file + for(int i=0;i length) { tempBuf = tempBuf.substr(0, length); } + delete buf4; + + istringstream iss (tempBuf,istringstream::in); + iss >> zeroCountProb[i] >> numbers[i]; } - //get probs for nonzero values - for (int i = 0; i < num[kmer]; i++) { - in >> name >> prob; - wordGenusProb[kmer][name] = prob; + MPI_File_close(&inMPI); + + for(int i=1;i length) { tempBuf = tempBuf.substr(0, length); } + delete buf4; + + istringstream iss (tempBuf,istringstream::in); + + iss >> kmer; + + //set them all to zero value + for (int i = 0; i < genusNodes.size(); i++) { + wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1)); + } + + //get probs for nonzero values + for (int i = 0; i < numbers[kmer]; i++) { + iss >> name >> prob; + wordGenusProb[kmer][name] = prob; + } + } + MPI_File_close(&inMPI2); + #else + + in >> numKmers; gobble(in); - gobble(in); - } - in.close(); + //initialze probabilities + wordGenusProb.resize(numKmers); + + for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); } + + int kmer, name, count; count = 0; + vector num; num.resize(numKmers); + float prob; + vector zeroCountProb; zeroCountProb.resize(numKmers); + + while (inNum) { + inNum >> zeroCountProb[count] >> num[count]; + count++; + gobble(inNum); + } + inNum.close(); + + while(in) { + in >> kmer; + + //set them all to zero value + for (int i = 0; i < genusNodes.size(); i++) { + wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1)); + } + + //get probs for nonzero values + for (int i = 0; i < num[kmer]; i++) { + in >> name >> prob; + wordGenusProb[kmer][name] = prob; + } + + gobble(in); + } + in.close(); + + #endif } catch(exception& e) { m->errorOut(e, "Bayesian", "readProbFile"); diff --git a/bayesian.h b/bayesian.h index 6e35e9f..70757be 100644 --- a/bayesian.h +++ b/bayesian.h @@ -34,7 +34,7 @@ private: string bootstrapResults(vector, int, int); int getMostProbableTaxonomy(vector); - void readProbFile(ifstream&, ifstream&); + void readProbFile(ifstream&, ifstream&, string, string); }; diff --git a/classify.cpp b/classify.cpp index be287a0..147f499 100644 --- a/classify.cpp +++ b/classify.cpp @@ -237,6 +237,8 @@ int Classify::readTaxonomy(string file) { phyloTree->assignHeirarchyIDs(0); + phyloTree->setUp(file); + m->mothurOut("DONE."); m->mothurOutEndLine(); cout.flush(); diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp index a38bf8f..3837849 100644 --- a/classifyseqscommand.cpp +++ b/classifyseqscommand.cpp @@ -11,6 +11,7 @@ #include "sequence.hpp" #include "bayesian.h" #include "phylotree.h" +#include "phylosummary.h" #include "knn.h" //********************************************************************************************************************** @@ -25,7 +26,7 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { else { //valid paramters for this command - string AlignArray[] = {"template","fasta","name","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters", "outputdir","inputdir"}; + string AlignArray[] = {"template","fasta","name","group","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff","probs","iters", "outputdir","inputdir"}; vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); OptionParser parser(option); @@ -62,6 +63,14 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { //if the user has not given a path then, add inputdir. else leave path alone. if (path == "") { parameters["taxonomy"] = inputDir + it->second; } } + + it = parameters.find("group"); + //user has given a template file + if(it != parameters.end()){ + path = hasPath(it->second); + //if the user has not given a path then, add inputdir. else leave path alone. + if (path == "") { parameters["group"] = inputDir + it->second; } + } } //check for required parameters @@ -73,6 +82,10 @@ ClassifySeqsCommand::ClassifySeqsCommand(string option) { } else if (templateFileName == "not open") { abort = true; } + groupfile = validParameter.validFile(parameters, "group", true); + if (groupfile == "not open") { abort = true; } + else if (groupfile == "not found") { groupfile = ""; } + fastaFileName = validParameter.validFile(parameters, "fasta", false); if (fastaFileName == "not found") { m->mothurOut("fasta is a required parameter for the classify.seqs command."); m->mothurOutEndLine(); abort = true; } else { @@ -250,6 +263,7 @@ void ClassifySeqsCommand::help(){ m->mothurOut("The template, fasta and taxonomy parameters are required. You may enter multiple fasta files by separating their names with dashes. ie. fasta=abrecovery.fasta-amzon.fasta \n"); m->mothurOut("The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer, blast and distance. The default is kmer.\n"); m->mothurOut("The name parameter allows you add a names file with your fasta file, if you enter multiple fasta files, you must enter matching names files for them.\n"); + m->mothurOut("The group parameter allows you add a group file so you can have the summary totals broken up by group.\n"); m->mothurOut("The method parameter allows you to specify classification method to use. Your options are: bayesian and knn. The default is bayesian.\n"); m->mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n"); m->mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n"); @@ -491,50 +505,40 @@ int ClassifySeqsCommand::execute(){ m->mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); start = time(NULL); - //make taxonomy tree from new taxonomy file - PhyloTree taxaBrowser; + PhyloSummary taxaSum(taxonomyFileName, groupfile); if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; } - ifstream in; - openInputFile(tempTaxonomyFile, in); - - //read in users taxonomy file and add sequences to tree - string name, taxon; - while(!in.eof()){ - in >> name >> taxon; gobble(in); - - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } remove(tempTaxonomyFile.c_str()); delete classify; return 0; } + if (namefile == "") { taxaSum.summarize(tempTaxonomyFile); } + else { + ifstream in; + openInputFile(tempTaxonomyFile, in); - if (namefile != "") { + //read in users taxonomy file and add sequences to tree + string name, taxon; + while(!in.eof()){ + in >> name >> taxon; gobble(in); + itNames = nameMap.find(name); if (itNames == nameMap.end()) { m->mothurOut(name + " is not in your name file please correct."); m->mothurOutEndLine(); exit(1); }else{ for (int i = 0; i < itNames->second; i++) { - taxaBrowser.addSeqToTree(name+toString(i), taxon); //add it as many times as there are identical seqs + taxaSum.addSeqToTree(name+toString(i), taxon); //add it as many times as there are identical seqs } } - }else { taxaBrowser.addSeqToTree(name, taxon); } //add it once + } + in.close(); } - in.close(); - - taxaBrowser.assignHeirarchyIDs(0); - - if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } remove(tempTaxonomyFile.c_str()); delete classify; return 0; } - - taxaBrowser.binUnclassified(); - remove(tempTaxonomyFile.c_str()); if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } delete classify; return 0; } - //print summary file ofstream outTaxTree; openOutputFile(taxSummary, outTaxTree); - taxaBrowser.print(outTaxTree); + taxaSum.print(outTaxTree); outTaxTree.close(); //output taxonomy with the unclassified bins added @@ -546,9 +550,10 @@ int ClassifySeqsCommand::execute(){ openOutputFile(unclass, outTax); //get maxLevel from phylotree so you know how many 'unclassified's to add - int maxLevel = taxaBrowser.getMaxLevel(); + int maxLevel = taxaSum.getMaxLevel(); //read taxfile - this reading and rewriting is done to preserve the confidence scores. + string name, taxon; while (!inTax.eof()) { if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } remove(unclass.c_str()); delete classify; return 0; } @@ -564,6 +569,9 @@ int ClassifySeqsCommand::execute(){ remove(newTaxonomyFile.c_str()); rename(unclass.c_str(), newTaxonomyFile.c_str()); + m->mothurOutEndLine(); + m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); + #ifdef USE_MPI } #endif @@ -572,10 +580,7 @@ int ClassifySeqsCommand::execute(){ m->mothurOut("Output File Names: "); m->mothurOutEndLine(); for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); } m->mothurOutEndLine(); - - //m->mothurOutEndLine(); - //m->mothurOut("It took " + toString(time(NULL) - start) + " secs to create the summary file for " + toString(numFastaSeqs) + " sequences."); m->mothurOutEndLine(); m->mothurOutEndLine(); } delete classify; diff --git a/classifyseqscommand.h b/classifyseqscommand.h index 0ffd18c..d19f862 100644 --- a/classifyseqscommand.h +++ b/classifyseqscommand.h @@ -48,7 +48,7 @@ private: Classify* classify; - string fastaFileName, templateFileName, distanceFileName, namefile, search, method, taxonomyFileName, outputDir; + string fastaFileName, templateFileName, distanceFileName, namefile, search, method, taxonomyFileName, outputDir, groupfile; int processors, kmerSize, numWanted, cutoff, iters; float match, misMatch, gapOpen, gapExtend; bool abort, probs; diff --git a/distancecommand.cpp b/distancecommand.cpp index 10a99dd..da5d0f1 100644 --- a/distancecommand.cpp +++ b/distancecommand.cpp @@ -278,8 +278,7 @@ int DistanceCommand::execute(){ delete buf; int count = 0; - while (count < fileSize) { //read 1000 characters at a time - //send freqs + while (count < fileSize) { char buf2[1]; MPI_File_read(inMPI, buf2, 1, MPI_CHAR, &status); MPI_File_write(outMPI, buf2, 1, MPI_CHAR, &status); diff --git a/makefile b/makefile index 5812d45..d4aac7f 100644 --- a/makefile +++ b/makefile @@ -220,7 +220,7 @@ mothur : \ ./classify.o\ ./phylotree.o\ ./bayesian.o\ - ./rawtrainingdatamaker.o\ + ./phylosummary.o\ ./alignmentdb.o\ ./knn.o\ ./distancedb.o\ @@ -418,7 +418,7 @@ mothur : \ ./classify.o\ ./phylotree.o\ ./bayesian.o\ - ./rawtrainingdatamaker.o\ + ./phylosummary.o\ ./alignmentdb.o\ ./knn.o\ ./distancedb.o\ @@ -619,7 +619,7 @@ clean : ./classify.o\ ./phylotree.o\ ./bayesian.o\ - ./rawtrainingdatamaker.o\ + ./phylosummary.o\ ./alignmentdb.o\ ./knn.o\ ./distancedb.o\ @@ -1623,9 +1623,9 @@ install : mothur ./chimerabellerophoncommand.o : chimerabellerophoncommand.cpp $(CC) $(CC_OPTIONS) chimerabellerophoncommand.cpp -c $(INCLUDE) -o ./chimerabellerophoncommand.o -# Item # 171 -- rawtrainingdatamaker -- -./rawtrainingdatamaker.o : rawtrainingdatamaker.cpp - $(CC) $(CC_OPTIONS) rawtrainingdatamaker.cpp -c $(INCLUDE) -o ./rawtrainingdatamaker.o +# Item # 171 -- phylosummary -- +./phylosummary.o : phylosummary.cpp + $(CC) $(CC_OPTIONS) phylosummary.cpp -c $(INCLUDE) -o ./phylosummary.o diff --git a/phylosummary.cpp b/phylosummary.cpp new file mode 100644 index 0000000..1e8d1bc --- /dev/null +++ b/phylosummary.cpp @@ -0,0 +1,274 @@ +/* + * rawTrainingDataMaker.cpp + * Mothur + * + * Created by westcott on 4/21/10. + * Copyright 2010 Schloss Lab. All rights reserved. + * + */ + +#include "phylosummary.h" + +/**************************************************************************************************/ + +PhyloSummary::PhyloSummary(string refTfile, string groupFile){ + try { + m = MothurOut::getInstance(); + maxLevel = 0; + + if (groupFile != "") { + groupmap = new GroupMap(groupFile); + groupmap->readMap(); + }else{ + groupmap = NULL; + } + + //check for necessary files + string taxFileNameTest = refTfile.substr(0,refTfile.find_last_of(".")+1) + "tree.sum"; + ifstream FileTest(taxFileNameTest.c_str()); + + if (!FileTest) { + m->mothurOut("Error: can't find " + taxFileNameTest + "."); m->mothurOutEndLine(); exit(1); + }else{ + readTreeStruct(FileTest); + } + + tree[0].rank = "0"; + assignRank(0); + + } + catch(exception& e) { + m->errorOut(e, "PhyloSummary", "PhyloSummary"); + exit(1); + } +} +/**************************************************************************************************/ + +void PhyloSummary::summarize(string userTfile){ + try { + + ifstream in; + openInputFile(userTfile, in); + + //read in users taxonomy file and add sequences to tree + string name, tax; + while(!in.eof()){ + in >> name >> tax; gobble(in); + + addSeqToTree(name, tax); + + if (m->control_pressed) { break; } + } + in.close(); + } + catch(exception& e) { + m->errorOut(e, "PhyloSummary", "summarize"); + exit(1); + } +} + +/**************************************************************************************************/ + +string PhyloSummary::getNextTaxon(string& heirarchy){ + try { + string currentLevel = ""; + if(heirarchy != ""){ + int pos = heirarchy.find_first_of(';'); + currentLevel=heirarchy.substr(0,pos); + if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); } + else { heirarchy = ""; } + } + return currentLevel; + } + catch(exception& e) { + m->errorOut(e, "PhyloSummary", "getNextTaxon"); + exit(1); + } +} + +/**************************************************************************************************/ + +int PhyloSummary::addSeqToTree(string seqName, string seqTaxonomy){ + try { + numSeqs++; + + map::iterator childPointer; + + int currentNode = 0; + string taxon; + + int level = 0; + + while (seqTaxonomy != "") { + + if (m->control_pressed) { return 0; } + + //somehow the parent is getting one too many accnos + //use print to reassign the taxa id + taxon = getNextTaxon(seqTaxonomy); + + childPointer = tree[currentNode].children.find(taxon); + + if(childPointer != tree[currentNode].children.end()){ //if the node already exists, update count and move on + if (groupmap != NULL) { + //find out the sequences group + string group = groupmap->getGroup(seqName); + + //do you have a count for this group? + map::iterator itGroup = tree[currentNode].groupCount.find(group); + + //if yes, increment it - there should not be a case where we can't find it since we load group in read + if (itGroup != tree[currentNode].groupCount.end()) { + tree[currentNode].groupCount[group]++; + } + } + + tree[currentNode].total++; + + currentNode = childPointer->second; + }else{ //otherwise, create it + m->mothurOut("Error: cannot find taxonomy in tree for " + seqName + "."); m->mothurOutEndLine(); + seqTaxonomy = ""; + } + + level++; + + if ((seqTaxonomy == "") && (level < maxLevel)) { //if you think you are done and you are not. + for (int k = level; k < maxLevel; k++) { seqTaxonomy += "unclassified;"; } + } + } + + } + catch(exception& e) { + m->errorOut(e, "PhyloSummary", "addSeqToTree"); + exit(1); + } +} +/**************************************************************************************************/ + +void PhyloSummary::assignRank(int index){ + try { + map::iterator it; + int counter = 1; + + for(it=tree[index].children.begin();it!=tree[index].children.end();it++){ + tree[it->second].rank = tree[index].rank + '.' + toString(counter); + counter++; + + assignRank(it->second); + } + } + catch(exception& e) { + m->errorOut(e, "PhyloSummary", "assignRank"); + exit(1); + } +} +/**************************************************************************************************/ + +void PhyloSummary::print(ofstream& out){ + try { + //print labels + out << "taxlevel\t rank ID\t label\t daughterlevels\t total\t"; + if (groupmap != NULL) { + for (int i = 0; i < groupmap->namesOfGroups.size(); i++) { + out << groupmap->namesOfGroups[i] << '\t'; + } + } + + out << endl; + + //print root + out << tree[0].level << "\t" << tree[0].rank << "\t" << tree[0].name << "\t" << tree[0].children.size() << "\t" << tree[0].total << "\t"; + + map::iterator itGroup; + if (groupmap != NULL) { + for (itGroup = tree[0].groupCount.begin(); itGroup != tree[0].groupCount.end(); itGroup++) { + out << itGroup->second << '\t'; + } + } + out << endl; + + //print rest + print(0, out); + + } + catch(exception& e) { + m->errorOut(e, "PhyloSummary", "print"); + exit(1); + } +} + +/**************************************************************************************************/ + +void PhyloSummary::print(int i, ofstream& out){ + try { + map::iterator it; + for(it=tree[i].children.begin();it!=tree[i].children.end();it++){ + + if (tree[it->second].total != 0) { + out << tree[it->second].level << "\t" << tree[it->second].rank << "\t" << tree[it->second].name << "\t" << tree[it->second].children.size() << "\t" << tree[it->second].total << "\t"; + + map::iterator itGroup; + if (groupmap != NULL) { + for (itGroup = tree[it->second].groupCount.begin(); itGroup != tree[it->second].groupCount.end(); itGroup++) { + out << itGroup->second << '\t'; + } + } + out << endl; + } + + print(it->second, out); + } + } + catch(exception& e) { + m->errorOut(e, "PhyloSummary", "print"); + exit(1); + } +} +/**************************************************************************************************/ +void PhyloSummary::readTreeStruct(ifstream& in){ + try { + int num; + + in >> num; gobble(in); + + tree.resize(num); + + //read the tree file + for (int i = 0; i < tree.size(); i++) { + + in >> tree[i].level >> tree[i].name >> num; //num contains the number of children tree[i] has + + //set children + string childName; + int childIndex; + for (int j = 0; j < num; j++) { + in >> childName >> childIndex; + tree[i].children[childName] = childIndex; + } + + //initialize groupcounts + if (groupmap != NULL) { + for (int j = 0; j < groupmap->namesOfGroups.size(); j++) { + tree[i].groupCount[groupmap->namesOfGroups[j]] = 0; + } + } + + tree[i].total = 0; + + gobble(in); + + if (tree[i].level > maxLevel) { maxLevel = tree[i].level; } + } + + } + catch(exception& e) { + m->errorOut(e, "PhyloSummary", "print"); + exit(1); + } +} + +/**************************************************************************************************/ + + + diff --git a/rawtrainingdatamaker.h b/phylosummary.h similarity index 65% rename from rawtrainingdatamaker.h rename to phylosummary.h index 912d781..9ebdf81 100644 --- a/rawtrainingdatamaker.h +++ b/phylosummary.h @@ -12,6 +12,7 @@ #include "mothur.h" #include "mothurout.h" +#include "groupmap.h" /**************************************************************************************************/ @@ -19,31 +20,37 @@ struct rawTaxNode { map children; //childs name to index in tree int parent, level; string name, rank; + map groupCount; + int total; - rawTaxNode(string n) : name(n), level(0), parent(-1) { } + rawTaxNode(string n) : name(n), level(0), parent(-1), total(0) {} rawTaxNode(){} }; /**************************************************************************************************/ - -class RawTrainingDataMaker { +//doesn't use MPI ifdefs since only pid 0 uses this class +class PhyloSummary { public: - RawTrainingDataMaker(); - RawTrainingDataMaker(string); //pass it a taxonomy file and it makes the tree - ~RawTrainingDataMaker() {}; + PhyloSummary(string, string); + ~PhyloSummary() { if (groupmap != NULL) { delete groupmap; } } + + void summarize(string); //pass it a taxonomy file and a group file and it makes the tree int addSeqToTree(string, string); - void assignRank(int); void print(ofstream&); + int getMaxLevel() { return maxLevel; } private: string getNextTaxon(string&); vector tree; void print(int, ofstream&); + void assignRank(int); + void readTreeStruct(ifstream&); + GroupMap* groupmap; + int numNodes; int numSeqs; int maxLevel; - //map sanityCheck; MothurOut* m; }; diff --git a/phylotree.cpp b/phylotree.cpp index 33aedc6..8c48e4f 100644 --- a/phylotree.cpp +++ b/phylotree.cpp @@ -28,34 +28,78 @@ PhyloTree::PhyloTree(){ } /**************************************************************************************************/ -PhyloTree::PhyloTree(ifstream& in){ +PhyloTree::PhyloTree(ifstream& in, string filename){ try { m = MothurOut::getInstance(); calcTotals = false; - in >> numNodes; gobble(in); - - tree.resize(numNodes); - - for (int i = 0; i < tree.size(); i++) { - in >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(in); - - } - - //read genus nodes - int numGenus = 0; - in >> numGenus; gobble(in); - - int gnode, gsize; - totals.clear(); - for (int i = 0; i < numGenus; i++) { - in >> gnode >> gsize; gobble(in); + #ifdef USE_MPI + MPI_File inMPI; + MPI_Offset size; + MPI_Status status; + + char inFileName[1024]; + strcpy(inFileName, filename.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); + MPI_File_get_size(inMPI, &size); - uniqueTaxonomies[gnode] = gnode; - totals.push_back(gsize); - } + char* buffer = new char[size]; + MPI_File_read(inMPI, buffer, size, MPI_CHAR, &status); + + string tempBuf = buffer; + if (tempBuf.length() > size) { tempBuf = tempBuf.substr(0, size); } + istringstream iss (tempBuf,istringstream::in); + delete buffer; + + iss >> numNodes; gobble(iss); + + tree.resize(numNodes); + + for (int i = 0; i < tree.size(); i++) { + iss >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(iss); + } - in.close(); + //read genus nodes + int numGenus = 0; + iss >> numGenus; gobble(iss); + + int gnode, gsize; + totals.clear(); + for (int i = 0; i < numGenus; i++) { + iss >> gnode >> gsize; gobble(iss); + + uniqueTaxonomies[gnode] = gnode; + totals.push_back(gsize); + } + + MPI_File_close(&inMPI); + + #else + in >> numNodes; gobble(in); + + tree.resize(numNodes); + + for (int i = 0; i < tree.size(); i++) { + in >> tree[i].name >> tree[i].level >> tree[i].parent; gobble(in); + } + + //read genus nodes + int numGenus = 0; + in >> numGenus; gobble(in); + + int gnode, gsize; + totals.clear(); + for (int i = 0; i < numGenus; i++) { + in >> gnode >> gsize; gobble(in); + + uniqueTaxonomies[gnode] = gnode; + totals.push_back(gsize); + } + + in.close(); + + #endif } catch(exception& e) { @@ -74,20 +118,70 @@ PhyloTree::PhyloTree(string tfile){ tree[0].heirarchyID = "0"; maxLevel = 0; calcTotals = true; + string name, tax; + - ifstream in; - openInputFile(tfile, in); + #ifdef USE_MPI + int pid, num; + vector positions; + + MPI_Status status; + MPI_File inMPI; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + char inFileName[1024]; + strcpy(inFileName, tfile.c_str()); + + MPI_File_open(MPI_COMM_WORLD, inFileName, MPI_MODE_RDONLY, MPI_INFO_NULL, &inMPI); //comm, filename, mode, info, filepointer + + if (pid == 0) { + positions = setFilePosEachLine(tfile, num); + + //send file positions to all processes + MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //send numSeqs + MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //send file pos + }else{ + MPI_Bcast(&num, 1, MPI_INT, 0, MPI_COMM_WORLD); //get numSeqs + positions.resize(num); + MPI_Bcast(&positions[0], (num+1), MPI_LONG, 0, MPI_COMM_WORLD); //get file positions + } - //read in users taxonomy file and add sequences to tree - string name, tax; - while(!in.eof()){ - in >> name >> tax; gobble(in); + //read file + for(int i=0;i length) { tempBuf = tempBuf.substr(0, length); } + delete buf4; + + istringstream iss (tempBuf,istringstream::in); + iss >> name >> tax; + addSeqToTree(name, tax); + } - addSeqToTree(name, tax); - } - in.close(); - + MPI_File_close(&inMPI); + + #else + ifstream in; + openInputFile(tfile, in); + + //read in users taxonomy file and add sequences to tree + while(!in.eof()){ + in >> name >> tax; gobble(in); + + addSeqToTree(name, tax); + } + in.close(); + #endif + assignHeirarchyIDs(0); + + //create file for summary if needed + setUp(tfile); } catch(exception& e) { m->errorOut(e, "PhyloTree", "PhyloTree"); @@ -232,15 +326,49 @@ void PhyloTree::assignHeirarchyIDs(int index){ } } /**************************************************************************************************/ -void PhyloTree::binUnclassified(){ +void PhyloTree::setUp(string tfile){ + try{ + string taxFileNameTest = tfile.substr(0,tfile.find_last_of(".")+1) + "tree.sum"; + + #ifdef USE_MPI + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + if (pid == 0) { binUnclassified(taxFileNameTest); } + + #else + //create file needed for summary if it doesn't exist + ifstream FileTest(taxFileNameTest.c_str()); + + if (!FileTest) { + binUnclassified(taxFileNameTest); + } + #endif + } + catch(exception& e) { + m->errorOut(e, "PhyloTree", "setUp"); + exit(1); + } +} +/**************************************************************************************************/ +void PhyloTree::binUnclassified(string file){ try { + + ofstream out; + openOutputFile(file, out); + map::iterator itBin; map::iterator childPointer; + vector copy = tree; + int copyNodes = numNodes; + //go through the seqs and if a sequence finest taxon is not the same level as the most finely defined taxon then classify it as unclassified where necessary for (itBin = name2Taxonomy.begin(); itBin != name2Taxonomy.end(); itBin++) { - - int level = tree[itBin->second].level; + + if (m->control_pressed) { out.close(); break; } + + int level = copy[itBin->second].level; int currentNode = itBin->second; //this sequence is unclassified at some levels @@ -251,34 +379,30 @@ void PhyloTree::binUnclassified(){ string taxon = "unclassified"; //does the parent have a child names 'unclassified'? - childPointer = tree[currentNode].children.find(taxon); + childPointer = copy[currentNode].children.find(taxon); - if(childPointer != tree[currentNode].children.end()){ //if the node already exists, move on + if(childPointer != copy[currentNode].children.end()){ //if the node already exists, move on currentNode = childPointer->second; //currentNode becomes 'unclassified' - tree[currentNode].accessions.push_back(itBin->first); //add this seq - name2Taxonomy[itBin->first] = currentNode; + copy[currentNode].accessions.push_back(itBin->first); //add this seq } else{ //otherwise, create it - tree.push_back(TaxNode(taxon)); - numNodes++; - tree[currentNode].children[taxon] = numNodes-1; - tree[numNodes-1].parent = currentNode; + copy.push_back(TaxNode(taxon)); + copyNodes++; + copy[currentNode].children[taxon] = copyNodes-1; + copy[copyNodes-1].parent = currentNode; + copy[copyNodes-1].level = copy[currentNode].level + 1; - currentNode = tree[currentNode].children[taxon]; - tree[currentNode].accessions.push_back(itBin->first); - name2Taxonomy[itBin->first] = currentNode; + currentNode = copy[currentNode].children[taxon]; + copy[currentNode].accessions.push_back(itBin->first); } - - if (level == maxLevel) { uniqueTaxonomies[currentNode] = currentNode; } } } - //clear HeirarchyIDs and reset them - for (int i = 1; i < tree.size(); i++) { - tree[i].heirarchyID = ""; + if (!m->control_pressed) { + //print copy tree + print(out, copy); } - assignHeirarchyIDs(0); - + } catch(exception& e) { m->errorOut(e, "PhyloTree", "binUnclassified"); @@ -306,26 +430,22 @@ string PhyloTree::getFullTaxonomy(string seqName) { } /**************************************************************************************************/ -void PhyloTree::print(ofstream& out){ - try { - out << tree[0].level << '\t'<< tree[0].heirarchyID << '\t' << tree[0].name << '\t' << tree[0].children.size() << '\t' << tree[0].accessions.size() << endl; - print(0, out); - } - catch(exception& e) { - m->errorOut(e, "PhyloTree", "print"); - exit(1); - } -} - -/**************************************************************************************************/ - -void PhyloTree::print(int i, ofstream& out){ +void PhyloTree::print(ofstream& out, vector& copy){ try { - map::iterator it; - for(it=tree[i].children.begin();it!=tree[i].children.end();it++){ - out <second].level << '\t' << tree[it->second].heirarchyID << '\t' << tree[it->second].name << '\t' << tree[it->second].children.size() << '\t' << tree[it->second].accessions.size() << endl; - print(it->second, out); + out << copy.size() << endl; + + for (int i = 0; i < copy.size(); i++) { + + out << copy[i].level << '\t'<< copy[i].name << '\t' << copy[i].children.size() << '\t'; + + map::iterator it; + for(it=copy[i].children.begin();it!=copy[i].children.end();it++){ + out << it->first << '\t' << it->second << '\t'; + } + out << endl; } + + out.close(); } catch(exception& e) { m->errorOut(e, "PhyloTree", "print"); @@ -335,23 +455,37 @@ void PhyloTree::print(int i, ofstream& out){ /**************************************************************************************************/ void PhyloTree::printTreeNodes(string treefilename) { try { - ofstream outTree; - openOutputFile(treefilename, outTree); - - //print treenodes - outTree << tree.size() << endl; - for (int i = 0; i < tree.size(); i++) { - outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl; - } - - //print genus nodes - outTree << endl << uniqueTaxonomies.size() << endl; - map::iterator it2; - for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl; } - outTree << endl; + + #ifdef USE_MPI + int pid; + MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are + + if (pid == 0) { + #endif + + ofstream outTree; + openOutputFile(treefilename, outTree); + + //print treenodes + outTree << tree.size() << endl; + for (int i = 0; i < tree.size(); i++) { + outTree << tree[i].name << '\t' << tree[i].level << '\t' << tree[i].parent << endl; + } + + //print genus nodes + outTree << endl << uniqueTaxonomies.size() << endl; + map::iterator it2; + for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { outTree << it2->first << '\t' << tree[it2->first].accessions.size() << endl; } + outTree << endl; + + + outTree.close(); - outTree.close(); + #ifdef USE_MPI + } + #endif + } catch(exception& e) { diff --git a/phylotree.h b/phylotree.h index b65a651..a57cf25 100644 --- a/phylotree.h +++ b/phylotree.h @@ -32,15 +32,14 @@ class PhyloTree { public: PhyloTree(); PhyloTree(string); //pass it a taxonomy file and it makes the tree - PhyloTree(ifstream&); //pass it a taxonomy file and it makes the train.tree + PhyloTree(ifstream&, string); //pass it a taxonomy file and it makes the train.tree ~PhyloTree() {}; int addSeqToTree(string, string); void assignHeirarchyIDs(int); - void print(ofstream&); void printTreeNodes(string); //used by bayesian to save time vector getGenusNodes(); vector getGenusTotals(); - void binUnclassified(); + void setUp(string); //used to create file needed for summary file if you use () constructor and add seqs manually instead of passing taxonomyfile TaxNode get(int i) { return tree[i]; } TaxNode get(string seqName) { return tree[name2Taxonomy[seqName]]; } @@ -52,12 +51,15 @@ public: private: string getNextTaxon(string&); + void print(ofstream&, vector&); + void binUnclassified(string); + vector tree; vector genusIndex; //holds the indexes in tree where the genus level taxonomies are stored vector totals; //holds the numSeqs at each genus level taxonomy map name2Taxonomy; //maps name to index in tree map uniqueTaxonomies; //map of unique taxonomies - void print(int, ofstream&); + //void print(int, ofstream&); int numNodes; int numSeqs; int maxLevel; diff --git a/rawtrainingdatamaker.cpp b/rawtrainingdatamaker.cpp deleted file mode 100644 index 362cdb7..0000000 --- a/rawtrainingdatamaker.cpp +++ /dev/null @@ -1,188 +0,0 @@ -/* - * rawTrainingDataMaker.cpp - * Mothur - * - * Created by westcott on 4/21/10. - * Copyright 2010 Schloss Lab. All rights reserved. - * - */ - -#include "rawtrainingdatamaker.h" - -/**************************************************************************************************/ - -RawTrainingDataMaker::RawTrainingDataMaker(){ - try { - m = MothurOut::getInstance(); - numNodes = 1; - numSeqs = 0; - tree.push_back(rawTaxNode("Root")); - tree[0].rank = "Root"; - maxLevel = 0; - } - catch(exception& e) { - m->errorOut(e, "RawTrainingDataMaker", "RawTrainingDataMaker"); - exit(1); - } -} -/**************************************************************************************************/ - -RawTrainingDataMaker::RawTrainingDataMaker(string tfile){ - try { - m = MothurOut::getInstance(); - numNodes = 1; - numSeqs = 0; - tree.push_back(rawTaxNode("Root")); - tree[0].rank = "Root"; - maxLevel = 0; - - ifstream in; - openInputFile(tfile, in); - - //read in users taxonomy file and add sequences to tree - string name, tax; - while(!in.eof()){ - in >> name >> tax; gobble(in); - - addSeqToTree(name, tax); - } - in.close(); - - assignRank(0); - } - catch(exception& e) { - m->errorOut(e, "RawTrainingDataMaker", "RawTrainingDataMaker"); - exit(1); - } -} - -/**************************************************************************************************/ - -string RawTrainingDataMaker::getNextTaxon(string& heirarchy){ - try { - string currentLevel = ""; - if(heirarchy != ""){ - int pos = heirarchy.find_first_of(';'); - currentLevel=heirarchy.substr(0,pos); - if (pos != (heirarchy.length()-1)) { heirarchy=heirarchy.substr(pos+1); } - else { heirarchy = ""; } - } - return currentLevel; - } - catch(exception& e) { - m->errorOut(e, "RawTrainingDataMaker", "getNextTaxon"); - exit(1); - } -} - -/**************************************************************************************************/ - -int RawTrainingDataMaker::addSeqToTree(string seqName, string seqTaxonomy){ - try { - numSeqs++; - - map::iterator childPointer; - - int currentNode = 0; - string taxon; - - while(seqTaxonomy != ""){ - - if (m->control_pressed) { return 0; } - - //somehow the parent is getting one too many accnos - //use print to reassign the taxa id - taxon = getNextTaxon(seqTaxonomy); - - childPointer = tree[currentNode].children.find(taxon); - - if(childPointer != tree[currentNode].children.end()){ //if the node already exists, move on - currentNode = childPointer->second; - }else{ //otherwise, create it - tree.push_back(rawTaxNode(taxon)); - numNodes++; - tree[currentNode].children[taxon] = numNodes-1; - tree[numNodes-1].parent = currentNode; - - currentNode = tree[currentNode].children[taxon]; - } - } - - } - catch(exception& e) { - m->errorOut(e, "RawTrainingDataMaker", "addSeqToTree"); - exit(1); - } -} -/**************************************************************************************************/ - -void RawTrainingDataMaker::assignRank(int index){ - try { - map::iterator it; - - string ranks[9] = { "Root","Domain","Kingdom","Phylum","Class","Order","Family","Genus","Species" }; - - for(it=tree[index].children.begin();it!=tree[index].children.end();it++){ - tree[it->second].level = tree[index].level + 1; - - if (tree[it->second].level > 8) { - tree[it->second].rank = ("unknown" + toString(tree[it->second].level)); - }else { - tree[it->second].rank = ranks[tree[it->second].level]; - } - - //save maxLevel for binning the unclassified seqs - if (tree[it->second].level > maxLevel) { maxLevel = tree[it->second].level; } - - assignRank(it->second); - } - } - catch(exception& e) { - m->errorOut(e, "RawTrainingDataMaker", "assignRank"); - exit(1); - } -} -/**************************************************************************************************/ - -void RawTrainingDataMaker::print(ofstream& out){ - try { - //string temp = tree[0].name +" " + tree[0].rank; - //sanityCheck[temp] = temp; - - out << "0" << "*" << tree[0].name << "*" << tree[0].parent << "*" << tree[0].level << "*" << tree[0].rank << endl; - print(0, out); - - } - catch(exception& e) { - m->errorOut(e, "RawTrainingDataMaker", "print"); - exit(1); - } -} - -/**************************************************************************************************/ - -void RawTrainingDataMaker::print(int i, ofstream& out){ - try { - map::iterator it; - for(it=tree[i].children.begin();it!=tree[i].children.end();it++){ - //string temp = tree[it->second].name + " " + tree[it->second].rank; - - //map::iterator itSan; - //itSan = sanityCheck.find(temp); - - //if (itSan == sanityCheck.end()) { - out << it->second << "*" << tree[it->second].name << "*" << tree[it->second].parent << "*" << tree[it->second].level << "*" << tree[it->second].rank << endl; - //sanityCheck[temp] = temp; - //} - print(it->second, out); - } - } - catch(exception& e) { - m->errorOut(e, "RawTrainingDataMaker", "print"); - exit(1); - } -} -/**************************************************************************************************/ - - - -- 2.39.2