A75B887E104C16860083C454 /* ccode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75B887B104C16860083C454 /* ccode.cpp */; };
A7861478109F5FA00010AAAF /* classifyseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7861477109F5FA00010AAAF /* classifyseqscommand.cpp */; };
A787810310A06D5D0086103D /* classify.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787810210A06D5D0086103D /* classify.cpp */; };
- A787811510A074330086103D /* phylotype.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787811410A074330086103D /* phylotype.cpp */; };
A78781BA10A0813E0086103D /* doTaxonomy.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A78781B810A0813D0086103D /* doTaxonomy.cpp */; };
A787821110A0AAE70086103D /* bayesian.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787821010A0AAE70086103D /* bayesian.cpp */; };
A78782AB10A1B1CB0086103D /* alignmentdb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A78782AA10A1B1CB0086103D /* alignmentdb.cpp */; };
A7861477109F5FA00010AAAF /* classifyseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = classifyseqscommand.cpp; sourceTree = "<group>"; };
A78780FE10A06B8B0086103D /* classify.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = classify.h; sourceTree = SOURCE_ROOT; };
A787810210A06D5D0086103D /* classify.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = classify.cpp; sourceTree = SOURCE_ROOT; };
- A787811310A074330086103D /* phylotype.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = phylotype.h; sourceTree = SOURCE_ROOT; };
- A787811410A074330086103D /* phylotype.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = phylotype.cpp; sourceTree = SOURCE_ROOT; };
A78781B810A0813D0086103D /* doTaxonomy.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = doTaxonomy.cpp; sourceTree = "<group>"; };
A78781B910A0813D0086103D /* doTaxonomy.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = doTaxonomy.h; sourceTree = "<group>"; };
A787820F10A0AAE70086103D /* bayesian.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bayesian.h; sourceTree = "<group>"; };
A78781B810A0813D0086103D /* doTaxonomy.cpp */,
A787844310A1EBDD0086103D /* knn.h */,
A787844410A1EBDD0086103D /* knn.cpp */,
- A787811310A074330086103D /* phylotype.h */,
- A787811410A074330086103D /* phylotype.cpp */,
);
name = classifiers;
sourceTree = "<group>";
A751ACEA1098B283003D0911 /* readcluster.cpp in Sources */,
A7861478109F5FA00010AAAF /* classifyseqscommand.cpp in Sources */,
A787810310A06D5D0086103D /* classify.cpp in Sources */,
- A787811510A074330086103D /* phylotype.cpp in Sources */,
A78781BA10A0813E0086103D /* doTaxonomy.cpp in Sources */,
A787821110A0AAE70086103D /* bayesian.cpp in Sources */,
A78782AB10A1B1CB0086103D /* alignmentdb.cpp in Sources */,
*/
#include "bayesian.h"
+#include "kmer.hpp"
/**************************************************************************************************/
-Bayesian::Bayesian(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch) :
-Classify(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch) {}
+Bayesian::Bayesian(string tfile, string tempFile, string method, int ksize, int cutoff) :
+Classify(tfile, tempFile, method, ksize, 0, 0, 0, 0), kmerSize(ksize), confidenceThreshold(cutoff) {
+ try {
+
+ numKmers = database->getMaxKmer() + 1;
+
+ //initialze probabilities
+ wordGenusProb.resize(numKmers);
+
+ genusNodes = phyloTree->getGenusNodes();
+
+ for (int j = 0; j < wordGenusProb.size(); j++) { wordGenusProb[j].resize(genusNodes.size()); }
+
+ //reset counts because we are on a new word
+ for (int j = 0; j < genusNodes.size(); j++) {
+ TaxNode temp = phyloTree->get(genusNodes[j]);
+ genusTotals.push_back(temp.accessions.size());
+ }
+
+
+ /************calculate the probablity that each word will be in a specific taxonomy*************/
+ ofstream out;
+ string probFileName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.prob";
+ ifstream probFileTest(probFileName.c_str());
+
+ ofstream out2;
+ string probFileName2 = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer.numNonZero";
+ ifstream probFileTest2(probFileName2.c_str());
+
+ int start = time(NULL);
+
+ if(probFileTest && probFileTest2){
+ mothurOut("Reading template probabilities... "); cout.flush();
+ readProbFile(probFileTest, probFileTest2);
+ }else{
+ mothurOut("Calculating template probabilities... "); cout.flush();
+
+ ofstream out;
+ openOutputFile(probFileName, out);
+
+ ofstream out2;
+ openOutputFile(probFileName2, out2);
+
+ //for each word
+ for (int i = 0; i < numKmers; i++) {
+
+ out << i << '\t';
+
+ vector<int> seqsWithWordi = database->getSequencesWithKmer(i);
+
+ map<int, int> count;
+ for (int k = 0; k < genusNodes.size(); k++) { count[genusNodes[k]] = 0; }
+
+ //for each sequence with that word
+ for (int j = 0; j < seqsWithWordi.size(); j++) {
+ int temp = phyloTree->getIndex(names[seqsWithWordi[j]]);
+ count[temp]++; //increment count of seq in this genus who have this word
+ }
+
+ //probabilityInTemplate = (# of seqs with that word in template + 0.05) / (total number of seqs in template + 1);
+ float probabilityInTemplate = (seqsWithWordi.size() + 0.50) / (float) (names.size() + 1);
+
+ int numNotZero = 0;
+ for (int k = 0; k < genusNodes.size(); k++) {
+ //probabilityInThisTaxonomy = (# of seqs with that word in this taxonomy + probabilityInTemplate) / (total number of seqs in this taxonomy + 1);
+ wordGenusProb[i][k] = log((count[genusNodes[k]] + probabilityInTemplate) / (float) (genusTotals[k] + 1));
+ if (count[genusNodes[k]] != 0) { out << k << '\t' << wordGenusProb[i][k] << '\t'; numNotZero++; }
+ }
+ out << endl;
+ out2 << probabilityInTemplate << '\t' << numNotZero << endl;
+ }
+
+ out.close();
+ out2.close();
+ }
+
+
+ mothurOut("DONE."); mothurOutEndLine();
+ mothurOut("It took " + toString(time(NULL) - start) + " seconds get probabilities. "); mothurOutEndLine();
+ }
+ catch(exception& e) {
+ errorOut(e, "Bayesian", "getTaxonomy");
+ exit(1);
+ }
+}
/**************************************************************************************************/
string Bayesian::getTaxonomy(Sequence* seq) {
try {
string tax;
+ Kmer kmer(kmerSize);
+ //get words contained in query
+ //getKmerString returns a string where the index in the string is hte kmer number
+ //and the character at that index can be converted to be the number of times that kmer was seen
+ string queryKmerString = kmer.getKmerString(seq->getUnaligned());
+ vector<int> queryKmers;
+ for (int i = 0; i < queryKmerString.length(); i++) {
+ if (queryKmerString[i] != '!') { //this kmer is in the query
+ queryKmers.push_back(i);
+ }
+ }
+
+ int index = getMostProbableTaxonomy(queryKmers);
+
+ //bootstrap - to set confidenceScore
+ int numToSelect = queryKmers.size() / 8;
+ tax = bootstrapResults(queryKmers, index, numToSelect);
return tax;
}
}
}
/**************************************************************************************************/
+string Bayesian::bootstrapResults(vector<int> kmers, int tax, int numToSelect) {
+ try {
+
+ //taxConfidenceScore.clear(); //clear out previous seqs scores
+
+ vector< map<string, int> > confidenceScores; //you need the added vector level of confusion to account for the level that name is seen since they can be the same
+ //map of classification to confidence for all areas seen
+ //ie. Bacteria;Alphaproteobacteria;Rhizobiales;Azorhizobium_et_rel.;Methylobacterium_et_rel.;Bosea;
+ //ie. Bacteria -> 100, Alphaproteobacteria -> 100, Rhizobiales -> 87, Azorhizobium_et_rel. -> 78, Methylobacterium_et_rel. -> 70, Bosea -> 50
+ confidenceScores.resize(100); //if you have more than 100 levels of classification...
+
+ map<string, int>::iterator itBoot;
+ map<string, int>::iterator itBoot2;
+ map<int, int>::iterator itConvert;
+
+ for (int i = 0; i < 100; i++) {
+ vector<int> temp;
+
+ for (int j = 0; j < numToSelect; j++) {
+ int index = int(rand() % kmers.size());
+
+ //add word to temp
+ temp.push_back(kmers[index]);
+ }
+
+ //get taxonomy
+ int newTax = getMostProbableTaxonomy(temp);
+ TaxNode taxonomy = phyloTree->get(newTax);
+
+ //add to confidence results
+ while (taxonomy.level != 0) { //while you are not at the root
+
+ itBoot2 = confidenceScores[taxonomy.level].find(taxonomy.name); //is this a classification we already have a count on
+
+ if (itBoot2 == confidenceScores[taxonomy.level].end()) { //not already in confidence scores
+ confidenceScores[taxonomy.level][taxonomy.name] = 1;
+ }else{
+ confidenceScores[taxonomy.level][taxonomy.name]++;
+ }
+
+ taxonomy = phyloTree->get(taxonomy.parent);
+ }
+ }
+
+ string confidenceTax = "";
+ simpleTax = "";
+ TaxNode seqTax = phyloTree->get(tax);
+
+ while (seqTax.level != 0) { //while you are not at the root
+
+ itBoot2 = confidenceScores[seqTax.level].find(seqTax.name); //is this a classification we already have a count on
+
+ int confidence = 0;
+ if (itBoot2 != confidenceScores[seqTax.level].end()) { //not already in confidence scores
+ confidence = confidenceScores[seqTax.level][seqTax.name];
+ }
+
+ if (confidence >= confidenceThreshold) {
+ confidenceTax = seqTax.name + "(" + toString(confidence) + ");" + confidenceTax;
+ simpleTax = seqTax.name + ";" + simpleTax;
+ }
+
+ seqTax = phyloTree->get(seqTax.parent);
+ }
+
+ return confidenceTax;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Bayesian", "bootstrapResults");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+int Bayesian::getMostProbableTaxonomy(vector<int> queryKmer) {
+ try {
+ int indexofGenus;
+
+ double maxProbability = -1000000.0;
+ //find taxonomy with highest probability that this sequence is from it
+ for (int k = 0; k < genusNodes.size(); k++) {
+
+ //for each taxonomy calc its probability
+ double prob = 1.0;
+ for (int i = 0; i < queryKmer.size(); i++) {
+ prob += wordGenusProb[queryKmer[i]][k];
+ }
+
+ //is this the taxonomy with the greatest probability?
+ if (prob > maxProbability) {
+ indexofGenus = genusNodes[k];
+ maxProbability = prob;
+ }
+ }
+
+ return indexofGenus;
+ }
+ catch(exception& e) {
+ errorOut(e, "Bayesian", "getMostProbableTaxonomy");
+ exit(1);
+ }
+}
+/*************************************************************************************************
+map<string, int> Bayesian::parseTaxMap(string newTax) {
+ try{
+
+ map<string, int> parsed;
+
+ newTax = newTax.substr(0, newTax.length()-1); //get rid of last ';'
+
+ //parse taxonomy
+ string individual;
+ while (newTax.find_first_of(';') != -1) {
+ individual = newTax.substr(0,newTax.find_first_of(';'));
+ newTax = newTax.substr(newTax.find_first_of(';')+1, newTax.length());
+ parsed[individual] = 1;
+ }
+
+ //get last one
+ parsed[newTax] = 1;
+
+ return parsed;
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Bayesian", "parseTax");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+void Bayesian::readProbFile(ifstream& in, ifstream& inNum) {
+ try{
+
+ int kmer, name, count; count = 0;
+ vector<int> num; num.resize(numKmers);
+ float prob;
+ vector<float> zeroCountProb; zeroCountProb.resize(numKmers);
+
+ while (inNum) {
+ inNum >> zeroCountProb[count] >> num[count];
+ count++;
+ gobble(inNum);
+ }
+ inNum.close();
+
+ while(in) {
+ in >> kmer;
+
+ //set them all to zero value
+ for (int i = 0; i < genusNodes.size(); i++) {
+ wordGenusProb[kmer][i] = log(zeroCountProb[kmer] / (float) (genusTotals[i]+1));
+ }
+
+ //get probs for nonzero values
+ for (int i = 0; i < num[kmer]; i++) {
+ in >> name >> prob;
+ wordGenusProb[kmer][name] = prob;
+ }
+
+ gobble(in);
+ }
+ in.close();
+ }
+ catch(exception& e) {
+ errorOut(e, "Bayesian", "readProbFile");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+
+
+
+
class Bayesian : public Classify {
public:
- Bayesian(string, string, string, int, int, int, int, int);
+ Bayesian(string, string, string, int, int);
~Bayesian() {};
string getTaxonomy(Sequence*);
-
+ //map<string, int> getConfidenceScores() { return taxConfidenceScore; }
private:
+ //map<string, vector<double> > taxonomyProbability; //probability of a word being in a given taxonomy.
+ //taxonomyProbability[bacteria;][0] = probabtility that a sequence within bacteria; would contain kmer 0;
+ //taxonomyProbability[bacteria;][1] = probabtility that a sequence within bacteria; would contain kmer 1...
+
+ vector< vector<float> > wordGenusProb; //vector of maps from genus to probability
+ //wordGenusProb[0][392] = probability that a sequence within genus that's index in the tree is 392 would contain kmer 0;
+
+ vector<int> genusTotals;
+ vector<int> genusNodes; //indexes in phyloTree where genus' are located
+
+ int kmerSize, numKmers, confidenceThreshold;
+
+ string bootstrapResults(vector<int>, int, int);
+ int getMostProbableTaxonomy(vector<int>);
+ void readProbFile(ifstream&, ifstream&);
+ //map<string, int> parseTaxMap(string);
};
Classify::Classify(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch) : taxFile(tfile), templateFile(tempFile) {
try {
+ readTaxonomy(taxFile);
- readTaxonomy(taxFile);
-
+ int start = time(NULL);
int numSeqs = 0;
//need to know number of template seqs for suffixdb
if (method == "suffix") {
if (needToGenerate) {
ifstream fastaFile;
openInputFile(tempFile, fastaFile);
-
+
while (!fastaFile.eof()) {
Sequence temp(fastaFile);
gobble(fastaFile);
names.push_back(temp.getName());
+
database->addSequence(temp);
}
fastaFile.close();
ifstream fastaFile;
openInputFile(tempFile, fastaFile);
-
+
while (!fastaFile.eof()) {
Sequence temp(fastaFile);
gobble(fastaFile);
database->setNumSeqs(names.size());
mothurOut("DONE."); mothurOutEndLine();
+ mothurOut("It took " + toString(time(NULL) - start) + " seconds generate search database. "); mothurOutEndLine();
}
catch(exception& e) {
void Classify::readTaxonomy(string file) {
try {
-
+
+ phyloTree = new PhyloTree();
+
ifstream inTax;
openInputFile(file, inTax);
inTax >> name >> taxInfo;
taxonomy[name] = taxInfo;
+
+ //itTax = taxList.find(taxInfo);
+ //if (itTax == taxList.end()) { //this is new taxonomy
+ //taxList[taxInfo] = 1;
+ //}else { taxList[taxInfo]++; }
+ phyloTree->addSeqToTree(name, taxInfo);
gobble(inTax);
}
+
+ phyloTree->assignHeirarchyIDs(0);
inTax.close();
mothurOut("DONE.");
}
/**************************************************************************************************/
+vector<string> Classify::parseTax(string tax) {
+ try {
+ vector<string> taxons;
+
+ tax = tax.substr(0, tax.length()-1); //get rid of last ';'
+
+ //parse taxonomy
+ string individual;
+ while (tax.find_first_of(';') != -1) {
+ individual = tax.substr(0,tax.find_first_of(';'));
+ tax = tax.substr(tax.find_first_of(';')+1, tax.length());
+ taxons.push_back(individual);
+
+ }
+ //get last one
+ taxons.push_back(tax);
+
+ return taxons;
+ }
+ catch(exception& e) {
+ errorOut(e, "Classify", "parseTax");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
#include "mothur.h"
#include "database.hpp"
-
+#include "doTaxonomy.h"
class Sequence;
+
/**************************************************************************************************/
class Classify {
public:
Classify(string, string, string, int, int, int, int, int);
- Classify(){};
+ Classify(){ delete phyloTree; }
virtual ~Classify(){};
virtual string getTaxonomy(Sequence*) = 0;
+ //virtual map<string, int> getConfidenceScores() { return taxConfidenceScore; }
+ //virtual vector<string> parseTax(string);
+ virtual string getSimpleTax() { return simpleTax; }
protected:
map<string, string> taxonomy; //name maps to taxonomy
+ //map<string, int> genusCount; //maps genus to count - in essence a list of how many seqs are in each taxonomy
+ map<string, int>::iterator itTax;
map<string, string>::iterator it;
Database* database;
+ PhyloTree* phyloTree;
- string taxFile, templateFile;
+ string taxFile, templateFile, simpleTax;
vector<string> names;
+ //map<string, int> taxConfidenceScore;
void readTaxonomy(string);
-
+ vector<string> parseTax(string);
};
/**************************************************************************************************/
#include "classifyseqscommand.h"
#include "sequence.hpp"
-#include "phylotype.h"
#include "bayesian.h"
#include "doTaxonomy.h"
#include "knn.h"
else {
//valid paramters for this command
- string AlignArray[] = {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted"};
+ string AlignArray[] = {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted","cutoff"};
vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
OptionParser parser(option);
search = validParameter.validFile(parameters, "search", false); if (search == "not found"){ search = "kmer"; }
- method = validParameter.validFile(parameters, "method", false); if (method == "not found"){ method = "phylotype"; }
+ method = validParameter.validFile(parameters, "method", false); if (method == "not found"){ method = "bayesian"; }
temp = validParameter.validFile(parameters, "match", false); if (temp == "not found"){ temp = "1.0"; }
convert(temp, match);
temp = validParameter.validFile(parameters, "numwanted", false); if (temp == "not found"){ temp = "10"; }
convert(temp, numWanted);
+
+ temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found"){ temp = "0"; }
+ convert(temp, cutoff);
+
+ if ((method == "bayesian") && (search != "kmer")) {
+ mothurOut("The bayesian method requires the kmer search." + search + "will be disregarded." ); mothurOutEndLine();
+ search = "kmer";
+ }
}
}
mothurOut("The classify.seqs command parameters are template, fasta, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend and numwanted.\n");
mothurOut("The template, fasta and taxonomy parameters are required.\n");
mothurOut("The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer and blast. The default is kmer.\n");
- mothurOut("The method parameter allows you to specify classification method to use. Your options are: phylotype, bayesian and knn. The default is phylotype.\n");
+ mothurOut("The method parameter allows you to specify classification method to use. Your options are: bayesian and knn. The default is bayesian.\n");
mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n");
mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n");
mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n");
mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -2.0.\n");
mothurOut("The numwanted parameter allows you to specify the number of sequence matches you want with the knn method. The default is 10.\n");
+ mothurOut("The cutoff parameter allows you to specify a bootstrap confidence threshold for your taxonomy. The default is 0.\n");
mothurOut("The classify.seqs command should be in the following format: \n");
mothurOut("classify.seqs(template=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n");
- mothurOut("Example classify.seqs(fasta=amazon.fasta, template=core.filtered, method=phylotype, search=gotoh, ksize=8, processors=2)\n");
+ mothurOut("Example classify.seqs(fasta=amazon.fasta, template=core.filtered, method=knn, search=gotoh, ksize=8, processors=2)\n");
mothurOut("The .taxonomy file consists of 2 columns: 1 = your sequence name, 2 = the taxonomy for your sequence. \n");
mothurOut("The .tax.summary is a summary of the different taxonomies represented in your fasta file. \n");
mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
try {
if (abort == true) { return 0; }
- int start = time(NULL);
- if(method == "phylotype"){ classify = new PhyloType(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch); }
- //else if(method == "bayesian") { classify = new Bayesian(taxonomyFileName, templateFileName, search); }
+
+ if(method == "bayesian") { classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff); }
else if(method == "knn") { classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted); }
else {
- mothurOut(search + " is not a valid method option. I will run the command using phylotype.");
+ mothurOut(search + " is not a valid method option. I will run the command using bayesian.");
mothurOutEndLine();
- classify = new PhyloType(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch);
+ classify = new Bayesian(taxonomyFileName, templateFileName, search, kmerSize, cutoff);
}
int numFastaSeqs = 0;
string newTaxonomyFile = getRootName(fastaFileName) + "taxonomy";
+ string tempTaxonomyFile = getRootName(fastaFileName) + "taxonomy.temp";
string taxSummary = getRootName(fastaFileName) + "tax.summary";
+ int start = time(NULL);
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
if(processors == 1){
ifstream inFASTA;
lines.push_back(new linePair(0, numFastaSeqs));
- driver(lines[0], newTaxonomyFile);
+ driver(lines[0], newTaxonomyFile, tempTaxonomyFile);
}
else{
}
lines.push_back(new linePair(startPos, numSeqsPerProcessor));
}
- createProcesses(newTaxonomyFile);
+ createProcesses(newTaxonomyFile, tempTaxonomyFile);
rename((newTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), newTaxonomyFile.c_str());
+ rename((tempTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), tempTaxonomyFile.c_str());
for(int i=1;i<processors;i++){
appendTaxFiles((newTaxonomyFile + toString(processIDS[i]) + ".temp"), newTaxonomyFile);
+ appendTaxFiles((tempTaxonomyFile + toString(processIDS[i]) + ".temp"), tempTaxonomyFile);
remove((newTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
+ remove((tempTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
}
}
lines.push_back(new linePair(0, numFastaSeqs));
- driver(lines[0], newTaxonomyFile);
+ driver(lines[0], newTaxonomyFile, tempTaxonomyFile);
#endif
delete classify;
//make taxonomy tree from new taxonomy file
ifstream inTaxonomy;
- openInputFile(newTaxonomyFile, inTaxonomy);
+ openInputFile(tempTaxonomyFile, inTaxonomy);
string accession, taxaList;
PhyloTree taxaBrowser;
gobble(inTaxonomy);
}
inTaxonomy.close();
+ remove(tempTaxonomyFile.c_str());
taxaBrowser.assignHeirarchyIDs(0);
}
/**************************************************************************************************/
-void ClassifySeqsCommand::createProcesses(string taxFileName) {
+void ClassifySeqsCommand::createProcesses(string taxFileName, string tempTaxFile) {
try {
#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
int process = 0;
processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
process++;
}else if (pid == 0){
- driver(lines[process], taxFileName + toString(getpid()) + ".temp");
+ driver(lines[process], taxFileName + toString(getpid()) + ".temp", tempTaxFile + toString(getpid()) + ".temp");
exit(0);
}else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
}
//**********************************************************************************************************************
-int ClassifySeqsCommand::driver(linePair* line, string taxFName){
+int ClassifySeqsCommand::driver(linePair* line, string taxFName, string tempTFName){
try {
ofstream outTax;
openOutputFile(taxFName, outTax);
+
+ ofstream outTaxSimple;
+ openOutputFile(tempTFName, outTaxSimple);
ifstream inFASTA;
openInputFile(fastaFileName, inFASTA);
taxonomy = classify->getTaxonomy(candidateSeq);
if (taxonomy != "bad seq") {
- outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
+ //if (method != "bayesian") {
+ outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
+ outTaxSimple << candidateSeq->getName() << '\t' << classify->getSimpleTax() << endl;
+ //}else{
+ //vector<string> pTax = classify->parseTax(taxonomy);
+ //map<string, int> confidence = classify->getConfidenceScores();
+
+ //outTax << candidateSeq->getName() << '\t';
+ //for (int j = 0; j < pTax.size(); j++) {
+ //if (confidence[pTax[j]] > cutoff) {
+ // outTax << pTax[j] << "(" << confidence[pTax[j]] << ");";
+ //}else{ break; }
+ //}
+ //outTax << endl;
+ //}
}
delete candidateSeq;
+
+ if(i % 100 == 0){
+ mothurOut("Classifying sequence " + toString(i)); mothurOutEndLine();
+ }
}
inFASTA.close();
outTax.close();
+ outTaxSimple.close();
return 1;
}
Classify* classify;
string fastaFileName, templateFileName, distanceFileName, search, method, taxonomyFileName;
- int processors, kmerSize, numWanted;
+ int processors, kmerSize, numWanted, cutoff;
float match, misMatch, gapOpen, gapExtend;
bool abort;
- int driver(linePair*, string);
+ int driver(linePair*, string, string);
void appendTaxFiles(string, string);
- void createProcesses(string);
+ void createProcesses(string, string);
};
#endif
for(int i=0;i<displays.size();i++){
ccd->registerDisplay(displays[i]); //adds a display[i] to cdd
- if (displays[i]->isCalcMultiple() == true) { displays[i]->init(groupLabelAll); }
+ if ((displays[i]->isCalcMultiple() == true) && (displays[i]->getAll() == true)) { displays[i]->init(groupLabelAll); }
else { displays[i]->init(groupLabel); }
}
public:
CollectDisplay(Calculator* calc, FileOutput* file) : estimate(calc), output(file) {timesCalled = 0;};
~CollectDisplay() { delete estimate; delete output; }
+
+
void update(SAbundVector* rank){
nSeqs=rank->getNumSeqs();
data = estimate->getValues(rank);
n++;
}
- if (estimate->getMultiple() == true) {
+ if ((estimate->getMultiple() == true) && all) {
numGroupComb++;
groupData.resize((numGroupComb*data.size()), 0);
//is this the time its called with all values
void init(string s) { output->initFile(s); };
void reset() { output->resetFile(); };
void close() { output->resetFile(); };
+ void setAll(bool a) { all = a; }
+ bool getAll() { return all; }
+
bool isCalcMultiple() { return estimate->getMultiple(); }
string getName() { return estimate->getName(); }
-
+
+
private:
Calculator* estimate;
int nSeqs, timesCalled, numGroupComb;
vector<double> data;
vector<double> groupData;
+ bool all;
+
};
/***********************************************************************/
void notifyDisplays(){
for(set<Display*>::iterator pos=displays.begin();pos!=displays.end();pos++){
//cout << (*pos)->getName() << endl;
- if ( ((*pos)->isCalcMultiple() == true) || ( ((*pos)->isCalcMultiple() == false) && (shared.size() == 2) ) ) {
+ if ( ( ((*pos)->isCalcMultiple() == true) && ((*pos)->getAll() == true) ) || (shared.size() == 2) ) {
(*pos)->update(shared, NumSeqs, NumGroupComb);
}
}
else {
//valid paramters for this command
- string Array[] = {"freq","label","calc","groups"};
+ string Array[] = {"freq","label","calc","groups","all"};
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
OptionParser parser(option);
string temp;
temp = validParameter.validFile(parameters, "freq", false); if (temp == "not found") { temp = "100"; }
convert(temp, freq);
+
+ temp = validParameter.validFile(parameters, "all", false); if (temp == "not found") { temp = "true"; }
+ all = isTrue(temp);
if (abort == false) {
mothurOut("The default value for groups is all the groups in your groupfile.\n");
validCalculator->printCalc("shared", cout);
mothurOut("The label parameter is used to analyze specific labels in your input.\n");
+ mothurOut("The all parameter is used to specify if you want the estimate of all your groups together. This estimate can only be made for sharedsobs and sharedchao calculators. The default is true.\n");
+ mothurOut("If you use sharedchao and run into memory issues, set all to false. \n");
mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 2 valid groups.\n");
mothurOut("Note: No spaces between parameter labels (i.e. list), '=' and parameters (i.e.yourListfile).\n\n");
//if the users entered no valid calculators don't execute command
if (cDisplays.size() == 0) { return 0; }
+ for(int i=0;i<cDisplays.size();i++){ cDisplays[i]->setAll(all); }
read = new ReadOTUFile(globaldata->inputFileName);
read->read(&*globaldata);
int freq;
string format;
- bool abort, allLines;
+ bool abort, allLines, all;
set<string> labels; //holds labels to be used
string label, calc, groups;
vector<string> Estimators, Groups;
virtual int getLongestBase();
virtual void readKmerDB(ifstream&){};
virtual void setNumSeqs(int i) { numSeqs = i; }
+ virtual vector<int> getSequencesWithKmer(int){ vector<int> filler; return filler; };
+ virtual int getMaxKmer(){ return 1; };
+
protected:
int numSeqs, longest;
virtual void reset() = 0;
virtual void close() = 0;
virtual bool isCalcMultiple() = 0;
+ virtual void setAll(bool){}
+ virtual bool getAll() { bool a; return a; }
virtual string getName() { return ""; };
virtual ~Display() {}
if(childPointer != tree[currentNode].children.end()){ //if the node already exists, move on
currentNode = childPointer->second;
tree[currentNode].accessions.push_back(seqName);
+ name2Taxonomy[seqName] = currentNode;
}
else{ //otherwise, create it
tree.push_back(TaxNode(taxon));
numNodes++;
tree[currentNode].children[taxon] = numNodes-1;
+ tree[numNodes-1].parent = currentNode;
// int numChildren = tree[currentNode].children.size();
// string heirarchyID = tree[currentNode].heirarchyID;
currentNode = tree[currentNode].children[taxon];
tree[currentNode].accessions.push_back(seqName);
-
+ name2Taxonomy[seqName] = currentNode;
// tree[currentNode].level = level;
// tree[currentNode].childNumber = numChildren;
// tree[currentNode].heirarchyID = heirarchyID + '.' + toString(tree[currentNode].childNumber);
}
-
+
+ if (seqTaxonomy == "") { uniqueTaxonomies[currentNode] = currentNode; }
+ }
+}
+/**************************************************************************************************/
+vector<int> PhyloTree::getGenusNodes() {
+ try {
+ genusIndex.clear();
+ //generate genusIndexes
+ map<int, int>::iterator it2;
+ for (it2=uniqueTaxonomies.begin(); it2!=uniqueTaxonomies.end(); it2++) { genusIndex.push_back(it2->first); }
+
+ return genusIndex;
+ }
+ catch(exception& e) {
+ errorOut(e, "PhyloTree", "getGenusNodes");
+ exit(1);
}
}
-
/**************************************************************************************************/
void PhyloTree::assignHeirarchyIDs(int index){
counter++;
tree[it->second].level = tree[index].level + 1;
assignHeirarchyIDs(it->second);
-
}
-
}
/**************************************************************************************************/
+#ifndef DOTAXONOMY_H
+#define DOTAXONOMY_H
+
/*
* doTaxonomy.h
*
/**************************************************************************************************/
struct TaxNode {
- vector<string> accessions;
- map<string, int> children;
+ vector<string> accessions; //names of seqs in this branch of tree
+ map<string, int> children; //childs name to index in tree
int parent, childNumber, level;
string name, heirarchyID;
void addSeqToTree(string, string);
void assignHeirarchyIDs(int);
void print(ofstream&);
+ vector<int> getGenusNodes();
+ TaxNode get(int i) { return tree[i]; }
+ TaxNode get(string seqName) { return tree[name2Taxonomy[seqName]]; }
+ int getIndex(string seqName) { return name2Taxonomy[seqName]; }
+ string getName(int i) { return tree[i].name; }
private:
string getNextTaxon(string&);
vector<TaxNode> tree;
+ vector<int> genusIndex; //holds the indexes in tree where the genus level taxonomies are stored
+ map<string, int> name2Taxonomy; //maps name to index in tree
+ map<int, int> uniqueTaxonomies; //map of unique taxonomies
void print(int, ofstream&);
int numNodes;
int numSeqs;
/**************************************************************************************************/
+#endif
+
+
if (namefile == "not open") { abort = true; }
else if (namefile == "not found") { namefile = ""; }
- if ((phylipfile == "") && (columnfile == "")) { mothurOut("When executing a cluster command you must enter a phylip or a column."); mothurOutEndLine(); abort = true; }
- else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a cluster command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; }
+ if ((phylipfile == "") && (columnfile == "")) { mothurOut("When executing a hcluster command you must enter a phylip or a column."); mothurOutEndLine(); abort = true; }
+ else if ((phylipfile != "") && (columnfile != "")) { mothurOut("When executing a hcluster command you must enter ONLY ONE of the following: phylip or column."); mothurOutEndLine(); abort = true; }
if (columnfile != "") {
if (namefile == "") { cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }
void HClusterCommand::help(){
try {
- mothurOut("The cluster command can only be executed after a successful read.dist command.\n");
- mothurOut("The cluster command parameter options are method, cuttoff, precision, showabund and timing. No parameters are required.\n");
- mothurOut("The cluster command should be in the following format: \n");
- mothurOut("cluster(method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
- mothurOut("The acceptable cluster methods are furthest, nearest and average. If no method is provided then furthest is assumed.\n\n");
+ mothurOut("The hcluster command parameter options are cutoff, precision, method, showabund, timing, phylip, column, name and sorted. Phylip or column and name are required.\n");
+ mothurOut("The phylip and column parameter allow you to enter your distance file, and sorted indicates whether your column distance file is already sorted. \n");
+ mothurOut("The name parameter allows you to enter your name file and is required if your distance file is in column format. \n");
+ mothurOut("The hcluster command should be in the following format: \n");
+ mothurOut("hcluster(column=youDistanceFile, name=yourNameFile, method=yourMethod, cutoff=yourCutoff, precision=yourPrecision) \n");
+ mothurOut("The acceptable hcluster methods is furthest, but we hope to add nearest and average in the future.\n\n");
}
catch(exception& e) {
errorOut(e, "HClusterCommand", "help");
}
/**************************************************************************************************/
+int KmerDB::getCount(int kmer) {
+ try {
+ if (kmer < 0) { return 0; } //if user gives negative number
+ else if (kmer > maxKmer) { return 0; } //or a kmer that is bigger than maxkmer
+ else { return kmerLocations[kmer].size(); } // kmer is in vector range
+ }
+ catch(exception& e) {
+ errorOut(e, "KmerDB", "getCount");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+vector<int> KmerDB::getSequencesWithKmer(int kmer) {
+ try {
+
+ vector<int> seqs;
+
+ if (kmer < 0) { } //if user gives negative number
+ else if (kmer > maxKmer) { } //or a kmer that is bigger than maxkmer
+ else { seqs = kmerLocations[kmer]; }
+
+ return seqs;
+ }
+ catch(exception& e) {
+ errorOut(e, "KmerDB", "getSequencesWithKmer");
+ exit(1);
+ }
+}
+
+
+/**************************************************************************************************/
+
+
void addSequence(Sequence);
vector<int> findClosestSequences(Sequence*, int);
void readKmerDB(ifstream&);
+ int getCount(int); //returns number of sequences with that kmer number
+ vector<int> getSequencesWithKmer(int); //returns vector of sequences that contain kmer passed in
+ int getMaxKmer() { return maxKmer; }
private:
string tax = taxonomy[closest[i]]; //we know its there since we checked in getTaxonomy
- tax = tax.substr(0, tax.length()-1); //get rid of last ';'
-
- //parse taxonomy
- string individual;
- while (tax.find_first_of(';') != -1) {
- individual = tax.substr(0,tax.find_first_of(';'));
- tax = tax.substr(tax.find_first_of(';')+1, tax.length());
- taxons[i].push_back(individual);
-
- }
- //get last one
- taxons[i].push_back(tax);
+ taxons[i] = parseTax(tax);
//figure out who has the shortest taxonomy info. so you can start comparing there
if (taxons[i].size() < smallest) {
int rcoef;
IntNode* left;
IntNode* right;
+
+ IntNode(int lv, int rv, IntNode* l, IntNode* r) : lvalue(lv), rvalue(rv), left(l), right(r) {};
+ IntNode() {};
};
struct ThreadNode {
+++ /dev/null
-/*
- * phylotype.cpp
- * Mothur
- *
- * Created by westcott on 11/3/09.
- * Copyright 2009 Schloss Lab. All rights reserved.
- *
- */
-
-#include "phylotype.h"
-
-/**************************************************************************************************/
-PhyloType::PhyloType(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch)
-: Classify(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch) {}
-/**************************************************************************************************/
-string PhyloType::getTaxonomy(Sequence* seq) {
- try {
- string tax;
-
- //use database to find closest seq
- vector<int> closest = database->findClosestSequences(seq, 1);
-
- //find that sequences taxonomy in map
- it = taxonomy.find(names[closest[0]]);
-
- //is this sequence in the taxonomy file
- if (it == taxonomy.end()) { //error not in file
- mothurOut("Error: sequence " + names[closest[0]] + " is not in the taxonomy file. It is the closest match to sequence " + seq->getName() + ". " + seq->getName() + " will be disregarded."); mothurOutEndLine();
- tax = "bad seq";
- }else {
- tax = it->second;
- }
-
- return tax;
- }
- catch(exception& e) {
- errorOut(e, "PhyloType", "getTaxonomy");
- exit(1);
- }
-}
-/**************************************************************************************************/
-
+++ /dev/null
-#ifndef PHYLOTYPE_H
-#define PHYLOTYPE_H
-
-/*
- * phylotype.h
- * Mothur
- *
- * Created by westcott on 11/3/09.
- * Copyright 2009 Schloss Lab. All rights reserved.
- *
- */
-
-#include "mothur.h"
-#include "classify.h"
-
-/**************************************************************************************************/
-
-class PhyloType : public Classify {
-
-public:
- PhyloType(string, string, string, int, int, int, int, int);
- ~PhyloType() {};
-
- string getTaxonomy(Sequence*);
-
-
-private:
-
-};
-
-/**************************************************************************************************/
-
-#endif
-
int ptile97_5 = int(numSeqs * 0.975);
int ptile100 = numSeqs - 1;
+ //to compensate for blank sequences that would result in startPosition and endPostion equalling -1
+ if (startPosition[0] == -1) { startPosition[0] = 0; }
+ if (endPosition[0] == -1) { endPosition[0] = 0; }
+
mothurOutEndLine();
mothurOut("\t\tStart\tEnd\tNBases\tAmbigs\tPolymer"); mothurOutEndLine();
mothurOut("Minimum:\t" + toString(startPosition[0]) + "\t" + toString(endPosition[0]) + "\t" + toString(seqLength[0]) + "\t" + toString(ambigBases[0]) + "\t" + toString(longHomoPolymer[0])); mothurOutEndLine();
// the coeffient is 2. Note: we only set the coeffient in f2 values.
//create and initialize trees to 0.
- initialTree(numGroups);
-
+ initialTree(numGroups);
for (int i = 0; i < shared[0]->size(); i++) {
//get bin values and calc shared
}
}
-
+
//calculate chao1, (numleaves-1) because numleaves contains the ++ values.
bool bias = false;
for(int i=0;i<numLeaves;i++){
f1leaves.resize(numNodes);
f2leaves.resize(numNodes);
-
+
//initialize leaf values
for (int i = 0; i < numLeaves; i++) {
- f1leaves[i] = new IntNode;
- f1leaves[i]->lvalue = 0;
- f1leaves[i]->rvalue = 0;
- f1leaves[i]->left = NULL;
- f1leaves[i]->right = NULL;
-
- f2leaves[i] = new IntNode;
- f2leaves[i]->lvalue = 0;
- f2leaves[i]->rvalue = 0;
- f2leaves[i]->left = NULL;
- f2leaves[i]->right = NULL;
+ f1leaves[i] = new IntNode(0, 0, NULL, NULL);
+ f2leaves[i] = new IntNode(0, 0, NULL, NULL);
}
//set pointers to children
for (int j = numLeaves; j < numNodes; j++) {
- f1leaves[j] = new IntNode;
+ f1leaves[j] = new IntNode();
f1leaves[j]->left = f1leaves[countleft];
f1leaves[j]->right = f1leaves[countright];
- f2leaves[j] = new IntNode;
+ f2leaves[j] = new IntNode();
f2leaves[j]->left = f2leaves[countleft];
f2leaves[j]->right =f2leaves[countright];
-
+
countleft = countleft + 2;
countright = countright + 2;
}
//point to root
f1root = f1leaves[numNodes-1];
-
+
//point to root
f2root = f2leaves[numNodes-1];
setCoef(f2root, 0);
}
catch(exception& e) {
+ if ((toString(e.what()) == "vector::_M_fill_insert") || (toString(e.what()) == "St9bad_alloc")) { mothurOut("You are using " + toString(n) + " groups which creates 2^" + toString(n+1) + " nodes. Try reducing the number of groups you selected. "); mothurOutEndLine(); exit(1); }
errorOut(e, "SharedChao1", "initialTree");
exit(1);
}
else {
//valid paramters for this command
- string Array[] = {"label","calc","groups"};
+ string Array[] = {"label","calc","groups","all"};
vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
OptionParser parser(option);
globaldata->Groups = Groups;
}
+ string temp = validParameter.validFile(parameters, "all", false); if (temp == "not found") { temp = "true"; }
+ all = isTrue(temp);
+
if (abort == false) {
validCalculator = new ValidCalculators();
void SummarySharedCommand::help(){
try {
mothurOut("The summary.shared command can only be executed after a successful read.otu command.\n");
- mothurOut("The summary.shared command parameters are label and calc. No parameters are required.\n");
+ mothurOut("The summary.shared command parameters are label, calc and all. No parameters are required.\n");
mothurOut("The summary.shared command should be in the following format: \n");
mothurOut("summary.shared(label=yourLabel, calc=yourEstimators, groups=yourGroups).\n");
mothurOut("Example summary.shared(label=unique-.01-.03, groups=B-C, calc=sharedchao-sharedace-jabund-sorensonabund-jclass-sorclass-jest-sorest-thetayc-thetan).\n");
mothurOut("The default value for calc is sharedsobs-sharedchao-sharedace-jabund-sorensonabund-jclass-sorclass-jest-sorest-thetayc-thetan\n");
mothurOut("The default value for groups is all the groups in your groupfile.\n");
mothurOut("The label parameter is used to analyze specific labels in your input.\n");
+ mothurOut("The all parameter is used to specify if you want the estimate of all your groups together. This estimate can only be made for sharedsobs and sharedchao calculators. The default is true.\n");
+ mothurOut("If you use sharedchao and run into memory issues, set all to false. \n");
mothurOut("The groups parameter allows you to specify which of the groups in your groupfile you would like analyzed. You must enter at least 2 valid groups.\n");
mothurOut("Note: No spaces between parameter labels (i.e. label), '=' and parameters (i.e.yourLabel).\n\n");
}
if (sumCalculators.size() == 0) { return 0; }
//check if any calcs can do multiples
else{
- for (int i = 0; i < sumCalculators.size(); i++) {
- if (sumCalculators[i]->getMultiple() == true) { mult = true; }
+ if (all){
+ for (int i = 0; i < sumCalculators.size(); i++) {
+ if (sumCalculators[i]->getMultiple() == true) { mult = true; }
+ }
}
}
InputData* input;
ValidCalculators* validCalculator;
- bool abort, allLines, mult;
+ bool abort, allLines, mult, all;
set<string> labels; //holds labels to be used
string label, calc, groups;
vector<string> Estimators, Groups;