From: westcott Date: Thu, 5 Nov 2009 14:40:15 +0000 (+0000) Subject: started work on classify.seqs command. changed the database class so that it does... X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=63e089e0b3aad1741bab60119ed7ccc784dce347 started work on classify.seqs command. changed the database class so that it does not store the template sequences. created alignmentDB class that stores template seqs and uses database class for searching. made changes to aligncommand and chimeracheckrdp to reflect the changes to database class. --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 252c02e..9b064e0 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -72,7 +72,6 @@ 379293C30F2DE73400B9034A /* treemap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 379293C20F2DE73400B9034A /* treemap.cpp */; }; 379294700F2E191800B9034A /* parsimonycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3792946F0F2E191800B9034A /* parsimonycommand.cpp */; }; 3792948A0F2E258500B9034A /* parsimony.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 379294890F2E258500B9034A /* parsimony.cpp */; }; - 3799A9500FD6A58C00E33EDE /* distancedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3799A94C0FD6A58C00E33EDE /* distancedb.cpp */; }; 3799A9510FD6A58C00E33EDE /* seqsummarycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3799A94E0FD6A58C00E33EDE /* seqsummarycommand.cpp */; }; 379D3D510FF90E090068C1C0 /* chimeraseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 379D3D500FF90E090068C1C0 /* chimeraseqscommand.cpp */; }; 37AD4CE40F28AEA300AA2D49 /* sharedlistvector.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37AD4CE30F28AEA300AA2D49 /* sharedlistvector.cpp */; }; @@ -169,6 +168,13 @@ A73329CF1083B3B3003B10C5 /* getlistcountcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73329CE1083B3B3003B10C5 /* getlistcountcommand.cpp */; }; A751ACEA1098B283003D0911 /* readcluster.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A751ACE91098B283003D0911 /* readcluster.cpp */; }; A75B887E104C16860083C454 /* ccode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75B887B104C16860083C454 /* ccode.cpp */; }; + A7861478109F5FA00010AAAF /* classifyseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7861477109F5FA00010AAAF /* classifyseqscommand.cpp */; }; + A787810310A06D5D0086103D /* classify.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787810210A06D5D0086103D /* classify.cpp */; }; + A787811510A074330086103D /* phylotype.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787811410A074330086103D /* phylotype.cpp */; }; + A78781BA10A0813E0086103D /* doTaxonomy.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A78781B810A0813D0086103D /* doTaxonomy.cpp */; }; + A787821110A0AAE70086103D /* bayesian.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787821010A0AAE70086103D /* bayesian.cpp */; }; + A78782AB10A1B1CB0086103D /* alignmentdb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A78782AA10A1B1CB0086103D /* alignmentdb.cpp */; }; + A787844510A1EBDD0086103D /* knn.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787844410A1EBDD0086103D /* knn.cpp */; }; A7B04493106CC3E60046FC83 /* chimeraslayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */; }; A7B0450E106CEEC90046FC83 /* slayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B0450D106CEEC90046FC83 /* slayer.cpp */; }; A7E4A783106913F900688F62 /* getsharedotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A782106913F900688F62 /* getsharedotucommand.cpp */; }; @@ -328,8 +334,6 @@ 379294880F2E258500B9034A /* parsimony.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = parsimony.h; sourceTree = SOURCE_ROOT; }; 379294890F2E258500B9034A /* parsimony.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = parsimony.cpp; sourceTree = SOURCE_ROOT; }; 3792948D0F2E271100B9034A /* treecalculator.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = treecalculator.h; sourceTree = SOURCE_ROOT; }; - 3799A94C0FD6A58C00E33EDE /* distancedb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = distancedb.cpp; sourceTree = SOURCE_ROOT; }; - 3799A94D0FD6A58C00E33EDE /* distancedb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = distancedb.hpp; sourceTree = SOURCE_ROOT; }; 3799A94E0FD6A58C00E33EDE /* seqsummarycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = seqsummarycommand.cpp; sourceTree = SOURCE_ROOT; }; 3799A94F0FD6A58C00E33EDE /* seqsummarycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = seqsummarycommand.h; sourceTree = SOURCE_ROOT; }; 379D3D4F0FF90E090068C1C0 /* chimeraseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraseqscommand.h; sourceTree = SOURCE_ROOT; }; @@ -538,6 +542,20 @@ A751ACE91098B283003D0911 /* readcluster.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = readcluster.cpp; sourceTree = SOURCE_ROOT; }; A75B887B104C16860083C454 /* ccode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ccode.cpp; sourceTree = SOURCE_ROOT; }; A75B887C104C16860083C454 /* ccode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ccode.h; sourceTree = SOURCE_ROOT; }; + A7861476109F5FA00010AAAF /* classifyseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = classifyseqscommand.h; sourceTree = ""; }; + A7861477109F5FA00010AAAF /* classifyseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = classifyseqscommand.cpp; sourceTree = ""; }; + A78780FE10A06B8B0086103D /* classify.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = classify.h; sourceTree = SOURCE_ROOT; }; + A787810210A06D5D0086103D /* classify.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = classify.cpp; sourceTree = SOURCE_ROOT; }; + A787811310A074330086103D /* phylotype.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = phylotype.h; sourceTree = SOURCE_ROOT; }; + A787811410A074330086103D /* phylotype.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = phylotype.cpp; sourceTree = SOURCE_ROOT; }; + A78781B810A0813D0086103D /* doTaxonomy.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = doTaxonomy.cpp; sourceTree = ""; }; + A78781B910A0813D0086103D /* doTaxonomy.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = doTaxonomy.h; sourceTree = ""; }; + A787820F10A0AAE70086103D /* bayesian.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bayesian.h; sourceTree = ""; }; + A787821010A0AAE70086103D /* bayesian.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bayesian.cpp; sourceTree = ""; }; + A78782A910A1B1CB0086103D /* alignmentdb.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = alignmentdb.h; sourceTree = ""; }; + A78782AA10A1B1CB0086103D /* alignmentdb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignmentdb.cpp; sourceTree = ""; }; + A787844310A1EBDD0086103D /* knn.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = knn.h; sourceTree = ""; }; + A787844410A1EBDD0086103D /* knn.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = knn.cpp; sourceTree = ""; }; A7B04491106CC3E60046FC83 /* chimeraslayer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraslayer.h; sourceTree = SOURCE_ROOT; }; A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraslayer.cpp; sourceTree = SOURCE_ROOT; }; A7B0450C106CEEC90046FC83 /* slayer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = slayer.h; sourceTree = SOURCE_ROOT; }; @@ -592,10 +610,9 @@ 373C68DB0FC1C38D00137ACD /* blastalign.cpp */, 37D928A60F2133C0001D4494 /* calculators */, 374F2FE81006346600E97C66 /* chimera */, + A787811210A073F40086103D /* classifiers */, 37D927C20F21331F001D4494 /* cluster.hpp */, 37D927C10F21331F001D4494 /* cluster.cpp */, - A729ACE210849BBE00139801 /* hcluster.h */, - A729ACE310849BBE00139801 /* hcluster.cpp */, 37D928A90F2133E5001D4494 /* commands */, 37D927C60F21331F001D4494 /* collect.h */, 37D927C50F21331F001D4494 /* collect.cpp */, @@ -619,6 +636,8 @@ 37D927DF0F21331F001D4494 /* globaldata.cpp */, 373C68BB0FC1C1A600137ACD /* gotohoverlap.hpp */, 373C68BA0FC1C1A600137ACD /* gotohoverlap.cpp */, + A729ACE210849BBE00139801 /* hcluster.h */, + A729ACE310849BBE00139801 /* hcluster.cpp */, 375873ED0F7D646F0040F377 /* heatmap.h */, 375873EE0F7D646F0040F377 /* heatmap.cpp */, 378598720FDD4C1500EF9D03 /* heatmapsim.h */, @@ -847,6 +866,8 @@ 377326630FAF16E0007ABB8B /* consensuscommand.cpp */, 379D3D4F0FF90E090068C1C0 /* chimeraseqscommand.h */, 379D3D500FF90E090068C1C0 /* chimeraseqscommand.cpp */, + A7861476109F5FA00010AAAF /* classifyseqscommand.h */, + A7861477109F5FA00010AAAF /* classifyseqscommand.cpp */, 37B28F660F27590100808A62 /* deconvolutecommand.h */, 37B28F670F27590100808A62 /* deconvolutecommand.cpp */, 37C753CC0FB3415200DBD02E /* distancecommand.h */, @@ -942,13 +963,13 @@ 373C68A00FC1C07D00137ACD /* alignment.cpp */, 373C68A30FC1C07D00137ACD /* alignmentcell.hpp */, 373C68A20FC1C07D00137ACD /* alignmentcell.cpp */, + A78782A910A1B1CB0086103D /* alignmentdb.h */, + A78782AA10A1B1CB0086103D /* alignmentdb.cpp */, 373C69170FC1C8AF00137ACD /* blastdb.hpp */, 373C69160FC1C8AF00137ACD /* blastdb.cpp */, 37D927D40F21331F001D4494 /* database.hpp */, 37D927D30F21331F001D4494 /* database.cpp */, 37D927D50F21331F001D4494 /* datavector.hpp */, - 3799A94D0FD6A58C00E33EDE /* distancedb.hpp */, - 3799A94C0FD6A58C00E33EDE /* distancedb.cpp */, 37D927DC0F21331F001D4494 /* fastamap.h */, 37D927DB0F21331F001D4494 /* fastamap.cpp */, 375873EA0F7D64520040F377 /* fullmatrix.h */, @@ -1010,6 +1031,23 @@ name = errorcheckor; sourceTree = ""; }; + A787811210A073F40086103D /* classifiers */ = { + isa = PBXGroup; + children = ( + A78780FE10A06B8B0086103D /* classify.h */, + A787810210A06D5D0086103D /* classify.cpp */, + A787820F10A0AAE70086103D /* bayesian.h */, + A787821010A0AAE70086103D /* bayesian.cpp */, + A78781B910A0813D0086103D /* doTaxonomy.h */, + A78781B810A0813D0086103D /* doTaxonomy.cpp */, + A787844310A1EBDD0086103D /* knn.h */, + A787844410A1EBDD0086103D /* knn.cpp */, + A787811310A074330086103D /* phylotype.h */, + A787811410A074330086103D /* phylotype.cpp */, + ); + name = classifiers; + sourceTree = ""; + }; C6859E8C029090F304C91782 /* Documentation */ = { isa = PBXGroup; children = ( @@ -1204,7 +1242,6 @@ 21E859D80FC4632E005E1A48 /* matrixoutputcommand.cpp in Sources */, 3749271D0FD58C840031C06B /* getsabundcommand.cpp in Sources */, 3749273F0FD5956B0031C06B /* getrabundcommand.cpp in Sources */, - 3799A9500FD6A58C00E33EDE /* distancedb.cpp in Sources */, 3799A9510FD6A58C00E33EDE /* seqsummarycommand.cpp in Sources */, 371B30B40FD7EE67000414CA /* screenseqscommand.cpp in Sources */, 7E09C5140FDA79C5002ECAE5 /* reversecommand.cpp in Sources */, @@ -1233,6 +1270,13 @@ A729ACD010848E6100139801 /* hclustercommand.cpp in Sources */, A729ACE410849BBE00139801 /* hcluster.cpp in Sources */, A751ACEA1098B283003D0911 /* readcluster.cpp in Sources */, + A7861478109F5FA00010AAAF /* classifyseqscommand.cpp in Sources */, + A787810310A06D5D0086103D /* classify.cpp in Sources */, + A787811510A074330086103D /* phylotype.cpp in Sources */, + A78781BA10A0813E0086103D /* doTaxonomy.cpp in Sources */, + A787821110A0AAE70086103D /* bayesian.cpp in Sources */, + A78782AB10A1B1CB0086103D /* alignmentdb.cpp in Sources */, + A787844510A1EBDD0086103D /* knn.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/aligncommand.cpp b/aligncommand.cpp index 48c67ec..2ca450a 100644 --- a/aligncommand.cpp +++ b/aligncommand.cpp @@ -22,10 +22,6 @@ #include "blastalign.hpp" #include "noalign.hpp" -#include "kmerdb.hpp" -#include "suffixdb.hpp" -#include "blastdb.hpp" - #include "nast.hpp" #include "nastreport.hpp" @@ -126,7 +122,7 @@ void AlignCommand::help(){ mothurOut("The template and candidate parameters are required.\n"); mothurOut("The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer and blast. The default is kmer.\n"); mothurOut("The align parameter allows you to specify the alignment method to use. Your options are: gotoh, needleman, blast and noalign. The default is needleman.\n"); - mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 7.\n"); + mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n"); mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n"); mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n"); mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n"); @@ -149,19 +145,9 @@ int AlignCommand::execute(){ try { if (abort == true) { return 0; } - if(search == "kmer") { templateDB = new KmerDB(templateFileName, kmerSize); } - else if(search == "suffix") { templateDB = new SuffixDB(templateFileName); } - else if(search == "blast") { templateDB = new BlastDB(templateFileName, gapOpen, gapExtend, match, misMatch); } - else { - mothurOut(search + " is not a valid search option. I will run the command using kmer, ksize=8."); - mothurOutEndLine(); - kmerSize = 8; - - templateDB = new KmerDB(templateFileName, kmerSize); - } - + templateDB = new AlignmentDB(templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch); int longestBase = templateDB->getLongestBase(); - + if(align == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, longestBase); } else if(align == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); } else if(align == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); } @@ -281,7 +267,7 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName){ Sequence temp = templateDB->findClosestSequence(candidateSeq); Sequence* templateSeq = &temp; - + report.setTemplate(templateSeq); report.setSearchParameters(search, templateDB->getSearchScore()); @@ -290,13 +276,13 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName){ report.setAlignmentParameters(align, alignment); report.setNastParameters(nast); - + alignmentFile << '>' << candidateSeq->getName() << '\n' << candidateSeq->getAligned() << endl; report.print(); delete candidateSeq; - + } alignmentFile.close(); diff --git a/aligncommand.h b/aligncommand.h index 50e9bff..c3c2441 100644 --- a/aligncommand.h +++ b/aligncommand.h @@ -14,6 +14,7 @@ #include "command.hpp" #include "database.hpp" #include "alignment.hpp" +#include "alignmentdb.h" class AlignCommand : public Command { @@ -32,7 +33,7 @@ private: vector processIDS; //processid vector lines; - Database* templateDB; + AlignmentDB* templateDB; Alignment* alignment; int driver(linePair*, string, string); diff --git a/alignmentdb.cpp b/alignmentdb.cpp new file mode 100644 index 0000000..493521e --- /dev/null +++ b/alignmentdb.cpp @@ -0,0 +1,111 @@ +/* + * alignmentdb.cpp + * Mothur + * + * Created by westcott on 11/4/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + +#include "alignmentdb.h" +#include "kmerdb.hpp" +#include "suffixdb.hpp" +#include "blastdb.hpp" + + +/**************************************************************************************************/ +AlignmentDB::AlignmentDB(string fastaFileName, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch){ // This assumes that the template database is in fasta format, may + try { // need to alter this in the future? + longest = 0; + + ifstream fastaFile; + openInputFile(fastaFileName, fastaFile); + + mothurOutEndLine(); + mothurOut("Reading in the " + fastaFileName + " template sequences...\t"); cout.flush(); + + while (!fastaFile.eof()) { + Sequence temp(fastaFile); + + templateSequences.push_back(temp); + + //save longest base + if (temp.getUnaligned().length() > longest) { longest = temp.getUnaligned().length(); } + + gobble(fastaFile); + } + + numSeqs = templateSequences.size(); + + fastaFile.close(); + //all of this is elsewhere already! + + mothurOut("DONE."); + mothurOutEndLine(); cout.flush(); + + //in case you delete the seqs and then ask for them + emptySequence = Sequence(); + emptySequence.setName("no_match"); + emptySequence.setUnaligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + emptySequence.setAligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + + bool needToGenerate = true; + string kmerDBName; + if(method == "kmer") { + search = new KmerDB(fastaFileName, kmerSize); + + kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; + ifstream kmerFileTest(kmerDBName.c_str()); + + if(kmerFileTest){ needToGenerate = false; } + } + else if(method == "suffix") { search = new SuffixDB(numSeqs); } + else if(method == "blast") { search = new BlastDB(gapOpen, gapExtend, match, misMatch); } + else { + mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); + mothurOutEndLine(); + search = new KmerDB(fastaFileName, 8); + } + + if (needToGenerate) { + + //add sequences to search + for (int i = 0; i < templateSequences.size(); i++) { + search->addSequence(templateSequences[i]); + } + search->generateDB(); + + }else if ((method == "kmer") && (!needToGenerate)) { + ifstream kmerFileTest(kmerDBName.c_str()); + search->readKmerDB(kmerFileTest); + } + + search->setNumSeqs(numSeqs); + } + catch(exception& e) { + errorOut(e, "AlignmentDB", "AlignmentDB"); + exit(1); + } +} +/**************************************************************************************************/ +AlignmentDB::~AlignmentDB() { delete search; } +/**************************************************************************************************/ +Sequence AlignmentDB::findClosestSequence(Sequence* seq) { + try{ + + vector spot = search->findClosestSequences(seq, 1); + + if (spot.size() != 0) { return templateSequences[spot[0]]; } + else { return emptySequence; } + + } + catch(exception& e) { + errorOut(e, "AlignmentDB", "findClosestSequence"); + exit(1); + } +} +/**************************************************************************************************/ + + + + diff --git a/alignmentdb.h b/alignmentdb.h new file mode 100644 index 0000000..339f46f --- /dev/null +++ b/alignmentdb.h @@ -0,0 +1,42 @@ +#ifndef ALIGNMENTDB_H +#define ALIGNMENTDB_H + +/* + * alignmentdb.h + * Mothur + * + * Created by westcott on 11/4/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + +#include "mothur.h" +#include "sequence.hpp" +#include "database.hpp" + +/**************************************************************************************************/ + +class AlignmentDB { + +public: + + AlignmentDB(string, string, int, int, int, int, int); //reads fastafile passed in and stores sequences + ~AlignmentDB(); + + Sequence findClosestSequence(Sequence*); + float getSearchScore() { return search->getSearchScore(); } + int getLongestBase() { return longest; } + +private: + int numSeqs, longest; + float searchScore; + + Database* search; + vector templateSequences; + Sequence emptySequence; +}; + +/**************************************************************************************************/ + +#endif + diff --git a/bayesian.cpp b/bayesian.cpp new file mode 100644 index 0000000..caae212 --- /dev/null +++ b/bayesian.cpp @@ -0,0 +1,29 @@ +/* + * bayesian.cpp + * Mothur + * + * Created by westcott on 11/3/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + +#include "bayesian.h" + +/**************************************************************************************************/ +Bayesian::Bayesian(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch) : +Classify(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch) {} +/**************************************************************************************************/ +string Bayesian::getTaxonomy(Sequence* seq) { + try { + string tax; + + + return tax; + } + catch(exception& e) { + errorOut(e, "Bayesian", "getTaxonomy"); + exit(1); + } +} +/**************************************************************************************************/ + diff --git a/bayesian.h b/bayesian.h new file mode 100644 index 0000000..cbe949f --- /dev/null +++ b/bayesian.h @@ -0,0 +1,34 @@ +#ifndef BAYESIAN_H +#define BAYESIAN_H + +/* + * bayesian.h + * Mothur + * + * Created by westcott on 11/3/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + +#include "mothur.h" +#include "classify.h" + +/**************************************************************************************************/ + +class Bayesian : public Classify { + +public: + Bayesian(string, string, string, int, int, int, int, int); + ~Bayesian() {}; + + string getTaxonomy(Sequence*); + + +private: + +}; + +/**************************************************************************************************/ + +#endif + diff --git a/blastdb.cpp b/blastdb.cpp index 0cba833..6b45a48 100644 --- a/blastdb.cpp +++ b/blastdb.cpp @@ -14,90 +14,110 @@ /**************************************************************************************************/ -BlastDB::BlastDB(string fastaFileName, float gO, float gE, float m, float mM) : Database(fastaFileName), +BlastDB::BlastDB(float gO, float gE, float m, float mM) : Database(), gapOpen(gO), gapExtend(gE), match(m), misMatch(mM) { globaldata = GlobalData::getInstance(); - - mothurOut("Generating the temporary BLAST database...\t"); cout.flush(); + count = 0; int randNumber = rand(); dbFileName = toString(randNumber) + ".template.unaligned.fasta"; queryFileName = toString(randNumber) + ".candidate.unaligned.fasta"; blastFileName = toString(randNumber) + ".blast"; - - ofstream unalignedFastaFile; - openOutputFile(dbFileName, unalignedFastaFile); - - for(int i=0;i' << i << endl; // sequences, which will be input to formatdb - unalignedFastaFile << templateSequences[i].getUnaligned() << endl; - } - unalignedFastaFile.close(); - - path = globaldata->argv; - path = path.substr(0, (path.find_last_of('m'))); - - string formatdbCommand = path + "blast/bin/formatdb -p F -o T -i " + dbFileName; // format the database, -o option gives us the ability - system(formatdbCommand.c_str()); // to get the right sequence names, i think. -p F - // option tells formatdb that seqs are DNA, not prot - mothurOut("DONE."); mothurOutEndLine(); mothurOutEndLine(); cout.flush(); - emptySequence = Sequence(); - emptySequence.setName("no_match"); - emptySequence.setUnaligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); - emptySequence.setAligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); - - } /**************************************************************************************************/ BlastDB::~BlastDB(){ - //for (int i = 0; i < templateSequences.size(); i++) { delete templateSequences[i]; } - //templateSequences.clear(); - //delete emptySequence; - remove(queryFileName.c_str()); // let's clean stuff up and remove the temp files remove(dbFileName.c_str()); // let's clean stuff up and remove the temp files remove(blastFileName.c_str()); // let's clean stuff up and remove the temp files } - /**************************************************************************************************/ +//assumes you have added all the template sequences using the addSequence function and run generateDB. +vector BlastDB::findClosestSequences(Sequence* seq, int n) { + try{ + vector topMatches; + + ofstream queryFile; + openOutputFile(queryFileName, queryFile); + queryFile << '>' << seq->getName() << endl; + queryFile << seq->getUnaligned() << endl; + queryFile.close(); + + // the goal here is to quickly survey the database to find the closest match. To do this we are using the default + // wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too + // long. With this setting, it seems comparable in speed to the suffix tree approach. + + string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -b 1 -m 8 -W 28 -v " + toString(n); + blastCommand += (" -i " + queryFileName + " -o " + blastFileName); + system(blastCommand.c_str()); + + ifstream m8FileHandle; + openInputFile(blastFileName, m8FileHandle); + + string dummy; + int templateAccession; + gobble(m8FileHandle); + + while(!m8FileHandle.eof()){ + m8FileHandle >> dummy >> templateAccession >> searchScore; + + //get rest of junk in line + while (!m8FileHandle.eof()) { char c = m8FileHandle.get(); if (c == 10 || c == 13){ break; } } + + gobble(m8FileHandle); + topMatches.push_back(templateAccession); + } + m8FileHandle.close(); + + return topMatches; + } + catch(exception& e) { + errorOut(e, "BlastDB", "findClosestSequences"); + exit(1); + } -Sequence BlastDB::findClosestSequence(Sequence* candidate){ - - ofstream queryFile; - openOutputFile(queryFileName, queryFile); - queryFile << '>' << candidate->getName() << endl; - queryFile << candidate->getUnaligned() << endl; - queryFile.close(); +} +/**************************************************************************************************/ +void BlastDB::addSequence(Sequence seq) { + try { + ofstream unalignedFastaFile; + openOutputFileAppend(dbFileName, unalignedFastaFile); -// the goal here is to quickly survey the database to find the closest match. To do this we are using the default -// wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too -// long. With this setting, it seems comparable in speed to the suffix tree approach. - - string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -b 1 -m 8 -W 28"; - blastCommand += (" -i " + queryFileName + " -o " + blastFileName); - system(blastCommand.c_str()); + // generating a fasta file with unaligned template + unalignedFastaFile << '>' << count << endl; // sequences, which will be input to formatdb + unalignedFastaFile << seq.getUnaligned() << endl; + unalignedFastaFile.close(); + + count++; + } + catch(exception& e) { + errorOut(e, "BlastDB", "addSequence"); + exit(1); + } +} +/**************************************************************************************************/ +void BlastDB::generateDB() { + try { - ifstream m8FileHandle; - openInputFile(blastFileName, m8FileHandle); + mothurOut("Generating the temporary BLAST database...\t"); cout.flush(); + + path = globaldata->argv; + path = path.substr(0, (path.find_last_of('m'))); - string dummy; - int templateAccession; - gobble(m8FileHandle); - if(!m8FileHandle.eof()){ - m8FileHandle >> dummy >> templateAccession >> searchScore; - m8FileHandle.close(); - return templateSequences[templateAccession]; + string formatdbCommand = path + "blast/bin/formatdb -p F -o T -i " + dbFileName; // format the database, -o option gives us the ability + system(formatdbCommand.c_str()); // to get the right sequence names, i think. -p F + // option tells formatdb that seqs are DNA, not prot + mothurOut("DONE."); mothurOutEndLine(); mothurOutEndLine(); cout.flush(); } - else{ - searchScore = 0.00; - return emptySequence; + catch(exception& e) { + errorOut(e, "BlastDB", "generateDB"); + exit(1); } - m8FileHandle.close(); } /**************************************************************************************************/ + diff --git a/blastdb.hpp b/blastdb.hpp index f0f17f9..c718bb4 100644 --- a/blastdb.hpp +++ b/blastdb.hpp @@ -15,10 +15,14 @@ #include "globaldata.hpp" class BlastDB : public Database { + public: - BlastDB(string, float, float, float, float); + BlastDB(float, float, float, float); ~BlastDB(); - Sequence findClosestSequence(Sequence*); + + void generateDB(); + void addSequence(Sequence); + vector findClosestSequences(Sequence*, int); private: string dbFileName; @@ -26,11 +30,11 @@ private: string blastFileName; string path; + int count; float gapOpen; float gapExtend; float match; float misMatch; - Sequence emptySequence; GlobalData* globaldata; }; diff --git a/chimeracheckrdp.cpp b/chimeracheckrdp.cpp index acc3aa2..07fb8ab 100644 --- a/chimeracheckrdp.cpp +++ b/chimeracheckrdp.cpp @@ -77,11 +77,13 @@ void ChimeraCheckRDP::getChimeras() { try { //read in query sequences and subject sequences - mothurOut("Reading sequences and template file... "); cout.flush(); + mothurOutEndLine(); + mothurOut("Reading query sequences... "); cout.flush(); querySeqs = readSeqs(fastafile); + mothurOut("Done."); //templateSeqs = readSeqs(templateFile); - templateDB = new KmerDB(templateFile, kmerSize); - mothurOut("Done."); mothurOutEndLine(); + templateDB = new AlignmentDB(templateFile, "kmer", kmerSize, 0,0,0,0); + mothurOutEndLine(); int numSeqs = querySeqs.size(); diff --git a/chimeracheckrdp.h b/chimeracheckrdp.h index f436718..fe46206 100644 --- a/chimeracheckrdp.h +++ b/chimeracheckrdp.h @@ -14,7 +14,7 @@ #include "chimera.h" #include "kmer.hpp" #include "kmerdb.hpp" -#include "database.hpp" +#include "alignmentdb.h" /***********************************************************/ //This class was created using the algorythms described in @@ -39,7 +39,7 @@ class ChimeraCheckRDP : public Chimera { vector lines; vector querySeqs; - Database* templateDB; + AlignmentDB* templateDB; Kmer* kmer; vector< vector > IS; //IS[0] is the vector of IS values for each window for querySeqs[0] diff --git a/classify.cpp b/classify.cpp new file mode 100644 index 0000000..605c1f8 --- /dev/null +++ b/classify.cpp @@ -0,0 +1,124 @@ +/* + * classify.cpp + * Mothur + * + * Created by westcott on 11/3/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + +#include "classify.h" +#include "sequence.hpp" +#include "kmerdb.hpp" +#include "suffixdb.hpp" +#include "blastdb.hpp" + +/**************************************************************************************************/ + +Classify::Classify(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch) : taxFile(tfile), templateFile(tempFile) { + try { + + readTaxonomy(taxFile); + + int numSeqs = 0; + //need to know number of template seqs for suffixdb + if (method == "suffix") { + ifstream inFASTA; + openInputFile(tempFile, inFASTA); + numSeqs = count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + inFASTA.close(); + } + + mothurOut("Generating search database... "); cout.flush(); + + bool needToGenerate = true; + string kmerDBName; + if(method == "kmer") { + database = new KmerDB(tempFile, kmerSize); + + kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; + ifstream kmerFileTest(kmerDBName.c_str()); + if(kmerFileTest){ needToGenerate = false; } + } + else if(method == "suffix") { database = new SuffixDB(numSeqs); } + else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); } + else { + mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8."); + mothurOutEndLine(); + database = new KmerDB(tempFile, 8); + } + + if (needToGenerate) { + ifstream fastaFile; + openInputFile(tempFile, fastaFile); + + while (!fastaFile.eof()) { + Sequence temp(fastaFile); + gobble(fastaFile); + + names.push_back(temp.getName()); + database->addSequence(temp); + } + fastaFile.close(); + + database->generateDB(); + + }else if ((method == "kmer") && (!needToGenerate)) { + ifstream kmerFileTest(kmerDBName.c_str()); + database->readKmerDB(kmerFileTest); + + ifstream fastaFile; + openInputFile(tempFile, fastaFile); + + while (!fastaFile.eof()) { + Sequence temp(fastaFile); + gobble(fastaFile); + + names.push_back(temp.getName()); + } + fastaFile.close(); + } + + database->setNumSeqs(names.size()); + + mothurOut("DONE."); mothurOutEndLine(); + + } + catch(exception& e) { + errorOut(e, "Classify", "Classify"); + exit(1); + } +} +/**************************************************************************************************/ + +void Classify::readTaxonomy(string file) { + try { + + ifstream inTax; + openInputFile(file, inTax); + + mothurOutEndLine(); + mothurOut("Reading in the " + file + " taxonomy...\t"); cout.flush(); + + string name, taxInfo; + //read template seqs and save + while (!inTax.eof()) { + inTax >> name >> taxInfo; + + taxonomy[name] = taxInfo; + + gobble(inTax); + } + inTax.close(); + + mothurOut("DONE."); + mothurOutEndLine(); cout.flush(); + + } + catch(exception& e) { + errorOut(e, "Classify", "readTaxonomy"); + exit(1); + } +} +/**************************************************************************************************/ + diff --git a/classify.h b/classify.h new file mode 100644 index 0000000..2926800 --- /dev/null +++ b/classify.h @@ -0,0 +1,50 @@ +#ifndef CLASSIFY_H +#define CLASSIFY_H + +/* + * classify.h + * Mothur + * + * Created by westcott on 11/3/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + + +/* This class is a parent to phylotyp, bayesian, knn. */ + +#include "mothur.h" +#include "database.hpp" + + + +class Sequence; + +/**************************************************************************************************/ + +class Classify { + +public: + Classify(string, string, string, int, int, int, int, int); + Classify(){}; + + virtual ~Classify(){}; + virtual string getTaxonomy(Sequence*) = 0; + +protected: + + map taxonomy; //name maps to taxonomy + map::iterator it; + Database* database; + + string taxFile, templateFile; + vector names; + + void readTaxonomy(string); + +}; + +/**************************************************************************************************/ + +#endif + diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp new file mode 100644 index 0000000..69ef712 --- /dev/null +++ b/classifyseqscommand.cpp @@ -0,0 +1,360 @@ +/* + * classifyseqscommand.cpp + * Mothur + * + * Created by westcott on 11/2/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + +#include "classifyseqscommand.h" +#include "sequence.hpp" +#include "phylotype.h" +#include "bayesian.h" +#include "doTaxonomy.h" +#include "knn.h" + +//********************************************************************************************************************** + +ClassifySeqsCommand::ClassifySeqsCommand(string option){ + try { + // globaldata = GlobalData::getInstance(); + abort = false; + + //allow user to run help + if(option == "help") { help(); abort = true; } + + else { + + //valid paramters for this command + string AlignArray[] = {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted"}; + vector myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string))); + + OptionParser parser(option); + map parameters = parser.getParameters(); + + ValidParameters validParameter; + + //check to make sure all parameters are valid for command + for (map::iterator it = parameters.begin(); it != parameters.end(); it++) { + if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; } + } + + //check for required parameters + templateFileName = validParameter.validFile(parameters, "template", true); + if (templateFileName == "not found") { + mothurOut("template is a required parameter for the classify.seqs command."); + mothurOutEndLine(); + abort = true; + } + else if (templateFileName == "not open") { abort = true; } + + fastaFileName = validParameter.validFile(parameters, "fasta", true); + if (fastaFileName == "not found") { + mothurOut("fasta is a required parameter for the classify.seqs command."); + mothurOutEndLine(); + abort = true; + } + else if (fastaFileName == "not open") { abort = true; } + + taxonomyFileName = validParameter.validFile(parameters, "taxonomy", true); + if (taxonomyFileName == "not found") { + mothurOut("taxonomy is a required parameter for the classify.seqs command."); + mothurOutEndLine(); + abort = true; + } + else if (taxonomyFileName == "not open") { abort = true; } + + + //check for optional parameter and set defaults + // ...at some point should added some additional type checking... + string temp; + temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found"){ temp = "8"; } + convert(temp, kmerSize); + + temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; } + convert(temp, processors); + + search = validParameter.validFile(parameters, "search", false); if (search == "not found"){ search = "kmer"; } + + method = validParameter.validFile(parameters, "method", false); if (method == "not found"){ method = "phylotype"; } + + temp = validParameter.validFile(parameters, "match", false); if (temp == "not found"){ temp = "1.0"; } + convert(temp, match); + + temp = validParameter.validFile(parameters, "mismatch", false); if (temp == "not found"){ temp = "-1.0"; } + convert(temp, misMatch); + + temp = validParameter.validFile(parameters, "gapopen", false); if (temp == "not found"){ temp = "-2.0"; } + convert(temp, gapOpen); + + temp = validParameter.validFile(parameters, "gapextend", false); if (temp == "not found"){ temp = "-1.0"; } + convert(temp, gapExtend); + + temp = validParameter.validFile(parameters, "numwanted", false); if (temp == "not found"){ temp = "10"; } + convert(temp, numWanted); + + } + + } + catch(exception& e) { + errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand"); + exit(1); + } +} + +//********************************************************************************************************************** + +ClassifySeqsCommand::~ClassifySeqsCommand(){ + + if (abort == false) { + for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear(); + } +} + +//********************************************************************************************************************** + +void ClassifySeqsCommand::help(){ + try { + mothurOut("The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n"); + mothurOut("The classify.seqs command parameters are template, fasta, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend and numwanted.\n"); + mothurOut("The template, fasta and taxonomy parameters are required.\n"); + mothurOut("The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer and blast. The default is kmer.\n"); + mothurOut("The method parameter allows you to specify classification method to use. Your options are: phylotype, bayesian and knn. The default is phylotype.\n"); + mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n"); + mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n"); + mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n"); + mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n"); + mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n"); + mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -2.0.\n"); + mothurOut("The numwanted parameter allows you to specify the number of sequence matches you want with the knn method. The default is 10.\n"); + mothurOut("The classify.seqs command should be in the following format: \n"); + mothurOut("classify.seqs(template=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n"); + mothurOut("Example classify.seqs(fasta=amazon.fasta, template=core.filtered, method=phylotype, search=gotoh, ksize=8, processors=2)\n"); + mothurOut("The .taxonomy file consists of 2 columns: 1 = your sequence name, 2 = the taxonomy for your sequence. \n"); + mothurOut("The .tax.summary is a summary of the different taxonomies represented in your fasta file. \n"); + mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n"); + } + catch(exception& e) { + errorOut(e, "ClassifySeqsCommand", "help"); + exit(1); + } +} + + +//********************************************************************************************************************** + +int ClassifySeqsCommand::execute(){ + try { + if (abort == true) { return 0; } + + int start = time(NULL); + + if(method == "phylotype"){ classify = new PhyloType(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch); } + //else if(method == "bayesian") { classify = new Bayesian(taxonomyFileName, templateFileName, search); } + else if(method == "knn") { classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted); } + else { + mothurOut(search + " is not a valid method option. I will run the command using phylotype."); + mothurOutEndLine(); + classify = new PhyloType(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch); + } + + int numFastaSeqs = 0; + + string newTaxonomyFile = getRootName(fastaFileName) + "taxonomy"; + string taxSummary = getRootName(fastaFileName) + "tax.summary"; + +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + if(processors == 1){ + ifstream inFASTA; + openInputFile(fastaFileName, inFASTA); + numFastaSeqs=count(istreambuf_iterator(inFASTA),istreambuf_iterator(), '>'); + inFASTA.close(); + + lines.push_back(new linePair(0, numFastaSeqs)); + + driver(lines[0], newTaxonomyFile); + + } + else{ + vector positions; + processIDS.resize(0); + + ifstream inFASTA; + openInputFile(fastaFileName, inFASTA); + + string input; + while(!inFASTA.eof()){ + input = getline(inFASTA); + if (input.length() != 0) { + if(input[0] == '>'){ int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); } + } + } + inFASTA.close(); + + numFastaSeqs = positions.size(); + + int numSeqsPerProcessor = numFastaSeqs / processors; + + for (int i = 0; i < processors; i++) { + int startPos = positions[ i * numSeqsPerProcessor ]; + if(i == processors - 1){ + numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; + } + lines.push_back(new linePair(startPos, numSeqsPerProcessor)); + } + createProcesses(newTaxonomyFile); + + rename((newTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), newTaxonomyFile.c_str()); + + for(int i=1;i(inFASTA),istreambuf_iterator(), '>'); + inFASTA.close(); + + lines.push_back(new linePair(0, numFastaSeqs)); + + driver(lines[0], newTaxonomyFile); +#endif + + delete classify; + + //make taxonomy tree from new taxonomy file + ifstream inTaxonomy; + openInputFile(newTaxonomyFile, inTaxonomy); + + string accession, taxaList; + PhyloTree taxaBrowser; + + //read in users taxonomy file and add sequences to tree + while(!inTaxonomy.eof()){ + inTaxonomy >> accession >> taxaList; + + taxaBrowser.addSeqToTree(accession, taxaList); + + gobble(inTaxonomy); + } + inTaxonomy.close(); + + taxaBrowser.assignHeirarchyIDs(0); + + ofstream outTaxTree; + openOutputFile(taxSummary, outTaxTree); + + taxaBrowser.print(outTaxTree); + + mothurOutEndLine(); + mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences."); + mothurOutEndLine(); + mothurOutEndLine(); + + return 0; + } + catch(exception& e) { + errorOut(e, "ClassifySeqsCommand", "execute"); + exit(1); + } +} +/**************************************************************************************************/ + +void ClassifySeqsCommand::createProcesses(string taxFileName) { + try { +#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + int process = 0; + // processIDS.resize(0); + + //loop through and create all the processes you want + while (process != processors) { + int pid = fork(); + + if (pid > 0) { + processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later + process++; + }else if (pid == 0){ + driver(lines[process], taxFileName + toString(getpid()) + ".temp"); + exit(0); + }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); } + } + + //force parent to wait until all the processes are done + for (int i=0;istart); + + string taxonomy; + + for(int i=0;inumSeqs;i++){ + + Sequence* candidateSeq = new Sequence(inFASTA); + + taxonomy = classify->getTaxonomy(candidateSeq); + + if (taxonomy != "bad seq") { + outTax << candidateSeq->getName() << '\t' << taxonomy << endl; + } + + delete candidateSeq; + } + + inFASTA.close(); + outTax.close(); + + return 1; + } + catch(exception& e) { + errorOut(e, "ClassifySeqsCommand", "driver"); + exit(1); + } +} + +/**************************************************************************************************/ diff --git a/classifyseqscommand.h b/classifyseqscommand.h new file mode 100644 index 0000000..a77b7d1 --- /dev/null +++ b/classifyseqscommand.h @@ -0,0 +1,48 @@ +#ifndef CLASSIFYSEQSCOMMAND_H +#define CLASSIFYSEQSCOMMAND_H + +/* + * classifyseqscommand.h + * Mothur + * + * Created by westcott on 11/2/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + +#include "mothur.h" +#include "command.hpp" +#include "alignment.hpp" +#include "classify.h" + +class ClassifySeqsCommand : public Command { + +public: + ClassifySeqsCommand(string); + ~ClassifySeqsCommand(); + int execute(); + void help(); + +private: + struct linePair { + int start; + int numSeqs; + linePair(int i, int j) : start(i), numSeqs(j) {} + }; + vector processIDS; //processid + vector lines; + + Classify* classify; + + string fastaFileName, templateFileName, distanceFileName, search, method, taxonomyFileName; + int processors, kmerSize, numWanted; + float match, misMatch, gapOpen, gapExtend; + bool abort; + + int driver(linePair*, string); + void appendTaxFiles(string, string); + void createProcesses(string); +}; + +#endif + diff --git a/commandfactory.cpp b/commandfactory.cpp index 5f07abb..030079e 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -58,6 +58,7 @@ #include "getsharedotucommand.h" #include "getlistcountcommand.h" #include "hclustercommand.h" +#include "classifyseqscommand.h" /***********************************************************/ @@ -114,8 +115,8 @@ CommandFactory::CommandFactory(){ commands["get.sharedotu"] = "get.sharedotu"; commands["get.listcount"] = "get.listcount"; commands["quit"] = "quit"; - commands["hcluster"] = "hcluster"; + commands["classify.seqs"] = "classify.seqs"; } /***********************************************************/ @@ -180,8 +181,8 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "align.check") { command = new AlignCheckCommand(optionString); } else if(commandName == "get.sharedotu") { command = new GetSharedOTUCommand(optionString); } else if(commandName == "get.listcount") { command = new GetListCountCommand(optionString); } - else if(commandName == "hcluster") { command = new HClusterCommand(optionString); } + else if(commandName == "classify.seqs") { command = new ClassifySeqsCommand(optionString); } else { command = new NoCommand(optionString); } return command; diff --git a/database.cpp b/database.cpp index f84c9e1..5f82bc1 100644 --- a/database.cpp +++ b/database.cpp @@ -13,73 +13,13 @@ /**************************************************************************************************/ -Database::Database(string fastaFileName){ // This assumes that the template database is in fasta format, may - // need to alter this in the future? +Database::Database(){ longest = 0; - - ifstream fastaFile; - openInputFile(fastaFileName, fastaFile); - - mothurOutEndLine(); - mothurOut("Reading in the " + fastaFileName + " template sequences...\t"); cout.flush(); - - //all of this is elsewhere already! - //numSeqs=count(istreambuf_iterator(fastaFile),istreambuf_iterator(), '>'); // count the number of - //fastaFile.seekg(0); // sequences - - //templateSequences.resize(numSeqs); - - /*string seqName, sequence; - for(int i=0;i> seqName; - seqName = seqName.substr(1); - char letter; - string aligned; - - while(fastaFile && (letter=fastaFile.get()) != '>'){ - if(isprint(letter)){ - letter = toupper(letter); - if(letter == 'U'){letter = 'T';} - aligned += letter; - } - } - templateSequences[i] = Sequence(seqName, aligned); - - //necessary because Sequence constructor by default sets whatever it reads to unaligned - //the setUnaligned function remove gap char. - templateSequences[i].setUnaligned(templateSequences[i].getUnaligned()); - - if (templateSequences[i].getUnaligned().length() > longest) { longest = templateSequences[i].getUnaligned().length(); } - - fastaFile.putback(letter); - }*/ - - while (!fastaFile.eof()) { - Sequence temp(fastaFile); - - templateSequences.push_back(temp); - - //save longest base - if (temp.getUnaligned().length() > longest) { longest = temp.getUnaligned().length(); } - - gobble(fastaFile); - } - - numSeqs = templateSequences.size(); - - fastaFile.close(); - //all of this is elsewhere already! - - mothurOut("DONE."); - mothurOutEndLine(); cout.flush(); - + numSeqs = 0; } /**************************************************************************************************/ -Database::~Database(){ - - templateSequences.clear(); -} +Database::~Database(){} /**************************************************************************************************/ @@ -91,5 +31,3 @@ float Database::getSearchScore() { return searchScore; } // we're assuming that int Database::getLongestBase() { return longest+1; } /**************************************************************************************************/ - - diff --git a/database.hpp b/database.hpp index 01393c1..01916d1 100644 --- a/database.hpp +++ b/database.hpp @@ -11,24 +11,51 @@ */ -/* This class is a parent to blastdb, distancedb, kmerdb, suffixdb. Which are used to convert a squencedb object into that form. */ +/* This class is a parent to blastdb, kmerdb, suffixdb. */ #include "mothur.h" +#include "sequence.hpp" -class Sequence; +/**************************************************************************************************/ +struct seqMatch { //used to select top n matches + int seq; + int match; + seqMatch(int s, int m) : seq(s), match(m) {} +}; +/**************************************************************************************************/ +inline bool compareSeqMatches (seqMatch member, seqMatch member2){ //sorts largest to smallest + if(member.match > member2.match){ + return true; } + else{ + return false; + } +} +/**************************************************************************************************/ +inline bool compareSeqMatchesReverse (seqMatch member, seqMatch member2){ //sorts largest to smallest + if(member.match < member2.match){ + return true; } + else{ + return false; + } +} +/**************************************************************************************************/ class Database { + public: - Database(string); + Database(); virtual ~Database(); - virtual Sequence findClosestSequence(Sequence*) = 0; + virtual void generateDB() = 0; + virtual void addSequence(Sequence) = 0; //add sequence to search engine + virtual vector findClosestSequences(Sequence*, int) = 0; // returns indexes of n closest sequences to query virtual float getSearchScore(); virtual int getLongestBase(); + virtual void readKmerDB(ifstream&){}; + virtual void setNumSeqs(int i) { numSeqs = i; } protected: int numSeqs, longest; float searchScore; - vector templateSequences; }; - +/**************************************************************************************************/ #endif diff --git a/distancedb.cpp b/distancedb.cpp index 30f9359..d6ec258 100644 --- a/distancedb.cpp +++ b/distancedb.cpp @@ -54,7 +54,8 @@ Sequence DistanceDB::findClosestSequence(Sequence* candidateSeq){ searchScore = 100. * simAccession.simScore; searchIndex++; // return templateSequences[closestMatchIndexNumber]; - return templateSequences[simAccession.indexNumber]; + if (templateValid) { return templateSequences[simAccession.indexNumber]; } + else { return emptySequence; } } diff --git a/distancedb.hpp b/distancedb.hpp index 8da1d10..b0d46c6 100644 --- a/distancedb.hpp +++ b/distancedb.hpp @@ -16,8 +16,9 @@ class DistanceDB : public Database { public: + DistanceDB(string, string); - Sequence findClosestSequence(Sequence*); + vector findClosestSequences(Sequence*, int); private: @@ -26,8 +27,8 @@ private: int indexNumber; float simScore; }; + vector mostSimSequenceVector; -// ifstream inputData; int searchIndex; }; diff --git a/doTaxonomy.cpp b/doTaxonomy.cpp new file mode 100644 index 0000000..e664764 --- /dev/null +++ b/doTaxonomy.cpp @@ -0,0 +1,125 @@ +/* + * doTaxonomy.cpp + * + * + * Created by Pat Schloss on 6/17/09. + * Copyright 2009 Patrick D. Schloss. All rights reserved. + * + */ + +#include "doTaxonomy.h" + +/**************************************************************************************************/ + +PhyloTree::PhyloTree(){ + + numNodes = 1; + numSeqs = 0; + tree.push_back(TaxNode("Root")); + tree[0].heirarchyID = "0"; +} + +/**************************************************************************************************/ + +string PhyloTree::getNextTaxon(string& heirarchy){ + + string currentLevel = ""; + if(heirarchy != ""){ + currentLevel=heirarchy.substr(0,heirarchy.find_first_of(';')); + heirarchy=heirarchy.substr(heirarchy.find_first_of(';')+1); + } + return currentLevel; + +} + +/**************************************************************************************************/ + +void PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){ + + numSeqs++; + + map::iterator childPointer; + + int currentNode = 0; + int level = 1; + + tree[0].accessions.push_back(seqName); + string taxon;// = getNextTaxon(seqTaxonomy); + + while(seqTaxonomy != ""){ + + level++; + +//somehow the parent is getting one too many accnos +//use print to reassign the taxa id + taxon = getNextTaxon(seqTaxonomy); + + childPointer = tree[currentNode].children.find(taxon); + + if(childPointer != tree[currentNode].children.end()){ //if the node already exists, move on + currentNode = childPointer->second; + tree[currentNode].accessions.push_back(seqName); + } + else{ //otherwise, create it + tree.push_back(TaxNode(taxon)); + numNodes++; + tree[currentNode].children[taxon] = numNodes-1; + +// int numChildren = tree[currentNode].children.size(); +// string heirarchyID = tree[currentNode].heirarchyID; +// tree[currentNode].accessions.push_back(seqName); + + currentNode = tree[currentNode].children[taxon]; + tree[currentNode].accessions.push_back(seqName); + +// tree[currentNode].level = level; +// tree[currentNode].childNumber = numChildren; +// tree[currentNode].heirarchyID = heirarchyID + '.' + toString(tree[currentNode].childNumber); + } + + } +} + +/**************************************************************************************************/ + +void PhyloTree::assignHeirarchyIDs(int index){ + + map::iterator it; + int counter = 1; + + for(it=tree[index].children.begin();it!=tree[index].children.end();it++){ + tree[it->second].heirarchyID = tree[index].heirarchyID + '.' + toString(counter); + counter++; + tree[it->second].level = tree[index].level + 1; + assignHeirarchyIDs(it->second); + + } + +} + +/**************************************************************************************************/ + +void PhyloTree::print(ofstream& out){ + + out << tree[0].level << '\t'<< tree[0].heirarchyID << '\t' << tree[0].name << '\t' << tree[0].children.size() << '\t' << tree[0].accessions.size() << endl; + print(0, out); + + +} + +/**************************************************************************************************/ + +void PhyloTree::print(int i, ofstream& out){ + + map::iterator it; + for(it=tree[i].children.begin();it!=tree[i].children.end();it++){ + out <second].level << '\t' << tree[it->second].heirarchyID << '\t' << tree[it->second].name << '\t' << tree[it->second].children.size() << '\t' << tree[it->second].accessions.size() << endl; + print(it->second, out); + } + +} + +/**************************************************************************************************/ + + + diff --git a/doTaxonomy.h b/doTaxonomy.h new file mode 100644 index 0000000..c1ec3ad --- /dev/null +++ b/doTaxonomy.h @@ -0,0 +1,42 @@ +/* + * doTaxonomy.h + * + * + * Created by Pat Schloss on 6/17/09. + * Copyright 2009 Patrick D. Schloss. All rights reserved. + * + */ + +#include "mothur.h" + +/**************************************************************************************************/ + +struct TaxNode { + vector accessions; + map children; + int parent, childNumber, level; + string name, heirarchyID; + + TaxNode(string n) : name(n), level(0), parent(-1) { } + TaxNode(){} +}; + +/**************************************************************************************************/ + +class PhyloTree { + +public: + PhyloTree(); + void addSeqToTree(string, string); + void assignHeirarchyIDs(int); + void print(ofstream&); +private: + string getNextTaxon(string&); + vector tree; + void print(int, ofstream&); + int numNodes; + int numSeqs; +}; + +/**************************************************************************************************/ + diff --git a/hcluster.cpp b/hcluster.cpp index 2d980d7..7613ecb 100644 --- a/hcluster.cpp +++ b/hcluster.cpp @@ -235,7 +235,7 @@ void HCluster::updateArrayandLinkTable() { } //cout << "here3" << endl; linkTable[colSpot].erase(size); - //linkTable.erase(linkTable.begin()+rowSpot); //delete row + linkTable.erase(linkTable.begin()+rowSpot); //delete row //printInfo(); //update activerows activeLinks.erase(smallRow); @@ -243,9 +243,9 @@ void HCluster::updateArrayandLinkTable() { activeLinks[size] = colSpot; //adjust everybody elses spot since you deleted - time vs. space - //for (it = activeLinks.begin(); it != activeLinks.end(); it++) { - //if (it->second > rowSpot) { activeLinks[it->first]--; } - //} + for (it = activeLinks.begin(); it != activeLinks.end(); it++) { + if (it->second > rowSpot) { activeLinks[it->first]--; } + } //cout << "here4" << endl; diff --git a/kmerdb.cpp b/kmerdb.cpp index 86dba09..bf8679e 100644 --- a/kmerdb.cpp +++ b/kmerdb.cpp @@ -29,27 +29,17 @@ /**************************************************************************************************/ -KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerSize(kSize) { +KmerDB::KmerDB(string fastaFileName, int kSize) : Database(), kmerSize(kSize) { try { - string kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; - ifstream kmerFileTest(kmerDBName.c_str()); + kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; + count = 0; maxKmer = power4s[kmerSize]; kmerLocations.resize(maxKmer+1); - if(!kmerFileTest){ // if we can open the kmer db file, then read it in... - mothurOut("Generating the " + kmerDBName + " database...\t"); cout.flush(); - generateKmerDB(kmerDBName); - } - else{ // ...otherwise generate it. - mothurOut("Reading in the " + kmerDBName + " database...\t"); cout.flush(); - readKmerDB(kmerDBName, kmerFileTest); - } - mothurOut("DONE."); mothurOutEndLine(); mothurOutEndLine(); cout.flush(); - } catch(exception& e) { errorOut(e, "KmerDB", "KmerDB"); @@ -59,27 +49,21 @@ KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerS } /**************************************************************************************************/ -KmerDB::~KmerDB(){ - - //for (int i = 0; i < templateSequences.size(); i++) { delete templateSequences[i]; } - // templateSequences.clear(); -} +KmerDB::~KmerDB(){} /**************************************************************************************************/ -Sequence KmerDB::findClosestSequence(Sequence* candidateSeq){ +vector KmerDB::findClosestSequences(Sequence* candidateSeq, int num){ try { - + vector topMatches; Kmer kmer(kmerSize); - searchScore = 0; - int maxSequence = 0; vector matches(numSeqs, 0); // a record of the sequences with shared kmers vector timesKmerFound(kmerLocations.size()+1, 0); // a record of the kmers that we have already found int numKmers = candidateSeq->getNumBases() - kmerSize + 1; - + for(int i=0;igetUnaligned(), i); // go through the query sequence and get a kmer number if(timesKmerFound[kmerNumber] == 0){ // if we haven't seen it before... @@ -89,46 +73,36 @@ Sequence KmerDB::findClosestSequence(Sequence* candidateSeq){ } timesKmerFound[kmerNumber] = 1; // ok, we've seen the kmer now } - - for(int i=0;i searchScore){ // the query sequence - searchScore = matches[i]; - maxSequence = i; - } + + vector seqMatches; + for(int i=0;i seenBefore(maxKmer+1,0); - for(int j=0;j seenBefore(maxKmer+1,0); + for(int j=0;j> seqName >> numValues; for(int j=0;j findClosestSequences(Sequence*, int); + void readKmerDB(ifstream&); + private: - void generateKmerDB(string); - void readKmerDB(string, ifstream&); + int kmerSize; - int maxKmer; + int maxKmer, count; + string kmerDBName; vector > kmerLocations; }; diff --git a/knn.cpp b/knn.cpp new file mode 100644 index 0000000..6d57fb1 --- /dev/null +++ b/knn.cpp @@ -0,0 +1,109 @@ +/* + * knn.cpp + * Mothur + * + * Created by westcott on 11/4/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + +#include "knn.h" + +/**************************************************************************************************/ +Knn::Knn(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch, int n) +: Classify(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch), num(n) {} +/**************************************************************************************************/ +string Knn::getTaxonomy(Sequence* seq) { + try { + string tax; + + //use database to find closest seq + vector closest = database->findClosestSequences(seq, num); + + vector closestNames; + for (int i = 0; i < closest.size(); i++) { + //find that sequences taxonomy in map + it = taxonomy.find(names[closest[i]]); + + //is this sequence in the taxonomy file + if (it == taxonomy.end()) { //error not in file + mothurOut("Error: sequence " + names[closest[i]] + " is not in the taxonomy file. It will be eliminated as a match to sequence " + seq->getName() + "."); mothurOutEndLine(); + }else{ closestNames.push_back(it->first); } + } + + if (closestNames.size() == 0) { + mothurOut("Error: All the matches for sequence " + seq->getName() + " have been eliminated. " + seq->getName() + " will be disregarded."); mothurOutEndLine(); + tax = "bad seq"; + }else{ + tax = findCommonTaxonomy(closestNames); + if (tax == "") { mothurOut("There are no common levels for sequence " + seq->getName() + ". " + seq->getName() + " will be disregarded."); mothurOutEndLine(); tax = "bad seq"; } + } + + return tax; + } + catch(exception& e) { + errorOut(e, "Knn", "getTaxonomy"); + exit(1); + } +} +/**************************************************************************************************/ +string Knn::findCommonTaxonomy(vector closest) { + try { + vector< vector > taxons; //taxon[0] = vector of taxonomy info for closest[0]. + //so if closest[0] taxonomy is Bacteria;Alphaproteobacteria;Rhizobiales;Azorhizobium_et_rel.;Methylobacterium_et_rel.;Bosea; + //taxon[0][0] = Bacteria, taxon[0][1] = Alphaproteobacteria.... + + taxons.resize(closest.size()); + int smallest = 100; + + for (int i = 0; i < closest.size(); i++) { + + string tax = taxonomy[closest[i]]; //we know its there since we checked in getTaxonomy + + tax = tax.substr(0, tax.length()-1); //get rid of last ';' + + //parse taxonomy + string individual; + while (tax.find_first_of(';') != -1) { + individual = tax.substr(0,tax.find_first_of(';')); + tax = tax.substr(tax.find_first_of(';')+1, tax.length()); + taxons[i].push_back(individual); + + } + //get last one + taxons[i].push_back(tax); + + //figure out who has the shortest taxonomy info. so you can start comparing there + if (taxons[i].size() < smallest) { + smallest = taxons[i].size(); + } + } + + //start at the highest level all the closest seqs have + string common = ""; + for (int i = (smallest-1); i >= 0; i--) { + + string thistax = taxons[0][i]; + int num = 0; + for (int j = 1; j < taxons.size(); j++) { + if (taxons[j][i] != thistax) { break; } + num = j; + } + + if (num == (taxons.size()-1)) { //they all match at this level + for (int k = 0; k <= i; k++) { + common += taxons[0][k] + ';'; + } + break; + } + } + + return common; + } + catch(exception& e) { + errorOut(e, "Knn", "findCommonTaxonomy"); + exit(1); + } +} +/**************************************************************************************************/ + diff --git a/knn.h b/knn.h new file mode 100644 index 0000000..15a33b0 --- /dev/null +++ b/knn.h @@ -0,0 +1,35 @@ +#ifndef KNN_H +#define KNN_H + +/* + * knn.h + * Mothur + * + * Created by westcott on 11/4/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + +#include "mothur.h" +#include "classify.h" + +/**************************************************************************************************/ + +class Knn : public Classify { + +public: + Knn(string, string, string, int, int, int, int, int, int); + ~Knn() {}; + + string getTaxonomy(Sequence*); + +private: + int num; + string findCommonTaxonomy(vector); +}; + +/**************************************************************************************************/ + +#endif + + diff --git a/nast.cpp b/nast.cpp index f247037..4848eb2 100644 --- a/nast.cpp +++ b/nast.cpp @@ -40,12 +40,12 @@ Nast::Nast(Alignment* method, Sequence* cand, Sequence* temp) : alignment(method void Nast::pairwiseAlignSeqs(){ // Here we call one of the pairwise alignment methods to align our unaligned candidate // and template sequences try { - + alignment->align(candidateSeq->getUnaligned(), templateSeq->getUnaligned()); - + string candAln = alignment->getSeqAAln(); string tempAln = alignment->getSeqBAln(); - + if(candAln == ""){ candidateSeq->setPairwise(""); diff --git a/phylotype.cpp b/phylotype.cpp new file mode 100644 index 0000000..f7af97a --- /dev/null +++ b/phylotype.cpp @@ -0,0 +1,42 @@ +/* + * phylotype.cpp + * Mothur + * + * Created by westcott on 11/3/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + +#include "phylotype.h" + +/**************************************************************************************************/ +PhyloType::PhyloType(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch) +: Classify(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch) {} +/**************************************************************************************************/ +string PhyloType::getTaxonomy(Sequence* seq) { + try { + string tax; + + //use database to find closest seq + vector closest = database->findClosestSequences(seq, 1); + + //find that sequences taxonomy in map + it = taxonomy.find(names[closest[0]]); + + //is this sequence in the taxonomy file + if (it == taxonomy.end()) { //error not in file + mothurOut("Error: sequence " + names[closest[0]] + " is not in the taxonomy file. It is the closest match to sequence " + seq->getName() + ". " + seq->getName() + " will be disregarded."); mothurOutEndLine(); + tax = "bad seq"; + }else { + tax = it->second; + } + + return tax; + } + catch(exception& e) { + errorOut(e, "PhyloType", "getTaxonomy"); + exit(1); + } +} +/**************************************************************************************************/ + diff --git a/phylotype.h b/phylotype.h new file mode 100644 index 0000000..ee649e3 --- /dev/null +++ b/phylotype.h @@ -0,0 +1,34 @@ +#ifndef PHYLOTYPE_H +#define PHYLOTYPE_H + +/* + * phylotype.h + * Mothur + * + * Created by westcott on 11/3/09. + * Copyright 2009 Schloss Lab. All rights reserved. + * + */ + +#include "mothur.h" +#include "classify.h" + +/**************************************************************************************************/ + +class PhyloType : public Classify { + +public: + PhyloType(string, string, string, int, int, int, int, int); + ~PhyloType() {}; + + string getTaxonomy(Sequence*); + + +private: + +}; + +/**************************************************************************************************/ + +#endif + diff --git a/suffixdb.cpp b/suffixdb.cpp index 5bcd8e9..39a9b54 100644 --- a/suffixdb.cpp +++ b/suffixdb.cpp @@ -22,41 +22,58 @@ /**************************************************************************************************/ -SuffixDB::SuffixDB(string fastaFileName) : Database(fastaFileName) { - +SuffixDB::SuffixDB(int numSeqs) : Database() { suffixForest.resize(numSeqs); - mothurOut("Generating the suffix tree database...\t"); cout.flush(); - for(int i=0;iconvert2ints(); // the candidate sequence needs to be a string of ints - for(int i=0;i SuffixDB::findClosestSequences(Sequence* candidateSeq, int num){ + try { + vector topMatches; + string processedSeq = candidateSeq->convert2ints(); // the candidate sequence needs to be a string of ints + + vector seqMatches; + for(int i=0;i= 0; i--) { suffixForest.pop_back(); } - // templateSequences.clear(); - } /**************************************************************************************************/ diff --git a/suffixdb.hpp b/suffixdb.hpp index f211aa7..1baa99e 100644 --- a/suffixdb.hpp +++ b/suffixdb.hpp @@ -26,12 +26,16 @@ class SuffixTree; class SuffixDB : public Database { public: - SuffixDB(string); + SuffixDB(int); ~SuffixDB(); - Sequence findClosestSequence(Sequence*); + + void generateDB() {}; //adding sequences generates the db + void addSequence(Sequence); + vector findClosestSequences(Sequence*, int); private: vector suffixForest; + int count; }; #endif diff --git a/suffixtree.cpp b/suffixtree.cpp index 53a6d3f..2842b36 100644 --- a/suffixtree.cpp +++ b/suffixtree.cpp @@ -125,7 +125,46 @@ int SuffixTree::countSuffixes(string compareSequence, int& minValue){ // here we return numSuffixes; // change the value and return the number of suffixes } +//******************************************************************************************************************** +int SuffixTree::countSuffixes(string compareSequence){ // here we count the number of suffix parts + // we need to rewrite a user supplied sequence. if the + int numSuffixes = 0; // count exceeds the supplied minValue, bail out. The + int seqLength = compareSequence.length(); // time complexity should be O(L) + int position = 0; + + int presentNode = 0; + + while(position < seqLength){ // while the position in the query sequence isn't at the end... + + int newNode = nodeVector[presentNode]->getChild(compareSequence[position]); // see if the current node has a + // child that matches the next character in the query + if(newNode == -1){ + if(presentNode == 0){ position++; } // if not, go back to the root and increase the count + numSuffixes++; // by one. + presentNode = 0; + } + else{ // if there is, move to that node and see how far down + presentNode = newNode; // it we can get + + for(int i=nodeVector[newNode]->getStartCharPos(); i<=nodeVector[newNode]->getEndCharPos(); i++){ + if(compareSequence[position] == sequence[i]){ + position++; // as long as the query and branch agree, keep going + } + else{ + numSuffixes++; // if there is a mismatch, increase the number of + presentNode = 0; // suffixes and go back to the root + break; + } + } + } + // if we get all the way through the node we'll go to the top of the while loop and find the child node + // that corresponds to what we are interested in + } + numSuffixes--; // the method puts an extra count on numSuffixes + + return numSuffixes; // change the value and return the number of suffixes +} //******************************************************************************************************************** void SuffixTree::canonize(){ // if you have to ask how this works, you don't really want to know and this really diff --git a/suffixtree.hpp b/suffixtree.hpp index b85cee9..ab262ba 100644 --- a/suffixtree.hpp +++ b/suffixtree.hpp @@ -42,6 +42,7 @@ public: string getSeqName(); void print(); int countSuffixes(string, int&); + int countSuffixes(string); private: void addPrefix(int);