379293C30F2DE73400B9034A /* treemap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 379293C20F2DE73400B9034A /* treemap.cpp */; };
379294700F2E191800B9034A /* parsimonycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3792946F0F2E191800B9034A /* parsimonycommand.cpp */; };
3792948A0F2E258500B9034A /* parsimony.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 379294890F2E258500B9034A /* parsimony.cpp */; };
- 3799A9500FD6A58C00E33EDE /* distancedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3799A94C0FD6A58C00E33EDE /* distancedb.cpp */; };
3799A9510FD6A58C00E33EDE /* seqsummarycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3799A94E0FD6A58C00E33EDE /* seqsummarycommand.cpp */; };
379D3D510FF90E090068C1C0 /* chimeraseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 379D3D500FF90E090068C1C0 /* chimeraseqscommand.cpp */; };
37AD4CE40F28AEA300AA2D49 /* sharedlistvector.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37AD4CE30F28AEA300AA2D49 /* sharedlistvector.cpp */; };
A73329CF1083B3B3003B10C5 /* getlistcountcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73329CE1083B3B3003B10C5 /* getlistcountcommand.cpp */; };
A751ACEA1098B283003D0911 /* readcluster.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A751ACE91098B283003D0911 /* readcluster.cpp */; };
A75B887E104C16860083C454 /* ccode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75B887B104C16860083C454 /* ccode.cpp */; };
+ A7861478109F5FA00010AAAF /* classifyseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7861477109F5FA00010AAAF /* classifyseqscommand.cpp */; };
+ A787810310A06D5D0086103D /* classify.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787810210A06D5D0086103D /* classify.cpp */; };
+ A787811510A074330086103D /* phylotype.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787811410A074330086103D /* phylotype.cpp */; };
+ A78781BA10A0813E0086103D /* doTaxonomy.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A78781B810A0813D0086103D /* doTaxonomy.cpp */; };
+ A787821110A0AAE70086103D /* bayesian.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787821010A0AAE70086103D /* bayesian.cpp */; };
+ A78782AB10A1B1CB0086103D /* alignmentdb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A78782AA10A1B1CB0086103D /* alignmentdb.cpp */; };
+ A787844510A1EBDD0086103D /* knn.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787844410A1EBDD0086103D /* knn.cpp */; };
A7B04493106CC3E60046FC83 /* chimeraslayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */; };
A7B0450E106CEEC90046FC83 /* slayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B0450D106CEEC90046FC83 /* slayer.cpp */; };
A7E4A783106913F900688F62 /* getsharedotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A782106913F900688F62 /* getsharedotucommand.cpp */; };
379294880F2E258500B9034A /* parsimony.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = parsimony.h; sourceTree = SOURCE_ROOT; };
379294890F2E258500B9034A /* parsimony.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = parsimony.cpp; sourceTree = SOURCE_ROOT; };
3792948D0F2E271100B9034A /* treecalculator.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = treecalculator.h; sourceTree = SOURCE_ROOT; };
- 3799A94C0FD6A58C00E33EDE /* distancedb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = distancedb.cpp; sourceTree = SOURCE_ROOT; };
- 3799A94D0FD6A58C00E33EDE /* distancedb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = distancedb.hpp; sourceTree = SOURCE_ROOT; };
3799A94E0FD6A58C00E33EDE /* seqsummarycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = seqsummarycommand.cpp; sourceTree = SOURCE_ROOT; };
3799A94F0FD6A58C00E33EDE /* seqsummarycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = seqsummarycommand.h; sourceTree = SOURCE_ROOT; };
379D3D4F0FF90E090068C1C0 /* chimeraseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraseqscommand.h; sourceTree = SOURCE_ROOT; };
A751ACE91098B283003D0911 /* readcluster.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = readcluster.cpp; sourceTree = SOURCE_ROOT; };
A75B887B104C16860083C454 /* ccode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ccode.cpp; sourceTree = SOURCE_ROOT; };
A75B887C104C16860083C454 /* ccode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ccode.h; sourceTree = SOURCE_ROOT; };
+ A7861476109F5FA00010AAAF /* classifyseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = classifyseqscommand.h; sourceTree = "<group>"; };
+ A7861477109F5FA00010AAAF /* classifyseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = classifyseqscommand.cpp; sourceTree = "<group>"; };
+ A78780FE10A06B8B0086103D /* classify.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = classify.h; sourceTree = SOURCE_ROOT; };
+ A787810210A06D5D0086103D /* classify.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = classify.cpp; sourceTree = SOURCE_ROOT; };
+ A787811310A074330086103D /* phylotype.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = phylotype.h; sourceTree = SOURCE_ROOT; };
+ A787811410A074330086103D /* phylotype.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = phylotype.cpp; sourceTree = SOURCE_ROOT; };
+ A78781B810A0813D0086103D /* doTaxonomy.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = doTaxonomy.cpp; sourceTree = "<group>"; };
+ A78781B910A0813D0086103D /* doTaxonomy.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = doTaxonomy.h; sourceTree = "<group>"; };
+ A787820F10A0AAE70086103D /* bayesian.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bayesian.h; sourceTree = "<group>"; };
+ A787821010A0AAE70086103D /* bayesian.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bayesian.cpp; sourceTree = "<group>"; };
+ A78782A910A1B1CB0086103D /* alignmentdb.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = alignmentdb.h; sourceTree = "<group>"; };
+ A78782AA10A1B1CB0086103D /* alignmentdb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignmentdb.cpp; sourceTree = "<group>"; };
+ A787844310A1EBDD0086103D /* knn.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = knn.h; sourceTree = "<group>"; };
+ A787844410A1EBDD0086103D /* knn.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = knn.cpp; sourceTree = "<group>"; };
A7B04491106CC3E60046FC83 /* chimeraslayer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraslayer.h; sourceTree = SOURCE_ROOT; };
A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraslayer.cpp; sourceTree = SOURCE_ROOT; };
A7B0450C106CEEC90046FC83 /* slayer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = slayer.h; sourceTree = SOURCE_ROOT; };
373C68DB0FC1C38D00137ACD /* blastalign.cpp */,
37D928A60F2133C0001D4494 /* calculators */,
374F2FE81006346600E97C66 /* chimera */,
+ A787811210A073F40086103D /* classifiers */,
37D927C20F21331F001D4494 /* cluster.hpp */,
37D927C10F21331F001D4494 /* cluster.cpp */,
- A729ACE210849BBE00139801 /* hcluster.h */,
- A729ACE310849BBE00139801 /* hcluster.cpp */,
37D928A90F2133E5001D4494 /* commands */,
37D927C60F21331F001D4494 /* collect.h */,
37D927C50F21331F001D4494 /* collect.cpp */,
37D927DF0F21331F001D4494 /* globaldata.cpp */,
373C68BB0FC1C1A600137ACD /* gotohoverlap.hpp */,
373C68BA0FC1C1A600137ACD /* gotohoverlap.cpp */,
+ A729ACE210849BBE00139801 /* hcluster.h */,
+ A729ACE310849BBE00139801 /* hcluster.cpp */,
375873ED0F7D646F0040F377 /* heatmap.h */,
375873EE0F7D646F0040F377 /* heatmap.cpp */,
378598720FDD4C1500EF9D03 /* heatmapsim.h */,
377326630FAF16E0007ABB8B /* consensuscommand.cpp */,
379D3D4F0FF90E090068C1C0 /* chimeraseqscommand.h */,
379D3D500FF90E090068C1C0 /* chimeraseqscommand.cpp */,
+ A7861476109F5FA00010AAAF /* classifyseqscommand.h */,
+ A7861477109F5FA00010AAAF /* classifyseqscommand.cpp */,
37B28F660F27590100808A62 /* deconvolutecommand.h */,
37B28F670F27590100808A62 /* deconvolutecommand.cpp */,
37C753CC0FB3415200DBD02E /* distancecommand.h */,
373C68A00FC1C07D00137ACD /* alignment.cpp */,
373C68A30FC1C07D00137ACD /* alignmentcell.hpp */,
373C68A20FC1C07D00137ACD /* alignmentcell.cpp */,
+ A78782A910A1B1CB0086103D /* alignmentdb.h */,
+ A78782AA10A1B1CB0086103D /* alignmentdb.cpp */,
373C69170FC1C8AF00137ACD /* blastdb.hpp */,
373C69160FC1C8AF00137ACD /* blastdb.cpp */,
37D927D40F21331F001D4494 /* database.hpp */,
37D927D30F21331F001D4494 /* database.cpp */,
37D927D50F21331F001D4494 /* datavector.hpp */,
- 3799A94D0FD6A58C00E33EDE /* distancedb.hpp */,
- 3799A94C0FD6A58C00E33EDE /* distancedb.cpp */,
37D927DC0F21331F001D4494 /* fastamap.h */,
37D927DB0F21331F001D4494 /* fastamap.cpp */,
375873EA0F7D64520040F377 /* fullmatrix.h */,
name = errorcheckor;
sourceTree = "<group>";
};
+ A787811210A073F40086103D /* classifiers */ = {
+ isa = PBXGroup;
+ children = (
+ A78780FE10A06B8B0086103D /* classify.h */,
+ A787810210A06D5D0086103D /* classify.cpp */,
+ A787820F10A0AAE70086103D /* bayesian.h */,
+ A787821010A0AAE70086103D /* bayesian.cpp */,
+ A78781B910A0813D0086103D /* doTaxonomy.h */,
+ A78781B810A0813D0086103D /* doTaxonomy.cpp */,
+ A787844310A1EBDD0086103D /* knn.h */,
+ A787844410A1EBDD0086103D /* knn.cpp */,
+ A787811310A074330086103D /* phylotype.h */,
+ A787811410A074330086103D /* phylotype.cpp */,
+ );
+ name = classifiers;
+ sourceTree = "<group>";
+ };
C6859E8C029090F304C91782 /* Documentation */ = {
isa = PBXGroup;
children = (
21E859D80FC4632E005E1A48 /* matrixoutputcommand.cpp in Sources */,
3749271D0FD58C840031C06B /* getsabundcommand.cpp in Sources */,
3749273F0FD5956B0031C06B /* getrabundcommand.cpp in Sources */,
- 3799A9500FD6A58C00E33EDE /* distancedb.cpp in Sources */,
3799A9510FD6A58C00E33EDE /* seqsummarycommand.cpp in Sources */,
371B30B40FD7EE67000414CA /* screenseqscommand.cpp in Sources */,
7E09C5140FDA79C5002ECAE5 /* reversecommand.cpp in Sources */,
A729ACD010848E6100139801 /* hclustercommand.cpp in Sources */,
A729ACE410849BBE00139801 /* hcluster.cpp in Sources */,
A751ACEA1098B283003D0911 /* readcluster.cpp in Sources */,
+ A7861478109F5FA00010AAAF /* classifyseqscommand.cpp in Sources */,
+ A787810310A06D5D0086103D /* classify.cpp in Sources */,
+ A787811510A074330086103D /* phylotype.cpp in Sources */,
+ A78781BA10A0813E0086103D /* doTaxonomy.cpp in Sources */,
+ A787821110A0AAE70086103D /* bayesian.cpp in Sources */,
+ A78782AB10A1B1CB0086103D /* alignmentdb.cpp in Sources */,
+ A787844510A1EBDD0086103D /* knn.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
#include "blastalign.hpp"
#include "noalign.hpp"
-#include "kmerdb.hpp"
-#include "suffixdb.hpp"
-#include "blastdb.hpp"
-
#include "nast.hpp"
#include "nastreport.hpp"
mothurOut("The template and candidate parameters are required.\n");
mothurOut("The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer and blast. The default is kmer.\n");
mothurOut("The align parameter allows you to specify the alignment method to use. Your options are: gotoh, needleman, blast and noalign. The default is needleman.\n");
- mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 7.\n");
+ mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n");
mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n");
mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n");
mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n");
try {
if (abort == true) { return 0; }
- if(search == "kmer") { templateDB = new KmerDB(templateFileName, kmerSize); }
- else if(search == "suffix") { templateDB = new SuffixDB(templateFileName); }
- else if(search == "blast") { templateDB = new BlastDB(templateFileName, gapOpen, gapExtend, match, misMatch); }
- else {
- mothurOut(search + " is not a valid search option. I will run the command using kmer, ksize=8.");
- mothurOutEndLine();
- kmerSize = 8;
-
- templateDB = new KmerDB(templateFileName, kmerSize);
- }
-
+ templateDB = new AlignmentDB(templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch);
int longestBase = templateDB->getLongestBase();
-
+
if(align == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, longestBase); }
else if(align == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase); }
else if(align == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); }
Sequence temp = templateDB->findClosestSequence(candidateSeq);
Sequence* templateSeq = &temp;
-
+
report.setTemplate(templateSeq);
report.setSearchParameters(search, templateDB->getSearchScore());
report.setAlignmentParameters(align, alignment);
report.setNastParameters(nast);
-
+
alignmentFile << '>' << candidateSeq->getName() << '\n' << candidateSeq->getAligned() << endl;
report.print();
delete candidateSeq;
-
+
}
alignmentFile.close();
#include "command.hpp"
#include "database.hpp"
#include "alignment.hpp"
+#include "alignmentdb.h"
class AlignCommand : public Command {
vector<int> processIDS; //processid
vector<linePair*> lines;
- Database* templateDB;
+ AlignmentDB* templateDB;
Alignment* alignment;
int driver(linePair*, string, string);
--- /dev/null
+/*
+ * alignmentdb.cpp
+ * Mothur
+ *
+ * Created by westcott on 11/4/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "alignmentdb.h"
+#include "kmerdb.hpp"
+#include "suffixdb.hpp"
+#include "blastdb.hpp"
+
+
+/**************************************************************************************************/
+AlignmentDB::AlignmentDB(string fastaFileName, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch){ // This assumes that the template database is in fasta format, may
+ try { // need to alter this in the future?
+ longest = 0;
+
+ ifstream fastaFile;
+ openInputFile(fastaFileName, fastaFile);
+
+ mothurOutEndLine();
+ mothurOut("Reading in the " + fastaFileName + " template sequences...\t"); cout.flush();
+
+ while (!fastaFile.eof()) {
+ Sequence temp(fastaFile);
+
+ templateSequences.push_back(temp);
+
+ //save longest base
+ if (temp.getUnaligned().length() > longest) { longest = temp.getUnaligned().length(); }
+
+ gobble(fastaFile);
+ }
+
+ numSeqs = templateSequences.size();
+
+ fastaFile.close();
+ //all of this is elsewhere already!
+
+ mothurOut("DONE.");
+ mothurOutEndLine(); cout.flush();
+
+ //in case you delete the seqs and then ask for them
+ emptySequence = Sequence();
+ emptySequence.setName("no_match");
+ emptySequence.setUnaligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
+ emptySequence.setAligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
+
+ bool needToGenerate = true;
+ string kmerDBName;
+ if(method == "kmer") {
+ search = new KmerDB(fastaFileName, kmerSize);
+
+ kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
+ ifstream kmerFileTest(kmerDBName.c_str());
+
+ if(kmerFileTest){ needToGenerate = false; }
+ }
+ else if(method == "suffix") { search = new SuffixDB(numSeqs); }
+ else if(method == "blast") { search = new BlastDB(gapOpen, gapExtend, match, misMatch); }
+ else {
+ mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
+ mothurOutEndLine();
+ search = new KmerDB(fastaFileName, 8);
+ }
+
+ if (needToGenerate) {
+
+ //add sequences to search
+ for (int i = 0; i < templateSequences.size(); i++) {
+ search->addSequence(templateSequences[i]);
+ }
+ search->generateDB();
+
+ }else if ((method == "kmer") && (!needToGenerate)) {
+ ifstream kmerFileTest(kmerDBName.c_str());
+ search->readKmerDB(kmerFileTest);
+ }
+
+ search->setNumSeqs(numSeqs);
+ }
+ catch(exception& e) {
+ errorOut(e, "AlignmentDB", "AlignmentDB");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+AlignmentDB::~AlignmentDB() { delete search; }
+/**************************************************************************************************/
+Sequence AlignmentDB::findClosestSequence(Sequence* seq) {
+ try{
+
+ vector<int> spot = search->findClosestSequences(seq, 1);
+
+ if (spot.size() != 0) { return templateSequences[spot[0]]; }
+ else { return emptySequence; }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "AlignmentDB", "findClosestSequence");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+
+
+
--- /dev/null
+#ifndef ALIGNMENTDB_H
+#define ALIGNMENTDB_H
+
+/*
+ * alignmentdb.h
+ * Mothur
+ *
+ * Created by westcott on 11/4/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "sequence.hpp"
+#include "database.hpp"
+
+/**************************************************************************************************/
+
+class AlignmentDB {
+
+public:
+
+ AlignmentDB(string, string, int, int, int, int, int); //reads fastafile passed in and stores sequences
+ ~AlignmentDB();
+
+ Sequence findClosestSequence(Sequence*);
+ float getSearchScore() { return search->getSearchScore(); }
+ int getLongestBase() { return longest; }
+
+private:
+ int numSeqs, longest;
+ float searchScore;
+
+ Database* search;
+ vector<Sequence> templateSequences;
+ Sequence emptySequence;
+};
+
+/**************************************************************************************************/
+
+#endif
+
--- /dev/null
+/*
+ * bayesian.cpp
+ * Mothur
+ *
+ * Created by westcott on 11/3/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "bayesian.h"
+
+/**************************************************************************************************/
+Bayesian::Bayesian(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch) :
+Classify(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch) {}
+/**************************************************************************************************/
+string Bayesian::getTaxonomy(Sequence* seq) {
+ try {
+ string tax;
+
+
+ return tax;
+ }
+ catch(exception& e) {
+ errorOut(e, "Bayesian", "getTaxonomy");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
--- /dev/null
+#ifndef BAYESIAN_H
+#define BAYESIAN_H
+
+/*
+ * bayesian.h
+ * Mothur
+ *
+ * Created by westcott on 11/3/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "classify.h"
+
+/**************************************************************************************************/
+
+class Bayesian : public Classify {
+
+public:
+ Bayesian(string, string, string, int, int, int, int, int);
+ ~Bayesian() {};
+
+ string getTaxonomy(Sequence*);
+
+
+private:
+
+};
+
+/**************************************************************************************************/
+
+#endif
+
/**************************************************************************************************/
-BlastDB::BlastDB(string fastaFileName, float gO, float gE, float m, float mM) : Database(fastaFileName),
+BlastDB::BlastDB(float gO, float gE, float m, float mM) : Database(),
gapOpen(gO), gapExtend(gE), match(m), misMatch(mM) {
globaldata = GlobalData::getInstance();
-
- mothurOut("Generating the temporary BLAST database...\t"); cout.flush();
+ count = 0;
int randNumber = rand();
dbFileName = toString(randNumber) + ".template.unaligned.fasta";
queryFileName = toString(randNumber) + ".candidate.unaligned.fasta";
blastFileName = toString(randNumber) + ".blast";
-
- ofstream unalignedFastaFile;
- openOutputFile(dbFileName, unalignedFastaFile);
-
- for(int i=0;i<numSeqs;i++){ // generating a fasta file with unaligned template
- unalignedFastaFile << '>' << i << endl; // sequences, which will be input to formatdb
- unalignedFastaFile << templateSequences[i].getUnaligned() << endl;
- }
- unalignedFastaFile.close();
-
- path = globaldata->argv;
- path = path.substr(0, (path.find_last_of('m')));
-
- string formatdbCommand = path + "blast/bin/formatdb -p F -o T -i " + dbFileName; // format the database, -o option gives us the ability
- system(formatdbCommand.c_str()); // to get the right sequence names, i think. -p F
- // option tells formatdb that seqs are DNA, not prot
- mothurOut("DONE."); mothurOutEndLine(); mothurOutEndLine(); cout.flush();
- emptySequence = Sequence();
- emptySequence.setName("no_match");
- emptySequence.setUnaligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
- emptySequence.setAligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
-
-
}
/**************************************************************************************************/
BlastDB::~BlastDB(){
- //for (int i = 0; i < templateSequences.size(); i++) { delete templateSequences[i]; }
- //templateSequences.clear();
- //delete emptySequence;
-
remove(queryFileName.c_str()); // let's clean stuff up and remove the temp files
remove(dbFileName.c_str()); // let's clean stuff up and remove the temp files
remove(blastFileName.c_str()); // let's clean stuff up and remove the temp files
}
-
/**************************************************************************************************/
+//assumes you have added all the template sequences using the addSequence function and run generateDB.
+vector<int> BlastDB::findClosestSequences(Sequence* seq, int n) {
+ try{
+ vector<int> topMatches;
+
+ ofstream queryFile;
+ openOutputFile(queryFileName, queryFile);
+ queryFile << '>' << seq->getName() << endl;
+ queryFile << seq->getUnaligned() << endl;
+ queryFile.close();
+
+ // the goal here is to quickly survey the database to find the closest match. To do this we are using the default
+ // wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
+ // long. With this setting, it seems comparable in speed to the suffix tree approach.
+
+ string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -b 1 -m 8 -W 28 -v " + toString(n);
+ blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
+ system(blastCommand.c_str());
+
+ ifstream m8FileHandle;
+ openInputFile(blastFileName, m8FileHandle);
+
+ string dummy;
+ int templateAccession;
+ gobble(m8FileHandle);
+
+ while(!m8FileHandle.eof()){
+ m8FileHandle >> dummy >> templateAccession >> searchScore;
+
+ //get rest of junk in line
+ while (!m8FileHandle.eof()) { char c = m8FileHandle.get(); if (c == 10 || c == 13){ break; } }
+
+ gobble(m8FileHandle);
+ topMatches.push_back(templateAccession);
+ }
+ m8FileHandle.close();
+
+ return topMatches;
+ }
+ catch(exception& e) {
+ errorOut(e, "BlastDB", "findClosestSequences");
+ exit(1);
+ }
-Sequence BlastDB::findClosestSequence(Sequence* candidate){
-
- ofstream queryFile;
- openOutputFile(queryFileName, queryFile);
- queryFile << '>' << candidate->getName() << endl;
- queryFile << candidate->getUnaligned() << endl;
- queryFile.close();
+}
+/**************************************************************************************************/
+void BlastDB::addSequence(Sequence seq) {
+ try {
+ ofstream unalignedFastaFile;
+ openOutputFileAppend(dbFileName, unalignedFastaFile);
-// the goal here is to quickly survey the database to find the closest match. To do this we are using the default
-// wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
-// long. With this setting, it seems comparable in speed to the suffix tree approach.
-
- string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -b 1 -m 8 -W 28";
- blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
- system(blastCommand.c_str());
+ // generating a fasta file with unaligned template
+ unalignedFastaFile << '>' << count << endl; // sequences, which will be input to formatdb
+ unalignedFastaFile << seq.getUnaligned() << endl;
+ unalignedFastaFile.close();
+
+ count++;
+ }
+ catch(exception& e) {
+ errorOut(e, "BlastDB", "addSequence");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+void BlastDB::generateDB() {
+ try {
- ifstream m8FileHandle;
- openInputFile(blastFileName, m8FileHandle);
+ mothurOut("Generating the temporary BLAST database...\t"); cout.flush();
+
+ path = globaldata->argv;
+ path = path.substr(0, (path.find_last_of('m')));
- string dummy;
- int templateAccession;
- gobble(m8FileHandle);
- if(!m8FileHandle.eof()){
- m8FileHandle >> dummy >> templateAccession >> searchScore;
- m8FileHandle.close();
- return templateSequences[templateAccession];
+ string formatdbCommand = path + "blast/bin/formatdb -p F -o T -i " + dbFileName; // format the database, -o option gives us the ability
+ system(formatdbCommand.c_str()); // to get the right sequence names, i think. -p F
+ // option tells formatdb that seqs are DNA, not prot
+ mothurOut("DONE."); mothurOutEndLine(); mothurOutEndLine(); cout.flush();
}
- else{
- searchScore = 0.00;
- return emptySequence;
+ catch(exception& e) {
+ errorOut(e, "BlastDB", "generateDB");
+ exit(1);
}
- m8FileHandle.close();
}
/**************************************************************************************************/
+
#include "globaldata.hpp"
class BlastDB : public Database {
+
public:
- BlastDB(string, float, float, float, float);
+ BlastDB(float, float, float, float);
~BlastDB();
- Sequence findClosestSequence(Sequence*);
+
+ void generateDB();
+ void addSequence(Sequence);
+ vector<int> findClosestSequences(Sequence*, int);
private:
string dbFileName;
string blastFileName;
string path;
+ int count;
float gapOpen;
float gapExtend;
float match;
float misMatch;
- Sequence emptySequence;
GlobalData* globaldata;
};
try {
//read in query sequences and subject sequences
- mothurOut("Reading sequences and template file... "); cout.flush();
+ mothurOutEndLine();
+ mothurOut("Reading query sequences... "); cout.flush();
querySeqs = readSeqs(fastafile);
+ mothurOut("Done.");
//templateSeqs = readSeqs(templateFile);
- templateDB = new KmerDB(templateFile, kmerSize);
- mothurOut("Done."); mothurOutEndLine();
+ templateDB = new AlignmentDB(templateFile, "kmer", kmerSize, 0,0,0,0);
+ mothurOutEndLine();
int numSeqs = querySeqs.size();
#include "chimera.h"
#include "kmer.hpp"
#include "kmerdb.hpp"
-#include "database.hpp"
+#include "alignmentdb.h"
/***********************************************************/
//This class was created using the algorythms described in
vector<linePair*> lines;
vector<Sequence*> querySeqs;
- Database* templateDB;
+ AlignmentDB* templateDB;
Kmer* kmer;
vector< vector<sim> > IS; //IS[0] is the vector of IS values for each window for querySeqs[0]
--- /dev/null
+/*
+ * classify.cpp
+ * Mothur
+ *
+ * Created by westcott on 11/3/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "classify.h"
+#include "sequence.hpp"
+#include "kmerdb.hpp"
+#include "suffixdb.hpp"
+#include "blastdb.hpp"
+
+/**************************************************************************************************/
+
+Classify::Classify(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch) : taxFile(tfile), templateFile(tempFile) {
+ try {
+
+ readTaxonomy(taxFile);
+
+ int numSeqs = 0;
+ //need to know number of template seqs for suffixdb
+ if (method == "suffix") {
+ ifstream inFASTA;
+ openInputFile(tempFile, inFASTA);
+ numSeqs = count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
+ inFASTA.close();
+ }
+
+ mothurOut("Generating search database... "); cout.flush();
+
+ bool needToGenerate = true;
+ string kmerDBName;
+ if(method == "kmer") {
+ database = new KmerDB(tempFile, kmerSize);
+
+ kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
+ ifstream kmerFileTest(kmerDBName.c_str());
+ if(kmerFileTest){ needToGenerate = false; }
+ }
+ else if(method == "suffix") { database = new SuffixDB(numSeqs); }
+ else if(method == "blast") { database = new BlastDB(gapOpen, gapExtend, match, misMatch); }
+ else {
+ mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
+ mothurOutEndLine();
+ database = new KmerDB(tempFile, 8);
+ }
+
+ if (needToGenerate) {
+ ifstream fastaFile;
+ openInputFile(tempFile, fastaFile);
+
+ while (!fastaFile.eof()) {
+ Sequence temp(fastaFile);
+ gobble(fastaFile);
+
+ names.push_back(temp.getName());
+ database->addSequence(temp);
+ }
+ fastaFile.close();
+
+ database->generateDB();
+
+ }else if ((method == "kmer") && (!needToGenerate)) {
+ ifstream kmerFileTest(kmerDBName.c_str());
+ database->readKmerDB(kmerFileTest);
+
+ ifstream fastaFile;
+ openInputFile(tempFile, fastaFile);
+
+ while (!fastaFile.eof()) {
+ Sequence temp(fastaFile);
+ gobble(fastaFile);
+
+ names.push_back(temp.getName());
+ }
+ fastaFile.close();
+ }
+
+ database->setNumSeqs(names.size());
+
+ mothurOut("DONE."); mothurOutEndLine();
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Classify", "Classify");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+void Classify::readTaxonomy(string file) {
+ try {
+
+ ifstream inTax;
+ openInputFile(file, inTax);
+
+ mothurOutEndLine();
+ mothurOut("Reading in the " + file + " taxonomy...\t"); cout.flush();
+
+ string name, taxInfo;
+ //read template seqs and save
+ while (!inTax.eof()) {
+ inTax >> name >> taxInfo;
+
+ taxonomy[name] = taxInfo;
+
+ gobble(inTax);
+ }
+ inTax.close();
+
+ mothurOut("DONE.");
+ mothurOutEndLine(); cout.flush();
+
+ }
+ catch(exception& e) {
+ errorOut(e, "Classify", "readTaxonomy");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
--- /dev/null
+#ifndef CLASSIFY_H
+#define CLASSIFY_H
+
+/*
+ * classify.h
+ * Mothur
+ *
+ * Created by westcott on 11/3/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+
+/* This class is a parent to phylotyp, bayesian, knn. */
+
+#include "mothur.h"
+#include "database.hpp"
+
+
+
+class Sequence;
+
+/**************************************************************************************************/
+
+class Classify {
+
+public:
+ Classify(string, string, string, int, int, int, int, int);
+ Classify(){};
+
+ virtual ~Classify(){};
+ virtual string getTaxonomy(Sequence*) = 0;
+
+protected:
+
+ map<string, string> taxonomy; //name maps to taxonomy
+ map<string, string>::iterator it;
+ Database* database;
+
+ string taxFile, templateFile;
+ vector<string> names;
+
+ void readTaxonomy(string);
+
+};
+
+/**************************************************************************************************/
+
+#endif
+
--- /dev/null
+/*
+ * classifyseqscommand.cpp
+ * Mothur
+ *
+ * Created by westcott on 11/2/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "classifyseqscommand.h"
+#include "sequence.hpp"
+#include "phylotype.h"
+#include "bayesian.h"
+#include "doTaxonomy.h"
+#include "knn.h"
+
+//**********************************************************************************************************************
+
+ClassifySeqsCommand::ClassifySeqsCommand(string option){
+ try {
+ // globaldata = GlobalData::getInstance();
+ abort = false;
+
+ //allow user to run help
+ if(option == "help") { help(); abort = true; }
+
+ else {
+
+ //valid paramters for this command
+ string AlignArray[] = {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted"};
+ vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
+
+ OptionParser parser(option);
+ map<string, string> parameters = parser.getParameters();
+
+ ValidParameters validParameter;
+
+ //check to make sure all parameters are valid for command
+ for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) {
+ if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
+ }
+
+ //check for required parameters
+ templateFileName = validParameter.validFile(parameters, "template", true);
+ if (templateFileName == "not found") {
+ mothurOut("template is a required parameter for the classify.seqs command.");
+ mothurOutEndLine();
+ abort = true;
+ }
+ else if (templateFileName == "not open") { abort = true; }
+
+ fastaFileName = validParameter.validFile(parameters, "fasta", true);
+ if (fastaFileName == "not found") {
+ mothurOut("fasta is a required parameter for the classify.seqs command.");
+ mothurOutEndLine();
+ abort = true;
+ }
+ else if (fastaFileName == "not open") { abort = true; }
+
+ taxonomyFileName = validParameter.validFile(parameters, "taxonomy", true);
+ if (taxonomyFileName == "not found") {
+ mothurOut("taxonomy is a required parameter for the classify.seqs command.");
+ mothurOutEndLine();
+ abort = true;
+ }
+ else if (taxonomyFileName == "not open") { abort = true; }
+
+
+ //check for optional parameter and set defaults
+ // ...at some point should added some additional type checking...
+ string temp;
+ temp = validParameter.validFile(parameters, "ksize", false); if (temp == "not found"){ temp = "8"; }
+ convert(temp, kmerSize);
+
+ temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = "1"; }
+ convert(temp, processors);
+
+ search = validParameter.validFile(parameters, "search", false); if (search == "not found"){ search = "kmer"; }
+
+ method = validParameter.validFile(parameters, "method", false); if (method == "not found"){ method = "phylotype"; }
+
+ temp = validParameter.validFile(parameters, "match", false); if (temp == "not found"){ temp = "1.0"; }
+ convert(temp, match);
+
+ temp = validParameter.validFile(parameters, "mismatch", false); if (temp == "not found"){ temp = "-1.0"; }
+ convert(temp, misMatch);
+
+ temp = validParameter.validFile(parameters, "gapopen", false); if (temp == "not found"){ temp = "-2.0"; }
+ convert(temp, gapOpen);
+
+ temp = validParameter.validFile(parameters, "gapextend", false); if (temp == "not found"){ temp = "-1.0"; }
+ convert(temp, gapExtend);
+
+ temp = validParameter.validFile(parameters, "numwanted", false); if (temp == "not found"){ temp = "10"; }
+ convert(temp, numWanted);
+
+ }
+
+ }
+ catch(exception& e) {
+ errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+
+ClassifySeqsCommand::~ClassifySeqsCommand(){
+
+ if (abort == false) {
+ for (int i = 0; i < lines.size(); i++) { delete lines[i]; } lines.clear();
+ }
+}
+
+//**********************************************************************************************************************
+
+void ClassifySeqsCommand::help(){
+ try {
+ mothurOut("The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n");
+ mothurOut("The classify.seqs command parameters are template, fasta, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend and numwanted.\n");
+ mothurOut("The template, fasta and taxonomy parameters are required.\n");
+ mothurOut("The search parameter allows you to specify the method to find most similar template. Your options are: suffix, kmer and blast. The default is kmer.\n");
+ mothurOut("The method parameter allows you to specify classification method to use. Your options are: phylotype, bayesian and knn. The default is phylotype.\n");
+ mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate. The default is 8.\n");
+ mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
+ mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n");
+ mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases. The default is -1.0.\n");
+ mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n");
+ mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment. The default is -2.0.\n");
+ mothurOut("The numwanted parameter allows you to specify the number of sequence matches you want with the knn method. The default is 10.\n");
+ mothurOut("The classify.seqs command should be in the following format: \n");
+ mothurOut("classify.seqs(template=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n");
+ mothurOut("Example classify.seqs(fasta=amazon.fasta, template=core.filtered, method=phylotype, search=gotoh, ksize=8, processors=2)\n");
+ mothurOut("The .taxonomy file consists of 2 columns: 1 = your sequence name, 2 = the taxonomy for your sequence. \n");
+ mothurOut("The .tax.summary is a summary of the different taxonomies represented in your fasta file. \n");
+ mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
+ }
+ catch(exception& e) {
+ errorOut(e, "ClassifySeqsCommand", "help");
+ exit(1);
+ }
+}
+
+
+//**********************************************************************************************************************
+
+int ClassifySeqsCommand::execute(){
+ try {
+ if (abort == true) { return 0; }
+
+ int start = time(NULL);
+
+ if(method == "phylotype"){ classify = new PhyloType(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch); }
+ //else if(method == "bayesian") { classify = new Bayesian(taxonomyFileName, templateFileName, search); }
+ else if(method == "knn") { classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted); }
+ else {
+ mothurOut(search + " is not a valid method option. I will run the command using phylotype.");
+ mothurOutEndLine();
+ classify = new PhyloType(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch);
+ }
+
+ int numFastaSeqs = 0;
+
+ string newTaxonomyFile = getRootName(fastaFileName) + "taxonomy";
+ string taxSummary = getRootName(fastaFileName) + "tax.summary";
+
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ if(processors == 1){
+ ifstream inFASTA;
+ openInputFile(fastaFileName, inFASTA);
+ numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
+ inFASTA.close();
+
+ lines.push_back(new linePair(0, numFastaSeqs));
+
+ driver(lines[0], newTaxonomyFile);
+
+ }
+ else{
+ vector<int> positions;
+ processIDS.resize(0);
+
+ ifstream inFASTA;
+ openInputFile(fastaFileName, inFASTA);
+
+ string input;
+ while(!inFASTA.eof()){
+ input = getline(inFASTA);
+ if (input.length() != 0) {
+ if(input[0] == '>'){ int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1); }
+ }
+ }
+ inFASTA.close();
+
+ numFastaSeqs = positions.size();
+
+ int numSeqsPerProcessor = numFastaSeqs / processors;
+
+ for (int i = 0; i < processors; i++) {
+ int startPos = positions[ i * numSeqsPerProcessor ];
+ if(i == processors - 1){
+ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
+ }
+ lines.push_back(new linePair(startPos, numSeqsPerProcessor));
+ }
+ createProcesses(newTaxonomyFile);
+
+ rename((newTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), newTaxonomyFile.c_str());
+
+ for(int i=1;i<processors;i++){
+ appendTaxFiles((newTaxonomyFile + toString(processIDS[i]) + ".temp"), newTaxonomyFile);
+ remove((newTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
+ }
+
+ }
+#else
+ ifstream inFASTA;
+ openInputFile(candidateFileName, inFASTA);
+ numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
+ inFASTA.close();
+
+ lines.push_back(new linePair(0, numFastaSeqs));
+
+ driver(lines[0], newTaxonomyFile);
+#endif
+
+ delete classify;
+
+ //make taxonomy tree from new taxonomy file
+ ifstream inTaxonomy;
+ openInputFile(newTaxonomyFile, inTaxonomy);
+
+ string accession, taxaList;
+ PhyloTree taxaBrowser;
+
+ //read in users taxonomy file and add sequences to tree
+ while(!inTaxonomy.eof()){
+ inTaxonomy >> accession >> taxaList;
+
+ taxaBrowser.addSeqToTree(accession, taxaList);
+
+ gobble(inTaxonomy);
+ }
+ inTaxonomy.close();
+
+ taxaBrowser.assignHeirarchyIDs(0);
+
+ ofstream outTaxTree;
+ openOutputFile(taxSummary, outTaxTree);
+
+ taxaBrowser.print(outTaxTree);
+
+ mothurOutEndLine();
+ mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences.");
+ mothurOutEndLine();
+ mothurOutEndLine();
+
+ return 0;
+ }
+ catch(exception& e) {
+ errorOut(e, "ClassifySeqsCommand", "execute");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+void ClassifySeqsCommand::createProcesses(string taxFileName) {
+ try {
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+ int process = 0;
+ // processIDS.resize(0);
+
+ //loop through and create all the processes you want
+ while (process != processors) {
+ int pid = fork();
+
+ if (pid > 0) {
+ processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
+ process++;
+ }else if (pid == 0){
+ driver(lines[process], taxFileName + toString(getpid()) + ".temp");
+ exit(0);
+ }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
+ }
+
+ //force parent to wait until all the processes are done
+ for (int i=0;i<processors;i++) {
+ int temp = processIDS[i];
+ wait(&temp);
+ }
+#endif
+ }
+ catch(exception& e) {
+ errorOut(e, "ClassifySeqsCommand", "createProcesses");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+void ClassifySeqsCommand::appendTaxFiles(string temp, string filename) {
+ try{
+
+ ofstream output;
+ ifstream input;
+ openOutputFileAppend(filename, output);
+ openInputFile(temp, input);
+
+ while(char c = input.get()){
+ if(input.eof()) { break; }
+ else { output << c; }
+ }
+
+ input.close();
+ output.close();
+ }
+ catch(exception& e) {
+ errorOut(e, "ClassifySeqsCommand", "appendTaxFiles");
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+
+int ClassifySeqsCommand::driver(linePair* line, string taxFName){
+ try {
+ ofstream outTax;
+ openOutputFile(taxFName, outTax);
+
+ ifstream inFASTA;
+ openInputFile(fastaFileName, inFASTA);
+
+ inFASTA.seekg(line->start);
+
+ string taxonomy;
+
+ for(int i=0;i<line->numSeqs;i++){
+
+ Sequence* candidateSeq = new Sequence(inFASTA);
+
+ taxonomy = classify->getTaxonomy(candidateSeq);
+
+ if (taxonomy != "bad seq") {
+ outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
+ }
+
+ delete candidateSeq;
+ }
+
+ inFASTA.close();
+ outTax.close();
+
+ return 1;
+ }
+ catch(exception& e) {
+ errorOut(e, "ClassifySeqsCommand", "driver");
+ exit(1);
+ }
+}
+
+/**************************************************************************************************/
--- /dev/null
+#ifndef CLASSIFYSEQSCOMMAND_H
+#define CLASSIFYSEQSCOMMAND_H
+
+/*
+ * classifyseqscommand.h
+ * Mothur
+ *
+ * Created by westcott on 11/2/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "command.hpp"
+#include "alignment.hpp"
+#include "classify.h"
+
+class ClassifySeqsCommand : public Command {
+
+public:
+ ClassifySeqsCommand(string);
+ ~ClassifySeqsCommand();
+ int execute();
+ void help();
+
+private:
+ struct linePair {
+ int start;
+ int numSeqs;
+ linePair(int i, int j) : start(i), numSeqs(j) {}
+ };
+ vector<int> processIDS; //processid
+ vector<linePair*> lines;
+
+ Classify* classify;
+
+ string fastaFileName, templateFileName, distanceFileName, search, method, taxonomyFileName;
+ int processors, kmerSize, numWanted;
+ float match, misMatch, gapOpen, gapExtend;
+ bool abort;
+
+ int driver(linePair*, string);
+ void appendTaxFiles(string, string);
+ void createProcesses(string);
+};
+
+#endif
+
#include "getsharedotucommand.h"
#include "getlistcountcommand.h"
#include "hclustercommand.h"
+#include "classifyseqscommand.h"
/***********************************************************/
commands["get.sharedotu"] = "get.sharedotu";
commands["get.listcount"] = "get.listcount";
commands["quit"] = "quit";
-
commands["hcluster"] = "hcluster";
+ commands["classify.seqs"] = "classify.seqs";
}
/***********************************************************/
else if(commandName == "align.check") { command = new AlignCheckCommand(optionString); }
else if(commandName == "get.sharedotu") { command = new GetSharedOTUCommand(optionString); }
else if(commandName == "get.listcount") { command = new GetListCountCommand(optionString); }
-
else if(commandName == "hcluster") { command = new HClusterCommand(optionString); }
+ else if(commandName == "classify.seqs") { command = new ClassifySeqsCommand(optionString); }
else { command = new NoCommand(optionString); }
return command;
/**************************************************************************************************/
-Database::Database(string fastaFileName){ // This assumes that the template database is in fasta format, may
- // need to alter this in the future?
+Database::Database(){
longest = 0;
-
- ifstream fastaFile;
- openInputFile(fastaFileName, fastaFile);
-
- mothurOutEndLine();
- mothurOut("Reading in the " + fastaFileName + " template sequences...\t"); cout.flush();
-
- //all of this is elsewhere already!
- //numSeqs=count(istreambuf_iterator<char>(fastaFile),istreambuf_iterator<char>(), '>'); // count the number of
- //fastaFile.seekg(0); // sequences
-
- //templateSequences.resize(numSeqs);
-
- /*string seqName, sequence;
- for(int i=0;i<numSeqs;i++){
- fastaFile >> seqName;
- seqName = seqName.substr(1);
- char letter;
- string aligned;
-
- while(fastaFile && (letter=fastaFile.get()) != '>'){
- if(isprint(letter)){
- letter = toupper(letter);
- if(letter == 'U'){letter = 'T';}
- aligned += letter;
- }
- }
- templateSequences[i] = Sequence(seqName, aligned);
-
- //necessary because Sequence constructor by default sets whatever it reads to unaligned
- //the setUnaligned function remove gap char.
- templateSequences[i].setUnaligned(templateSequences[i].getUnaligned());
-
- if (templateSequences[i].getUnaligned().length() > longest) { longest = templateSequences[i].getUnaligned().length(); }
-
- fastaFile.putback(letter);
- }*/
-
- while (!fastaFile.eof()) {
- Sequence temp(fastaFile);
-
- templateSequences.push_back(temp);
-
- //save longest base
- if (temp.getUnaligned().length() > longest) { longest = temp.getUnaligned().length(); }
-
- gobble(fastaFile);
- }
-
- numSeqs = templateSequences.size();
-
- fastaFile.close();
- //all of this is elsewhere already!
-
- mothurOut("DONE.");
- mothurOutEndLine(); cout.flush();
-
+ numSeqs = 0;
}
/**************************************************************************************************/
-Database::~Database(){
-
- templateSequences.clear();
-}
+Database::~Database(){}
/**************************************************************************************************/
int Database::getLongestBase() { return longest+1; }
/**************************************************************************************************/
-
-
*/
-/* This class is a parent to blastdb, distancedb, kmerdb, suffixdb. Which are used to convert a squencedb object into that form. */
+/* This class is a parent to blastdb, kmerdb, suffixdb. */
#include "mothur.h"
+#include "sequence.hpp"
-class Sequence;
+/**************************************************************************************************/
+struct seqMatch { //used to select top n matches
+ int seq;
+ int match;
+ seqMatch(int s, int m) : seq(s), match(m) {}
+};
+/**************************************************************************************************/
+inline bool compareSeqMatches (seqMatch member, seqMatch member2){ //sorts largest to smallest
+ if(member.match > member2.match){
+ return true; }
+ else{
+ return false;
+ }
+}
+/**************************************************************************************************/
+inline bool compareSeqMatchesReverse (seqMatch member, seqMatch member2){ //sorts largest to smallest
+ if(member.match < member2.match){
+ return true; }
+ else{
+ return false;
+ }
+}
+/**************************************************************************************************/
class Database {
+
public:
- Database(string);
+ Database();
virtual ~Database();
- virtual Sequence findClosestSequence(Sequence*) = 0;
+ virtual void generateDB() = 0;
+ virtual void addSequence(Sequence) = 0; //add sequence to search engine
+ virtual vector<int> findClosestSequences(Sequence*, int) = 0; // returns indexes of n closest sequences to query
virtual float getSearchScore();
virtual int getLongestBase();
+ virtual void readKmerDB(ifstream&){};
+ virtual void setNumSeqs(int i) { numSeqs = i; }
protected:
int numSeqs, longest;
float searchScore;
- vector<Sequence> templateSequences;
};
-
+/**************************************************************************************************/
#endif
searchScore = 100. * simAccession.simScore;
searchIndex++;
// return templateSequences[closestMatchIndexNumber];
- return templateSequences[simAccession.indexNumber];
+ if (templateValid) { return templateSequences[simAccession.indexNumber]; }
+ else { return emptySequence; }
}
class DistanceDB : public Database {
public:
+
DistanceDB(string, string);
- Sequence findClosestSequence(Sequence*);
+ vector<int> findClosestSequences(Sequence*, int);
private:
int indexNumber;
float simScore;
};
+
vector<hit> mostSimSequenceVector;
-// ifstream inputData;
int searchIndex;
};
--- /dev/null
+/*
+ * doTaxonomy.cpp
+ *
+ *
+ * Created by Pat Schloss on 6/17/09.
+ * Copyright 2009 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "doTaxonomy.h"
+
+/**************************************************************************************************/
+
+PhyloTree::PhyloTree(){
+
+ numNodes = 1;
+ numSeqs = 0;
+ tree.push_back(TaxNode("Root"));
+ tree[0].heirarchyID = "0";
+}
+
+/**************************************************************************************************/
+
+string PhyloTree::getNextTaxon(string& heirarchy){
+
+ string currentLevel = "";
+ if(heirarchy != ""){
+ currentLevel=heirarchy.substr(0,heirarchy.find_first_of(';'));
+ heirarchy=heirarchy.substr(heirarchy.find_first_of(';')+1);
+ }
+ return currentLevel;
+
+}
+
+/**************************************************************************************************/
+
+void PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
+
+ numSeqs++;
+
+ map<string, int>::iterator childPointer;
+
+ int currentNode = 0;
+ int level = 1;
+
+ tree[0].accessions.push_back(seqName);
+ string taxon;// = getNextTaxon(seqTaxonomy);
+
+ while(seqTaxonomy != ""){
+
+ level++;
+
+//somehow the parent is getting one too many accnos
+//use print to reassign the taxa id
+ taxon = getNextTaxon(seqTaxonomy);
+
+ childPointer = tree[currentNode].children.find(taxon);
+
+ if(childPointer != tree[currentNode].children.end()){ //if the node already exists, move on
+ currentNode = childPointer->second;
+ tree[currentNode].accessions.push_back(seqName);
+ }
+ else{ //otherwise, create it
+ tree.push_back(TaxNode(taxon));
+ numNodes++;
+ tree[currentNode].children[taxon] = numNodes-1;
+
+// int numChildren = tree[currentNode].children.size();
+// string heirarchyID = tree[currentNode].heirarchyID;
+// tree[currentNode].accessions.push_back(seqName);
+
+ currentNode = tree[currentNode].children[taxon];
+ tree[currentNode].accessions.push_back(seqName);
+
+// tree[currentNode].level = level;
+// tree[currentNode].childNumber = numChildren;
+// tree[currentNode].heirarchyID = heirarchyID + '.' + toString(tree[currentNode].childNumber);
+ }
+
+ }
+}
+
+/**************************************************************************************************/
+
+void PhyloTree::assignHeirarchyIDs(int index){
+
+ map<string,int>::iterator it;
+ int counter = 1;
+
+ for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
+ tree[it->second].heirarchyID = tree[index].heirarchyID + '.' + toString(counter);
+ counter++;
+ tree[it->second].level = tree[index].level + 1;
+ assignHeirarchyIDs(it->second);
+
+ }
+
+}
+
+/**************************************************************************************************/
+
+void PhyloTree::print(ofstream& out){
+
+ out << tree[0].level << '\t'<< tree[0].heirarchyID << '\t' << tree[0].name << '\t' << tree[0].children.size() << '\t' << tree[0].accessions.size() << endl;
+ print(0, out);
+
+
+}
+
+/**************************************************************************************************/
+
+void PhyloTree::print(int i, ofstream& out){
+
+ map<string,int>::iterator it;
+ for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
+ out <<tree[it->second].level << '\t' << tree[it->second].heirarchyID << '\t' << tree[it->second].name << '\t' << tree[it->second].children.size() << '\t' << tree[it->second].accessions.size() << endl;
+ print(it->second, out);
+ }
+
+}
+
+/**************************************************************************************************/
+
+
+
--- /dev/null
+/*
+ * doTaxonomy.h
+ *
+ *
+ * Created by Pat Schloss on 6/17/09.
+ * Copyright 2009 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+
+/**************************************************************************************************/
+
+struct TaxNode {
+ vector<string> accessions;
+ map<string, int> children;
+ int parent, childNumber, level;
+ string name, heirarchyID;
+
+ TaxNode(string n) : name(n), level(0), parent(-1) { }
+ TaxNode(){}
+};
+
+/**************************************************************************************************/
+
+class PhyloTree {
+
+public:
+ PhyloTree();
+ void addSeqToTree(string, string);
+ void assignHeirarchyIDs(int);
+ void print(ofstream&);
+private:
+ string getNextTaxon(string&);
+ vector<TaxNode> tree;
+ void print(int, ofstream&);
+ int numNodes;
+ int numSeqs;
+};
+
+/**************************************************************************************************/
+
}
//cout << "here3" << endl;
linkTable[colSpot].erase(size);
- //linkTable.erase(linkTable.begin()+rowSpot); //delete row
+ linkTable.erase(linkTable.begin()+rowSpot); //delete row
//printInfo();
//update activerows
activeLinks.erase(smallRow);
activeLinks[size] = colSpot;
//adjust everybody elses spot since you deleted - time vs. space
- //for (it = activeLinks.begin(); it != activeLinks.end(); it++) {
- //if (it->second > rowSpot) { activeLinks[it->first]--; }
- //}
+ for (it = activeLinks.begin(); it != activeLinks.end(); it++) {
+ if (it->second > rowSpot) { activeLinks[it->first]--; }
+ }
//cout << "here4" << endl;
/**************************************************************************************************/
-KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerSize(kSize) {
+KmerDB::KmerDB(string fastaFileName, int kSize) : Database(), kmerSize(kSize) {
try {
- string kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
- ifstream kmerFileTest(kmerDBName.c_str());
+ kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
+ count = 0;
maxKmer = power4s[kmerSize];
kmerLocations.resize(maxKmer+1);
- if(!kmerFileTest){ // if we can open the kmer db file, then read it in...
- mothurOut("Generating the " + kmerDBName + " database...\t"); cout.flush();
- generateKmerDB(kmerDBName);
- }
- else{ // ...otherwise generate it.
- mothurOut("Reading in the " + kmerDBName + " database...\t"); cout.flush();
- readKmerDB(kmerDBName, kmerFileTest);
- }
- mothurOut("DONE."); mothurOutEndLine(); mothurOutEndLine(); cout.flush();
-
}
catch(exception& e) {
errorOut(e, "KmerDB", "KmerDB");
}
/**************************************************************************************************/
-KmerDB::~KmerDB(){
-
- //for (int i = 0; i < templateSequences.size(); i++) { delete templateSequences[i]; }
- // templateSequences.clear();
-}
+KmerDB::~KmerDB(){}
/**************************************************************************************************/
-Sequence KmerDB::findClosestSequence(Sequence* candidateSeq){
+vector<int> KmerDB::findClosestSequences(Sequence* candidateSeq, int num){
try {
-
+ vector<int> topMatches;
Kmer kmer(kmerSize);
-
searchScore = 0;
- int maxSequence = 0;
vector<int> matches(numSeqs, 0); // a record of the sequences with shared kmers
vector<int> timesKmerFound(kmerLocations.size()+1, 0); // a record of the kmers that we have already found
int numKmers = candidateSeq->getNumBases() - kmerSize + 1;
-
+
for(int i=0;i<numKmers;i++){
int kmerNumber = kmer.getKmerNumber(candidateSeq->getUnaligned(), i); // go through the query sequence and get a kmer number
if(timesKmerFound[kmerNumber] == 0){ // if we haven't seen it before...
}
timesKmerFound[kmerNumber] = 1; // ok, we've seen the kmer now
}
-
- for(int i=0;i<numSeqs;i++){ // find the db sequence that shares the most kmers with
- if(matches[i] > searchScore){ // the query sequence
- searchScore = matches[i];
- maxSequence = i;
- }
+
+ vector<seqMatch> seqMatches;
+ for(int i=0;i<numSeqs;i++){
+ seqMatch temp(i, matches[i]);
+ seqMatches.push_back(temp);
}
- searchScore = 100 * searchScore / (float) numKmers; // return the Sequence object corresponding to the db
+ //sorts putting largest matches first
+ sort(seqMatches.begin(), seqMatches.end(), compareSeqMatches);
- return templateSequences[maxSequence]; // sequence with the most shared kmers.
+ searchScore = seqMatches[0].match;
+ searchScore = 100 * searchScore / (float) numKmers; // return the Sequence object corresponding to the db
+
+ //save top matches
+ for (int i = 0; i < num; i++) {
+ topMatches.push_back(seqMatches[i].seq);
+ }
+ return topMatches;
}
catch(exception& e) {
- errorOut(e, "KmerDB", "findClosestSequence");
+ errorOut(e, "KmerDB", "findClosestSequences");
exit(1);
}
}
/**************************************************************************************************/
-void KmerDB::generateKmerDB(string kmerDBName){
+void KmerDB::generateDB(){
try {
-
- Kmer kmer(kmerSize);
-
- for(int i=0;i<numSeqs;i++){ // for all of the template sequences...
-
- string seq = templateSequences[i].getUnaligned(); // ...take the unaligned sequence...
- int numKmers = seq.length() - kmerSize + 1;
-
- vector<int> seenBefore(maxKmer+1,0);
- for(int j=0;j<numKmers;j++){ // ...step though the sequence and get each kmer...
- int kmerNumber = kmer.getKmerNumber(seq, j);
- if(seenBefore[kmerNumber] == 0){
- kmerLocations[kmerNumber].push_back(i); // ...insert the sequence index into kmerLocations for
- } // the appropriate kmer number
- seenBefore[kmerNumber] = 1;
- }
- }
ofstream kmerFile; // once we have the kmerLocations folder print it out
openOutputFile(kmerDBName, kmerFile); // to a file
}
catch(exception& e) {
- errorOut(e, "KmerDB", "generateKmerDB");
+ errorOut(e, "KmerDB", "generateDB");
exit(1);
}
}
-
/**************************************************************************************************/
-
-void KmerDB::readKmerDB(string kmerDBName, ifstream& kmerDBFile){
+void KmerDB::addSequence(Sequence seq) {
try {
+ Kmer kmer(kmerSize);
+
+ string unaligned = seq.getUnaligned(); // ...take the unaligned sequence...
+ int numKmers = unaligned.length() - kmerSize + 1;
+
+ vector<int> seenBefore(maxKmer+1,0);
+ for(int j=0;j<numKmers;j++){ // ...step though the sequence and get each kmer...
+ int kmerNumber = kmer.getKmerNumber(unaligned, j);
+ if(seenBefore[kmerNumber] == 0){
+ kmerLocations[kmerNumber].push_back(count); // ...insert the sequence index into kmerLocations for
+ } // the appropriate kmer number
+ seenBefore[kmerNumber] = 1;
+ }
+ count++;
+ }
+ catch(exception& e) {
+ errorOut(e, "KmerDB", "addSequence");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
+void KmerDB::readKmerDB(ifstream& kmerDBFile){
+ try {
+
kmerDBFile.seekg(0); // start at the beginning of the file
string seqName;
int seqNumber;
-
+
for(int i=0;i<maxKmer;i++){
- int numValues;
+ int numValues = 0;
kmerDBFile >> seqName >> numValues;
for(int j=0;j<numValues;j++){ // for each kmer number get the...
public:
KmerDB(string, int);
~KmerDB();
- Sequence findClosestSequence(Sequence*);
-
+
+ void generateDB();
+ void addSequence(Sequence);
+ vector<int> findClosestSequences(Sequence*, int);
+ void readKmerDB(ifstream&);
+
private:
- void generateKmerDB(string);
- void readKmerDB(string, ifstream&);
+
int kmerSize;
- int maxKmer;
+ int maxKmer, count;
+ string kmerDBName;
vector<vector<int> > kmerLocations;
};
--- /dev/null
+/*
+ * knn.cpp
+ * Mothur
+ *
+ * Created by westcott on 11/4/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "knn.h"
+
+/**************************************************************************************************/
+Knn::Knn(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch, int n)
+: Classify(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch), num(n) {}
+/**************************************************************************************************/
+string Knn::getTaxonomy(Sequence* seq) {
+ try {
+ string tax;
+
+ //use database to find closest seq
+ vector<int> closest = database->findClosestSequences(seq, num);
+
+ vector<string> closestNames;
+ for (int i = 0; i < closest.size(); i++) {
+ //find that sequences taxonomy in map
+ it = taxonomy.find(names[closest[i]]);
+
+ //is this sequence in the taxonomy file
+ if (it == taxonomy.end()) { //error not in file
+ mothurOut("Error: sequence " + names[closest[i]] + " is not in the taxonomy file. It will be eliminated as a match to sequence " + seq->getName() + "."); mothurOutEndLine();
+ }else{ closestNames.push_back(it->first); }
+ }
+
+ if (closestNames.size() == 0) {
+ mothurOut("Error: All the matches for sequence " + seq->getName() + " have been eliminated. " + seq->getName() + " will be disregarded."); mothurOutEndLine();
+ tax = "bad seq";
+ }else{
+ tax = findCommonTaxonomy(closestNames);
+ if (tax == "") { mothurOut("There are no common levels for sequence " + seq->getName() + ". " + seq->getName() + " will be disregarded."); mothurOutEndLine(); tax = "bad seq"; }
+ }
+
+ return tax;
+ }
+ catch(exception& e) {
+ errorOut(e, "Knn", "getTaxonomy");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+string Knn::findCommonTaxonomy(vector<string> closest) {
+ try {
+ vector< vector<string> > taxons; //taxon[0] = vector of taxonomy info for closest[0].
+ //so if closest[0] taxonomy is Bacteria;Alphaproteobacteria;Rhizobiales;Azorhizobium_et_rel.;Methylobacterium_et_rel.;Bosea;
+ //taxon[0][0] = Bacteria, taxon[0][1] = Alphaproteobacteria....
+
+ taxons.resize(closest.size());
+ int smallest = 100;
+
+ for (int i = 0; i < closest.size(); i++) {
+
+ string tax = taxonomy[closest[i]]; //we know its there since we checked in getTaxonomy
+
+ tax = tax.substr(0, tax.length()-1); //get rid of last ';'
+
+ //parse taxonomy
+ string individual;
+ while (tax.find_first_of(';') != -1) {
+ individual = tax.substr(0,tax.find_first_of(';'));
+ tax = tax.substr(tax.find_first_of(';')+1, tax.length());
+ taxons[i].push_back(individual);
+
+ }
+ //get last one
+ taxons[i].push_back(tax);
+
+ //figure out who has the shortest taxonomy info. so you can start comparing there
+ if (taxons[i].size() < smallest) {
+ smallest = taxons[i].size();
+ }
+ }
+
+ //start at the highest level all the closest seqs have
+ string common = "";
+ for (int i = (smallest-1); i >= 0; i--) {
+
+ string thistax = taxons[0][i];
+ int num = 0;
+ for (int j = 1; j < taxons.size(); j++) {
+ if (taxons[j][i] != thistax) { break; }
+ num = j;
+ }
+
+ if (num == (taxons.size()-1)) { //they all match at this level
+ for (int k = 0; k <= i; k++) {
+ common += taxons[0][k] + ';';
+ }
+ break;
+ }
+ }
+
+ return common;
+ }
+ catch(exception& e) {
+ errorOut(e, "Knn", "findCommonTaxonomy");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
--- /dev/null
+#ifndef KNN_H
+#define KNN_H
+
+/*
+ * knn.h
+ * Mothur
+ *
+ * Created by westcott on 11/4/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "classify.h"
+
+/**************************************************************************************************/
+
+class Knn : public Classify {
+
+public:
+ Knn(string, string, string, int, int, int, int, int, int);
+ ~Knn() {};
+
+ string getTaxonomy(Sequence*);
+
+private:
+ int num;
+ string findCommonTaxonomy(vector<string>);
+};
+
+/**************************************************************************************************/
+
+#endif
+
+
void Nast::pairwiseAlignSeqs(){ // Here we call one of the pairwise alignment methods to align our unaligned candidate
// and template sequences
try {
-
+
alignment->align(candidateSeq->getUnaligned(), templateSeq->getUnaligned());
-
+
string candAln = alignment->getSeqAAln();
string tempAln = alignment->getSeqBAln();
-
+
if(candAln == ""){
candidateSeq->setPairwise("");
--- /dev/null
+/*
+ * phylotype.cpp
+ * Mothur
+ *
+ * Created by westcott on 11/3/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "phylotype.h"
+
+/**************************************************************************************************/
+PhyloType::PhyloType(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch)
+: Classify(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch) {}
+/**************************************************************************************************/
+string PhyloType::getTaxonomy(Sequence* seq) {
+ try {
+ string tax;
+
+ //use database to find closest seq
+ vector<int> closest = database->findClosestSequences(seq, 1);
+
+ //find that sequences taxonomy in map
+ it = taxonomy.find(names[closest[0]]);
+
+ //is this sequence in the taxonomy file
+ if (it == taxonomy.end()) { //error not in file
+ mothurOut("Error: sequence " + names[closest[0]] + " is not in the taxonomy file. It is the closest match to sequence " + seq->getName() + ". " + seq->getName() + " will be disregarded."); mothurOutEndLine();
+ tax = "bad seq";
+ }else {
+ tax = it->second;
+ }
+
+ return tax;
+ }
+ catch(exception& e) {
+ errorOut(e, "PhyloType", "getTaxonomy");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+
--- /dev/null
+#ifndef PHYLOTYPE_H
+#define PHYLOTYPE_H
+
+/*
+ * phylotype.h
+ * Mothur
+ *
+ * Created by westcott on 11/3/09.
+ * Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "classify.h"
+
+/**************************************************************************************************/
+
+class PhyloType : public Classify {
+
+public:
+ PhyloType(string, string, string, int, int, int, int, int);
+ ~PhyloType() {};
+
+ string getTaxonomy(Sequence*);
+
+
+private:
+
+};
+
+/**************************************************************************************************/
+
+#endif
+
/**************************************************************************************************/
-SuffixDB::SuffixDB(string fastaFileName) : Database(fastaFileName) {
-
+SuffixDB::SuffixDB(int numSeqs) : Database() {
suffixForest.resize(numSeqs);
- mothurOut("Generating the suffix tree database...\t"); cout.flush();
- for(int i=0;i<numSeqs;i++){ // The parent class' constructor generates the vector of
- suffixForest[i].loadSequence(templateSequences[i]); // template Sequence objects. Here each of these objects
- } // is used to generate a suffix tree, aka the suffix forest
- mothurOut("DONE."); mothurOutEndLine(); mothurOutEndLine(); cout.flush();
-
+ count = 0;
}
-
/**************************************************************************************************/
-
-Sequence SuffixDB::findClosestSequence(Sequence* candidateSeq){
-
- int minValue = 2000;
- int closestSequenceNo = 0;
- string processedSeq = candidateSeq->convert2ints(); // the candidate sequence needs to be a string of ints
- for(int i=0;i<suffixForest.size();i++){ // scan through the forest and see what the minimum
- int count = suffixForest[i].countSuffixes(processedSeq, minValue); // return score is and keep track of the
- if(count == minValue){ // template sequence index that corresponds to that score
- closestSequenceNo = i;
+//assumes sequences have been added using addSequence
+vector<int> SuffixDB::findClosestSequences(Sequence* candidateSeq, int num){
+ try {
+ vector<int> topMatches;
+ string processedSeq = candidateSeq->convert2ints(); // the candidate sequence needs to be a string of ints
+
+ vector<seqMatch> seqMatches;
+ for(int i=0;i<suffixForest.size();i++){ // scan through the forest and see what the minimum
+ int count = suffixForest[i].countSuffixes(processedSeq); // return score is and keep track of the
+ seqMatch temp(i, count);
+ seqMatches.push_back(temp);
+ }
+
+ //sorts putting smallest matches first
+ sort(seqMatches.begin(), seqMatches.end(), compareSeqMatchesReverse);
+
+ searchScore = seqMatches[0].match;
+ searchScore = 100 * (1. - searchScore / (float)processedSeq.length());
+
+ //save top matches
+ for (int i = 0; i < num; i++) {
+ topMatches.push_back(seqMatches[i].seq);
}
+
+ // return the Sequence object that has the minimum score
+ return topMatches;
+ }
+ catch(exception& e) {
+ errorOut(e, "SuffixDB", "findClosestSequences");
+ exit(1);
+ }
+}
+/**************************************************************************************************/
+//adding the sequences generates the db
+void SuffixDB::addSequence(Sequence seq) {
+ try {
+ suffixForest[count].loadSequence(seq);
+ count++;
+ }
+ catch(exception& e) {
+ errorOut(e, "SuffixDB", "addSequence");
+ exit(1);
}
- searchScore = 100 * (1. - minValue / (float)processedSeq.length());
- return templateSequences[closestSequenceNo]; // return the Sequence object that has the minimum score
-
}
-
/**************************************************************************************************/
SuffixDB::~SuffixDB(){
-
for (int i = (suffixForest.size()-1); i >= 0; i--) { suffixForest.pop_back(); }
- // templateSequences.clear();
-
}
/**************************************************************************************************/
class SuffixDB : public Database {
public:
- SuffixDB(string);
+ SuffixDB(int);
~SuffixDB();
- Sequence findClosestSequence(Sequence*);
+
+ void generateDB() {}; //adding sequences generates the db
+ void addSequence(Sequence);
+ vector<int> findClosestSequences(Sequence*, int);
private:
vector<SuffixTree> suffixForest;
+ int count;
};
#endif
return numSuffixes; // change the value and return the number of suffixes
}
+//********************************************************************************************************************
+int SuffixTree::countSuffixes(string compareSequence){ // here we count the number of suffix parts
+ // we need to rewrite a user supplied sequence. if the
+ int numSuffixes = 0; // count exceeds the supplied minValue, bail out. The
+ int seqLength = compareSequence.length(); // time complexity should be O(L)
+ int position = 0;
+
+ int presentNode = 0;
+
+ while(position < seqLength){ // while the position in the query sequence isn't at the end...
+
+ int newNode = nodeVector[presentNode]->getChild(compareSequence[position]); // see if the current node has a
+ // child that matches the next character in the query
+ if(newNode == -1){
+ if(presentNode == 0){ position++; } // if not, go back to the root and increase the count
+ numSuffixes++; // by one.
+ presentNode = 0;
+ }
+ else{ // if there is, move to that node and see how far down
+ presentNode = newNode; // it we can get
+
+ for(int i=nodeVector[newNode]->getStartCharPos(); i<=nodeVector[newNode]->getEndCharPos(); i++){
+ if(compareSequence[position] == sequence[i]){
+ position++; // as long as the query and branch agree, keep going
+ }
+ else{
+ numSuffixes++; // if there is a mismatch, increase the number of
+ presentNode = 0; // suffixes and go back to the root
+ break;
+ }
+ }
+ }
+ // if we get all the way through the node we'll go to the top of the while loop and find the child node
+ // that corresponds to what we are interested in
+ }
+ numSuffixes--; // the method puts an extra count on numSuffixes
+
+ return numSuffixes; // change the value and return the number of suffixes
+}
//********************************************************************************************************************
void SuffixTree::canonize(){ // if you have to ask how this works, you don't really want to know and this really
string getSeqName();
void print();
int countSuffixes(string, int&);
+ int countSuffixes(string);
private:
void addPrefix(int);