]> git.donarmstrong.com Git - mothur.git/commitdiff
started work on classify.seqs command. changed the database class so that it does...
authorwestcott <westcott>
Thu, 5 Nov 2009 14:40:15 +0000 (14:40 +0000)
committerwestcott <westcott>
Thu, 5 Nov 2009 14:40:15 +0000 (14:40 +0000)
34 files changed:
Mothur.xcodeproj/project.pbxproj
aligncommand.cpp
aligncommand.h
alignmentdb.cpp [new file with mode: 0644]
alignmentdb.h [new file with mode: 0644]
bayesian.cpp [new file with mode: 0644]
bayesian.h [new file with mode: 0644]
blastdb.cpp
blastdb.hpp
chimeracheckrdp.cpp
chimeracheckrdp.h
classify.cpp [new file with mode: 0644]
classify.h [new file with mode: 0644]
classifyseqscommand.cpp [new file with mode: 0644]
classifyseqscommand.h [new file with mode: 0644]
commandfactory.cpp
database.cpp
database.hpp
distancedb.cpp
distancedb.hpp
doTaxonomy.cpp [new file with mode: 0644]
doTaxonomy.h [new file with mode: 0644]
hcluster.cpp
kmerdb.cpp
kmerdb.hpp
knn.cpp [new file with mode: 0644]
knn.h [new file with mode: 0644]
nast.cpp
phylotype.cpp [new file with mode: 0644]
phylotype.h [new file with mode: 0644]
suffixdb.cpp
suffixdb.hpp
suffixtree.cpp
suffixtree.hpp

index 252c02eef75714e8610af6e8a5a7403a0b07bdc4..9b064e08786aa39aa48f0c32bfc218b2d4e9e62c 100644 (file)
@@ -72,7 +72,6 @@
                379293C30F2DE73400B9034A /* treemap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 379293C20F2DE73400B9034A /* treemap.cpp */; };
                379294700F2E191800B9034A /* parsimonycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3792946F0F2E191800B9034A /* parsimonycommand.cpp */; };
                3792948A0F2E258500B9034A /* parsimony.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 379294890F2E258500B9034A /* parsimony.cpp */; };
-               3799A9500FD6A58C00E33EDE /* distancedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3799A94C0FD6A58C00E33EDE /* distancedb.cpp */; };
                3799A9510FD6A58C00E33EDE /* seqsummarycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3799A94E0FD6A58C00E33EDE /* seqsummarycommand.cpp */; };
                379D3D510FF90E090068C1C0 /* chimeraseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 379D3D500FF90E090068C1C0 /* chimeraseqscommand.cpp */; };
                37AD4CE40F28AEA300AA2D49 /* sharedlistvector.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 37AD4CE30F28AEA300AA2D49 /* sharedlistvector.cpp */; };
                A73329CF1083B3B3003B10C5 /* getlistcountcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A73329CE1083B3B3003B10C5 /* getlistcountcommand.cpp */; };
                A751ACEA1098B283003D0911 /* readcluster.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A751ACE91098B283003D0911 /* readcluster.cpp */; };
                A75B887E104C16860083C454 /* ccode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75B887B104C16860083C454 /* ccode.cpp */; };
+               A7861478109F5FA00010AAAF /* classifyseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7861477109F5FA00010AAAF /* classifyseqscommand.cpp */; };
+               A787810310A06D5D0086103D /* classify.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787810210A06D5D0086103D /* classify.cpp */; };
+               A787811510A074330086103D /* phylotype.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787811410A074330086103D /* phylotype.cpp */; };
+               A78781BA10A0813E0086103D /* doTaxonomy.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A78781B810A0813D0086103D /* doTaxonomy.cpp */; };
+               A787821110A0AAE70086103D /* bayesian.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787821010A0AAE70086103D /* bayesian.cpp */; };
+               A78782AB10A1B1CB0086103D /* alignmentdb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A78782AA10A1B1CB0086103D /* alignmentdb.cpp */; };
+               A787844510A1EBDD0086103D /* knn.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A787844410A1EBDD0086103D /* knn.cpp */; };
                A7B04493106CC3E60046FC83 /* chimeraslayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */; };
                A7B0450E106CEEC90046FC83 /* slayer.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B0450D106CEEC90046FC83 /* slayer.cpp */; };
                A7E4A783106913F900688F62 /* getsharedotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E4A782106913F900688F62 /* getsharedotucommand.cpp */; };
                379294880F2E258500B9034A /* parsimony.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = parsimony.h; sourceTree = SOURCE_ROOT; };
                379294890F2E258500B9034A /* parsimony.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = parsimony.cpp; sourceTree = SOURCE_ROOT; };
                3792948D0F2E271100B9034A /* treecalculator.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = treecalculator.h; sourceTree = SOURCE_ROOT; };
-               3799A94C0FD6A58C00E33EDE /* distancedb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = distancedb.cpp; sourceTree = SOURCE_ROOT; };
-               3799A94D0FD6A58C00E33EDE /* distancedb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = distancedb.hpp; sourceTree = SOURCE_ROOT; };
                3799A94E0FD6A58C00E33EDE /* seqsummarycommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = seqsummarycommand.cpp; sourceTree = SOURCE_ROOT; };
                3799A94F0FD6A58C00E33EDE /* seqsummarycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = seqsummarycommand.h; sourceTree = SOURCE_ROOT; };
                379D3D4F0FF90E090068C1C0 /* chimeraseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraseqscommand.h; sourceTree = SOURCE_ROOT; };
                A751ACE91098B283003D0911 /* readcluster.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = readcluster.cpp; sourceTree = SOURCE_ROOT; };
                A75B887B104C16860083C454 /* ccode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ccode.cpp; sourceTree = SOURCE_ROOT; };
                A75B887C104C16860083C454 /* ccode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ccode.h; sourceTree = SOURCE_ROOT; };
+               A7861476109F5FA00010AAAF /* classifyseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = classifyseqscommand.h; sourceTree = "<group>"; };
+               A7861477109F5FA00010AAAF /* classifyseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = classifyseqscommand.cpp; sourceTree = "<group>"; };
+               A78780FE10A06B8B0086103D /* classify.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = classify.h; sourceTree = SOURCE_ROOT; };
+               A787810210A06D5D0086103D /* classify.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = classify.cpp; sourceTree = SOURCE_ROOT; };
+               A787811310A074330086103D /* phylotype.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = phylotype.h; sourceTree = SOURCE_ROOT; };
+               A787811410A074330086103D /* phylotype.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = phylotype.cpp; sourceTree = SOURCE_ROOT; };
+               A78781B810A0813D0086103D /* doTaxonomy.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = doTaxonomy.cpp; sourceTree = "<group>"; };
+               A78781B910A0813D0086103D /* doTaxonomy.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = doTaxonomy.h; sourceTree = "<group>"; };
+               A787820F10A0AAE70086103D /* bayesian.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = bayesian.h; sourceTree = "<group>"; };
+               A787821010A0AAE70086103D /* bayesian.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = bayesian.cpp; sourceTree = "<group>"; };
+               A78782A910A1B1CB0086103D /* alignmentdb.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = alignmentdb.h; sourceTree = "<group>"; };
+               A78782AA10A1B1CB0086103D /* alignmentdb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignmentdb.cpp; sourceTree = "<group>"; };
+               A787844310A1EBDD0086103D /* knn.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = knn.h; sourceTree = "<group>"; };
+               A787844410A1EBDD0086103D /* knn.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = knn.cpp; sourceTree = "<group>"; };
                A7B04491106CC3E60046FC83 /* chimeraslayer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraslayer.h; sourceTree = SOURCE_ROOT; };
                A7B04492106CC3E60046FC83 /* chimeraslayer.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimeraslayer.cpp; sourceTree = SOURCE_ROOT; };
                A7B0450C106CEEC90046FC83 /* slayer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = slayer.h; sourceTree = SOURCE_ROOT; };
                                373C68DB0FC1C38D00137ACD /* blastalign.cpp */,
                                37D928A60F2133C0001D4494 /* calculators */,
                                374F2FE81006346600E97C66 /* chimera */,
+                               A787811210A073F40086103D /* classifiers */,
                                37D927C20F21331F001D4494 /* cluster.hpp */,
                                37D927C10F21331F001D4494 /* cluster.cpp */,
-                               A729ACE210849BBE00139801 /* hcluster.h */,
-                               A729ACE310849BBE00139801 /* hcluster.cpp */,
                                37D928A90F2133E5001D4494 /* commands */,
                                37D927C60F21331F001D4494 /* collect.h */,
                                37D927C50F21331F001D4494 /* collect.cpp */,
                                37D927DF0F21331F001D4494 /* globaldata.cpp */,
                                373C68BB0FC1C1A600137ACD /* gotohoverlap.hpp */,
                                373C68BA0FC1C1A600137ACD /* gotohoverlap.cpp */,
+                               A729ACE210849BBE00139801 /* hcluster.h */,
+                               A729ACE310849BBE00139801 /* hcluster.cpp */,
                                375873ED0F7D646F0040F377 /* heatmap.h */,
                                375873EE0F7D646F0040F377 /* heatmap.cpp */,
                                378598720FDD4C1500EF9D03 /* heatmapsim.h */,
                                377326630FAF16E0007ABB8B /* consensuscommand.cpp */,
                                379D3D4F0FF90E090068C1C0 /* chimeraseqscommand.h */,
                                379D3D500FF90E090068C1C0 /* chimeraseqscommand.cpp */,
+                               A7861476109F5FA00010AAAF /* classifyseqscommand.h */,
+                               A7861477109F5FA00010AAAF /* classifyseqscommand.cpp */,
                                37B28F660F27590100808A62 /* deconvolutecommand.h */,
                                37B28F670F27590100808A62 /* deconvolutecommand.cpp */,
                                37C753CC0FB3415200DBD02E /* distancecommand.h */,
                                373C68A00FC1C07D00137ACD /* alignment.cpp */,
                                373C68A30FC1C07D00137ACD /* alignmentcell.hpp */,
                                373C68A20FC1C07D00137ACD /* alignmentcell.cpp */,
+                               A78782A910A1B1CB0086103D /* alignmentdb.h */,
+                               A78782AA10A1B1CB0086103D /* alignmentdb.cpp */,
                                373C69170FC1C8AF00137ACD /* blastdb.hpp */,
                                373C69160FC1C8AF00137ACD /* blastdb.cpp */,
                                37D927D40F21331F001D4494 /* database.hpp */,
                                37D927D30F21331F001D4494 /* database.cpp */,
                                37D927D50F21331F001D4494 /* datavector.hpp */,
-                               3799A94D0FD6A58C00E33EDE /* distancedb.hpp */,
-                               3799A94C0FD6A58C00E33EDE /* distancedb.cpp */,
                                37D927DC0F21331F001D4494 /* fastamap.h */,
                                37D927DB0F21331F001D4494 /* fastamap.cpp */,
                                375873EA0F7D64520040F377 /* fullmatrix.h */,
                        name = errorcheckor;
                        sourceTree = "<group>";
                };
+               A787811210A073F40086103D /* classifiers */ = {
+                       isa = PBXGroup;
+                       children = (
+                               A78780FE10A06B8B0086103D /* classify.h */,
+                               A787810210A06D5D0086103D /* classify.cpp */,
+                               A787820F10A0AAE70086103D /* bayesian.h */,
+                               A787821010A0AAE70086103D /* bayesian.cpp */,
+                               A78781B910A0813D0086103D /* doTaxonomy.h */,
+                               A78781B810A0813D0086103D /* doTaxonomy.cpp */,
+                               A787844310A1EBDD0086103D /* knn.h */,
+                               A787844410A1EBDD0086103D /* knn.cpp */,
+                               A787811310A074330086103D /* phylotype.h */,
+                               A787811410A074330086103D /* phylotype.cpp */,
+                       );
+                       name = classifiers;
+                       sourceTree = "<group>";
+               };
                C6859E8C029090F304C91782 /* Documentation */ = {
                        isa = PBXGroup;
                        children = (
                                21E859D80FC4632E005E1A48 /* matrixoutputcommand.cpp in Sources */,
                                3749271D0FD58C840031C06B /* getsabundcommand.cpp in Sources */,
                                3749273F0FD5956B0031C06B /* getrabundcommand.cpp in Sources */,
-                               3799A9500FD6A58C00E33EDE /* distancedb.cpp in Sources */,
                                3799A9510FD6A58C00E33EDE /* seqsummarycommand.cpp in Sources */,
                                371B30B40FD7EE67000414CA /* screenseqscommand.cpp in Sources */,
                                7E09C5140FDA79C5002ECAE5 /* reversecommand.cpp in Sources */,
                                A729ACD010848E6100139801 /* hclustercommand.cpp in Sources */,
                                A729ACE410849BBE00139801 /* hcluster.cpp in Sources */,
                                A751ACEA1098B283003D0911 /* readcluster.cpp in Sources */,
+                               A7861478109F5FA00010AAAF /* classifyseqscommand.cpp in Sources */,
+                               A787810310A06D5D0086103D /* classify.cpp in Sources */,
+                               A787811510A074330086103D /* phylotype.cpp in Sources */,
+                               A78781BA10A0813E0086103D /* doTaxonomy.cpp in Sources */,
+                               A787821110A0AAE70086103D /* bayesian.cpp in Sources */,
+                               A78782AB10A1B1CB0086103D /* alignmentdb.cpp in Sources */,
+                               A787844510A1EBDD0086103D /* knn.cpp in Sources */,
                        );
                        runOnlyForDeploymentPostprocessing = 0;
                };
index 48c67ec4569ef65381b087c02323545302097bfe..2ca450aa4f4292feb322dec9dee1108095340684 100644 (file)
 #include "blastalign.hpp"
 #include "noalign.hpp"
 
-#include "kmerdb.hpp"
-#include "suffixdb.hpp"
-#include "blastdb.hpp"
-
 #include "nast.hpp"
 #include "nastreport.hpp"
 
@@ -126,7 +122,7 @@ void AlignCommand::help(){
                mothurOut("The template and candidate parameters are required.\n");
                mothurOut("The search parameter allows you to specify the method to find most similar template.  Your options are: suffix, kmer and blast. The default is kmer.\n");
                mothurOut("The align parameter allows you to specify the alignment method to use.  Your options are: gotoh, needleman, blast and noalign. The default is needleman.\n");
-               mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate.  The default is 7.\n");
+               mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate.  The default is 8.\n");
                mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n");
                mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases.  The default is -1.0.\n");
                mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n");
@@ -149,19 +145,9 @@ int AlignCommand::execute(){
        try {
                if (abort == true) {    return 0;       }
                
-               if(search == "kmer")                    {       templateDB = new KmerDB(templateFileName, kmerSize);    }
-               else if(search == "suffix")             {       templateDB = new SuffixDB(templateFileName);                    }
-               else if(search == "blast")              {       templateDB = new BlastDB(templateFileName, gapOpen, gapExtend, match, misMatch);        }
-               else {
-                       mothurOut(search + " is not a valid search option. I will run the command using kmer, ksize=8.");
-                       mothurOutEndLine();
-                       kmerSize = 8;
-                       
-                       templateDB = new KmerDB(templateFileName, kmerSize);
-               }
-               
+               templateDB = new AlignmentDB(templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch);
                int longestBase = templateDB->getLongestBase();
-               
+       
                if(align == "gotoh")                    {       alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, longestBase);                 }
                else if(align == "needleman")   {       alignment = new NeedlemanOverlap(gapOpen, match, misMatch, longestBase);                                }
                else if(align == "blast")               {       alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch);            }
@@ -281,7 +267,7 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName){
        
                        Sequence temp = templateDB->findClosestSequence(candidateSeq);
                        Sequence* templateSeq = &temp;
-       
+
                        report.setTemplate(templateSeq);
                        report.setSearchParameters(search, templateDB->getSearchScore());
                                
@@ -290,13 +276,13 @@ int AlignCommand::driver(linePair* line, string alignFName, string reportFName){
                        report.setAlignmentParameters(align, alignment);
        
                        report.setNastParameters(nast);
-
+       
                        alignmentFile << '>' << candidateSeq->getName() << '\n' << candidateSeq->getAligned() << endl;
                                
                        report.print();
                        
                        delete candidateSeq;
-       
+               
                }
                
                alignmentFile.close();
index 50e9bffb3ba26790d4b0531605d34cf99f173dbe..c3c244138a7b2457dd20ba5bedc152b88f0e0e2b 100644 (file)
@@ -14,6 +14,7 @@
 #include "command.hpp"
 #include "database.hpp"
 #include "alignment.hpp"
+#include "alignmentdb.h"
 
 class AlignCommand : public Command {
        
@@ -32,7 +33,7 @@ private:
        vector<int> processIDS;   //processid
        vector<linePair*> lines;
        
-       Database* templateDB;
+       AlignmentDB* templateDB;
        Alignment* alignment;
        
        int driver(linePair*, string, string);
diff --git a/alignmentdb.cpp b/alignmentdb.cpp
new file mode 100644 (file)
index 0000000..493521e
--- /dev/null
@@ -0,0 +1,111 @@
+/*
+ *  alignmentdb.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 11/4/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "alignmentdb.h"
+#include "kmerdb.hpp"
+#include "suffixdb.hpp"
+#include "blastdb.hpp"
+
+
+/**************************************************************************************************/
+AlignmentDB::AlignmentDB(string fastaFileName, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch){              //      This assumes that the template database is in fasta format, may 
+       try {                                                                                   //      need to alter this in the future?
+               longest = 0;
+               
+               ifstream fastaFile;
+               openInputFile(fastaFileName, fastaFile);
+               
+               mothurOutEndLine();
+               mothurOut("Reading in the " + fastaFileName + " template sequences...\t");      cout.flush();
+               
+               while (!fastaFile.eof()) {
+                       Sequence temp(fastaFile);
+                       
+                       templateSequences.push_back(temp);
+                       
+                       //save longest base
+                       if (temp.getUnaligned().length() > longest)  { longest = temp.getUnaligned().length(); }
+                       
+                       gobble(fastaFile);
+               }
+               
+               numSeqs = templateSequences.size();
+               
+               fastaFile.close();
+               //all of this is elsewhere already!
+               
+               mothurOut("DONE.");
+               mothurOutEndLine();     cout.flush();
+               
+               //in case you delete the seqs and then ask for them
+               emptySequence = Sequence();
+               emptySequence.setName("no_match");
+               emptySequence.setUnaligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
+               emptySequence.setAligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
+               
+               bool needToGenerate = true;
+               string kmerDBName;
+               if(method == "kmer")                    {       
+                       search = new KmerDB(fastaFileName, kmerSize);                   
+                       
+                       kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
+                       ifstream kmerFileTest(kmerDBName.c_str());
+                       
+                       if(kmerFileTest){       needToGenerate = false;         }
+               }
+               else if(method == "suffix")             {       search = new SuffixDB(numSeqs);                                                         }
+               else if(method == "blast")              {       search = new BlastDB(gapOpen, gapExtend, match, misMatch);      }
+               else {
+                       mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
+                       mothurOutEndLine();
+                       search = new KmerDB(fastaFileName, 8);
+               }
+               
+               if (needToGenerate) {
+               
+                       //add sequences to search 
+                       for (int i = 0; i < templateSequences.size(); i++) {
+                               search->addSequence(templateSequences[i]);
+                       }
+                       search->generateDB();
+                       
+               }else if ((method == "kmer") && (!needToGenerate)) {
+                       ifstream kmerFileTest(kmerDBName.c_str());
+                       search->readKmerDB(kmerFileTest);       
+               }
+               
+               search->setNumSeqs(numSeqs);
+       }
+       catch(exception& e) {
+               errorOut(e, "AlignmentDB", "AlignmentDB");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+AlignmentDB::~AlignmentDB() {  delete search;  }
+/**************************************************************************************************/
+Sequence AlignmentDB::findClosestSequence(Sequence* seq) {
+       try{
+       
+               vector<int> spot = search->findClosestSequences(seq, 1);
+
+               if (spot.size() != 0)   {               return templateSequences[spot[0]];      }
+               else                                    {               return emptySequence;                           }
+               
+       }
+       catch(exception& e) {
+               errorOut(e, "AlignmentDB", "findClosestSequence");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
+
+
+
diff --git a/alignmentdb.h b/alignmentdb.h
new file mode 100644 (file)
index 0000000..339f46f
--- /dev/null
@@ -0,0 +1,42 @@
+#ifndef ALIGNMENTDB_H
+#define ALIGNMENTDB_H
+
+/*
+ *  alignmentdb.h
+ *  Mothur
+ *
+ *  Created by westcott on 11/4/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "sequence.hpp"
+#include "database.hpp"
+
+/**************************************************************************************************/
+
+class AlignmentDB {
+
+public:
+
+       AlignmentDB(string, string, int, int, int, int, int);  //reads fastafile passed in and stores sequences
+       ~AlignmentDB();
+       
+       Sequence findClosestSequence(Sequence*);
+       float getSearchScore()  {  return search->getSearchScore();  }
+       int getLongestBase()    {  return longest;  }
+       
+private:
+       int numSeqs, longest;
+       float searchScore;
+       
+       Database* search;
+       vector<Sequence> templateSequences;
+       Sequence emptySequence;
+};
+
+/**************************************************************************************************/
+
+#endif
+
diff --git a/bayesian.cpp b/bayesian.cpp
new file mode 100644 (file)
index 0000000..caae212
--- /dev/null
@@ -0,0 +1,29 @@
+/*
+ *  bayesian.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 11/3/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "bayesian.h"
+
+/**************************************************************************************************/
+Bayesian::Bayesian(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch) : 
+Classify(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch) {}
+/**************************************************************************************************/
+string Bayesian::getTaxonomy(Sequence* seq) {
+       try {
+               string tax;
+               
+                               
+               return tax;     
+       }
+       catch(exception& e) {
+               errorOut(e, "Bayesian", "getTaxonomy");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
diff --git a/bayesian.h b/bayesian.h
new file mode 100644 (file)
index 0000000..cbe949f
--- /dev/null
@@ -0,0 +1,34 @@
+#ifndef BAYESIAN_H
+#define BAYESIAN_H
+
+/*
+ *  bayesian.h
+ *  Mothur
+ *
+ *  Created by westcott on 11/3/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "classify.h"
+
+/**************************************************************************************************/
+
+class Bayesian : public Classify {
+       
+public:
+       Bayesian(string, string, string, int, int, int, int, int);
+       ~Bayesian() {};
+       
+       string getTaxonomy(Sequence*);
+       
+       
+private:
+       
+};
+
+/**************************************************************************************************/
+
+#endif
+
index 0cba83323db70f0748e4e6a5ec0d902ee8294945..6b45a48ce379168e82300209a7668c3e07e4d0ab 100644 (file)
 
 /**************************************************************************************************/
 
-BlastDB::BlastDB(string fastaFileName, float gO, float gE, float m, float mM) : Database(fastaFileName), 
+BlastDB::BlastDB(float gO, float gE, float m, float mM) : Database(), 
 gapOpen(gO), gapExtend(gE), match(m), misMatch(mM) {
        
        globaldata = GlobalData::getInstance();
-       
-       mothurOut("Generating the temporary BLAST database...\t");      cout.flush();
+       count = 0;
 
        int randNumber = rand();
        dbFileName = toString(randNumber) + ".template.unaligned.fasta";
        queryFileName = toString(randNumber) + ".candidate.unaligned.fasta";
        blastFileName = toString(randNumber) + ".blast";
 
-
-       ofstream unalignedFastaFile;
-       openOutputFile(dbFileName, unalignedFastaFile);                         
-       
-       for(int i=0;i<numSeqs;i++){                                                                     //      generating a fasta file with unaligned template
-               unalignedFastaFile << '>' << i << endl;                                 //      sequences, which will be input to formatdb
-               unalignedFastaFile << templateSequences[i].getUnaligned() << endl;
-       }
-       unalignedFastaFile.close();
-       
-       path = globaldata->argv;
-       path = path.substr(0, (path.find_last_of('m')));
-       
-       string formatdbCommand = path + "blast/bin/formatdb -p F -o T -i " + dbFileName;        //      format the database, -o option gives us the ability
-       system(formatdbCommand.c_str());                                                                //      to get the right sequence names, i think. -p F
-                                                                                                                                       //      option tells formatdb that seqs are DNA, not prot
-       mothurOut("DONE."); mothurOutEndLine(); mothurOutEndLine(); cout.flush();
-       emptySequence = Sequence();
-       emptySequence.setName("no_match");
-       emptySequence.setUnaligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
-       emptySequence.setAligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
-
-
 }
 
 /**************************************************************************************************/
 
 BlastDB::~BlastDB(){
-       //for (int i = 0; i < templateSequences.size(); i++) {  delete templateSequences[i];   }
-       //templateSequences.clear();
-       //delete emptySequence;
-       
        remove(queryFileName.c_str());                          //      let's clean stuff up and remove the temp files
        remove(dbFileName.c_str());                                     //      let's clean stuff up and remove the temp files
        remove(blastFileName.c_str());                          //      let's clean stuff up and remove the temp files
 }
-
 /**************************************************************************************************/
+//assumes you have added all the template sequences using the addSequence function and run generateDB.
+vector<int> BlastDB::findClosestSequences(Sequence* seq, int n) {
+       try{
+               vector<int> topMatches;
+               
+               ofstream queryFile;
+               openOutputFile(queryFileName, queryFile);
+               queryFile << '>' << seq->getName() << endl;
+               queryFile << seq->getUnaligned() << endl;
+               queryFile.close();
+                               
+               //      the goal here is to quickly survey the database to find the closest match.  To do this we are using the default
+               //      wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
+               //      long.  With this setting, it seems comparable in speed to the suffix tree approach.
+               
+               string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -b 1 -m 8 -W 28 -v " + toString(n);
+               blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
+               system(blastCommand.c_str());
+               
+               ifstream m8FileHandle;
+               openInputFile(blastFileName, m8FileHandle);
+               
+               string dummy;
+               int templateAccession;
+               gobble(m8FileHandle);
+               
+               while(!m8FileHandle.eof()){
+                       m8FileHandle >> dummy >> templateAccession >> searchScore;
+                       
+                       //get rest of junk in line
+                       while (!m8FileHandle.eof())     {       char c = m8FileHandle.get(); if (c == 10 || c == 13){   break;  }       } 
+                       
+                       gobble(m8FileHandle);
+                       topMatches.push_back(templateAccession);
+               }
+               m8FileHandle.close();
+               
+               return topMatches;
+       }
+       catch(exception& e) {
+               errorOut(e, "BlastDB", "findClosestSequences");
+               exit(1);
+       }
 
-Sequence BlastDB::findClosestSequence(Sequence* candidate){
-
-       ofstream queryFile;
-       openOutputFile(queryFileName, queryFile);
-       queryFile << '>' << candidate->getName() << endl;
-       queryFile << candidate->getUnaligned() << endl;
-       queryFile.close();
+}
+/**************************************************************************************************/
+void BlastDB::addSequence(Sequence seq) {
+       try {
        
+               ofstream unalignedFastaFile;
+               openOutputFileAppend(dbFileName, unalignedFastaFile);                           
        
-//     the goal here is to quickly survey the database to find the closest match.  To do this we are using the default
-//     wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
-//     long.  With this setting, it seems comparable in speed to the suffix tree approach.
-
-       string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -b 1 -m 8 -W 28";
-       blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
-       system(blastCommand.c_str());
+               //      generating a fasta file with unaligned template
+               unalignedFastaFile << '>' << count << endl;                                     //      sequences, which will be input to formatdb
+               unalignedFastaFile << seq.getUnaligned() << endl;
+               unalignedFastaFile.close();
+               
+               count++;
+       }
+       catch(exception& e) {
+               errorOut(e, "BlastDB", "addSequence");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+void BlastDB::generateDB() {
+       try {
        
-       ifstream m8FileHandle;
-       openInputFile(blastFileName, m8FileHandle);
+               mothurOut("Generating the temporary BLAST database...\t");      cout.flush();
+               
+               path = globaldata->argv;
+               path = path.substr(0, (path.find_last_of('m')));
        
-       string dummy;
-       int templateAccession;
-       gobble(m8FileHandle);
-       if(!m8FileHandle.eof()){
-               m8FileHandle >> dummy >> templateAccession >> searchScore;
-               m8FileHandle.close();
-               return templateSequences[templateAccession];
+               string formatdbCommand = path + "blast/bin/formatdb -p F -o T -i " + dbFileName;        //      format the database, -o option gives us the ability
+               system(formatdbCommand.c_str());                                                                //      to get the right sequence names, i think. -p F
+                                                                                                                                       //      option tells formatdb that seqs are DNA, not prot
+               mothurOut("DONE."); mothurOutEndLine(); mothurOutEndLine(); cout.flush();
        }
-       else{
-               searchScore = 0.00;
-               return emptySequence;
+       catch(exception& e) {
+               errorOut(e, "BlastDB", "generateDB");
+               exit(1);
        }
-       m8FileHandle.close();
 }
 
 /**************************************************************************************************/
+
index f0f17f9ce58bc8ee2b466cc9be6b95ddf53e207a..c718bb43fb29562b755cdd44e37fa893c7b57f25 100644 (file)
 #include "globaldata.hpp"
 
 class BlastDB : public Database {
+
 public:
-       BlastDB(string, float, float, float, float);
+       BlastDB(float, float, float, float);
        ~BlastDB();
-       Sequence findClosestSequence(Sequence*);
+       
+       void generateDB();
+       void addSequence(Sequence);
+       vector<int> findClosestSequences(Sequence*, int);
 
 private:
        string dbFileName;
@@ -26,11 +30,11 @@ private:
        string blastFileName;
        string path;
        
+       int count;
        float gapOpen;
        float gapExtend;
        float match;
        float misMatch;
-       Sequence emptySequence;
        GlobalData* globaldata;
 };
 
index acc3aa2eb62554743105b2847e2a682389d21b40..07fb8ab65019b9a45d7556fbd75d50310c5c0b98 100644 (file)
@@ -77,11 +77,13 @@ void ChimeraCheckRDP::getChimeras() {
        try {
                
                //read in query sequences and subject sequences
-               mothurOut("Reading sequences and template file... "); cout.flush();
+               mothurOutEndLine();
+               mothurOut("Reading query sequences... "); cout.flush();
                querySeqs = readSeqs(fastafile);
+               mothurOut("Done."); 
                //templateSeqs = readSeqs(templateFile);
-               templateDB = new KmerDB(templateFile, kmerSize);
-               mothurOut("Done."); mothurOutEndLine();
+               templateDB = new AlignmentDB(templateFile, "kmer", kmerSize, 0,0,0,0);
+               mothurOutEndLine();
                
                int numSeqs = querySeqs.size();
                
index f436718ef8cc7ae5d63451edadd0eee960d3f9fc..fe4620614b8ae2d78d985599c594185313b40c4c 100644 (file)
@@ -14,7 +14,7 @@
 #include "chimera.h"
 #include "kmer.hpp"
 #include "kmerdb.hpp"
-#include "database.hpp"
+#include "alignmentdb.h"
 
 /***********************************************************/
 //This class was created using the algorythms described in 
@@ -39,7 +39,7 @@ class ChimeraCheckRDP : public Chimera {
                
                vector<linePair*> lines;
                vector<Sequence*> querySeqs;
-               Database* templateDB;
+               AlignmentDB* templateDB;
                Kmer* kmer;
                
                vector< vector<sim> > IS;  //IS[0] is the vector of IS values for each window for querySeqs[0]
diff --git a/classify.cpp b/classify.cpp
new file mode 100644 (file)
index 0000000..605c1f8
--- /dev/null
@@ -0,0 +1,124 @@
+/*
+ *  classify.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 11/3/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "classify.h"
+#include "sequence.hpp"
+#include "kmerdb.hpp"
+#include "suffixdb.hpp"
+#include "blastdb.hpp"
+
+/**************************************************************************************************/
+
+Classify::Classify(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch) : taxFile(tfile), templateFile(tempFile) {         
+       try {                                                                                   
+               
+               readTaxonomy(taxFile);
+               
+               int numSeqs = 0;
+               //need to know number of template seqs for suffixdb
+               if (method == "suffix") {
+                       ifstream inFASTA;
+                       openInputFile(tempFile, inFASTA);
+                       numSeqs = count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
+                       inFASTA.close();
+               }
+
+               mothurOut("Generating search database...    "); cout.flush();
+                               
+               bool needToGenerate = true;
+               string kmerDBName;
+               if(method == "kmer")                    {       
+                       database = new KmerDB(tempFile, kmerSize);                      
+                       
+                       kmerDBName = tempFile.substr(0,tempFile.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
+                       ifstream kmerFileTest(kmerDBName.c_str());
+                       if(kmerFileTest){       needToGenerate = false;         }
+               }
+               else if(method == "suffix")             {       database = new SuffixDB(numSeqs);                                                               }
+               else if(method == "blast")              {       database = new BlastDB(gapOpen, gapExtend, match, misMatch);    }
+               else {
+                       mothurOut(method + " is not a valid search option. I will run the command using kmer, ksize=8.");
+                       mothurOutEndLine();
+                       database = new KmerDB(tempFile, 8);
+               }
+               
+               if (needToGenerate) {
+                       ifstream fastaFile;
+                       openInputFile(tempFile, fastaFile);
+               
+                       while (!fastaFile.eof()) {
+                               Sequence temp(fastaFile);
+                               gobble(fastaFile);
+                       
+                               names.push_back(temp.getName());
+                               database->addSequence(temp);    
+                       }
+                       fastaFile.close();
+
+                       database->generateDB();
+                       
+               }else if ((method == "kmer") && (!needToGenerate)) {    
+                       ifstream kmerFileTest(kmerDBName.c_str());
+                       database->readKmerDB(kmerFileTest);     
+                       
+                       ifstream fastaFile;
+                       openInputFile(tempFile, fastaFile);
+               
+                       while (!fastaFile.eof()) {
+                               Sequence temp(fastaFile);
+                               gobble(fastaFile);
+                       
+                               names.push_back(temp.getName());
+                       }
+                       fastaFile.close();
+               }
+               
+               database->setNumSeqs(names.size());
+               
+               mothurOut("DONE."); mothurOutEndLine();
+
+       }
+       catch(exception& e) {
+               errorOut(e, "Classify", "Classify");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
+void Classify::readTaxonomy(string file) {
+       try {
+       
+               ifstream inTax;
+               openInputFile(file, inTax);
+       
+               mothurOutEndLine();
+               mothurOut("Reading in the " + file + " taxonomy...\t"); cout.flush();
+               
+               string name, taxInfo;
+               //read template seqs and save
+               while (!inTax.eof()) {
+                       inTax >> name >> taxInfo;
+                       
+                       taxonomy[name] = taxInfo;
+               
+                       gobble(inTax);
+               }
+               inTax.close();
+       
+               mothurOut("DONE.");
+               mothurOutEndLine();     cout.flush();
+       
+       }
+       catch(exception& e) {
+               errorOut(e, "Classify", "readTaxonomy");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
diff --git a/classify.h b/classify.h
new file mode 100644 (file)
index 0000000..2926800
--- /dev/null
@@ -0,0 +1,50 @@
+#ifndef CLASSIFY_H
+#define CLASSIFY_H
+
+/*
+ *  classify.h
+ *  Mothur
+ *
+ *  Created by westcott on 11/3/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+
+/* This class is a parent to phylotyp, bayesian, knn.  */
+
+#include "mothur.h"
+#include "database.hpp"
+
+
+
+class Sequence;
+
+/**************************************************************************************************/
+
+class Classify {
+
+public:
+       Classify(string, string, string, int, int, int, int, int);
+       Classify(){};
+       
+       virtual ~Classify(){};
+       virtual string getTaxonomy(Sequence*) = 0;
+       
+protected:
+
+       map<string, string> taxonomy;  //name maps to taxonomy
+       map<string, string>::iterator it;
+       Database* database;
+       
+       string taxFile, templateFile;
+       vector<string> names;
+       
+       void readTaxonomy(string);
+               
+};
+
+/**************************************************************************************************/
+
+#endif
+
diff --git a/classifyseqscommand.cpp b/classifyseqscommand.cpp
new file mode 100644 (file)
index 0000000..69ef712
--- /dev/null
@@ -0,0 +1,360 @@
+/*
+ *  classifyseqscommand.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 11/2/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "classifyseqscommand.h"
+#include "sequence.hpp"
+#include "phylotype.h"
+#include "bayesian.h"
+#include "doTaxonomy.h"
+#include "knn.h"
+
+//**********************************************************************************************************************
+
+ClassifySeqsCommand::ClassifySeqsCommand(string option){
+       try {
+               //              globaldata = GlobalData::getInstance();
+               abort = false;
+               
+               //allow user to run help
+               if(option == "help") { help(); abort = true; }
+               
+               else {
+                       
+                       //valid paramters for this command
+                       string AlignArray[] =  {"template","fasta","search","ksize","method","processors","taxonomy","match","mismatch","gapopen","gapextend","numwanted"};
+                       vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
+                       
+                       OptionParser parser(option);
+                       map<string, string> parameters = parser.getParameters(); 
+                       
+                       ValidParameters validParameter;
+                       
+                       //check to make sure all parameters are valid for command
+                       for (map<string, string>::iterator it = parameters.begin(); it != parameters.end(); it++) { 
+                               if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
+                       }
+                       
+                       //check for required parameters
+                       templateFileName = validParameter.validFile(parameters, "template", true);
+                       if (templateFileName == "not found") { 
+                               mothurOut("template is a required parameter for the classify.seqs command."); 
+                               mothurOutEndLine();
+                               abort = true; 
+                       }
+                       else if (templateFileName == "not open") { abort = true; }      
+                       
+                       fastaFileName = validParameter.validFile(parameters, "fasta", true);
+                       if (fastaFileName == "not found") { 
+                               mothurOut("fasta is a required parameter for the classify.seqs command."); 
+                               mothurOutEndLine();
+                               abort = true; 
+                       }
+                       else if (fastaFileName == "not open") { abort = true; } 
+                       
+                       taxonomyFileName = validParameter.validFile(parameters, "taxonomy", true);
+                       if (taxonomyFileName == "not found") { 
+                               mothurOut("taxonomy is a required parameter for the classify.seqs command."); 
+                               mothurOutEndLine();
+                               abort = true; 
+                       }
+                       else if (taxonomyFileName == "not open") { abort = true; }      
+
+                       
+                       //check for optional parameter and set defaults
+                       // ...at some point should added some additional type checking...
+                       string temp;
+                       temp = validParameter.validFile(parameters, "ksize", false);            if (temp == "not found"){       temp = "8";                             }
+                       convert(temp, kmerSize); 
+                       
+                       temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = "1";                             }
+                       convert(temp, processors); 
+                       
+                       search = validParameter.validFile(parameters, "search", false);         if (search == "not found"){     search = "kmer";                }
+                       
+                       method = validParameter.validFile(parameters, "method", false);         if (method == "not found"){     method = "phylotype";   }
+                       
+                       temp = validParameter.validFile(parameters, "match", false);            if (temp == "not found"){       temp = "1.0";                   }
+                       convert(temp, match);  
+                       
+                       temp = validParameter.validFile(parameters, "mismatch", false);         if (temp == "not found"){       temp = "-1.0";                  }
+                       convert(temp, misMatch);  
+                       
+                       temp = validParameter.validFile(parameters, "gapopen", false);          if (temp == "not found"){       temp = "-2.0";                  }
+                       convert(temp, gapOpen);  
+                       
+                       temp = validParameter.validFile(parameters, "gapextend", false);        if (temp == "not found"){       temp = "-1.0";                  }
+                       convert(temp, gapExtend); 
+                       
+                       temp = validParameter.validFile(parameters, "numwanted", false);        if (temp == "not found"){       temp = "10";                    }
+                       convert(temp, numWanted);
+
+               }
+               
+       }
+       catch(exception& e) {
+               errorOut(e, "ClassifySeqsCommand", "ClassifySeqsCommand");
+               exit(1);
+       }
+}
+
+//**********************************************************************************************************************
+
+ClassifySeqsCommand::~ClassifySeqsCommand(){   
+
+       if (abort == false) {
+               for (int i = 0; i < lines.size(); i++) {  delete lines[i];  }  lines.clear();
+       }
+}
+
+//**********************************************************************************************************************
+
+void ClassifySeqsCommand::help(){
+       try {
+               mothurOut("The classify.seqs command reads a fasta file containing sequences and creates a .taxonomy file and a .tax.summary file.\n");
+               mothurOut("The classify.seqs command parameters are template, fasta, search, ksize, method, taxonomy, processors, match, mismatch, gapopen, gapextend and numwanted.\n");
+               mothurOut("The template, fasta and taxonomy parameters are required.\n");
+               mothurOut("The search parameter allows you to specify the method to find most similar template.  Your options are: suffix, kmer and blast. The default is kmer.\n");
+               mothurOut("The method parameter allows you to specify classification method to use.  Your options are: phylotype, bayesian and knn. The default is phylotype.\n");
+               mothurOut("The ksize parameter allows you to specify the kmer size for finding most similar template to candidate.  The default is 8.\n");
+               mothurOut("The processors parameter allows you to specify the number of processors to use. The default is 1.\n");
+               mothurOut("The match parameter allows you to specify the bonus for having the same base. The default is 1.0.\n");
+               mothurOut("The mistmatch parameter allows you to specify the penalty for having different bases.  The default is -1.0.\n");
+               mothurOut("The gapopen parameter allows you to specify the penalty for opening a gap in an alignment. The default is -1.0.\n");
+               mothurOut("The gapextend parameter allows you to specify the penalty for extending a gap in an alignment.  The default is -2.0.\n");
+               mothurOut("The numwanted parameter allows you to specify the number of sequence matches you want with the knn method.  The default is 10.\n");
+               mothurOut("The classify.seqs command should be in the following format: \n");
+               mothurOut("classify.seqs(template=yourTemplateFile, fasta=yourFastaFile, method=yourClassificationMethod, search=yourSearchmethod, ksize=yourKmerSize, taxonomy=yourTaxonomyFile, processors=yourProcessors) \n");
+               mothurOut("Example classify.seqs(fasta=amazon.fasta, template=core.filtered, method=phylotype, search=gotoh, ksize=8, processors=2)\n");
+               mothurOut("The .taxonomy file consists of 2 columns: 1 = your sequence name, 2 = the taxonomy for your sequence. \n");
+               mothurOut("The .tax.summary is a summary of the different taxonomies represented in your fasta file. \n");
+               mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
+       }
+       catch(exception& e) {
+               errorOut(e, "ClassifySeqsCommand", "help");
+               exit(1);
+       }
+}
+
+
+//**********************************************************************************************************************
+
+int ClassifySeqsCommand::execute(){
+       try {
+               if (abort == true) {    return 0;       }
+               
+               int start = time(NULL);
+               
+               if(method == "phylotype"){      classify = new PhyloType(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch);                                    }
+               //else if(method == "bayesian")         {       classify = new Bayesian(taxonomyFileName, templateFileName, search);            }
+               else if(method == "knn")                        {       classify = new Knn(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch, numWanted);                               }
+               else {
+                       mothurOut(search + " is not a valid method option. I will run the command using phylotype.");
+                       mothurOutEndLine();
+                       classify = new PhyloType(taxonomyFileName, templateFileName, search, kmerSize, gapOpen, gapExtend, match, misMatch);
+               }
+
+               int numFastaSeqs = 0;
+               
+               string newTaxonomyFile = getRootName(fastaFileName) + "taxonomy";
+               string taxSummary = getRootName(fastaFileName) + "tax.summary";
+               
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+               if(processors == 1){
+                       ifstream inFASTA;
+                       openInputFile(fastaFileName, inFASTA);
+                       numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
+                       inFASTA.close();
+                       
+                       lines.push_back(new linePair(0, numFastaSeqs));
+                       
+                       driver(lines[0], newTaxonomyFile);
+                       
+               }
+               else{
+                       vector<int> positions;
+                       processIDS.resize(0);
+                       
+                       ifstream inFASTA;
+                       openInputFile(fastaFileName, inFASTA);
+                       
+                       string input;
+                       while(!inFASTA.eof()){
+                               input = getline(inFASTA);
+                               if (input.length() != 0) {
+                                       if(input[0] == '>'){    int pos = inFASTA.tellg(); positions.push_back(pos - input.length() - 1);       }
+                               }
+                       }
+                       inFASTA.close();
+                       
+                       numFastaSeqs = positions.size();
+                       
+                       int numSeqsPerProcessor = numFastaSeqs / processors;
+                       
+                       for (int i = 0; i < processors; i++) {
+                               int startPos = positions[ i * numSeqsPerProcessor ];
+                               if(i == processors - 1){
+                                       numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor;
+                               }
+                               lines.push_back(new linePair(startPos, numSeqsPerProcessor));
+                       }
+                       createProcesses(newTaxonomyFile); 
+                       
+                       rename((newTaxonomyFile + toString(processIDS[0]) + ".temp").c_str(), newTaxonomyFile.c_str());
+                       
+                       for(int i=1;i<processors;i++){
+                               appendTaxFiles((newTaxonomyFile + toString(processIDS[i]) + ".temp"), newTaxonomyFile);
+                               remove((newTaxonomyFile + toString(processIDS[i]) + ".temp").c_str());
+                       }
+                       
+               }
+#else
+               ifstream inFASTA;
+               openInputFile(candidateFileName, inFASTA);
+               numFastaSeqs=count(istreambuf_iterator<char>(inFASTA),istreambuf_iterator<char>(), '>');
+               inFASTA.close();
+               
+               lines.push_back(new linePair(0, numFastaSeqs));
+               
+               driver(lines[0], newTaxonomyFile);
+#endif
+               
+               delete classify;
+               
+               //make taxonomy tree from new taxonomy file 
+               ifstream inTaxonomy;
+               openInputFile(newTaxonomyFile, inTaxonomy);
+               
+               string accession, taxaList;
+               PhyloTree taxaBrowser;
+               
+               //read in users taxonomy file and add sequences to tree
+               while(!inTaxonomy.eof()){
+                       inTaxonomy >> accession >> taxaList;
+                       
+                       taxaBrowser.addSeqToTree(accession, taxaList);
+                       
+                       gobble(inTaxonomy);
+               }
+               inTaxonomy.close();
+               
+               taxaBrowser.assignHeirarchyIDs(0);
+               
+               ofstream outTaxTree;
+               openOutputFile(taxSummary, outTaxTree);
+               
+               taxaBrowser.print(outTaxTree);
+               
+               mothurOutEndLine();
+               mothurOut("It took " + toString(time(NULL) - start) + " secs to classify " + toString(numFastaSeqs) + " sequences.");
+               mothurOutEndLine();
+               mothurOutEndLine();
+               
+               return 0;
+       }
+       catch(exception& e) {
+               errorOut(e, "ClassifySeqsCommand", "execute");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
+void ClassifySeqsCommand::createProcesses(string taxFileName) {
+       try {
+#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
+               int process = 0;
+               //              processIDS.resize(0);
+               
+               //loop through and create all the processes you want
+               while (process != processors) {
+                       int pid = fork();
+                       
+                       if (pid > 0) {
+                               processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
+                               process++;
+                       }else if (pid == 0){
+                               driver(lines[process], taxFileName + toString(getpid()) + ".temp");
+                               exit(0);
+                       }else { mothurOut("unable to spawn the necessary processes."); mothurOutEndLine(); exit(0); }
+               }
+               
+               //force parent to wait until all the processes are done
+               for (int i=0;i<processors;i++) { 
+                       int temp = processIDS[i];
+                       wait(&temp);
+               }
+#endif         
+       }
+       catch(exception& e) {
+               errorOut(e, "ClassifySeqsCommand", "createProcesses");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
+void ClassifySeqsCommand::appendTaxFiles(string temp, string filename) {
+       try{
+               
+               ofstream output;
+               ifstream input;
+               openOutputFileAppend(filename, output);
+               openInputFile(temp, input);
+               
+               while(char c = input.get()){
+                       if(input.eof())         {       break;                  }
+                       else                            {       output << c;    }
+               }
+               
+               input.close();
+               output.close();
+       }
+       catch(exception& e) {
+               errorOut(e, "ClassifySeqsCommand", "appendTaxFiles");
+               exit(1);
+       }
+}
+
+//**********************************************************************************************************************
+
+int ClassifySeqsCommand::driver(linePair* line, string taxFName){
+       try {
+               ofstream outTax;
+               openOutputFile(taxFName, outTax);
+       
+               ifstream inFASTA;
+               openInputFile(fastaFileName, inFASTA);
+
+               inFASTA.seekg(line->start);
+               
+               string taxonomy;
+
+               for(int i=0;i<line->numSeqs;i++){
+                       
+                       Sequence* candidateSeq = new Sequence(inFASTA);
+
+                       taxonomy = classify->getTaxonomy(candidateSeq);
+                       
+                       if (taxonomy != "bad seq") {
+                               outTax << candidateSeq->getName() << '\t' << taxonomy << endl;
+                       }
+                                                       
+                       delete candidateSeq;
+               }
+               
+               inFASTA.close();
+               outTax.close();
+               
+               return 1;
+       }
+       catch(exception& e) {
+               errorOut(e, "ClassifySeqsCommand", "driver");
+               exit(1);
+       }
+}
+
+/**************************************************************************************************/
diff --git a/classifyseqscommand.h b/classifyseqscommand.h
new file mode 100644 (file)
index 0000000..a77b7d1
--- /dev/null
@@ -0,0 +1,48 @@
+#ifndef CLASSIFYSEQSCOMMAND_H
+#define CLASSIFYSEQSCOMMAND_H
+
+/*
+ *  classifyseqscommand.h
+ *  Mothur
+ *
+ *  Created by westcott on 11/2/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "command.hpp"
+#include "alignment.hpp"
+#include "classify.h"
+
+class ClassifySeqsCommand : public Command {
+       
+public:
+       ClassifySeqsCommand(string);    
+       ~ClassifySeqsCommand();
+       int execute(); 
+       void help();    
+       
+private:
+       struct linePair {
+               int start;
+               int numSeqs;
+               linePair(int i, int j) : start(i), numSeqs(j) {}
+       };
+       vector<int> processIDS;   //processid
+       vector<linePair*> lines;
+       
+       Classify* classify;
+       
+       string fastaFileName, templateFileName, distanceFileName, search, method, taxonomyFileName;
+       int processors, kmerSize, numWanted;
+       float match, misMatch, gapOpen, gapExtend;
+       bool abort;
+       
+       int driver(linePair*, string);
+       void appendTaxFiles(string, string);
+       void createProcesses(string); 
+};
+
+#endif
+
index 5f07abb59e03e161bd46bc60065eb15e37107a06..030079ed21e3a6d4f35263f45d46a67878715e5b 100644 (file)
@@ -58,6 +58,7 @@
 #include "getsharedotucommand.h"
 #include "getlistcountcommand.h"
 #include "hclustercommand.h"
+#include "classifyseqscommand.h"
 
 /***********************************************************/
 
@@ -114,8 +115,8 @@ CommandFactory::CommandFactory(){
        commands["get.sharedotu"]               = "get.sharedotu";
        commands["get.listcount"]               = "get.listcount";
        commands["quit"]                                = "quit"; 
-       
        commands["hcluster"]                    = "hcluster"; 
+       commands["classify.seqs"]               = "classify.seqs"; 
 
 }
 /***********************************************************/
@@ -180,8 +181,8 @@ Command* CommandFactory::getCommand(string commandName, string optionString){
                else if(commandName == "align.check")                   {       command = new AlignCheckCommand(optionString);                  }
                else if(commandName == "get.sharedotu")                 {       command = new GetSharedOTUCommand(optionString);                }
                else if(commandName == "get.listcount")                 {       command = new GetListCountCommand(optionString);                }
-               
                else if(commandName == "hcluster")                              {       command = new HClusterCommand(optionString);                    }
+               else if(commandName == "classify.seqs")                 {       command = new ClassifySeqsCommand(optionString);                }
                else                                                                                    {       command = new NoCommand(optionString);                                  }
 
                return command;
index f84c9e1bd8f3c3f2111cde09657966f5c787b1eb..5f82bc194a7f6d174ca0a670a0b802b1b18cb715 100644 (file)
 
 /**************************************************************************************************/
 
-Database::Database(string fastaFileName){              //      This assumes that the template database is in fasta format, may 
-                                                                                               //      need to alter this in the future?
+Database::Database(){          
        longest = 0;
-       
-       ifstream fastaFile;
-       openInputFile(fastaFileName, fastaFile);
-       
-       mothurOutEndLine();
-       mothurOut("Reading in the " + fastaFileName + " template sequences...\t");      cout.flush();
-
-       //all of this is elsewhere already!
-       //numSeqs=count(istreambuf_iterator<char>(fastaFile),istreambuf_iterator<char>(), '>'); //      count the number of
-       //fastaFile.seekg(0);                                                                                                                                           //      sequences
-       
-       //templateSequences.resize(numSeqs);
-       
-       /*string seqName, sequence;
-       for(int i=0;i<numSeqs;i++){
-               fastaFile >> seqName;
-               seqName = seqName.substr(1);
-               char letter;
-               string aligned;
-               
-               while(fastaFile && (letter=fastaFile.get()) != '>'){
-                       if(isprint(letter)){
-                               letter = toupper(letter);
-                               if(letter == 'U'){letter = 'T';}
-                               aligned += letter;
-                       }
-               }
-               templateSequences[i] = Sequence(seqName, aligned);
-               
-               //necessary because Sequence constructor by default sets whatever it reads to unaligned
-               //the setUnaligned function remove gap char.
-               templateSequences[i].setUnaligned(templateSequences[i].getUnaligned());
-               
-               if (templateSequences[i].getUnaligned().length() > longest)  { longest = templateSequences[i].getUnaligned().length(); }
-               
-               fastaFile.putback(letter);
-       }*/
-       
-       while (!fastaFile.eof()) {
-               Sequence temp(fastaFile);
-               
-               templateSequences.push_back(temp);
-               
-               //save longest base
-               if (temp.getUnaligned().length() > longest)  { longest = temp.getUnaligned().length(); }
-               
-               gobble(fastaFile);
-       }
-       
-       numSeqs = templateSequences.size();
-       
-       fastaFile.close();
-       //all of this is elsewhere already!
-       
-       mothurOut("DONE.");
-       mothurOutEndLine();     cout.flush();
-
+       numSeqs = 0;
 }
 /**************************************************************************************************/
 
-Database::~Database(){                                                                                                         
-
-               templateSequences.clear();
-}
+Database::~Database(){}
 
 /**************************************************************************************************/
 
@@ -91,5 +31,3 @@ float Database::getSearchScore()      {       return searchScore;             }       //      we're assuming that
 int Database::getLongestBase() {       return longest+1;               }       
 
 /**************************************************************************************************/
-
-
index 01393c130adb3250ffaf5c9ed97fd8a2d4043c1e..01916d1924bb86466798dc2667f83aaeda174c3d 100644 (file)
  */
 
 
-/* This class is a parent to blastdb, distancedb, kmerdb, suffixdb.  Which are used to convert a squencedb object into that form. */
+/* This class is a parent to blastdb, kmerdb, suffixdb.  */
 
 #include "mothur.h"
+#include "sequence.hpp"
 
-class Sequence;
+/**************************************************************************************************/
+struct seqMatch {  //used to select top n matches
+               int seq;
+               int match;
+               seqMatch(int s, int m) : seq(s), match(m) {}
+};
+/**************************************************************************************************/
+inline bool compareSeqMatches (seqMatch member, seqMatch member2){ //sorts largest to smallest
+       if(member.match > member2.match){
+               return true;   }   
+       else{
+               return false; 
+       }
+}
+/**************************************************************************************************/
+inline bool compareSeqMatchesReverse (seqMatch member, seqMatch member2){ //sorts largest to smallest
+       if(member.match < member2.match){
+               return true;   }   
+       else{
+               return false; 
+       }
+}
 
+/**************************************************************************************************/
 class Database {
+
 public:
-       Database(string);
+       Database();
        virtual ~Database();
-       virtual Sequence findClosestSequence(Sequence*) = 0;
+       virtual void generateDB() = 0; 
+       virtual void addSequence(Sequence) = 0;  //add sequence to search engine
+       virtual vector<int> findClosestSequences(Sequence*, int) = 0;  // returns indexes of n closest sequences to query
        virtual float getSearchScore();
        virtual int getLongestBase(); 
+       virtual void readKmerDB(ifstream&){};
+       virtual void setNumSeqs(int i) {        numSeqs = i;    }
        
 protected:
        int numSeqs, longest;
        float searchScore;
-       vector<Sequence> templateSequences;
 };
-
+/**************************************************************************************************/
 #endif
index 30f93597983e9c83fdc1ffcf54fb1998dc6112d0..d6ec258702207ca1312e004f0f275095f506f326 100644 (file)
@@ -54,7 +54,8 @@ Sequence DistanceDB::findClosestSequence(Sequence* candidateSeq){
        searchScore = 100. * simAccession.simScore;
        searchIndex++;
 //     return templateSequences[closestMatchIndexNumber];
-       return templateSequences[simAccession.indexNumber];
+       if (templateValid) {  return templateSequences[simAccession.indexNumber];       }
+       else {  return emptySequence;   }
        
 }
 
index 8da1d102ac8db9c9e485a2e9c774672fdcd0cc98..b0d46c6629ef1b19ce935f6f93608d2d89c8acd8 100644 (file)
@@ -16,8 +16,9 @@
 class DistanceDB : public Database {
        
 public:
+
        DistanceDB(string, string);
-       Sequence findClosestSequence(Sequence*);
+       vector<int> findClosestSequences(Sequence*, int);
        
 private:
 
@@ -26,8 +27,8 @@ private:
                int indexNumber;
                float simScore;
        };
+       
        vector<hit> mostSimSequenceVector;
-//     ifstream inputData;
        int searchIndex;
 };
 
diff --git a/doTaxonomy.cpp b/doTaxonomy.cpp
new file mode 100644 (file)
index 0000000..e664764
--- /dev/null
@@ -0,0 +1,125 @@
+/*
+ *  doTaxonomy.cpp
+ *  
+ *
+ *  Created by Pat Schloss on 6/17/09.
+ *  Copyright 2009 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "doTaxonomy.h"
+
+/**************************************************************************************************/
+
+PhyloTree::PhyloTree(){
+       
+       numNodes = 1;
+       numSeqs = 0;
+       tree.push_back(TaxNode("Root"));
+       tree[0].heirarchyID = "0";
+}
+
+/**************************************************************************************************/
+
+string PhyloTree::getNextTaxon(string& heirarchy){
+       
+       string currentLevel = "";
+       if(heirarchy != ""){
+               currentLevel=heirarchy.substr(0,heirarchy.find_first_of(';'));
+               heirarchy=heirarchy.substr(heirarchy.find_first_of(';')+1);
+       }
+       return currentLevel;
+       
+}
+
+/**************************************************************************************************/
+
+void PhyloTree::addSeqToTree(string seqName, string seqTaxonomy){
+
+       numSeqs++;
+
+       map<string, int>::iterator childPointer;
+
+       int currentNode = 0;
+       int level = 1;
+       
+       tree[0].accessions.push_back(seqName);
+       string taxon;// = getNextTaxon(seqTaxonomy);
+
+       while(seqTaxonomy != ""){
+
+               level++;
+
+//somehow the parent is getting one too many accnos
+//use print to reassign the taxa id
+               taxon = getNextTaxon(seqTaxonomy);
+
+               childPointer = tree[currentNode].children.find(taxon);
+
+               if(childPointer != tree[currentNode].children.end()){   //if the node already exists, move on
+                       currentNode = childPointer->second;
+                       tree[currentNode].accessions.push_back(seqName);
+               }
+               else{                                                                                   //otherwise, create it
+                       tree.push_back(TaxNode(taxon));
+                       numNodes++;
+                       tree[currentNode].children[taxon] = numNodes-1;
+
+//                     int numChildren = tree[currentNode].children.size();
+//                     string heirarchyID = tree[currentNode].heirarchyID;
+//                     tree[currentNode].accessions.push_back(seqName);
+                       
+                       currentNode = tree[currentNode].children[taxon];
+                       tree[currentNode].accessions.push_back(seqName);
+
+//                     tree[currentNode].level = level;
+//                     tree[currentNode].childNumber = numChildren;
+//                     tree[currentNode].heirarchyID = heirarchyID + '.' + toString(tree[currentNode].childNumber);
+               }
+
+       }
+}
+
+/**************************************************************************************************/
+
+void PhyloTree::assignHeirarchyIDs(int index){
+       
+       map<string,int>::iterator it;
+       int counter = 1;
+       
+       for(it=tree[index].children.begin();it!=tree[index].children.end();it++){
+               tree[it->second].heirarchyID = tree[index].heirarchyID + '.' + toString(counter);
+               counter++;
+               tree[it->second].level = tree[index].level + 1;
+               assignHeirarchyIDs(it->second);
+
+       }
+       
+}
+
+/**************************************************************************************************/
+
+void PhyloTree::print(ofstream& out){
+       
+       out << tree[0].level << '\t'<< tree[0].heirarchyID << '\t' << tree[0].name << '\t' << tree[0].children.size() << '\t' << tree[0].accessions.size() << endl;
+       print(0, out);
+
+       
+}
+
+/**************************************************************************************************/
+
+void PhyloTree::print(int i, ofstream& out){
+       
+       map<string,int>::iterator it;
+       for(it=tree[i].children.begin();it!=tree[i].children.end();it++){
+               out <<tree[it->second].level << '\t' << tree[it->second].heirarchyID << '\t' << tree[it->second].name << '\t' << tree[it->second].children.size() << '\t' << tree[it->second].accessions.size() << endl;
+               print(it->second, out);
+       }
+
+}
+
+/**************************************************************************************************/
+
+
+       
diff --git a/doTaxonomy.h b/doTaxonomy.h
new file mode 100644 (file)
index 0000000..c1ec3ad
--- /dev/null
@@ -0,0 +1,42 @@
+/*
+ *  doTaxonomy.h
+ *  
+ *
+ *  Created by Pat Schloss on 6/17/09.
+ *  Copyright 2009 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+
+/**************************************************************************************************/
+
+struct TaxNode {
+       vector<string> accessions;
+       map<string, int> children;
+       int parent, childNumber, level;
+       string name, heirarchyID;
+       
+       TaxNode(string n) : name(n), level(0), parent(-1) {             }
+       TaxNode(){}
+};
+
+/**************************************************************************************************/
+
+class PhyloTree {
+
+public:
+       PhyloTree();
+       void addSeqToTree(string, string);
+       void assignHeirarchyIDs(int);
+       void print(ofstream&);
+private:
+       string getNextTaxon(string&);
+       vector<TaxNode> tree;
+       void print(int, ofstream&);
+       int numNodes;
+       int numSeqs;
+};
+
+/**************************************************************************************************/
+
index 2d980d73723dba0b712fc0025d975520061e9180..7613ecbf43e245713880a26444b3e68baaad808c 100644 (file)
@@ -235,7 +235,7 @@ void HCluster::updateArrayandLinkTable() {
                        }
 //cout << "here3" << endl;                     
                        linkTable[colSpot].erase(size);
-                       //linkTable.erase(linkTable.begin()+rowSpot);  //delete row
+                       linkTable.erase(linkTable.begin()+rowSpot);  //delete row
        //printInfo();          
                        //update activerows
                        activeLinks.erase(smallRow);
@@ -243,9 +243,9 @@ void HCluster::updateArrayandLinkTable() {
                        activeLinks[size] = colSpot;
                        
                        //adjust everybody elses spot since you deleted - time vs. space
-                       //for (it = activeLinks.begin(); it != activeLinks.end(); it++) {
-                               //if (it->second > rowSpot) {  activeLinks[it->first]--;        }
-                       //}
+                       for (it = activeLinks.begin(); it != activeLinks.end(); it++) {
+                               if (it->second > rowSpot) {  activeLinks[it->first]--;  }
+                       }
                        
 //cout << "here4" << endl;
        
index 86dba09e64b42bb6480b4e5a2d5b99ab88df8303..bf8679e57a932af0567df8a456f038b76fbf0fae 100644 (file)
 
 /**************************************************************************************************/
 
-KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerSize(kSize) {
+KmerDB::KmerDB(string fastaFileName, int kSize) : Database(), kmerSize(kSize) {
        try { 
        
-               string kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
-               ifstream kmerFileTest(kmerDBName.c_str());
+               kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
                
                int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
+               count = 0;
                
                maxKmer = power4s[kmerSize];
                kmerLocations.resize(maxKmer+1);
                
-               if(!kmerFileTest){              //      if we can open the kmer db file, then read it in...
-                       mothurOut("Generating the " + kmerDBName + " database...\t");   cout.flush();
-                       generateKmerDB(kmerDBName);     
-               }
-               else{                                   //      ...otherwise generate it.
-                       mothurOut("Reading in the " + kmerDBName + " database...\t");   cout.flush();
-                       readKmerDB(kmerDBName, kmerFileTest);
-               }
-               mothurOut("DONE."); mothurOutEndLine(); mothurOutEndLine(); cout.flush();
-               
        }
        catch(exception& e) {
                errorOut(e, "KmerDB", "KmerDB");
@@ -59,27 +49,21 @@ KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerS
 }
 /**************************************************************************************************/
 
-KmerDB::~KmerDB(){                                                                                                             
-       
-               //for (int i = 0; i < templateSequences.size(); i++) {  delete templateSequences[i]; }
-               // templateSequences.clear(); 
-}
+KmerDB::~KmerDB(){}
 
 /**************************************************************************************************/
 
-Sequence KmerDB::findClosestSequence(Sequence* candidateSeq){
+vector<int> KmerDB::findClosestSequences(Sequence* candidateSeq, int num){
        try {
-       
+               vector<int> topMatches;
                Kmer kmer(kmerSize);
-               
                searchScore = 0;
-               int maxSequence = 0;
                
                vector<int> matches(numSeqs, 0);                                                //      a record of the sequences with shared kmers
                vector<int> timesKmerFound(kmerLocations.size()+1, 0);  //      a record of the kmers that we have already found
                
                int numKmers = candidateSeq->getNumBases() - kmerSize + 1;      
-               
+       
                for(int i=0;i<numKmers;i++){
                        int kmerNumber = kmer.getKmerNumber(candidateSeq->getUnaligned(), i);           //      go through the query sequence and get a kmer number
                        if(timesKmerFound[kmerNumber] == 0){                            //      if we haven't seen it before...
@@ -89,46 +73,36 @@ Sequence KmerDB::findClosestSequence(Sequence* candidateSeq){
                        }
                        timesKmerFound[kmerNumber] = 1;                                         //      ok, we've seen the kmer now
                }
-               
-               for(int i=0;i<numSeqs;i++){                                                             //      find the db sequence that shares the most kmers with
-                       if(matches[i] > searchScore){                                   //      the query sequence
-                               searchScore = matches[i];
-                               maxSequence = i;
-                       }
+       
+               vector<seqMatch> seqMatches;
+               for(int i=0;i<numSeqs;i++){             
+                       seqMatch temp(i, matches[i]);
+                       seqMatches.push_back(temp);
                }
                
-               searchScore = 100 * searchScore / (float) numKmers;             //      return the Sequence object corresponding to the db
+               //sorts putting largest matches first
+               sort(seqMatches.begin(), seqMatches.end(), compareSeqMatches);
                
-               return templateSequences[maxSequence];                                  //      sequence with the most shared kmers.
+               searchScore = seqMatches[0].match;
+               searchScore = 100 * searchScore / (float) numKmers;             //      return the Sequence object corresponding to the db
+       
+               //save top matches
+               for (int i = 0; i < num; i++) {
+                       topMatches.push_back(seqMatches[i].seq);
+               }
                
+               return topMatches;              
        }
        catch(exception& e) {
-               errorOut(e, "KmerDB", "findClosestSequence");
+               errorOut(e, "KmerDB", "findClosestSequences");
                exit(1);
        }       
 }
 
 /**************************************************************************************************/
 
-void KmerDB::generateKmerDB(string kmerDBName){
+void KmerDB::generateDB(){
        try {
-       
-               Kmer kmer(kmerSize);
-               
-               for(int i=0;i<numSeqs;i++){                                                             //      for all of the template sequences...
-                       
-                       string seq = templateSequences[i].getUnaligned();       //      ...take the unaligned sequence...
-                       int numKmers = seq.length() - kmerSize + 1;
-                       
-                       vector<int> seenBefore(maxKmer+1,0);
-                       for(int j=0;j<numKmers;j++){                                            //      ...step though the sequence and get each kmer...
-                               int kmerNumber = kmer.getKmerNumber(seq, j);
-                               if(seenBefore[kmerNumber] == 0){
-                                       kmerLocations[kmerNumber].push_back(i);         //      ...insert the sequence index into kmerLocations for
-                               }                                                                                               //      the appropriate kmer number
-                               seenBefore[kmerNumber] = 1;
-                       }                                                                                                       
-               }
                
                ofstream kmerFile;                                                                              //      once we have the kmerLocations folder print it out
                openOutputFile(kmerDBName, kmerFile);                                   //      to a file
@@ -144,24 +118,47 @@ void KmerDB::generateKmerDB(string kmerDBName){
                
        }
        catch(exception& e) {
-               errorOut(e, "KmerDB", "generateKmerDB");
+               errorOut(e, "KmerDB", "generateDB");
                exit(1);
        }       
        
 }
-
 /**************************************************************************************************/
-
-void KmerDB::readKmerDB(string kmerDBName, ifstream& kmerDBFile){
+void KmerDB::addSequence(Sequence seq) {
        try {
+               Kmer kmer(kmerSize);
+               
+               string unaligned = seq.getUnaligned();  //      ...take the unaligned sequence...
+               int numKmers = unaligned.length() - kmerSize + 1;
+                       
+               vector<int> seenBefore(maxKmer+1,0);
+               for(int j=0;j<numKmers;j++){                                            //      ...step though the sequence and get each kmer...
+                       int kmerNumber = kmer.getKmerNumber(unaligned, j);
+                       if(seenBefore[kmerNumber] == 0){
+                               kmerLocations[kmerNumber].push_back(count);             //      ...insert the sequence index into kmerLocations for
+                       }                                                                                               //      the appropriate kmer number
+                       seenBefore[kmerNumber] = 1;
+               }                                                                                                       
        
+               count++;
+       }
+       catch(exception& e) {
+               errorOut(e, "KmerDB", "addSequence");
+               exit(1);
+       }       
+}
+/**************************************************************************************************/
+
+void KmerDB::readKmerDB(ifstream& kmerDBFile){
+       try {
+                                       
                kmerDBFile.seekg(0);                                                                    //      start at the beginning of the file
                
                string seqName;
                int seqNumber;
-               
+
                for(int i=0;i<maxKmer;i++){
-                       int numValues;  
+                       int numValues = 0;      
                        kmerDBFile >> seqName >> numValues;
                        
                        for(int j=0;j<numValues;j++){                                           //      for each kmer number get the...
index d50c8b2607347aa337b8cc57fc0566b40ccbb01b..b64412d259a85870c57253249a4e355193476c9a 100644 (file)
@@ -27,13 +27,17 @@ class KmerDB : public Database {
 public:
        KmerDB(string, int);
        ~KmerDB();
-       Sequence findClosestSequence(Sequence*);
-
+       
+       void generateDB();
+       void addSequence(Sequence);
+       vector<int> findClosestSequences(Sequence*, int);
+       void readKmerDB(ifstream&);
+       
 private:
-       void generateKmerDB(string);
-       void readKmerDB(string, ifstream&);
+       
        int kmerSize;
-       int maxKmer;
+       int maxKmer, count;
+       string kmerDBName;
        vector<vector<int> > kmerLocations;
 };
 
diff --git a/knn.cpp b/knn.cpp
new file mode 100644 (file)
index 0000000..6d57fb1
--- /dev/null
+++ b/knn.cpp
@@ -0,0 +1,109 @@
+/*
+ *  knn.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 11/4/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "knn.h"
+
+/**************************************************************************************************/
+Knn::Knn(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch, int n) 
+: Classify(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch), num(n)  {}
+/**************************************************************************************************/
+string Knn::getTaxonomy(Sequence* seq) {
+       try {
+               string tax;
+               
+               //use database to find closest seq
+               vector<int> closest = database->findClosestSequences(seq, num);
+               
+               vector<string> closestNames;
+               for (int i = 0; i < closest.size(); i++) {
+                       //find that sequences taxonomy in map
+                       it = taxonomy.find(names[closest[i]]);
+               
+                       //is this sequence in the taxonomy file
+                       if (it == taxonomy.end()) { //error not in file
+                               mothurOut("Error: sequence " + names[closest[i]] + " is not in the taxonomy file.  It will be eliminated as a match to sequence " + seq->getName() + "."); mothurOutEndLine();
+                       }else{   closestNames.push_back(it->first);     }
+               }
+               
+               if (closestNames.size() == 0) {
+                       mothurOut("Error: All the matches for sequence " + seq->getName() + " have been eliminated. " + seq->getName() + " will be disregarded."); mothurOutEndLine();
+                       tax = "bad seq";
+               }else{
+                       tax = findCommonTaxonomy(closestNames);
+                       if (tax == "") { mothurOut("There are no common levels for sequence " + seq->getName() + ". " + seq->getName() + " will be disregarded."); mothurOutEndLine(); tax = "bad seq"; }
+               }
+               
+               return tax;     
+       }
+       catch(exception& e) {
+               errorOut(e, "Knn", "getTaxonomy");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+string Knn::findCommonTaxonomy(vector<string> closest)  {
+       try {
+               vector< vector<string> > taxons;  //taxon[0] = vector of taxonomy info for closest[0].
+                                                                               //so if closest[0] taxonomy is Bacteria;Alphaproteobacteria;Rhizobiales;Azorhizobium_et_rel.;Methylobacterium_et_rel.;Bosea;
+                                                                               //taxon[0][0] = Bacteria, taxon[0][1] = Alphaproteobacteria....
+                                                                               
+               taxons.resize(closest.size());
+               int smallest = 100;
+               
+               for (int i = 0; i < closest.size(); i++) {
+               
+                       string tax = taxonomy[closest[i]];  //we know its there since we checked in getTaxonomy
+               
+                       tax = tax.substr(0, tax.length()-1);  //get rid of last ';'
+       
+                       //parse taxonomy
+                       string individual;
+                       while (tax.find_first_of(';') != -1) {
+                               individual = tax.substr(0,tax.find_first_of(';'));
+                               tax = tax.substr(tax.find_first_of(';')+1, tax.length());
+                               taxons[i].push_back(individual);
+                       
+                       }
+                       //get last one
+                       taxons[i].push_back(tax);
+               
+                       //figure out who has the shortest taxonomy info. so you can start comparing there
+                       if (taxons[i].size() < smallest) {
+                               smallest = taxons[i].size();
+                       }
+               }
+       
+               //start at the highest level all the closest seqs have
+               string common = "";
+               for (int i = (smallest-1); i >= 0; i--) {
+
+                       string thistax = taxons[0][i];
+                       int num = 0;
+                       for (int j = 1; j < taxons.size(); j++) {
+                               if (taxons[j][i] != thistax) { break; }
+                               num = j;
+                       }
+               
+                       if (num == (taxons.size()-1)) { //they all match at this level
+                               for (int k = 0; k <= i; k++) {
+                                       common += taxons[0][k] + ';';
+                               }
+                               break;
+                       }
+               }
+       
+               return common;
+       }
+       catch(exception& e) {
+               errorOut(e, "Knn", "findCommonTaxonomy");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
diff --git a/knn.h b/knn.h
new file mode 100644 (file)
index 0000000..15a33b0
--- /dev/null
+++ b/knn.h
@@ -0,0 +1,35 @@
+#ifndef KNN_H
+#define KNN_H
+
+/*
+ *  knn.h
+ *  Mothur
+ *
+ *  Created by westcott on 11/4/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+#include "mothur.h"
+#include "classify.h"
+
+/**************************************************************************************************/
+
+class Knn : public Classify {
+       
+public:
+       Knn(string, string, string, int, int, int, int, int, int);
+       ~Knn() {};
+       
+       string getTaxonomy(Sequence*);
+       
+private:
+       int num;
+       string findCommonTaxonomy(vector<string>);
+};
+
+/**************************************************************************************************/
+
+#endif
+
+
index f247037cc915fd1d54d9d1e050abde8f73711fba..4848eb20be0b5cfdb43715b5ba1589d253ba4571 100644 (file)
--- a/nast.cpp
+++ b/nast.cpp
@@ -40,12 +40,12 @@ Nast::Nast(Alignment* method, Sequence* cand, Sequence* temp) : alignment(method
 void Nast::pairwiseAlignSeqs(){        //      Here we call one of the pairwise alignment methods to align our unaligned candidate
                                                                //      and template sequences
        try {
-               
+
                alignment->align(candidateSeq->getUnaligned(), templateSeq->getUnaligned());
-       
+               
                string candAln = alignment->getSeqAAln();
                string tempAln = alignment->getSeqBAln();
-
+               
                if(candAln == ""){
 
                        candidateSeq->setPairwise("");
diff --git a/phylotype.cpp b/phylotype.cpp
new file mode 100644 (file)
index 0000000..f7af97a
--- /dev/null
@@ -0,0 +1,42 @@
+/*
+ *  phylotype.cpp
+ *  Mothur
+ *
+ *  Created by westcott on 11/3/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "phylotype.h"
+
+/**************************************************************************************************/
+PhyloType::PhyloType(string tfile, string tempFile, string method, int kmerSize, int gapOpen, int gapExtend, int match, int misMatch) 
+: Classify(tfile, tempFile, method, kmerSize, gapOpen, gapExtend, match, misMatch)  {}
+/**************************************************************************************************/
+string PhyloType::getTaxonomy(Sequence* seq) {
+       try {
+               string tax;
+               
+               //use database to find closest seq
+               vector<int> closest = database->findClosestSequences(seq, 1);
+               
+               //find that sequences taxonomy in map
+               it = taxonomy.find(names[closest[0]]);
+               
+               //is this sequence in the taxonomy file
+               if (it == taxonomy.end()) { //error not in file
+                       mothurOut("Error: sequence " + names[closest[0]] + " is not in the taxonomy file.  It is the closest match to sequence " + seq->getName() + ". " + seq->getName() + " will be disregarded."); mothurOutEndLine();
+                       tax = "bad seq";
+               }else {
+                       tax = it->second;
+               }
+               
+               return tax;     
+       }
+       catch(exception& e) {
+               errorOut(e, "PhyloType", "getTaxonomy");
+               exit(1);
+       }
+}
+/**************************************************************************************************/
+
diff --git a/phylotype.h b/phylotype.h
new file mode 100644 (file)
index 0000000..ee649e3
--- /dev/null
@@ -0,0 +1,34 @@
+#ifndef PHYLOTYPE_H
+#define PHYLOTYPE_H
+
+/*
+ *  phylotype.h
+ *  Mothur
+ *
+ *  Created by westcott on 11/3/09.
+ *  Copyright 2009 Schloss Lab. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+#include "classify.h"
+
+/**************************************************************************************************/
+
+class PhyloType : public Classify {
+       
+public:
+       PhyloType(string, string, string, int, int, int, int, int);
+       ~PhyloType() {};
+       
+       string getTaxonomy(Sequence*);
+       
+       
+private:
+       
+};
+
+/**************************************************************************************************/
+
+#endif
+
index 5bcd8e96c0be725ee978ba74a4d32bb36b3aa4e0..39a9b54945cf1fb0a7d4f8f29ecc323381784d1d 100644 (file)
 
 /**************************************************************************************************/
 
-SuffixDB::SuffixDB(string fastaFileName) : Database(fastaFileName) {
-
+SuffixDB::SuffixDB(int numSeqs) : Database() {
        suffixForest.resize(numSeqs);
-       mothurOut("Generating the suffix tree database...\t");  cout.flush();
-       for(int i=0;i<numSeqs;i++){                                                             //      The parent class' constructor generates the vector of
-               suffixForest[i].loadSequence(templateSequences[i]);     //      template Sequence objects.  Here each of these objects
-       }                                                                                                               //      is used to generate a suffix tree, aka the suffix forest
-       mothurOut("DONE."); mothurOutEndLine(); mothurOutEndLine(); cout.flush();
-
+       count = 0;
 }
-
 /**************************************************************************************************/
-
-Sequence SuffixDB::findClosestSequence(Sequence* candidateSeq){
-
-       int minValue = 2000;
-       int closestSequenceNo = 0;
-       string processedSeq = candidateSeq->convert2ints();             //      the candidate sequence needs to be a string of ints
-       for(int i=0;i<suffixForest.size();i++){                                 //      scan through the forest and see what the minimum
-               int count = suffixForest[i].countSuffixes(processedSeq, minValue);      //      return score is and keep track of the
-               if(count == minValue){                                                          //      template sequence index that corresponds to that score
-                       closestSequenceNo = i;
+//assumes sequences have been added using addSequence
+vector<int> SuffixDB::findClosestSequences(Sequence* candidateSeq, int num){
+       try {
+               vector<int> topMatches;
+               string processedSeq = candidateSeq->convert2ints();             //      the candidate sequence needs to be a string of ints
+               
+               vector<seqMatch> seqMatches;
+               for(int i=0;i<suffixForest.size();i++){                                 //      scan through the forest and see what the minimum
+                       int count = suffixForest[i].countSuffixes(processedSeq);        //      return score is and keep track of the
+                       seqMatch temp(i, count);
+                       seqMatches.push_back(temp);
+               }
+               
+               //sorts putting smallest matches first
+               sort(seqMatches.begin(), seqMatches.end(), compareSeqMatchesReverse);
+               
+               searchScore = seqMatches[0].match;
+               searchScore = 100 * (1. - searchScore / (float)processedSeq.length());
+               
+               //save top matches
+               for (int i = 0; i < num; i++) {
+                       topMatches.push_back(seqMatches[i].seq);
                }
+
+               //      return the Sequence object that has the minimum score
+               return topMatches;
+       }
+       catch(exception& e) {
+               errorOut(e, "SuffixDB", "findClosestSequences");
+               exit(1);
+       }       
+}
+/**************************************************************************************************/
+//adding the sequences generates the db
+void SuffixDB::addSequence(Sequence seq) {
+       try {
+               suffixForest[count].loadSequence(seq);          
+               count++;
+       }
+       catch(exception& e) {
+               errorOut(e, "SuffixDB", "addSequence");
+               exit(1);
        }
-       searchScore = 100 * (1. - minValue / (float)processedSeq.length());
-       return templateSequences[closestSequenceNo];                    //      return the Sequence object that has the minimum score
-       
 }
-
 /**************************************************************************************************/
 
 SuffixDB::~SuffixDB(){                                                                                                         
-       
        for (int i = (suffixForest.size()-1); i >= 0; i--) {  suffixForest.pop_back();  }
-       // templateSequences.clear();
-
 }
 /**************************************************************************************************/
index f211aa7ba7f612613419c54e551cb0755d2dffc7..1baa99e8d09fbcaa3a52abefb03811d6c3a3ed5f 100644 (file)
@@ -26,12 +26,16 @@ class SuffixTree;
 class SuffixDB : public Database {
        
 public:
-       SuffixDB(string);
+       SuffixDB(int);
        ~SuffixDB();
-       Sequence findClosestSequence(Sequence*);
+       
+       void generateDB() {}; //adding sequences generates the db
+       void addSequence(Sequence);
+       vector<int> findClosestSequences(Sequence*, int);
 
 private:
        vector<SuffixTree> suffixForest;
+       int count;
 };
 
 #endif
index 53a6d3f8bc690ffec892d31eec5cb47a59ecd0e2..2842b366b3360f4b43ad32dd28603073266dcdfa 100644 (file)
@@ -125,7 +125,46 @@ int SuffixTree::countSuffixes(string compareSequence, int& minValue){      //      here we
        return numSuffixes;                                                                                     //      change the value and return the number of suffixes
        
 }
+//********************************************************************************************************************
 
+int SuffixTree::countSuffixes(string compareSequence){ //      here we count the number of suffix parts 
+                                                                                                                       //      we need to rewrite a user supplied sequence.  if the 
+       int numSuffixes = 0;                                                                    //      count exceeds the supplied minValue, bail out.  The
+       int seqLength = compareSequence.length();                               //      time complexity should be O(L)
+       int position = 0;
+       
+       int presentNode = 0;
+       
+       while(position < seqLength){            //      while the position in the query sequence isn't at the end...
+               
+               int newNode = nodeVector[presentNode]->getChild(compareSequence[position]);     //      see if the current node has a
+                                                                                                                               //      child that matches the next character in the query
+               if(newNode == -1){                                                                              
+                       if(presentNode == 0){   position++;             }                       //      if not, go back to the root and increase the count
+                       numSuffixes++;                                                                          //      by one.
+                       presentNode = 0;
+               }
+               else{                                                                                                   //      if there is, move to that node and see how far down
+                       presentNode = newNode;                                                          //      it we can get
+                       
+                       for(int i=nodeVector[newNode]->getStartCharPos(); i<=nodeVector[newNode]->getEndCharPos(); i++){
+                               if(compareSequence[position] == sequence[i]){
+                                       position++;                                                                     //      as long as the query and branch agree, keep going
+                               }
+                               else{
+                                       numSuffixes++;                                                          //      if there is a mismatch, increase the number of 
+                                       presentNode = 0;                                                        //      suffixes and go back to the root
+                                       break;
+                               }
+                       }
+               }
+               //      if we get all the way through the node we'll go to the top of the while loop and find the child node
+               //      that corresponds to what we are interested in           
+       }
+       numSuffixes--;                                                                                          //      the method puts an extra count on numSuffixes
+       
+       return numSuffixes;                                                                                     //      change the value and return the number of suffixes
+}
 //********************************************************************************************************************
 
 void SuffixTree::canonize(){   //      if you have to ask how this works, you don't really want to know and this really
index b85cee90f93cd59cc34a604840de9d8d6c613a61..ab262ba86295ea964d3857e40d387961466e7d11 100644 (file)
@@ -42,6 +42,7 @@ public:
        string getSeqName();
        void print();   
        int countSuffixes(string, int&);
+       int countSuffixes(string);      
 
 private:       
        void addPrefix(int);