]> git.donarmstrong.com Git - mothur.git/commitdiff
added alignment code
authorwestcott <westcott>
Mon, 18 May 2009 17:37:53 +0000 (17:37 +0000)
committerwestcott <westcott>
Mon, 18 May 2009 17:37:53 +0000 (17:37 +0000)
48 files changed:
Mothur.xcodeproj/project.pbxproj
aligncommand.cpp [new file with mode: 0644]
aligncommand.h [new file with mode: 0644]
alignment.cpp [new file with mode: 0644]
alignment.hpp [new file with mode: 0644]
alignmentcell.cpp [new file with mode: 0644]
alignmentcell.hpp [new file with mode: 0644]
blastalign.cpp [new file with mode: 0644]
blastalign.hpp [new file with mode: 0644]
blastdb.cpp [new file with mode: 0644]
blastdb.hpp [new file with mode: 0644]
commandfactory.cpp
database.cpp
database.hpp
distancecommand.cpp
distancedb.cpp [new file with mode: 0644]
distancedb.hpp [new file with mode: 0644]
errorchecking.cpp
errorchecking.h
globaldata.cpp
globaldata.hpp
gotohoverlap.cpp [new file with mode: 0644]
gotohoverlap.hpp [new file with mode: 0644]
kmer.cpp
kmer.hpp
kmerdb.cpp
kmerdb.hpp
nast.cpp [new file with mode: 0644]
nast.hpp [new file with mode: 0644]
nastreport.cpp [new file with mode: 0644]
nastreport.hpp [new file with mode: 0644]
needlemanoverlap.cpp [new file with mode: 0644]
needlemanoverlap.hpp [new file with mode: 0644]
noalign.cpp [new file with mode: 0644]
noalign.hpp [new file with mode: 0644]
overlap.cpp [new file with mode: 0644]
overlap.hpp [new file with mode: 0644]
rarefact.cpp
sequence.cpp
sequence.hpp
suffixdb.cpp [new file with mode: 0644]
suffixdb.hpp [new file with mode: 0644]
suffixnodes.cpp [new file with mode: 0644]
suffixnodes.hpp [new file with mode: 0644]
suffixtree.cpp [new file with mode: 0644]
suffixtree.hpp [new file with mode: 0644]
validcommands.cpp
validparameter.cpp

index 7beb6cbfe80f60a8db0c7a74dbcb02e5762e4f45..7bd433f6ba83e52aa7d9f842ca4b9adca4a5a304 100644 (file)
                372E12700F26365B0095CF7E /* readotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E126F0F26365B0095CF7E /* readotucommand.cpp */; };
                372E12960F263D5A0095CF7E /* readdistcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E12950F263D5A0095CF7E /* readdistcommand.cpp */; };
                372E12ED0F264D320095CF7E /* commandfactory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E12EC0F264D320095CF7E /* commandfactory.cpp */; };
+               373C68A40FC1C07D00137ACD /* alignment.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68A00FC1C07D00137ACD /* alignment.cpp */; };
+               373C68A50FC1C07D00137ACD /* alignmentcell.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68A20FC1C07D00137ACD /* alignmentcell.cpp */; };
+               373C68BC0FC1C1A600137ACD /* gotohoverlap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68BA0FC1C1A600137ACD /* gotohoverlap.cpp */; };
+               373C68C50FC1C25F00137ACD /* overlap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68C30FC1C25F00137ACD /* overlap.cpp */; };
+               373C68CF0FC1C2D900137ACD /* needlemanoverlap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68CD0FC1C2D900137ACD /* needlemanoverlap.cpp */; };
+               373C68DD0FC1C38D00137ACD /* blastalign.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68DB0FC1C38D00137ACD /* blastalign.cpp */; };
+               373C68E50FC1C4A500137ACD /* noalign.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68E30FC1C4A500137ACD /* noalign.cpp */; };
+               373C68F90FC1C6A800137ACD /* suffixdb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68F30FC1C6A800137ACD /* suffixdb.cpp */; };
+               373C68FA0FC1C6A800137ACD /* suffixnodes.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68F50FC1C6A800137ACD /* suffixnodes.cpp */; };
+               373C68FB0FC1C6A800137ACD /* suffixtree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68F70FC1C6A800137ACD /* suffixtree.cpp */; };
+               373C69180FC1C8AF00137ACD /* blastdb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C69160FC1C8AF00137ACD /* blastdb.cpp */; };
+               373C691F0FC1C98600137ACD /* nast.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C691D0FC1C98600137ACD /* nast.cpp */; };
+               373C692B0FC1C9EB00137ACD /* nastreport.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C69290FC1C9EB00137ACD /* nastreport.cpp */; };
+               373C69340FC1CA9E00137ACD /* distancedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C69320FC1CA9E00137ACD /* distancedb.cpp */; };
                374610780F40645300460C57 /* unifracweightedcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 374610770F40645300460C57 /* unifracweightedcommand.cpp */; };
                3746107E0F4064D100460C57 /* weighted.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3746107D0F4064D100460C57 /* weighted.cpp */; };
                374610830F40652400460C57 /* unweighted.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 374610820F40652400460C57 /* unweighted.cpp */; };
@@ -45,6 +59,7 @@
                378C1B0A0FB0644E004D63F5 /* sequencedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 378C1AFD0FB0644D004D63F5 /* sequencedb.cpp */; };
                378C1B0B0FB0644E004D63F5 /* sharedjackknife.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 378C1AFF0FB0644D004D63F5 /* sharedjackknife.cpp */; };
                378C1B0C0FB0644E004D63F5 /* sharedmarczewski.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 378C1B010FB0644D004D63F5 /* sharedmarczewski.cpp */; };
+               378DC5CF0FBDE1C8003B8607 /* aligncommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 378DC5CE0FBDE1C8003B8607 /* aligncommand.cpp */; };
                379293C30F2DE73400B9034A /* treemap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 379293C20F2DE73400B9034A /* treemap.cpp */; };
                379294700F2E191800B9034A /* parsimonycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3792946F0F2E191800B9034A /* parsimonycommand.cpp */; };
                3792948A0F2E258500B9034A /* parsimony.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 379294890F2E258500B9034A /* parsimony.cpp */; };
                372E12940F263D5A0095CF7E /* readdistcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = readdistcommand.h; sourceTree = "<group>"; };
                372E12950F263D5A0095CF7E /* readdistcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = readdistcommand.cpp; sourceTree = "<group>"; };
                372E12EC0F264D320095CF7E /* commandfactory.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = commandfactory.cpp; sourceTree = "<group>"; };
+               373C68A00FC1C07D00137ACD /* alignment.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignment.cpp; sourceTree = "<group>"; };
+               373C68A10FC1C07D00137ACD /* alignment.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = alignment.hpp; sourceTree = "<group>"; };
+               373C68A20FC1C07D00137ACD /* alignmentcell.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignmentcell.cpp; sourceTree = "<group>"; };
+               373C68A30FC1C07D00137ACD /* alignmentcell.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = alignmentcell.hpp; sourceTree = "<group>"; };
+               373C68BA0FC1C1A600137ACD /* gotohoverlap.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = gotohoverlap.cpp; sourceTree = "<group>"; };
+               373C68BB0FC1C1A600137ACD /* gotohoverlap.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = gotohoverlap.hpp; sourceTree = "<group>"; };
+               373C68C30FC1C25F00137ACD /* overlap.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = overlap.cpp; sourceTree = "<group>"; };
+               373C68C40FC1C25F00137ACD /* overlap.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = overlap.hpp; sourceTree = "<group>"; };
+               373C68CD0FC1C2D900137ACD /* needlemanoverlap.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = needlemanoverlap.cpp; sourceTree = "<group>"; };
+               373C68CE0FC1C2D900137ACD /* needlemanoverlap.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = needlemanoverlap.hpp; sourceTree = "<group>"; };
+               373C68DB0FC1C38D00137ACD /* blastalign.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blastalign.cpp; sourceTree = "<group>"; };
+               373C68DC0FC1C38D00137ACD /* blastalign.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = blastalign.hpp; sourceTree = "<group>"; };
+               373C68E30FC1C4A500137ACD /* noalign.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = noalign.cpp; sourceTree = "<group>"; };
+               373C68E40FC1C4A500137ACD /* noalign.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = noalign.hpp; sourceTree = "<group>"; };
+               373C68F30FC1C6A800137ACD /* suffixdb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = suffixdb.cpp; sourceTree = "<group>"; };
+               373C68F40FC1C6A800137ACD /* suffixdb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = suffixdb.hpp; sourceTree = "<group>"; };
+               373C68F50FC1C6A800137ACD /* suffixnodes.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = suffixnodes.cpp; sourceTree = "<group>"; };
+               373C68F60FC1C6A800137ACD /* suffixnodes.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = suffixnodes.hpp; sourceTree = "<group>"; };
+               373C68F70FC1C6A800137ACD /* suffixtree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = suffixtree.cpp; sourceTree = "<group>"; };
+               373C68F80FC1C6A800137ACD /* suffixtree.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = suffixtree.hpp; sourceTree = "<group>"; };
+               373C69160FC1C8AF00137ACD /* blastdb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blastdb.cpp; sourceTree = "<group>"; };
+               373C69170FC1C8AF00137ACD /* blastdb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = blastdb.hpp; sourceTree = "<group>"; };
+               373C691D0FC1C98600137ACD /* nast.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = nast.cpp; sourceTree = "<group>"; };
+               373C691E0FC1C98600137ACD /* nast.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = nast.hpp; sourceTree = "<group>"; };
+               373C69290FC1C9EB00137ACD /* nastreport.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = nastreport.cpp; sourceTree = "<group>"; };
+               373C692A0FC1C9EB00137ACD /* nastreport.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = nastreport.hpp; sourceTree = "<group>"; };
+               373C69320FC1CA9E00137ACD /* distancedb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = distancedb.cpp; sourceTree = "<group>"; };
+               373C69330FC1CA9E00137ACD /* distancedb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = distancedb.hpp; sourceTree = "<group>"; };
                374610760F40645300460C57 /* unifracweightedcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = unifracweightedcommand.h; sourceTree = "<group>"; };
                374610770F40645300460C57 /* unifracweightedcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = unifracweightedcommand.cpp; sourceTree = "<group>"; };
                3746107C0F4064D100460C57 /* weighted.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = weighted.h; sourceTree = "<group>"; };
                378C1B000FB0644D004D63F5 /* sharedjackknife.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sharedjackknife.h; sourceTree = "<group>"; };
                378C1B010FB0644D004D63F5 /* sharedmarczewski.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sharedmarczewski.cpp; sourceTree = "<group>"; };
                378C1B020FB0644D004D63F5 /* sharedmarczewski.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sharedmarczewski.h; sourceTree = "<group>"; };
+               378DC5CD0FBDE1C8003B8607 /* aligncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = aligncommand.h; sourceTree = "<group>"; };
+               378DC5CE0FBDE1C8003B8607 /* aligncommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligncommand.cpp; sourceTree = "<group>"; };
                379293C10F2DE73400B9034A /* treemap.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = treemap.h; sourceTree = "<group>"; };
                379293C20F2DE73400B9034A /* treemap.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = treemap.cpp; sourceTree = "<group>"; };
                3792946E0F2E191800B9034A /* parsimonycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = parsimonycommand.h; sourceTree = "<group>"; };
                        isa = PBXGroup;
                        children = (
                                37D927BA0F21331F001D4494 /* averagelinkage.cpp */,
+                               373C68DC0FC1C38D00137ACD /* blastalign.hpp */,
+                               373C68DB0FC1C38D00137ACD /* blastalign.cpp */,
                                37D928A60F2133C0001D4494 /* calculators */,
                                37D927C20F21331F001D4494 /* cluster.hpp */,
                                37D927C10F21331F001D4494 /* cluster.cpp */,
                                37D927D10F21331F001D4494 /* commandoptionparser.hpp */,
                                37D927D00F21331F001D4494 /* commandoptionparser.cpp */,
                                37D927D20F21331F001D4494 /* completelinkage.cpp */,
-                               37D927D40F21331F001D4494 /* database.hpp */,
-                               37D927D30F21331F001D4494 /* database.cpp */,
                                7E4130F70F8E58FA00381DD0 /* dlibshuff.h */,
                                7E4130F60F8E58FA00381DD0 /* dlibshuff.cpp */,
                                37D927D60F21331F001D4494 /* display.h */,
                                37D927DD0F21331F001D4494 /* fileoutput.cpp */,
                                37D927E00F21331F001D4494 /* globaldata.hpp */,
                                37D927DF0F21331F001D4494 /* globaldata.cpp */,
+                               373C68BB0FC1C1A600137ACD /* gotohoverlap.hpp */,
+                               373C68BA0FC1C1A600137ACD /* gotohoverlap.cpp */,
                                375873ED0F7D646F0040F377 /* heatmap.h */,
                                375873EE0F7D646F0040F377 /* heatmap.cpp */,
                                37D927E60F21331F001D4494 /* inputdata.h */,
                                37D927E50F21331F001D4494 /* inputdata.cpp */,
-                               37D927EA0F21331F001D4494 /* kmer.hpp */,
-                               37D927E90F21331F001D4494 /* kmer.cpp */,
-                               37D927EC0F21331F001D4494 /* kmerdb.hpp */,
-                               37D927EB0F21331F001D4494 /* kmerdb.cpp */,
                                7E412F420F8D213C00381DD0 /* libshuff.h */,
                                7E412FE90F8D3E2C00381DD0 /* libshuff.cpp */,
                                37E3ED800F4D9D0D00B5DF02 /* mothur.h */,
                                37D927EF0F21331F001D4494 /* mothur.cpp */,
                                37D927F10F21331F001D4494 /* nameassignment.hpp */,
                                37D927F00F21331F001D4494 /* nameassignment.cpp */,
+                               373C691E0FC1C98600137ACD /* nast.hpp */,
+                               373C691D0FC1C98600137ACD /* nast.cpp */,
+                               373C692A0FC1C9EB00137ACD /* nastreport.hpp */,
+                               373C69290FC1C9EB00137ACD /* nastreport.cpp */,
+                               373C68CE0FC1C2D900137ACD /* needlemanoverlap.hpp */,
+                               373C68CD0FC1C2D900137ACD /* needlemanoverlap.cpp */,
+                               373C68E40FC1C4A500137ACD /* noalign.hpp */,
+                               373C68E30FC1C4A500137ACD /* noalign.cpp */,
                                37D927F60F21331F001D4494 /* observable.h */,
+                               373C68C40FC1C25F00137ACD /* overlap.hpp */,
+                               373C68C30FC1C25F00137ACD /* overlap.cpp */,
                                37D927FC0F21331F001D4494 /* progress.hpp */,
                                37D927FB0F21331F001D4494 /* progress.cpp */,
                                37D928040F21331F001D4494 /* raredisplay.h */,
                        isa = PBXGroup;
                        children = (
                                37D927CD0F21331F001D4494 /* command.hpp */,
+                               378DC5CD0FBDE1C8003B8607 /* aligncommand.h */,
+                               378DC5CE0FBDE1C8003B8607 /* aligncommand.cpp */,
                                37C1D9710F86506E0059E3F0 /* binsequencecommand.h */,
                                37C1D9720F86506E0059E3F0 /* binsequencecommand.cpp */,
                                21DDC0190F97A8FE0060691C /* bootstrapsharedcommand.h */,
                37D928AC0F213420001D4494 /* containers */ = {
                        isa = PBXGroup;
                        children = (
+                               373C68A10FC1C07D00137ACD /* alignment.hpp */,
+                               373C68A00FC1C07D00137ACD /* alignment.cpp */,
+                               373C68A30FC1C07D00137ACD /* alignmentcell.hpp */,
+                               373C68A20FC1C07D00137ACD /* alignmentcell.cpp */,
+                               373C69170FC1C8AF00137ACD /* blastdb.hpp */,
+                               373C69160FC1C8AF00137ACD /* blastdb.cpp */,
+                               37D927D40F21331F001D4494 /* database.hpp */,
+                               37D927D30F21331F001D4494 /* database.cpp */,
                                37D927D50F21331F001D4494 /* datavector.hpp */,
+                               373C69330FC1CA9E00137ACD /* distancedb.hpp */,
+                               373C69320FC1CA9E00137ACD /* distancedb.cpp */,
                                37D927DC0F21331F001D4494 /* fastamap.h */,
                                37D927DB0F21331F001D4494 /* fastamap.cpp */,
                                375873EA0F7D64520040F377 /* fullmatrix.h */,
                                375873EB0F7D64520040F377 /* fullmatrix.cpp */,
                                37D927E20F21331F001D4494 /* groupmap.h */,
                                37D927E10F21331F001D4494 /* groupmap.cpp */,
+                               37D927EA0F21331F001D4494 /* kmer.hpp */,
+                               37D927E90F21331F001D4494 /* kmer.cpp */,
+                               37D927EC0F21331F001D4494 /* kmerdb.hpp */,
+                               37D927EB0F21331F001D4494 /* kmerdb.cpp */,
                                37D927EE0F21331F001D4494 /* listvector.hpp */,
                                37D927ED0F21331F001D4494 /* listvector.cpp */,
                                37D927F80F21331F001D4494 /* ordervector.hpp */,
                                37D928320F21331F001D4494 /* sharedsabundvector.cpp */,
                                37D928450F21331F001D4494 /* sparsematrix.hpp */,
                                37D928440F21331F001D4494 /* sparsematrix.cpp */,
+                               373C68F40FC1C6A800137ACD /* suffixdb.hpp */,
+                               373C68F30FC1C6A800137ACD /* suffixdb.cpp */,
+                               373C68F60FC1C6A800137ACD /* suffixnodes.hpp */,
+                               373C68F50FC1C6A800137ACD /* suffixnodes.cpp */,
+                               373C68F80FC1C6A800137ACD /* suffixtree.hpp */,
+                               373C68F70FC1C6A800137ACD /* suffixtree.cpp */,
                                37AD4DB90F28E2FE00AA2D49 /* tree.h */,
                                37AD4DBA0F28E2FE00AA2D49 /* tree.cpp */,
                                379293C10F2DE73400B9034A /* treemap.h */,
                                378C1B0C0FB0644E004D63F5 /* sharedmarczewski.cpp in Sources */,
                                37C753CE0FB3415200DBD02E /* distancecommand.cpp in Sources */,
                                379643ED0FB9B5A80081FDB6 /* readseqs.cpp in Sources */,
+                               378DC5CF0FBDE1C8003B8607 /* aligncommand.cpp in Sources */,
+                               373C68A40FC1C07D00137ACD /* alignment.cpp in Sources */,
+                               373C68A50FC1C07D00137ACD /* alignmentcell.cpp in Sources */,
+                               373C68BC0FC1C1A600137ACD /* gotohoverlap.cpp in Sources */,
+                               373C68C50FC1C25F00137ACD /* overlap.cpp in Sources */,
+                               373C68CF0FC1C2D900137ACD /* needlemanoverlap.cpp in Sources */,
+                               373C68DD0FC1C38D00137ACD /* blastalign.cpp in Sources */,
+                               373C68E50FC1C4A500137ACD /* noalign.cpp in Sources */,
+                               373C68F90FC1C6A800137ACD /* suffixdb.cpp in Sources */,
+                               373C68FA0FC1C6A800137ACD /* suffixnodes.cpp in Sources */,
+                               373C68FB0FC1C6A800137ACD /* suffixtree.cpp in Sources */,
+                               373C69180FC1C8AF00137ACD /* blastdb.cpp in Sources */,
+                               373C691F0FC1C98600137ACD /* nast.cpp in Sources */,
+                               373C692B0FC1C9EB00137ACD /* nastreport.cpp in Sources */,
+                               373C69340FC1CA9E00137ACD /* distancedb.cpp in Sources */,
                        );
                        runOnlyForDeploymentPostprocessing = 0;
                };
diff --git a/aligncommand.cpp b/aligncommand.cpp
new file mode 100644 (file)
index 0000000..81f8e6a
--- /dev/null
@@ -0,0 +1,151 @@
+/*
+ *  aligncommand.cpp
+ *  Mothur
+ *
+ *  Created by Sarah Westcott on 5/15/09.
+ *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
+ *
+ *     This version of nast does everything I think that the greengenes nast server does and then some.  I have added the 
+ *     feature of allowing users to define their database, kmer size for searching, alignment penalty values and alignment 
+ *     method.  This latter feature is perhaps most significant.  nastPlus enables a user to use either a Needleman-Wunsch 
+ *     (non-affine gap penalty) or Gotoh (affine gap penalty) pairwise alignment algorithm.  This is significant because it
+ *     allows for a global alignment and not the local alignment provided by bLAst.  Furthermore, it has the potential to
+ *     provide a better alignment because of the banding method employed by blast (I'm not sure about this).
+ *
+ *     to compile type:
+ *             make
+ *
+ *     for basic instructions on how to run nastPlus type:
+ *             ./nastPlus
+ */
+
+#include "aligncommand.h"
+#include "sequence.hpp"
+
+#include "alignment.hpp"
+#include "gotohoverlap.hpp"
+#include "needlemanoverlap.hpp"
+#include "blastalign.hpp"
+#include "noalign.hpp"
+
+#include "database.hpp"
+#include "kmerdb.hpp"
+#include "suffixdb.hpp"
+#include "blastdb.hpp"
+#include "distancedb.hpp"
+
+#include "nast.hpp"
+#include "nastreport.hpp"
+
+
+//**********************************************************************************************************************
+AlignCommand::AlignCommand(){
+       try {
+               globaldata = GlobalData::getInstance();
+               candidateFileName = globaldata->inputFileName;
+               templateFileName = globaldata->getTemplateFile();
+               openInputFile(candidateFileName, in);
+               convert(globaldata->getKSize(), kmerSize);
+               convert(globaldata->getMatch(), match);
+               convert(globaldata->getMismatch(), misMatch);
+               convert(globaldata->getGapopen(), gapOpen);
+               convert(globaldata->getGapextend(), gapExtend);
+               distanceFileName = "????";
+       }
+       catch(exception& e) {
+               cout << "Standard Error: " << e.what() << " has occurred in the AlignCommand class Function AlignCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               exit(1);
+       }
+       catch(...) {
+               cout << "An unknown error has occurred in the AlignCommand class function AlignCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               exit(1);
+       }       
+}
+
+//**********************************************************************************************************************
+
+AlignCommand::~AlignCommand(){
+       
+}
+
+//**********************************************************************************************************************
+
+int AlignCommand::execute(){
+       try {
+               srand( (unsigned)time( NULL ) );  //needed to assign names to temporary files
+               
+               Database* templateDB;
+               if(globaldata->getSearch() == "kmer")                   {       templateDB = new KmerDB(templateFileName, kmerSize);                                                            }
+               else if(globaldata->getSearch() == "suffix")    {       templateDB = new SuffixDB(templateFileName);                                                                            }
+               else if(globaldata->getSearch() == "blast")             {       templateDB = new BlastDB(templateFileName, gapOpen, gapExtend, match, misMatch);        }
+               else if(globaldata->getSearch() == "distance")  {       templateDB = new DistanceDB(templateFileName, distanceFileName);                                        }
+               else {  cout << globaldata->getSearch() << " is not a valid search option. I will run the command using suffix." << endl;
+                               templateDB = new SuffixDB(templateFileName);            }
+       
+               Alignment* alignment;
+               if(globaldata->getAlign() == "gotoh")                                   {       alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, 3000);        }
+               else if(globaldata->getAlign() == "needleman")                  {       alignment = new NeedlemanOverlap(gapOpen, match, misMatch, 3000);                       }
+               else if(globaldata->getAlign() == "blast")                              {       alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch);            }
+               else if(globaldata->getAlign() == "noalign")                    {       alignment = new NoAlign();                                                                                                      }
+               else {  cout << globaldata->getAlign() << " is not a valid alignment option. I will run the command using blast." << endl;
+                               alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch);    }
+                               
+               int numFastaSeqs=count(istreambuf_iterator<char>(in),istreambuf_iterator<char>(), '>');
+               in.seekg(0);
+       
+               string candidateAligngmentFName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + globaldata->getSearch() + '.' + globaldata->getAlign() + ".nast.align";
+               ofstream candidateAlignmentFile;
+               openOutputFile(candidateAligngmentFName, candidateAlignmentFile);
+
+               string candidateReportFName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + globaldata->getSearch() + '.' + globaldata->getAlign() + ".nast.report";
+               NastReport report(candidateReportFName);
+
+               cout << "We are going to align the " << numFastaSeqs << " sequences in " << candidateFileName << "..." << endl;
+               cout.flush();
+       
+               int start = time(NULL);
+
+               for(int i=0;i<numFastaSeqs;i++){
+
+                       Sequence* candidateSeq = new Sequence(in);
+                       report.setCandidate(candidateSeq);
+
+                       Sequence* templateSeq = templateDB->findClosestSequence(candidateSeq);
+                       report.setTemplate(templateSeq);
+                       report.setSearchParameters(globaldata->getSearch(), templateDB->getSearchScore());
+                               
+                       Nast nast(alignment, candidateSeq, templateSeq);
+                       report.setAlignmentParameters(globaldata->getAlign(), alignment);
+                       report.setNastParameters(nast);
+
+                       candidateAlignmentFile << '>' << candidateSeq->getName() << '\n' << candidateSeq->getAligned() << endl;
+                       candidateAlignmentFile.flush();
+
+                       if((i+1) % 100 == 0){
+                               cout << "It has taken " << time(NULL) - start << " secs to align " << i+1 << " sequences" << endl;
+                       }
+                       report.print();
+               
+                       delete candidateSeq;            
+               }
+               
+               cout << "It took " << time(NULL) - start << " secs to align " << numFastaSeqs << " sequences" << endl;
+               cout << endl;
+
+               delete templateDB;
+               delete alignment;
+
+                               
+               return 0;
+       }
+       catch(exception& e) {
+               cout << "Standard Error: " << e.what() << " has occurred in the AlignCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               exit(1);
+       }
+       catch(...) {
+               cout << "An unknown error has occurred in the AlignCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+               exit(1);
+       }       
+}
+
+//**********************************************************************************************************************
\ No newline at end of file
diff --git a/aligncommand.h b/aligncommand.h
new file mode 100644 (file)
index 0000000..cf30a6b
--- /dev/null
@@ -0,0 +1,35 @@
+#ifndef ALIGNCOMMAND_H
+#define ALIGNCOMMAND_H
+
+/*
+ *  aligncommand.h
+ *  Mothur
+ *
+ *  Created by Sarah Westcott on 5/15/09.
+ *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
+ *
+ */
+
+#include "command.hpp"
+#include "globaldata.hpp"
+
+class AlignCommand : public Command {
+       
+       public:
+               AlignCommand(); 
+               ~AlignCommand();
+               int execute();  
+       
+       private:
+               GlobalData* globaldata;
+               string candidateFileName, templateFileName, distanceFileName;
+               int kmerSize;
+               float match, misMatch, gapOpen, gapExtend;
+               ofstream out;
+               ifstream in;
+
+};
+
+
+
+#endif
\ No newline at end of file
diff --git a/alignment.cpp b/alignment.cpp
new file mode 100644 (file)
index 0000000..6db3b3d
--- /dev/null
@@ -0,0 +1,163 @@
+/*
+ *  alignment.cpp
+ *
+ *  Created by Pat Schloss on 12/15/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ *  This is a class for an abstract datatype for classes that implement various types of alignment     algorithms.
+ *     As of 12/18/08 these included alignments based on blastn, needleman-wunsch, and the     Gotoh algorithms
+ *  
+ */
+
+using namespace std;
+
+#include <string>
+#include <vector>
+
+#include "alignmentcell.hpp"
+#include "alignment.hpp"
+
+#include <iostream>
+
+/**************************************************************************************************/
+
+Alignment::Alignment() {       /*      do nothing      */      }
+
+/**************************************************************************************************/
+
+Alignment::Alignment(int A) : nCols(A), nRows(A) {
+       
+       alignment.resize(nRows);                        //      For the Gotoh and Needleman-Wunsch we initialize the dynamic programming
+       for(int i=0;i<nRows;i++){                       //      matrix by initializing a matrix that is A x A.  By default we will set A
+               alignment[i].resize(nCols);             //      at 2000 for 16S rRNA gene sequences
+       }       
+       
+}
+
+/**************************************************************************************************/
+
+void Alignment::traceBack(){                   //      This traceback routine is used by the dynamic programming algorithms
+                                                                               //      to fill the values of seqAaln and seqBaln
+       seqAaln = "";
+       seqBaln = "";
+       int row = lB-1;
+       int column = lA-1;
+//     seqAstart = 1;
+//     seqAend = column;
+       
+       AlignmentCell currentCell = alignment[row][column];     //      Start the traceback from the bottom-right corner of the
+                                                                                                               //      matrix
+       
+       if(currentCell.prevCell == 'x'){        seqAaln = seqBaln = "NOALIGNMENT";              }//If there's an 'x' in the bottom-
+       else{                                                                                           //      right corner bail out because it means nothing got aligned
+               while(currentCell.prevCell != 'x'){                             //      while the previous cell isn't an 'x', keep going...
+                       
+                       if(currentCell.prevCell == 'u'){                        //      if the pointer to the previous cell is 'u', go up in the
+                               seqAaln = '-' + seqAaln;                                //      matrix.  this indicates that we need to insert a gap in
+                               seqBaln = seqB[row] + seqBaln;                  //      seqA and a base in seqB
+                               currentCell = alignment[--row][column];
+                       }
+                       else if(currentCell.prevCell == 'l'){           //      if the pointer to the previous cell is 'l', go to the left
+                               seqBaln = '-' + seqBaln;                                //      in the matrix.  this indicates that we need to insert a gap
+                               seqAaln = seqA[column] + seqAaln;               //      in seqB and a base in seqA
+                               currentCell = alignment[row][--column];
+                       }
+                       else{
+                               seqAaln = seqA[column] + seqAaln;               //      otherwise we need to go diagonally up and to the left,
+                               seqBaln = seqB[row] + seqBaln;                  //      here we add a base to both alignments
+                               currentCell = alignment[--row][--column];
+                       }
+               }
+       }
+       
+       pairwiseLength = seqAaln.length();
+       seqAstart = 1;  seqAend = 0;
+       seqBstart = 1;  seqBend = 0;
+       
+       for(int i=0;i<seqAaln.length();i++){
+               if(seqAaln[i] != '-' && seqBaln[i] == '-')              {       seqAstart++;    }
+               else if(seqAaln[i] == '-' && seqBaln[i] != '-') {       seqBstart++;    }
+               else                                                                                    {       break;                  }
+       }
+       
+       pairwiseLength -= (seqAstart + seqBstart - 2);
+       
+       for(int i=seqAaln.length()-1; i>=0;i--){
+               if(seqAaln[i] != '-' && seqBaln[i] == '-')              {       seqAend++;              }
+               else if(seqAaln[i] == '-' && seqBaln[i] != '-') {       seqBend++;              }
+               else                                                                                    {       break;                  }
+       }
+       pairwiseLength -= (seqAend + seqBend);
+
+       seqAend = seqA.length() - seqAend - 1;
+       seqBend = seqB.length() - seqBend - 1;
+
+}
+
+/**************************************************************************************************/
+
+string Alignment::getSeqAAln(){
+       return seqAaln;                                                                         //      this is called to get the alignment of seqA
+}
+
+/**************************************************************************************************/
+
+string Alignment::getSeqBAln(){
+       return seqBaln;                                                                         //      this is called to get the alignment of seqB                                                     
+}
+
+/**************************************************************************************************/
+
+int Alignment::getCandidateStartPos(){
+       return seqAstart;                                                                       //      this is called to report the quality of the alignment
+}
+
+/**************************************************************************************************/
+
+int Alignment::getCandidateEndPos(){
+       return seqAend;                                                                         //      this is called to report the quality of the alignment
+}
+
+/**************************************************************************************************/
+
+int Alignment::getTemplateStartPos(){
+       return seqBstart;                                                                       //      this is called to report the quality of the alignment
+}
+
+/**************************************************************************************************/
+
+int Alignment::getTemplateEndPos(){
+       return seqBend;                                                                         //      this is called to report the quality of the alignment
+}
+
+/**************************************************************************************************/
+
+int Alignment::getPairwiseLength(){
+       return pairwiseLength;                                                          //      this is the pairwise alignment length
+}
+
+/**************************************************************************************************/
+
+//int Alignment::getLongestTemplateGap(){
+//
+//     int length = seqBaln.length();
+//     int longGap = 0;
+//     int gapLength = 0;
+//     
+//     int start = seqAstart;
+//     if(seqAstart < seqBstart){      start = seqBstart;      }
+//     for(int i=seqAstart;i<length;i++){
+//             if(seqBaln[i] == '-'){
+//                     gapLength++;
+//             }
+//             else{
+//                     if(gapLength > 0){
+//                             if(gapLength > longGap){        longGap = gapLength;    }
+//                     }
+//                     gapLength = 0;
+//             }
+//     }
+//     return longGap;
+//}
+
+/**************************************************************************************************/
diff --git a/alignment.hpp b/alignment.hpp
new file mode 100644 (file)
index 0000000..3367812
--- /dev/null
@@ -0,0 +1,53 @@
+#ifndef DPALIGNMENT_H
+#define DPALIGNMENT_H
+
+/*
+ *  dpalignment.h
+ *  
+ *
+ *  Created by Pat Schloss on 12/15/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ *  This is a class for an abstract datatype for classes that implement various types of alignment     algorithms.
+ *     As of 12/18/08 these included alignments based on blastn, needleman-wunsch, and the     Gotoh algorithms
+ * 
+ */
+using namespace std;
+
+#include "mothur.h"
+#include "alignmentcell.hpp"
+
+/**************************************************************************************************/
+
+class Alignment {
+       
+public:
+       Alignment(int);
+       Alignment();
+       virtual void align(string, string) = 0;
+       
+       float getAlignmentScore();
+       string getSeqAAln();
+       string getSeqBAln();
+       int getCandidateStartPos();
+       int getCandidateEndPos();
+       int getTemplateStartPos();
+       int getTemplateEndPos();
+
+       int getPairwiseLength();
+//     int getLongestTemplateGap();
+
+protected:
+       void traceBack();
+       string seqA, seqAaln;
+       string seqB, seqBaln;
+       int seqAstart, seqAend;
+       int seqBstart, seqBend;
+       int pairwiseLength;
+       int nRows, nCols, lA, lB;
+       vector<vector<AlignmentCell> > alignment;
+};
+
+/**************************************************************************************************/
+
+#endif
diff --git a/alignmentcell.cpp b/alignmentcell.cpp
new file mode 100644 (file)
index 0000000..544c218
--- /dev/null
@@ -0,0 +1,18 @@
+/*
+ *  alignmentcell.cpp
+ *
+ *  Created by Pat Schloss on 12/15/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ *     This class is pretty basic.  Each AlignmentCell object contains a pointer to the previous cell and different values
+ *     used to calcualte the alignment.  Initially everything is set to zero and all pointers are set to 'x'
+ *  
+ */
+
+#include "alignmentcell.hpp"
+
+//********************************************************************************************************************
+
+AlignmentCell::AlignmentCell() : prevCell('x'), cValue(0.0000), dValue(0.0000), iValue(0.0000) {};
+
+//********************************************************************************************************************
diff --git a/alignmentcell.hpp b/alignmentcell.hpp
new file mode 100644 (file)
index 0000000..c309048
--- /dev/null
@@ -0,0 +1,30 @@
+#ifndef ALIGNMENTCELL_H
+#define ALIGNMENTCELL_H
+
+/*
+ *  alignmentcell.hpp
+ *  
+ *
+ *  Created by Pat Schloss on 12/15/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ *     This class is pretty basic.  Each AlignmentCell object contains a pointer to the previous cell and different values
+ *     used to calcualte the alignment.  Initially everything is set to zero and all pointers are set to 'x'
+ *  
+ */
+
+//********************************************************************************************************************
+
+class AlignmentCell {
+       
+public:
+       AlignmentCell();
+       char prevCell;
+       float cValue;
+       float dValue;
+       float iValue;
+};
+
+//********************************************************************************************************************
+
+#endif
diff --git a/blastalign.cpp b/blastalign.cpp
new file mode 100644 (file)
index 0000000..0a1350f
--- /dev/null
@@ -0,0 +1,156 @@
+/*
+ *  blastalign.cpp
+ *  
+ *
+ *  Created by Pat Schloss on 12/16/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ *     This is a basic alignment method that gets the blast program to do the heavy lifting.  In the future, we should
+ *     probably incorporate NCBI's library so that we don't have to call on a user-supplied executable.  This is a child
+ *     of the Alignment class, which requires a constructor and align method.
+ *
+ */
+
+using namespace std;
+
+#include "alignment.hpp"
+#include "blastalign.hpp"
+
+
+//**************************************************************************************************/
+
+BlastAlignment::BlastAlignment(float go, float ge, float m, float mm) : 
+                       match(m),                               //      This is the score to award for two nucleotides matching (match >= 0)
+                       mismatch(mm)                    //      This is the penalty to assess for a mismatch (mismatch <= 0)
+{
+       gapOpen = abs(go);                              //      This is the penalty to assess for opening a gap (gapOpen >= 0)
+       gapExtend = abs(ge);                            //      This is the penalty to assess for extending a gap (gapExtend >= 0)
+               
+       int randNumber = rand();
+       candidateFileName = toString(randNumber) + ".candidate";
+       templateFileName = toString(randNumber) + ".template";
+       blastFileName = toString(randNumber) + ".pairwise";
+}
+
+//**************************************************************************************************/
+
+BlastAlignment::~BlastAlignment(){             //      The desctructor should clean up by removing the temporary 
+       remove(candidateFileName.c_str());      //      files used to run bl2seq
+       remove(templateFileName.c_str());
+       remove(blastFileName.c_str());
+}
+
+//**************************************************************************************************/
+
+void BlastAlignment::align(string seqA, string seqB){  //Use blastn to align the two sequences
+
+       ofstream candidateFile(candidateFileName.c_str());      //      Write the sequence to be aligned to a temporary candidate seq file
+       candidateFile << ">candidate" << endl << seqA << endl;
+       candidateFile.close();
+       
+       ofstream templateFile(templateFileName.c_str());        //      Write the unaligned template sequence to a temporary candidate seq file
+       templateFile << ">template" << endl << seqB << endl;
+       templateFile.close();
+       
+       //      The blastCommand assumes that we have DNA sequences (blastn) and that they are fairly similar (-e 0.001) and
+       //      that we don't want to apply any kind of complexity filtering (-F F)
+       string blastCommand = "~/Pipeline/src/cpp/production/blast/bin/bl2seq -p blastn -i " + candidateFileName + " -j " + templateFileName + " -e 0.0001 -F F -o " + blastFileName + " -W 11";
+       blastCommand += " -r " + toString(match) + " -q " + toString(mismatch);
+       blastCommand += " -G " + toString(gapOpen) + " -E " + toString(gapExtend);
+       
+       system(blastCommand.c_str());   //      Here we assume that "bl2seq" is in the users path or in the same folder as
+                                                                       //      this executable
+       setPairwiseSeqs();
+}
+
+/**************************************************************************************************/
+
+void BlastAlignment::setPairwiseSeqs(){        //      This method call assigns the blast generated alignment
+                                                                                                                       //      to the pairwise entry in the Sequence class for the 
+                                                                                                                       //      candidate and template Sequence objects
+       ifstream blastFile;
+       openInputFile(blastFileName, blastFile);
+       
+       seqAaln = "";
+       seqBaln = "";
+       
+       int candidateLength, templateLength;
+       char d;
+       
+       string candidateName, templateName;
+       
+       while(d=blastFile.get() != '='){};
+       blastFile >> candidateName;                                     //      Get the candidate sequence name from flatfile
+       
+       while(d=blastFile.get() != '('){};
+       blastFile >> candidateLength;                           //      Get the candidate sequence length from flatfile
+       
+       while(d=blastFile.get()){
+               if(d == '>'){
+                       blastFile >> templateName;                      //      Get the template sequence name from flatfile
+                       break;
+               }
+               else if(d == '*'){                                                                      //      We go here if there is no significant match
+                       
+                       seqAstart = 0;
+                       seqBstart = 0;
+                       seqAend = 0;
+                       seqBend = 0;
+                       pairwiseLength = 0;
+                       
+//                     string dummy;
+//                     while(dummy != "query:"){       cout << dummy << endl;blastFile >> dummy;       }
+//                     blastFile >> seqBend;
+//                     cout << seqBend << endl;
+//                     for(int i=0;i<seqBend;i++){
+//                             seqAaln += 'Z';
+//                             seqBaln += 'X';
+//                     }
+//                     pairwiseLength = 0;
+                       return;
+               }
+       }
+       
+       while(d=blastFile.get() != '='){};
+       blastFile >> templateLength;                            //      Get the template sequence length from flatfile
+               
+       while(d=blastFile.get() != 'Q'){};                      //      Suck up everything else until we get to the start of the alignment
+       int queryStart, sbjctStart, queryEnd, sbjctEnd;
+       string queryLabel, sbjctLabel, query, sbjct;
+
+       blastFile >> queryLabel;        queryLabel = 'Q' + queryLabel;
+
+       
+       while(queryLabel == "Query:"){
+               blastFile >> queryStart >> query >> queryEnd;
+               
+               while(d=blastFile.get() != 'S'){};
+               
+               blastFile >> sbjctLabel >> sbjctStart >> sbjct >> sbjctEnd;
+               
+               if(seqAaln == ""){
+                       seqAstart = queryStart;
+                       seqBstart = sbjctStart;
+               }
+
+               seqAaln += query;                                       //      concatenate each line of the sequence to what we already have
+               seqBaln += sbjct;                                       //      for the query and template (subject) sequence
+               
+               blastFile >> queryLabel;
+       }
+       seqAend = queryEnd;
+       seqBend = sbjctEnd;
+       pairwiseLength = seqAaln.length();
+
+       for(int i=1;i<seqBstart;i++){                           //      Since the alignments don't always start at (1, 1), we need to pad
+               seqAaln = 'Z' + seqAaln;                                //      the sequences so that they start at the same point
+               seqBaln = 'X' + seqBaln;
+       }
+       
+       for(int i=seqBend+1;i<=templateLength;i++){     //      since the sequences don't necessarily end at the same point, we
+               seqAaln += 'Z';                                                 //      again need ot pad the sequences so that they extend to the length
+               seqBaln += 'X';                                                 //      of the template sequence
+       }
+}
+
+//**************************************************************************************************/
diff --git a/blastalign.hpp b/blastalign.hpp
new file mode 100644 (file)
index 0000000..0f755d8
--- /dev/null
@@ -0,0 +1,34 @@
+/*
+ *  blastalign.hpp
+ *  
+ *
+ *  Created by Pat Schloss on 12/16/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ *     This is a basic alignment method that gets the blast program to do the heavy lifting.  In the future, we should
+ *     probably incorporate NCBI's library so that we don't have to call on a user-supplied executable.  This is a child
+ *     of the Alignment class, which requires a constructor and align method.
+ *
+ */
+#include "mothur.h"
+
+class BlastAlignment : public Alignment {
+
+public:
+       BlastAlignment(float, float, float, float);
+       ~BlastAlignment();
+       void align(string, string);
+private:
+
+       string candidateFileName;
+       string templateFileName;
+       string blastFileName;
+
+       void setPairwiseSeqs();
+       float match;
+       float mismatch;
+       float gapOpen;
+       float gapExtend;
+};
+
diff --git a/blastdb.cpp b/blastdb.cpp
new file mode 100644 (file)
index 0000000..ef8f737
--- /dev/null
@@ -0,0 +1,95 @@
+/*
+ *  blastdb.cpp
+ *  
+ *
+ *  Created by Pat Schloss on 12/22/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+using namespace std;
+
+
+#include "database.hpp"
+#include "sequence.hpp"
+#include "blastdb.hpp"
+
+/**************************************************************************************************/
+
+BlastDB::BlastDB(string fastaFileName, float gO, float gE, float m, float mM) : Database(fastaFileName), 
+gapOpen(gO), gapExtend(gE), match(m), misMatch(mM) {
+
+       cout << "Generating the temporary BLAST database...\t"; cout.flush();
+
+       int randNumber = rand();
+       dbFileName = toString(randNumber) + ".template.unaligned.fasta";
+       queryFileName = toString(randNumber) + ".candidate.unaligned.fasta";
+       blastFileName = toString(randNumber) + ".blast";
+
+
+       ofstream unalignedFastaFile;
+       openOutputFile(dbFileName, unalignedFastaFile);                         
+       
+       for(int i=0;i<numSeqs;i++){                                                                     //      generating a fasta file with unaligned template
+               unalignedFastaFile << '>' << i << endl;                                 //      sequences, which will be input to formatdb
+               unalignedFastaFile << templateSequences[i]->getUnaligned() << endl;
+       }
+       unalignedFastaFile.close();
+       
+       string formatdbCommand = "~/Pipeline/src/cpp/production/blast/bin/formatdb -p F -o T -i " + dbFileName; //      format the database, -o option gives us the ability
+       system(formatdbCommand.c_str());                                                                //      to get the right sequence names, i think. -p F
+                                                                                                                                       //      option tells formatdb that seqs are DNA, not prot
+       cout << "DONE." << endl << endl;        cout.flush();
+       emptySequence = new Sequence();
+       emptySequence->setName("no_match");
+       emptySequence->setUnaligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
+       emptySequence->setAligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
+
+
+}
+
+/**************************************************************************************************/
+
+BlastDB::~BlastDB(){
+       remove(queryFileName.c_str());                          //      let's clean stuff up and remove the temp files
+       remove(dbFileName.c_str());                                     //      let's clean stuff up and remove the temp files
+       remove(blastFileName.c_str());                          //      let's clean stuff up and remove the temp files
+}
+
+/**************************************************************************************************/
+
+Sequence* BlastDB::findClosestSequence(Sequence* candidate){
+
+       ofstream queryFile;
+       openOutputFile(queryFileName, queryFile);
+       queryFile << '>' << candidate->getName() << endl;
+       queryFile << candidate->getUnaligned() << endl;
+       queryFile.close();
+       
+       
+//     the goal here is to quickly survey the database to find the closest match.  To do this we are using the default
+//     wordsize used in megablast.  I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
+//     long.  With this setting, it seems comparable in speed to the suffix tree approach.
+
+       string blastCommand = "~/Pipeline/src/cpp/production/blast/bin/blastall -p blastn -d " + dbFileName + " -b 1 -m 8 -W 28";
+       blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
+       system(blastCommand.c_str());
+       
+       ifstream m8FileHandle;
+       openInputFile(blastFileName, m8FileHandle);
+       
+       string dummy;
+       int templateAccession;
+       gobble(m8FileHandle);
+       if(!m8FileHandle.eof()){
+               m8FileHandle >> dummy >> templateAccession >> searchScore;
+               m8FileHandle.close();
+               return templateSequences[templateAccession];
+       }
+       else{
+               searchScore = 0.00;
+               return emptySequence;
+       }
+}
+
+/**************************************************************************************************/
diff --git a/blastdb.hpp b/blastdb.hpp
new file mode 100644 (file)
index 0000000..a952897
--- /dev/null
@@ -0,0 +1,34 @@
+#ifndef BLASTDB_HPP
+#define BLASTDB_HPP
+
+
+/*
+ *  blastdb.hpp
+ *  
+ *
+ *  Created by Pat Schloss on 12/22/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+
+class BlastDB : public Database {
+public:
+       BlastDB(string, float, float, float, float);
+       ~BlastDB();
+       Sequence* findClosestSequence(Sequence*);
+
+private:
+       string dbFileName;
+       string queryFileName;
+       string blastFileName;
+       
+       float gapOpen;
+       float gapExtend;
+       float match;
+       float misMatch;
+       Sequence* emptySequence;
+};
+
+#endif
index 2e8aca7823a76b6656e091b48aeda4a23813aaa1..d41e4a81c76269baf17ca1c36b25ca5d1a57711b 100644 (file)
@@ -41,6 +41,7 @@
 #include "bootstrapsharedcommand.h"
 #include "concensuscommand.h"
 #include "distancecommand.h"
+#include "aligncommand.h"
 
 
 /***********************************************************/
@@ -93,6 +94,7 @@ Command* CommandFactory::getCommand(string commandName){
                else if(commandName == "bootstrap.shared")              {   command = new BootSharedCommand();                  }
                else if(commandName == "concensus")                             {   command = new ConcensusCommand();                   }
                else if(commandName == "dist.seqs")                             {   command = new DistanceCommand();                    }
+               else if(commandName == "align.seqs")                    {   command = new AlignCommand();                               }
                else                                                                                    {       command = new NoCommand();                                      }
 
                return command;
index 933bed4769f9d79ff3ba7acd0ce26f472566be23..8c080dbf95e79d6dc574e52d5679747018df00fa 100644 (file)
@@ -15,24 +15,23 @@ using namespace std;
 
 /**************************************************************************************************/
 
-Database::Database(string fastaFileName){
-       
-       ifstream fastaFile(fastaFileName.c_str());
-       if(!fastaFile) {
-               cerr << "Error: Could not open " << fastaFileName << endl;
-               exit(1);
-       }
+Database::Database(string fastaFileName){              //      This assumes that the template database is in fasta format, may 
+                                                                                               //      need to alter this in the future?
+
+       ifstream fastaFile;
+       openInputFile(fastaFileName, fastaFile);
+
        cout << endl << "Reading in the " << fastaFileName << " template sequences...\t";       cout.flush();
 
-       numSeqs=count(istreambuf_iterator<char>(fastaFile),istreambuf_iterator<char>(), '>');
-       fastaFile.seekg(0);
+       numSeqs=count(istreambuf_iterator<char>(fastaFile),istreambuf_iterator<char>(), '>');   //      count the number of
+       fastaFile.seekg(0);                                                                                                                                             //      sequences
        
        templateSequences.resize(numSeqs);
        
        string seqName, sequence;
        for(int i=0;i<numSeqs;i++){
-               templateSequences[i] = new Sequence();
-               
+               templateSequences[i] = new Sequence();          //      We're storing the aligned template sequences as a vector of
+                                                                                                       //      pointers to Sequence objects
                fastaFile >> seqName;
                templateSequences[i]->setName(seqName);
                
@@ -42,12 +41,13 @@ Database::Database(string fastaFileName){
                while(fastaFile && (letter=fastaFile.get()) != '>'){
                        if(isprint(letter)){
                                letter = toupper(letter);
+                               if(letter == 'U'){letter = 'T';}
                                aligned += letter;
                        }
                }
-               templateSequences[i]->setAligned(aligned);
-               templateSequences[i]->setUnaligned(aligned);
-               fastaFile.putback(letter);
+               templateSequences[i]->setAligned(aligned);      //      Obviously, we need the fully aligned template sequence
+               templateSequences[i]->setUnaligned(aligned);//  We will also need the unaligned sequence for pairwise alignments
+               fastaFile.putback(letter);                                      //      and database searches
        }
        
        fastaFile.close();
@@ -56,3 +56,7 @@ Database::Database(string fastaFileName){
 }
 
 /**************************************************************************************************/
+
+float Database::getSearchScore()       {       return searchScore;             }       //      we're assuming that the search is already done
+
+/**************************************************************************************************/
index e80bfb5949a3ce9584ae15a136a255284b9f740f..09fea858247b23cf33b351b0d56278522cf930af 100644 (file)
 
 #include "mothur.h"
 
+class Sequence;
+
 class Database {
 public:
        Database(string);
        virtual Sequence* findClosestSequence(Sequence*) = 0;
-       
+       virtual float getSearchScore();
 protected:
        int numSeqs;
-       vector<Sequence*> templateSequences;    
+       float searchScore;
+       vector<Sequence*> templateSequences;
 };
 
 #endif
index e5878b3170fcb256d2a4d8655e32b8c6bb97a795..7d1f6a0308b5358b2ef39ef70579d866fafca6da 100644 (file)
@@ -88,11 +88,12 @@ int DistanceCommand::execute(){
                
                remove(distFile.c_str());
                
-       //      #       if defined (WIN_VERSION)
+               //#     if defined (_WIN32)
+                       //figure out how to implement the fork and wait commands in windows
                //      driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff);
-       //      #       endif
+               //#     endif
                
-               #       if defined (__APPLE__) || (__MACH__)
+               #       if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
                        if(processors == 1){
                                driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff);    
                        }       
@@ -168,6 +169,8 @@ int DistanceCommand::execute(){
                                wait(NULL);
                        }
                        wait(NULL);
+               #       else
+                       driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff);
                #       endif
        
                delete distCalculator;
diff --git a/distancedb.cpp b/distancedb.cpp
new file mode 100644 (file)
index 0000000..ce74854
--- /dev/null
@@ -0,0 +1,65 @@
+/*
+ *  distancedb.cpp
+ *  
+ *
+ *  Created by Pat Schloss on 12/29/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+
+using namespace std;
+
+
+#include "database.hpp"
+#include "sequence.hpp"
+#include "distancedb.hpp"
+
+
+/**************************************************************************************************/
+
+DistanceDB::DistanceDB(string fastaFileName, string distanceFileName) : Database(fastaFileName) {
+       
+       ifstream inputData;
+       openInputFile(distanceFileName, inputData);
+       int numCandSeqs=count(istreambuf_iterator<char>(inputData),istreambuf_iterator<char>(), '\n');  //      count the number of
+       inputData.seekg(0);                                                                                                                                             //      sequences
+
+       hit closestMatch;
+       string candidateSeqName;
+       string junk;
+       
+       mostSimSequenceVector.resize(numCandSeqs);
+       
+       for(int i=0;i<numCandSeqs;i++){
+               inputData >> candidateSeqName >> closestMatch.seqName >> closestMatch.indexNumber >> closestMatch.simScore;
+//             getline(inputData, junk);       
+               mostSimSequenceVector[i] = closestMatch;
+       }
+       cout << numCandSeqs << endl;
+       searchIndex = 0;
+       inputData.close();
+}
+
+/**************************************************************************************************/
+
+Sequence* DistanceDB::findClosestSequence(Sequence* candidateSeq){
+       
+       hit simAccession = mostSimSequenceVector[searchIndex];
+//     string candidateSeqName, closestMatchSeqName, junk;
+//     int closestMatchIndexNumber;
+//     float closestMatchSimScore;
+//     
+//     inputData >> candidateSeqName >> closestMatchSeqName >> closestMatchIndexNumber >> closestMatchSimScore;
+//     getline(inputData, junk);       
+
+//     searchScore = 100. * closestMatchSimScore;
+
+       searchScore = 100. * simAccession.simScore;
+       searchIndex++;
+//     return templateSequences[closestMatchIndexNumber];
+       return templateSequences[simAccession.indexNumber];
+       
+}
+
+/**************************************************************************************************/
diff --git a/distancedb.hpp b/distancedb.hpp
new file mode 100644 (file)
index 0000000..dc4c9a6
--- /dev/null
@@ -0,0 +1,36 @@
+#ifndef DISTANCEDB_HPP
+#define DISTANCEDB_HPP
+
+/*
+ *  distancedb.hpp
+ *  
+ *
+ *  Created by Pat Schloss on 12/29/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+
+using namespace std;
+
+#include "mothur.h"
+
+class DistanceDB : public Database {
+       
+public:
+       DistanceDB(string, string);
+       Sequence* findClosestSequence(Sequence*);
+       
+private:
+
+       struct hit{
+               string seqName;
+               int indexNumber;
+               float simScore;
+       };
+       vector<hit> mostSimSequenceVector;
+//     ifstream inputData;
+       int searchIndex;
+};
+
+#endif
index 8aaa8b04fbeeb80b4c727396bab06f9899840a7e..dfe96072e3e8ae8799856982f56b85fff15b0289 100644 (file)
@@ -118,7 +118,14 @@ bool ErrorCheck::checkInput(string input) {
                                if (parameter == "scale" )                      { scale = value;        }
                                if (parameter == "ends" )                       { ends = value; }
                                if (parameter == "processors" )         { processors = value;   }
-
+                               if (parameter == "template")            { templatefile = value; }
+                               if (parameter == "search")                      { search = value;               }
+                               if (parameter == "ksize")                       { ksize = value;                }
+                               if (parameter == "align")                   { align = value;            }
+                               if (parameter == "match")                       { match = value;                }
+                               if (parameter == "mismatch")            { mismatch = value;         }
+                               if (parameter == "gapopen")                     { gapopen = value;              }
+                               if (parameter == "gapextend" )          { gapextend = value;    }
                        }
                        
                        //gets the last parameter and value
@@ -160,6 +167,14 @@ bool ErrorCheck::checkInput(string input) {
                                if (parameter == "scale" )                      { scale = value;        }
                                if (parameter == "ends" )                       { ends = value; }
                                if (parameter == "processors" )         { processors = value;   }
+                               if (parameter == "template")            { templatefile = value; }
+                               if (parameter == "search")                      { search = value;               }
+                               if (parameter == "ksize")                       { ksize = value;                }
+                               if (parameter == "align")                   { align = value;            }
+                               if (parameter == "match")                       { match = value;                }
+                               if (parameter == "mismatch")            { mismatch = value;         }
+                               if (parameter == "gapopen")                     { gapopen = value;              }
+                               if (parameter == "gapextend" )          { gapextend = value;    }
 
                        }
                }
@@ -246,9 +261,11 @@ bool ErrorCheck::checkInput(string input) {
                        }
                }
                
-               if ((commandName == "filter.seqs") || (commandName == "dist.seqs")) { 
+               if ((commandName == "filter.seqs") || (commandName == "dist.seqs") || (commandName == "align.seqs")) { 
                        if ((fastafile == "") && (nexusfile == "") && (clustalfile == "") && (phylipfile == "")) {
-                                cout << "You must read either a fasta, nexus, clustal, or phylip file before you can use the filter.seqs command." << endl; return false; 
+                                cout << "You must enter either a fasta, nexus, clustal, or phylip file before you can use the filter.seqs, dist.seqs or align.seqs command." << endl; return false; 
+                       }else if ((commandName == "align.seqs") && (templatefile == "")) {
+                               cout << "You must enter a template to use the align.seqs command." << endl; return false; 
                        }
                        validateSeqsFiles();
                }
@@ -558,9 +575,15 @@ void ErrorCheck::validateSeqsFiles() {
                                        errorFree = false;
                                }
                        }
-
+               }else if (templatefile != "") {
+                       ableToOpen = openInputFile(templatefile, filehandle);
+                       filehandle.close();
+                       if (ableToOpen == 1) { //unable to open
+                               errorFree = false;
+                       }
                }
                
+               
        }
        catch(exception& e) {
                cout << "Standard Error: " << e.what() << " has occurred in the ErrorCheck class Function validateSeqsFiles. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
@@ -642,6 +665,7 @@ void ErrorCheck::clear() {
        fastafile       =   "";
        nexusfile       =   "";
        clustalfile     =   "";
+       templatefile    =       "";
        line                    =       "";
        label                   =       "";
        method                  =   "furthest";
index 8de923499a5f84d520063d337c4be688a57d21ad..d6388f7d2337bacafd148a201cc62a27f83e1375 100644 (file)
@@ -36,7 +36,7 @@ class ErrorCheck {
                void refresh();
                string phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, orderfile, fastafile, nexusfile, clustalfile, treefile, sharedfile, cutoff, format; 
                string precision, method, fileroot, label, line, iters, jumble, freq, single, rarefaction, shared, summary, randomtree, abund, sorted, trump, soft, filter, scale, ends, processors;
-
+               string templatefile, search, ksize, align, match, mismatch, gapopen, gapextend;
                string commandName, optionText;
                bool errorFree;
 
index 4dfcbc1cf7129a6a9044b74f3ffb76671a0fd059..b4239f36bebb2fce5544cb171d6493c0d4a86aaf 100644 (file)
@@ -85,10 +85,14 @@ void GlobalData::parseGlobalData(string commandString, string optionText){
                                if (key == "scale")                     { scale = value;                }
                                if (key == "ends" )                     { ends = value;                 }
                                if (key == "processors" )       { processors = value;   }
-                               
-
-                               
-
+                               if (key == "template")          { templatefile = value; }
+                               if (key == "search")            { search = value;               }
+                               if (key == "ksize")                     { ksize = value;                }
+                               if (key == "align")                 { align = value;            }
+                               if (key == "match")                     { match = value;                }
+                               if (key == "mismatch")          { mismatch = value;         }
+                               if (key == "gapopen")           { gapopen = value;              }
+                               if (key == "gapextend" )        { gapextend = value;    }
                                
                                if (key == "line") {//stores lines to be used in a set
                                        lines.clear();
@@ -151,7 +155,14 @@ void GlobalData::parseGlobalData(string commandString, string optionText){
                        if (key == "scale")                     { scale = value;                }
                        if (key == "ends" )                     { ends = value;                 }
                        if (key == "processors" )       { processors = value;   }
-
+                       if (key == "template")          { templatefile = value; }
+                       if (key == "search")            { search = value;               }
+                       if (key == "ksize")                     { ksize = value;                }
+                       if (key == "align")                 { align = value;            }
+                       if (key == "match")                     { match = value;                }
+                       if (key == "mismatch")          { mismatch = value;         }
+                       if (key == "gapopen")           { gapopen = value;              }
+                       if (key == "gapextend" )        { gapextend = value;    }
 
                        if (key == "line") {//stores lines to be used in a vector
                                lines.clear();
@@ -289,6 +300,14 @@ string GlobalData::getFilter()                     {   return filter;              }
 string GlobalData::getScale()                  {       return scale;           }
 string GlobalData::getEnds()                   {   return ends;                }
 string GlobalData::getProcessors()             {       return processors;      }
+string GlobalData::getTemplateFile()   {       return templatefile;}
+string GlobalData::getSearch()                 {       return search;          }
+string GlobalData::getKSize()                  {       return ksize;           }
+string GlobalData::getAlign()                  {       return align;           }
+string GlobalData::getMatch()                  {       return match;           }
+string GlobalData::getMismatch()               {       return mismatch;        }
+string GlobalData::getGapopen()                        {       return gapopen;         }
+string GlobalData::getGapextend()              {       return gapextend;       }
 
 void GlobalData::setListFile(string file)              {       listfile = file;        inputFileName = file;}
 void GlobalData::setGroupFile(string file)             {       groupfile = file;       }
@@ -334,6 +353,7 @@ void GlobalData::clear() {
        clustalfile             =   "";
        treefile                =       "";
        sharedfile              =       "";
+       templatefile    =       "";
        cutoff                  =       "10.00";
        format                  =       "";
        precision               =       "100";
@@ -357,6 +377,14 @@ void GlobalData::clear() {
        scale                   =       "log10";
        ends                    =   "T";  //yes
        processors              =       "1";
+       search                  =       "suffix";
+       ksize                   =       "7";
+       align                   =       "blast";
+       match                   =       "1.0";
+       mismatch                =       "-1.0";
+       gapopen                 =       "-5.0";
+       gapextend               =       "-2.0";
+       
 
 }
 
@@ -379,6 +407,13 @@ void GlobalData::reset() {
        form                    =       "integral";
        ends                    =   "T";
        processors              =       "1";
+       search                  =       "suffix";
+       ksize                   =       "7";
+       align                   =       "blast";
+       match                   =       "1.0";
+       mismatch                =       "-1.0";
+       gapopen                 =       "-5.0";
+       gapextend               =       "-2.0";
 }
 /*******************************************************/
 
index 3a469f08cc52dc15ca51e13d9edf23f7a380a752..cb9aff67dfb4c07918891630c8729b61960451b5 100644 (file)
@@ -77,7 +77,14 @@ public:
        string getSorted();
        string getEnds();
        string getProcessors();
-
+       string getTemplateFile();
+       string getSearch();
+       string getKSize();
+       string getAlign();
+       string getMatch();
+       string getMismatch();
+       string getGapopen();
+       string getGapextend();
        string getTrump();
        string getSoft();
        string getFilter();
@@ -116,7 +123,8 @@ public:
 private:
 
        string phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, orderfile, fastafile, nexusfile, clustalfile, treefile, sharedfile, line, label, randomtree, groups;
-       string cutoff, format, precision, method, fileroot, iters, jumble, freq, calc, abund, step, form, sorted, trump, soft, filter, scale, ends, processors;
+       string cutoff, format, precision, method, fileroot, iters, jumble, freq, calc, abund, step, form, sorted, trump, soft, filter, scale, ends, processors, templatefile, search, ksize, align, match;
+       string mismatch, gapopen, gapextend;
 
 
        static GlobalData* _uniqueInstance;
diff --git a/gotohoverlap.cpp b/gotohoverlap.cpp
new file mode 100644 (file)
index 0000000..57d884b
--- /dev/null
@@ -0,0 +1,90 @@
+/*
+ *  gotohoverlap.cpp
+ *  
+ *
+ *  Created by Pat Schloss on 12/15/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ *     This class is an Alignment child class that implements the Gotoh pairwise alignment algorithm as described in:
+ *             
+ *             Gotoh O. 1982.  An improved algorithm for matching biological sequences.  J. Mol. Biol.  162:705-8.
+ *             Myers, EW & Miller, W.  1988.  Optimal alignments in linear space.  Comput Appl Biosci. 4:11-7.
+ *
+ *     This method is nice because it allows for an affine gap penalty to be assessed, which is analogous to what is used
+ *     in blast and is an alternative to Needleman-Wunsch, which only charges the same penalty for each gap position.
+ *     Because this method typically has problems at the ends when two sequences do not full overlap, we employ a separate
+ *     method to fix the ends (see Overlap class documentation)
+ *
+ */
+
+using namespace std;
+
+#include "alignmentcell.hpp"
+#include "overlap.hpp"
+#include "alignment.hpp"
+#include "gotohoverlap.hpp"
+
+/**************************************************************************************************/
+
+GotohOverlap::GotohOverlap(float gO, float gE, float m, float mm, int r) :
+       gapOpen(gO), gapExtend(gE), match(m), mismatch(mm), Alignment(r) {
+       
+       for(int i=1;i<nCols;i++){                               //      we initialize the dynamic programming matrix by setting the pointers in
+               alignment[0][i].prevCell = 'l';         //      the first row to the left
+               alignment[0][i].cValue = 0;
+               alignment[0][i].dValue = 0;
+       }
+       
+       for(int i=1;i<nRows;i++){                               //      we initialize the dynamic programming matrix by setting the pointers in
+               alignment[i][0].prevCell = 'u';         //      the first column upward
+               alignment[i][0].cValue = 0;
+               alignment[i][0].iValue = 0;
+       }
+}
+
+/**************************************************************************************************/
+
+void GotohOverlap::align(string A, string B){
+       
+       seqA = ' ' + A; lA = seqA.length();             //      the algorithm requires that the first character be a dummy value
+       seqB = ' ' + B; lB = seqB.length();             //      the algorithm requires that the first character be a dummy value
+       
+       for(int i=1;i<lB;i++){                                  //      the recursion here is shown in Webb and Miller, Fig. 1A.  Note that 
+               for(int j=1;j<lA;j++){                          //      if we need to conserve on space we should see Fig. 1B, which is linear
+                                                                                       //      in space, which I think is unnecessary
+                       float diagonal;
+                       if(seqB[i] == seqA[j])  {       diagonal = alignment[i-1][j-1].cValue + match;          }
+                       else                                    {       diagonal = alignment[i-1][j-1].cValue + mismatch;       }
+                       
+                       alignment[i][j].iValue = max(alignment[i][j-1].iValue, alignment[i][j-1].cValue + gapOpen) + gapExtend;
+                       alignment[i][j].dValue = max(alignment[i-1][j].dValue, alignment[i-1][j].cValue + gapOpen) + gapExtend;
+                       
+                       if(alignment[i][j].iValue > alignment[i][j].dValue){
+                               if(alignment[i][j].iValue > diagonal){
+                                       alignment[i][j].cValue = alignment[i][j].iValue;
+                                       alignment[i][j].prevCell = 'l';
+                               }
+                               else{
+                                       alignment[i][j].cValue = diagonal;
+                                       alignment[i][j].prevCell = 'd';
+                               }
+                       }
+                       else{
+                               if(alignment[i][j].dValue > diagonal){
+                                       alignment[i][j].cValue = alignment[i][j].dValue;
+                                       alignment[i][j].prevCell = 'u';
+                               }
+                               else{
+                                       alignment[i][j].cValue = diagonal;
+                                       alignment[i][j].prevCell = 'd';
+                               }
+                       }
+                       
+               }
+       }
+       Overlap over;
+       over.setOverlap(alignment, lA, lB, 0);  //      Fix the gaps at the ends of the sequences
+       traceBack();                                                    //      Construct the alignment and set seqAaln and seqBaln
+}
+
+/**************************************************************************************************/
diff --git a/gotohoverlap.hpp b/gotohoverlap.hpp
new file mode 100644 (file)
index 0000000..bf3c636
--- /dev/null
@@ -0,0 +1,43 @@
+#ifndef GOTOHOVERLAP_H
+#define GOTOHOVERLAP_H
+
+/*
+ *  gotohoverlap.h
+ *  
+ *
+ *  Created by Pat Schloss on 12/15/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ *     This class is an Alignment child class that implements the Gotoh pairwise alignment algorithm as described in:
+ *             
+ *             Gotoh O. 1982.  An improved algorithm for matching biological sequences.  J. Mol. Biol.  162:705-8.
+ *             Myers, EW & Miller, W.  1988.  Optimal alignments in linear space.  Comput Appl Biosci. 4:11-7.
+ *
+ *     This method is nice because it allows for an affine gap penalty to be assessed, which is analogous to what is used
+ *     in blast and is an alternative to Needleman-Wunsch, which only charges the same penalty for each gap position.
+ *     Because this method typically has problems at the ends when two sequences do not full overlap, we employ a separate
+ *     method to fix the ends (see Overlap class documentation)
+ *
+ */
+
+#include "mothur.h"
+#include "alignment.hpp"
+
+/**************************************************************************************************/
+
+class GotohOverlap : public Alignment {
+       
+public:
+       GotohOverlap(float, float, float, float, int);
+       void align(string, string);
+       
+private:
+       float gapOpen;
+       float gapExtend;
+       float match;
+       float mismatch;
+};
+
+/**************************************************************************************************/
+
+#endif
index 875cec54e9a8547341f060de8cba73eca1517a4d..dcb6475161ce6f29f89d746fa1581ca7cab380c4 100644 (file)
--- a/kmer.cpp
+++ b/kmer.cpp
@@ -14,28 +14,29 @@ using namespace std;
 
 /**************************************************************************************************/
 
-Kmer::Kmer(int size) : kmerSize(size) {
-       
-       int power4s[9] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536 };
-       maxKmer = power4s[kmerSize]+1;// (int)pow(4.,k)+1;
+Kmer::Kmer(int size) : kmerSize(size) {        //      The constructor sets the size of the kmer
        
+       int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
+                                                                                                                                               //      No reason to waste the time of recalculating
+       maxKmer = power4s[kmerSize]+1;// (int)pow(4.,k)+1;                                      //      powers of 4 everytime through.  We need an
+                                                                                                                                               //      extra kmer if we get a non-ATGCU base
 }
 
 /**************************************************************************************************/
 
-string Kmer::getKmerString(string sequence){
-       int length = sequence.length();
-       int nKmers = length - kmerSize + 1;
+string Kmer::getKmerString(string sequence){   //      Calculate kmer for each position in the sequence, count the freq
+       int length = sequence.length();                         //      of each kmer, and convert it to an ascii character with base '!'.
+       int nKmers = length - kmerSize + 1;                     //      Export the string of characters as a string
        vector<int> counts(maxKmer, 0);
        
-       for(int i=0;i<nKmers;i++){
-               int kmerNumber = getKmerNumber(sequence, i);
+       for(int i=0;i<nKmers;i++){                                      //      Go though sequence and get the number between 0 and maxKmer for that
+               int kmerNumber = getKmerNumber(sequence, i);//  kmer.  Increase the count for the kmer in the counts vector
                counts[kmerNumber]++;
        }
        
-       string kmerString = "";
-       for(int i=0;i<maxKmer;i++){
-               kmerString += getASCII(counts[i]);
+       string kmerString = "";                                         
+       for(int i=0;i<maxKmer;i++){                                     //      Scan through the vector of counts and convert each element of the 
+               kmerString += getASCII(counts[i]);              //      vector to an ascii character that is equal to or greater than '!'
        }               
        
        return kmerString;      
@@ -44,8 +45,24 @@ string Kmer::getKmerString(string sequence){
 /**************************************************************************************************/
 
 int Kmer::getKmerNumber(string sequence, int index){
-       int power4s[9] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536 };
+       
+//     Here we convert a kmer to a number between 0 and maxKmer.  For example, AAAA would equal 0 and TTTT would equal 255.
+//     If there's an N in the kmer, it is set to 256 (if we are looking at 4mers).  The largest we can look at are 8mers,
+//     this could be easily increased.
+//     
+//     Example:   ATGCAT (kSize = 6)
+//               i:   012345
+//
+//             Score   = 0*4^(6-0-1) + 3*4^(6-1-1) + 2*4^(6-2-1) + 1*4^(6-3-1) + 0*4^(6-4-1) + 3*4^(6-5-1)
+//                             = 0*4^5 +       3*4^4   +       2*4^3   +       1*4^2   +       0*4^1   +       3*4^0
+//                             = 0 + 3*256 + 2*64 + 1*16 + 0*4 + 3*1
+//                             = 0 + 768 + 128 + 16 + 0 + 3
+//                             = 915
+       
+       int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
+       
        int kmer = 0;
+       
        for(int i=0;i<kmerSize;i++){
                if(toupper(sequence[i+index]) == 'A')           {       kmer += (0 * power4s[kmerSize-i-1]);    }
                else if(toupper(sequence[i+index]) == 'C')      {       kmer += (1 * power4s[kmerSize-i-1]);    }
@@ -60,19 +77,30 @@ int Kmer::getKmerNumber(string sequence, int index){
 /**************************************************************************************************/
        
 string Kmer::getKmerBases(int kmerNumber){
-       int power4s[9] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536 };
+       
+//     Here we convert the kmer number into the kmer in terms of bases.
+//
+//     Example:        Score = 915 (for a 6-mer)
+//                             Base6 = (915 / 4^0) % 4 = 915 % 4 = 3 => T      [T]
+//                             Base5 = (915 / 4^1) % 4 = 228 % 4 = 0 => A      [AT]
+//                             Base4 = (915 / 4^2) % 4 = 57 % 4 = 1 => C       [CAT]
+//                             Base3 = (915 / 4^3) % 4 = 14 % 4 = 2 => G       [GCAT]
+//                             Base2 = (915 / 4^4) % 4 = 3 % 4 = 3 => T        [TGCAT]
+//                             Base1 = (915 / 4^5) % 4 = 0 % 4 = 0 => A        [ATGCAT] -> this checks out with the previous method
+       
+       int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
        
        string kmer = "";
        
-       if(kmerNumber == power4s[kmerSize]){//pow(4.,7)){       
-               for(int i=0;i<kmerSize;i++){
+       if(kmerNumber == power4s[kmerSize]){//pow(4.,7)){       //      if the kmer number is the same as the maxKmer then it must
+               for(int i=0;i<kmerSize;i++){                                    //      have had an N in it and so we'll just call it N x kmerSize
                        kmer += 'N';
                }
        }
        else{
                for(int i=0;i<kmerSize;i++){
-                       int nt = (int)(kmerNumber / (float)power4s[i]) % 4;
-                       if(nt == 0)             {       kmer = 'A' + kmer;      }
+                       int nt = (int)(kmerNumber / (float)power4s[i]) % 4;             //      the '%' operator returns the remainder 
+                       if(nt == 0)             {       kmer = 'A' + kmer;      }                               //      from int-based division ]
                        else if(nt == 1){       kmer = 'C' + kmer;      }
                        else if(nt == 2){       kmer = 'G' + kmer;      }
                        else if(nt == 3){       kmer = 'T' + kmer;      }
@@ -83,10 +111,10 @@ string Kmer::getKmerBases(int kmerNumber){
 
 /**************************************************************************************************/
 
-char Kmer::getASCII(int number)                {       return (char)(33+number);                       }
-
+char Kmer::getASCII(int number)                {       return (char)(33+number);                       }       // '!' is the first printable char and
+                                                                                                                                                               // has the int value of 33
 /**************************************************************************************************/
 
-int Kmer::getNumber(char character)    {       return ((int)(character-'!'));          }
+int Kmer::getNumber(char character)    {       return ((int)(character-'!'));          }       // '!' has the value of 33
 
 /**************************************************************************************************/
index 21a18e3ab801a9c07a3aa18f5a16e0481f65eb6b..31ca9fb4b7442a03757fc7893efa201a7d3f24be 100644 (file)
--- a/kmer.hpp
+++ b/kmer.hpp
@@ -20,11 +20,10 @@ public:
        Kmer(int);
        string getKmerString(string);
        int getKmerNumber(string, int);
+       string getKmerBases(int);
        
        
 private:
-       string getKmerBases(int);
-       
        char getASCII(int);
        int getNumber(char);
        int kmerSize;
@@ -34,4 +33,5 @@ private:
 
 /**************************************************************************************************/
 
+
 #endif
index ee8168a328a07bd8dbed1da1f6b4fa3ab0aebca7..27e6997d27878577e9121e6ecfb8950cee02e0c7 100644 (file)
@@ -5,6 +5,21 @@
  *  Created by Pat Schloss on 12/16/08.
  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
  *
+ *     This class is a child class of the Database class, which stores the template sequences as a kmer table and provides
+ *     a method of searching the kmer table for the sequence with the most kmers in common with a query sequence.
+ *     kmerLocations is the primary storage variable that is a two-dimensional vector where each row represents the
+ *     different number of kmers and each column contains the index to sequences that use that kmer.
+ *
+ *     Construction of an object of this type will first look for an appropriately named database file and if it is found
+ *     then will read in the database file (readKmerDB), otherwise it will generate one and store the data in memory
+ *     (generateKmerDB)
+ *
+ *     The search method used here is roughly the same as that used in the SimRank program that is found at the
+ *     greengenes website.  The default kmer size is 7.  The speed complexity is between O(L) and O(LN).  When I use 7mers
+ *     on average a kmer is found in ~100 other sequences with a database of ~5000 sequences.  If this is the case then the
+ *     time would be on the order of O(0.1LN) -> fast.
+ *
+
  */
 
 using namespace std;
@@ -22,16 +37,16 @@ KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerS
        string kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
        ifstream kmerFileTest(kmerDBName.c_str());
        
-       int power4s[9] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536 };
+       int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
        
        maxKmer = power4s[kmerSize];
        kmerLocations.resize(maxKmer+1);
        
-       if(!kmerFileTest){
+       if(!kmerFileTest){              //      if we can open the kmer db file, then read it in...
                cout << "Generating the " << kmerDBName << " database...\t";    cout.flush();
                generateKmerDB(kmerDBName);     
        }
-       else{
+       else{                                   //      ...otherwise generate it.
                cout << "Reading in the " << kmerDBName << " database...\t";    cout.flush();
                readKmerDB(kmerDBName, kmerFileTest);
        }
@@ -42,68 +57,67 @@ KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerS
 /**************************************************************************************************/
 
 Sequence* KmerDB::findClosestSequence(Sequence* candidateSeq){
+
+       vector<int> matches(numSeqs, 0);                                                //      a record of the sequences with shared kmers
+       vector<int> timesKmerFound(kmerLocations.size()+1, 0);  //      a record of the kmers that we have already found
        
-       vector<int> matches(numSeqs, 0);
-       vector<int> timesKmerFound(kmerLocations.size()+1, 0);
-       
-       int maxMatches = 0;
+       searchScore = 0;
        int maxSequence = 0;
        
-       string query = candidateSeq->getUnaligned();
-       
-       int numKmers = query.length() - kmerSize + 1;
+       string query = candidateSeq->getUnaligned();                    //      we want to search using an unaligned dna sequence
+
+       int numKmers = query.length() - kmerSize + 1;   
        Kmer kmer(kmerSize);
        
        for(int i=0;i<numKmers;i++){
-               
-               int kmerNumber = kmer.getKmerNumber(query, i);
-               
-               if(timesKmerFound[kmerNumber] == 0){
-                       for(int j=0;j<kmerLocations[kmerNumber].size();j++){
-                               matches[kmerLocations[kmerNumber][j]]++;
+               int kmerNumber = kmer.getKmerNumber(query, i);          //      go through the query sequence and get a kmer number
+               if(timesKmerFound[kmerNumber] == 0){                            //      if we haven't seen it before...
+                       for(int j=0;j<kmerLocations[kmerNumber].size();j++){//increase the count for each sequence that also has
+                               matches[kmerLocations[kmerNumber][j]]++;        //      that kmer
                        }
                }
-               timesKmerFound[kmerNumber] = 1;
-               
+               timesKmerFound[kmerNumber] = 1;                                         //      ok, we've seen the kmer now
        }
-       for(int i=0;i<numSeqs;i++){
-               if(matches[i] > maxMatches){
-                       maxMatches = matches[i];
+
+       for(int i=0;i<numSeqs;i++){                                                             //      find the db sequence that shares the most kmers with
+               if(matches[i] > searchScore){                                           //      the query sequence
+                       searchScore = matches[i];
                        maxSequence = i;
                }
        }
-       return templateSequences[maxSequence];
-       
+       searchScore = 100 * searchScore / (float)numKmers;
+       return templateSequences[maxSequence];                                  //      return the Sequence object corresponding to the db
+                                                                                                                       //      sequence with the most shared kmers.
 }
 
 /**************************************************************************************************/
 
 void KmerDB::generateKmerDB(string kmerDBName){
        
-       
        Kmer kmer(kmerSize);
        
-       for(int i=0;i<numSeqs;i++){
+       for(int i=0;i<numSeqs;i++){                                                             //      for all of the template sequences...
 
-               string seq = templateSequences[i]->getUnaligned();
+               string seq = templateSequences[i]->getUnaligned();      //      ...take the unaligned sequence...
                int numKmers = seq.length() - kmerSize + 1;
                
-               for(int j=0;j<numKmers;j++){
+               vector<int> seenBefore(maxKmer+1,0);
+               for(int j=0;j<numKmers;j++){                                            //      ...step though the sequence and get each kmer...
                        int kmerNumber = kmer.getKmerNumber(seq, j);
-                       kmerLocations[kmerNumber].push_back(i);
-               }
+                       if(seenBefore[kmerNumber] == 0){
+                               kmerLocations[kmerNumber].push_back(i);         //      ...insert the sequence index into kmerLocations for
+                       }                                                                                               //      the appropriate kmer number
+                       seenBefore[kmerNumber] = 1;
+               }                                                                                                       
        }
        
-       ofstream kmerFile(kmerDBName.c_str(), ios::trunc);
-       if(!kmerFile) {
-               cerr << "Error: Could not open " << kmerDBName << endl;
-               exit(1);
-       }
+       ofstream kmerFile;                                                                              //      once we have the kmerLocations folder print it out
+       openOutputFile(kmerDBName, kmerFile);                                   //      to a file
        
-       for(int i=0;i<maxKmer;i++){
-               kmerFile << i << ' ' << kmerLocations[i].size();
-               for(int j=0;j<kmerLocations[i].size();j++){
-                       kmerFile << ' ' << kmerLocations[i][j];
+       for(int i=0;i<maxKmer;i++){                                                             //      step through all of the possible kmer numbers
+               kmerFile << i << ' ' << kmerLocations[i].size();        //      print the kmer number and the number of sequences with
+               for(int j=0;j<kmerLocations[i].size();j++){                     //      that kmer.  then print out the indices of the sequences
+                       kmerFile << ' ' << kmerLocations[i][j];                 //      with that kmer.
                }
                kmerFile << endl;
        }
@@ -115,18 +129,18 @@ void KmerDB::generateKmerDB(string kmerDBName){
 
 void KmerDB::readKmerDB(string kmerDBName, ifstream& kmerDBFile){
 
-       kmerDBFile.seekg(0);
+       kmerDBFile.seekg(0);                                                                    //      start at the beginning of the file
        
        string seqName;
        int seqNumber;
        
-       for(int i=0;i<numSeqs;i++){
-               int numValues;
+       for(int i=0;i<maxKmer;i++){
+               int numValues;  
                kmerDBFile >> seqName >> numValues;
                
-               for(int j=0;j<numValues;j++){
-                       kmerDBFile >> seqNumber;
-                       kmerLocations[i].push_back(seqNumber);
+               for(int j=0;j<numValues;j++){                                           //      for each kmer number get the...
+                       kmerDBFile >> seqNumber;                                                //              1. number of sequences with the kmer number
+                       kmerLocations[i].push_back(seqNumber);                  //              2. sequence indices
                }
        }
        kmerDBFile.close();
index af52789dedc231eb5495f4a2f10cbc0efe73a37c..eb10061c9c36eab1053a07fe2cc7d0f4e3eb90cd 100644 (file)
@@ -8,6 +8,15 @@
  *  Created by Pat Schloss on 12/16/08.
  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
  *
+ *     This class is a child class of the Database class, which stores the template sequences as a kmer table and provides
+ *     a method of searching the kmer table for the sequence with the most kmers in common with a query sequence.
+ *     kmerLocations is the primary storage variable that is a two-dimensional vector where each row represents the
+ *     different number of kmers and each column contains the index to sequences that use that kmer.
+ *
+ *     Construction of an object of this type will first look for an appropriately named database file and if it is found
+ *     then will read in the database file (readKmerDB), otherwise it will generate one and store the data in memory
+ *     (generateKmerDB)
+
  */
 
 #include "mothur.h"
diff --git a/nast.cpp b/nast.cpp
new file mode 100644 (file)
index 0000000..0fae6ab
--- /dev/null
+++ b/nast.cpp
@@ -0,0 +1,362 @@
+/*
+ *  nast.cpp
+ *  
+ *
+ *  Created by Pat Schloss on 12/17/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ *     This is my implementation of the NAST (nearest alignment space termination) algorithm as described in:
+ *
+ *     DeSantis TZ, Hugenholtz P, Keller K, Brodie EL, Larsen N, Piceno YM, Phan R, & Anderson GL.  2006.  NAST: a multiple
+ *             sequence alignment server for comparative analysis of 16S rRNA genes.  Nucleic Acids Research.  34:W394-9.
+ *
+ *     To construct an object one needs to provide a method of getting a pairwise alignment (alignment) and the template
+ *     and candidate sequence that are to be aligned to each other.
+ *
+ */
+
+using namespace std;
+
+#include "sequence.hpp"
+#include "alignment.hpp"
+#include "nast.hpp"
+
+/**************************************************************************************************/
+
+Nast::Nast(Alignment* method, Sequence* cand, Sequence* temp) : alignment(method), candidateSeq(cand), templateSeq(temp) {
+       maxInsertLength = 0;
+       pairwiseAlignSeqs();    //      This is part A in Fig. 2 of DeSantis et al.
+       regapSequences();               //      This is parts B-F in Fig. 2 of DeSantis et al.
+}
+
+/**************************************************************************************************/
+
+void Nast::pairwiseAlignSeqs(){        //      Here we call one of the pairwise alignment methods to align our unaligned candidate
+                                                               //      and template sequences
+       alignment->align(candidateSeq->getUnaligned(), templateSeq->getUnaligned());
+       
+       string candAln = alignment->getSeqAAln();
+       string tempAln = alignment->getSeqBAln();
+
+       if(candAln == ""){
+               candidateSeq->setPairwise("");
+               templateSeq->setPairwise(templateSeq->getUnaligned());
+       }
+       else{
+               if(tempAln[0] == '-'){
+                       int pairwiseAlignmentLength = tempAln.length(); //      we need to make sure that the candidate sequence alignment
+                       for(int i=0;i<pairwiseAlignmentLength;i++){             //      starts where the template sequence alignment starts, if it
+                               if(isalpha(tempAln[i])){                                        //      starts early, we nuke the beginning of the candidate
+                                       candAln = candAln.substr(i);                    //      sequence
+                                       tempAln = tempAln.substr(i);
+                                       break;
+                               }
+                       }
+               }
+               
+               int pairwiseAlignmentLength = tempAln.length();
+               if(tempAln[pairwiseAlignmentLength-1] == '-'){          //      we need to make sure that the candidate sequence alignment
+                       for(int i=pairwiseAlignmentLength-1; i>=0; i--){//      ends where the template sequence alignment ends, if it runs
+                               if(isalpha(tempAln[i])){                                        //      long, we nuke the end of the candidate sequence
+                                       candAln = candAln.substr(0,i+1);
+                                       tempAln = tempAln.substr(0,i+1);
+                                       break;
+                               }               
+                       }
+               }
+       }
+       candidateSeq->setPairwise(candAln);                                     //      set the pairwise sequences in the Sequence objects for
+       templateSeq->setPairwise(tempAln);                                      //      the candidate and template sequences
+       
+}
+
+/**************************************************************************************************/
+
+void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAlign){
+
+//     here we do steps C-F of Fig. 2 from DeSantis et al.
+       
+       int longAlignmentLength = newTemplateAlign.length();    
+
+       for(int i=0; i<longAlignmentLength; i++){                               //      use the long alignment as the standard
+               int rightIndex, rightRoom, leftIndex, leftRoom;
+
+               //      Part C of Fig. 2 from DeSantis et al.
+               if((isalpha(newTemplateAlign[i]) != isalpha(tempAln[i]))){      //if there is a discrepancy between the regapped
+                       
+//                     cout << i << '\t';cout.flush();
+                       
+                       //      Part D of Fig. 2 from DeSantis et al.           //      template sequence and the official template sequence
+                       for(leftIndex=i-1;leftIndex>=0;leftIndex--){    //      then we've got problems...
+                               if(!isalpha(candAln[leftIndex])){
+                                       leftRoom = 1;   //count how far it is to the nearest gap on the LEFT side of the anomaly
+                                       while(leftIndex-leftRoom>=0 && !isalpha(candAln[leftIndex-leftRoom]))   {       leftRoom++;             }
+                                       break;
+                               }
+                       }
+//                     cout << leftIndex << '\t' << leftRoom << endl;
+                       
+                       for(rightIndex=i+1;rightIndex<longAlignmentLength;rightIndex++){
+                               if(!isalpha(candAln[rightIndex])){
+                                       rightRoom = 1;  //count how far it is to the nearest gap on the RIGHT side of the anomaly
+                                       while(rightIndex+rightRoom<longAlignmentLength && !isalpha(candAln[rightIndex+rightRoom]))      {       rightRoom++;    }
+                                       break;
+                               }
+                       }
+                       
+                       int insertLength = 0;                                                   //      figure out how long the anomaly is
+                       while(!isalpha(newTemplateAlign[i + insertLength]))     {       insertLength++; }
+                       if(insertLength > maxInsertLength){     maxInsertLength = insertLength; }
+                       
+                       if((leftRoom + rightRoom) >= insertLength){
+                               //      Parts D & E from Fig. 2 of DeSantis et al.
+                               if((i-leftIndex) <= (rightIndex-i)){            //      the left gap is closer - > move stuff left there's
+                                       if(leftRoom >= insertLength){                   //      enough room to the left to move
+                                               string leftTemplateString = newTemplateAlign.substr(0,i);
+                                               string rightTemplateString = newTemplateAlign.substr(i+insertLength);
+                                               newTemplateAlign = leftTemplateString + rightTemplateString;
+                                               longAlignmentLength = newTemplateAlign.length();
+                                               
+                                               string leftCandidateString = candAln.substr(0,leftIndex-insertLength+1);
+                                               string rightCandidateString = candAln.substr(leftIndex+1);
+                                               candAln = leftCandidateString + rightCandidateString;   
+                                       }
+                                       else{                                                                   //      not enough room to the left, have to steal some space to
+                                               string leftTemplateString = newTemplateAlign.substr(0,i);       //      the right
+                                               string rightTemplateString = newTemplateAlign.substr(i+insertLength);
+                                               newTemplateAlign = leftTemplateString + rightTemplateString;
+                                               longAlignmentLength = newTemplateAlign.length();
+                                               
+                                               string leftCandidateString = candAln.substr(0,leftIndex-leftRoom+1);
+                                               string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
+                                               string rightCandidateString = candAln.substr(rightIndex+(insertLength-leftRoom));
+                                               candAln = leftCandidateString + insertString + rightCandidateString;    
+                                       }
+                               }
+                               else{                                                                           //      the right gap is closer - > move stuff right there's
+                                       if(rightRoom >= insertLength){                  //      enough room to the right to move
+                                               string leftTemplateString = newTemplateAlign.substr(0,i);
+                                               string rightTemplateString = newTemplateAlign.substr(i+insertLength);
+                                               newTemplateAlign = leftTemplateString + rightTemplateString;
+                                               longAlignmentLength = newTemplateAlign.length();
+                                               
+                                               string leftCandidateString = candAln.substr(0,rightIndex);
+                                               string rightCandidateString = candAln.substr(rightIndex+insertLength);
+                                               candAln = leftCandidateString + rightCandidateString;   
+                                               
+                                       }
+                                       else{                                                                   //      not enough room to the right, have to steal some 
+                                                                                                                       //      space to the left lets move left and then right...
+                                               string leftTemplateString = newTemplateAlign.substr(0,i);
+                                               string rightTemplateString = newTemplateAlign.substr(i+insertLength);
+                                               newTemplateAlign = leftTemplateString + rightTemplateString;
+                                               longAlignmentLength = newTemplateAlign.length();
+                                               
+                                               string leftCandidateString = candAln.substr(0,leftIndex-(insertLength-rightRoom)+1);
+                                               string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
+                                               string rightCandidateString = candAln.substr(rightIndex+rightRoom);
+                                               candAln = leftCandidateString + insertString + rightCandidateString;    
+                                               
+                                       }
+                               }
+                       }
+                       else{                                                                                   //      there could be a case where there isn't enough room in
+                               string leftTemplateString = newTemplateAlign.substr(0,i);                       //      either direction to move stuff
+                               string rightTemplateString = newTemplateAlign.substr(i+leftRoom+rightRoom);
+                               newTemplateAlign = leftTemplateString + rightTemplateString;
+                               longAlignmentLength = newTemplateAlign.length();
+                               
+                               string leftCandidateString = candAln.substr(0,leftIndex-leftRoom+1);
+                               string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
+                               string rightCandidateString = candAln.substr(rightIndex+rightRoom);
+                               candAln = leftCandidateString + insertString + rightCandidateString;    
+                       }
+                       
+                       i -= insertLength;
+               } 
+       }
+//     cout << candAln << endl << tempAln << endl << newTemplateAlign << endl;
+}
+
+/**************************************************************************************************/
+
+void Nast::regapSequences(){   //This is essentially part B in Fig 2. of DeSantis et al.
+       
+       string candPair = candidateSeq->getPairwise();
+       string candAln = "";
+       
+       string tempPair = templateSeq->getPairwise();
+       string tempAln = templateSeq->getAligned();             //      we use the template aligned sequence as our guide
+       
+       int pairwiseLength = candPair.length();
+       int fullAlignLength = tempAln.length();
+       
+       if(candPair == ""){
+               for(int i=0;i<fullAlignLength;i++)      {       candAln += '.';         }
+               candidateSeq->setAligned(candAln);
+               return;
+       }
+       
+       int fullAlignIndex = 0;
+       int pairwiseAlignIndex = 0;
+       string newTemplateAlign = "";                                   //      this is going to be messy so we want a temporary template
+                                                                                                       //      alignment string
+       while(tempAln[fullAlignIndex] == '.' || tempAln[fullAlignIndex]  == '-'){
+               candAln += '.';                                                         //      add the initial '-' and '.' to the candidate and template
+               newTemplateAlign += tempAln[fullAlignIndex];//  pairwise sequences
+               fullAlignIndex++;
+       }
+       
+       string lastLoop = "";
+       
+       while(pairwiseAlignIndex<pairwiseLength){
+               if(isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
+                  && isalpha(candPair[pairwiseAlignIndex])){
+                       //  the template and candidate pairwise and template aligned have characters
+                       //      need to add character onto the candidatSeq.aligned sequence
+                       
+                       candAln += candPair[pairwiseAlignIndex];
+                       newTemplateAlign += tempPair[pairwiseAlignIndex];//
+                       
+                       pairwiseAlignIndex++;
+                       fullAlignIndex++;
+               }
+               else if(isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
+                               && isalpha(candPair[pairwiseAlignIndex])){
+                       //      the template pairwise and candidate pairwise are characters and the template aligned is a gap
+                       //      need to insert gaps into the candidateSeq.aligned sequence
+                       
+                       candAln += '-';
+                       newTemplateAlign += '-';//
+                       fullAlignIndex++;
+               }
+               else if(!isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
+                               && isalpha(candPair[pairwiseAlignIndex])){
+                       //  the template pairwise is a gap and the template aligned and pairwise sequences have characters
+                       //      this is the alpha scenario.  add character to the candidateSeq.aligned sequence without progressing
+                       //      further through the tempAln sequence.
+                       
+                       candAln += candPair[pairwiseAlignIndex];
+                       newTemplateAlign += '-';//
+                       pairwiseAlignIndex++;
+               }
+               else if(isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
+                               && !isalpha(candPair[pairwiseAlignIndex])){
+                       //  the template pairwise and full alignment are characters and the candidate sequence has a gap
+                       //      should not be a big deal, just add the gap position to the candidateSeq.aligned sequence;
+                       
+                       candAln += candPair[pairwiseAlignIndex];
+                       newTemplateAlign += tempAln[fullAlignIndex];//
+                       fullAlignIndex++;                       
+                       pairwiseAlignIndex++;
+               }
+               else if(!isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
+                               && isalpha(candPair[pairwiseAlignIndex])){
+                       //      the template pairwise and aligned are gaps while the candidate pairwise has a character
+                       //      this would be an insertion, go ahead and add the character->seems to be the opposite of the alpha scenario
+                       
+                       candAln += candPair[pairwiseAlignIndex];
+                       newTemplateAlign += tempAln[fullAlignIndex];//
+                       pairwiseAlignIndex++;
+                       fullAlignIndex++;                       
+               }
+               else if(isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
+                               && !isalpha(candPair[pairwiseAlignIndex])){
+                       //      template pairwise has a character, but its full aligned sequence and candidate sequence have gaps
+                       //      this would happen like we need to add a gap.  basically the opposite of the alpha situation
+                       
+                       newTemplateAlign += tempAln[fullAlignIndex];//
+                       candAln += "-";
+                       fullAlignIndex++;                       
+               }
+               else if(!isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
+                               && !isalpha(candPair[pairwiseAlignIndex])){
+                       //      template and candidate pairwise are gaps and the template aligned is not a gap this should not be possible
+                       //      would skip the gaps and not progress through full alignment sequence
+                       //      not tested yet
+                       
+                       cout << "We're into D" << ' ' << fullAlignIndex << ' ' << pairwiseAlignIndex <<  endl;
+                       pairwiseAlignIndex++;
+               }
+               else{
+                       //      everything has a gap - not possible
+                       //      not tested yet
+                       
+                       cout << "We're into F" << ' ' << fullAlignIndex << ' ' << pairwiseAlignIndex <<  endl;
+                       pairwiseAlignIndex++;
+                       fullAlignIndex++;                       
+               }               
+       }
+       
+       for(int i=fullAlignIndex;i<fullAlignLength;i++){
+               candAln += '.';
+               newTemplateAlign += tempAln[i];//
+       }
+       
+       int start, end;
+       for(int i=0;i<candAln.length();i++){
+               if(candAln[i] == 'Z' || !isalnum(candAln[i]))   {       candAln[i] = '.';       }       //      if we padded the alignemnt from
+               else{                   start = i;                      break;          }                                                       //      blast with Z's, change them to
+       }                                                                                                                                                               //      '.' characters
+       
+       for(int i=candAln.length()-1;i>=0;i--){                                                                                 //      ditto.
+               if(candAln[i] == 'Z' || !isalnum(candAln[i]))   {       candAln[i] = '.';       }
+               else{                   end = i;                        break;          }
+       }
+       
+       for(int i=start;i<=end;i++){                                    //      go through the candidate alignment sequence and make sure that
+               candAln[i] = toupper(candAln[i]);                       //      everything is upper case
+       }
+
+//     cout << candAln << endl;
+//     cout << tempAln << endl;
+//     cout << newTemplateAlign << endl;
+
+       if(candAln.length() != tempAln.length()){               //      if the regapped candidate sequence is longer than the official
+               removeExtraGaps(candAln, tempAln, newTemplateAlign);//  template alignment then we need to do steps C-F in Fig.
+       }                                                                                               //      2 of Desantis et al.
+       
+//     cout << candAln << endl;
+//     cout << tempAln << endl;
+//     cout << newTemplateAlign << endl;
+
+       candidateSeq->setAligned(candAln);
+}
+
+/**************************************************************************************************/
+
+float Nast::getSimilarityScore(){
+
+       string cand = candidateSeq->getAligned();
+       string temp = templateSeq->getAligned();
+       int alignmentLength = temp.length();
+       int mismatch = 0;
+       int denominator = 0;
+       
+       for(int i=0;i<alignmentLength;i++){
+               if(cand[i] == '-' && temp[i] == '-'){
+                       
+               }
+               else if(cand[i] != '.' && temp[i] != '.'){
+                       denominator++;
+                       
+                       if(cand[i] != temp[i]){
+                               mismatch++;
+                       }
+               }
+       }
+       float similarity = 100 * (1. - mismatch / (float)denominator);
+       if(denominator == 0){   similarity = 0.0000;    }
+
+       return similarity;
+}
+
+/**************************************************************************************************/
+
+int Nast::getMaxInsertLength(){
+       
+       return maxInsertLength;
+       
+}
+       
+/**************************************************************************************************/
diff --git a/nast.hpp b/nast.hpp
new file mode 100644 (file)
index 0000000..4cf1b85
--- /dev/null
+++ b/nast.hpp
@@ -0,0 +1,49 @@
+#ifndef NAST_HPP
+#define NAST_HPP
+
+/*
+ *  nast.hpp
+ *  
+ *
+ *  Created by Pat Schloss on 12/17/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ *     This is my implementation of the NAST (nearest alignment space termination) algorithm as described in:
+ *
+ *     DeSantis TZ, Hugenholtz P, Keller K, Brodie EL, Larsen N, Piceno YM, Phan R, & Anderson GL.  2006.  NAST: a multiple
+ *             sequence alignment server for comparative analysis of 16S rRNA genes.  Nucleic Acids Research.  34:W394-9.
+ *
+ *     To construct an object one needs to provide a method of getting a pairwise alignment (alignment) and the template
+ *     and candidate sequence that are to be aligned to each other.
+ *
+ */
+
+#include "mothur.h"
+
+class Alignment;
+class Sequence;
+
+/**************************************************************************************************/
+
+class Nast {
+       
+public:
+       Nast(Alignment*, Sequence*, Sequence*);
+       float getSimilarityScore();
+       int getMaxInsertLength();
+       
+private:
+       void pairwiseAlignSeqs();
+       void regapSequences();
+       void removeExtraGaps(string&, string, string);
+       
+       Alignment* alignment;
+       Sequence* candidateSeq;
+       Sequence* templateSeq;
+       
+       int maxInsertLength;
+};
+
+/**************************************************************************************************/
+
+#endif
diff --git a/nastreport.cpp b/nastreport.cpp
new file mode 100644 (file)
index 0000000..67ec702
--- /dev/null
@@ -0,0 +1,92 @@
+/*
+ *  nastreport.cpp
+ *  
+ *
+ *  Created by Pat Schloss on 12/19/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+using namespace std;
+
+#include "sequence.hpp"
+#include "nast.hpp"
+#include "alignment.hpp"
+#include "nastreport.hpp"
+
+/******************************************************************************************************************/
+
+NastReport::NastReport(string candidateReportFName) {
+       openOutputFile(candidateReportFName, candidateReportFile);
+       
+       candidateReportFile << "QueryName\tQueryLength\tTemplateName\tTemplateLength\t";
+       candidateReportFile << "SearchMethod\tSearchScore\t";
+       candidateReportFile << "AlignmentMethod\tQueryStart\tQueryEnd\tTemplateStart\tTemplateEnd\t";
+       candidateReportFile << "PairwiseAlignmentLength\tGapsInQuery\tGapsInTemplate\t";
+       candidateReportFile << "LongestInsert\t";
+       candidateReportFile << "SimBtwnQuery&Template" << endl;
+}
+
+/******************************************************************************************************************/
+
+void NastReport::print(){
+       
+       candidateReportFile << queryName << '\t' << queryLength << '\t' << templateName << '\t' << templateLength << '\t';
+       candidateReportFile << searchMethod << '\t' << setprecision(2) << fixed << searchScore << '\t';
+
+       candidateReportFile << alignmentMethod << '\t' << candidateStartPosition << "\t" << candidateEndPosition << '\t';
+       candidateReportFile << templateStartPosition << "\t" << templateEndPosition << '\t';
+       candidateReportFile << pairwiseAlignmentLength << '\t' << totalGapsInQuery << '\t' << totalGapsInTemplate << '\t';
+       candidateReportFile << longestInsert << '\t';;
+       candidateReportFile << setprecision(2) << similarityToTemplate;
+       
+       candidateReportFile << endl;
+       candidateReportFile.flush();
+}
+
+/******************************************************************************************************************/
+
+void NastReport::setCandidate(Sequence* candSeq){ 
+       queryName = candSeq->getName();
+       queryLength = candSeq->getUnalignLength();
+}
+
+/******************************************************************************************************************/
+
+void NastReport::setTemplate(Sequence* tempSeq){ 
+       templateName = tempSeq->getName();
+       templateLength = tempSeq->getUnalignLength();
+}
+
+/******************************************************************************************************************/
+
+void NastReport::setSearchParameters(string method, float score){
+       searchMethod = method;
+       searchScore = score;
+}
+
+/******************************************************************************************************************/
+
+void NastReport::setAlignmentParameters(string method, Alignment* align){
+       alignmentMethod = method;
+       
+       candidateStartPosition = align->getCandidateStartPos();
+       candidateEndPosition = align->getCandidateEndPos();
+       templateStartPosition = align->getTemplateStartPos();
+       templateEndPosition = align->getTemplateEndPos();
+       pairwiseAlignmentLength = align->getPairwiseLength();
+
+       totalGapsInQuery = pairwiseAlignmentLength - (candidateEndPosition - candidateStartPosition + 1);
+       totalGapsInTemplate = pairwiseAlignmentLength - (templateEndPosition - templateStartPosition + 1);
+}
+
+/******************************************************************************************************************/
+
+void NastReport::setNastParameters(Nast nast){
+
+       longestInsert = nast.getMaxInsertLength();
+       similarityToTemplate = nast.getSimilarityScore();
+       
+}
+
+/******************************************************************************************************************/
diff --git a/nastreport.hpp b/nastreport.hpp
new file mode 100644 (file)
index 0000000..424b0bb
--- /dev/null
@@ -0,0 +1,51 @@
+#ifndef NASTREPORT_HPP
+#define NASTREPORT_HPP
+
+
+/*
+ *  nastreport.hpp
+ *  
+ *
+ *  Created by Pat Schloss on 12/19/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+
+using namespace std;
+
+/******************************************************************************************************************/
+
+class NastReport {
+
+public:
+       NastReport(string);
+       void setCandidate(Sequence*);
+       void setTemplate(Sequence*);
+       void setSearchParameters(string, float);
+       void setAlignmentParameters(string, Alignment*);
+       void setNastParameters(Nast);
+       void print();
+       
+private:
+       string queryName;
+       int queryLength;
+       string templateName;
+       int templateLength;
+       string searchMethod;
+       float searchScore;
+       string alignmentMethod;
+       int candidateStartPosition, candidateEndPosition;
+       int templateStartPosition, templateEndPosition;
+
+       int pairwiseAlignmentLength;
+       int longestInsert;
+       int totalGapsInQuery, totalGapsInTemplate;
+       float similarityToTemplate;
+       ofstream candidateReportFile;
+};
+
+/******************************************************************************************************************/
+
+#endif
diff --git a/needlemanoverlap.cpp b/needlemanoverlap.cpp
new file mode 100644 (file)
index 0000000..8f5d464
--- /dev/null
@@ -0,0 +1,92 @@
+/*
+ *  needleman.cpp
+ *  
+ *
+ *  Created by Pat Schloss on 12/15/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ *     This class is an Alignment child class that implements the Gotoh pairwise alignment algorithm as described in:
+ *             
+ *             Gotoh O. 1982.  An improved algorithm for matching biological sequences.  J. Mol. Biol.  162:705-8.
+ *             Myers, EW & Miller, W.  1988.  Optimal alignments in linear space.  Comput Appl Biosci. 4:11-7.
+ *
+ *     This method is nice because it allows for an affine gap penalty to be assessed, which is analogous to what is used
+ *     in blast and is an alternative to Needleman-Wunsch, which only charges the same penalty for each gap position.
+ *     Because this method typically has problems at the ends when two sequences do not full overlap, we employ a separate
+ *     method to fix the ends (see Overlap class documentation)
+ *
+ */
+
+using namespace std;
+
+#include "alignmentcell.hpp"
+#include "alignment.hpp"
+#include "overlap.hpp"
+#include "needlemanoverlap.hpp"
+
+/**************************************************************************************************/
+
+NeedlemanOverlap::NeedlemanOverlap(float gO, float m, float mm, int r) ://     note that we don't have a gap extend
+gap(gO), match(m), mismatch(mm), Alignment(r) {                                                        //      the gap openning penalty is assessed for
+                                                                                                                                               //      every gapped position
+       for(int i=1;i<nCols;i++){
+               alignment[0][i].prevCell = 'l';                                 //      initialize first row by pointing all poiters to the left
+               alignment[0][i].cValue = 0;                                             //      and the score to zero
+       }
+       
+       for(int i=1;i<nRows;i++){
+               alignment[i][0].prevCell = 'u';                                 //      initialize first column by pointing all poiters upwards
+               alignment[i][0].cValue = 0;                                             //      and the score to zero
+       }
+}
+
+/**************************************************************************************************/
+
+NeedlemanOverlap::~NeedlemanOverlap(){ /*      do nothing      */      }
+
+/**************************************************************************************************/
+
+void NeedlemanOverlap::align(string A, string B){
+       
+       seqA = ' ' + A; lA = seqA.length();             //      algorithm requires a dummy space at the beginning of each string
+       seqB = ' ' + B; lB = seqB.length();             //      algorithm requires a dummy space at the beginning of each string
+       
+       for(int i=1;i<lB;i++){                                  //      This code was largely translated from Perl code provided in Ex 3.1 
+               for(int j=1;j<lA;j++){                          //      of the O'Reilly BLAST book.  I found that the example output had a
+                                                                                       //      number of errors
+                       float diagonal;
+                       if(seqB[i] == seqA[j])  {       diagonal = alignment[i-1][j-1].cValue + match;          }
+                       else                                    {       diagonal = alignment[i-1][j-1].cValue + mismatch;       }
+                       
+                       float up        = alignment[i-1][j].cValue + gap;
+                       float left      = alignment[i][j-1].cValue + gap;
+                       
+                       if(diagonal >= up){
+                               if(diagonal >= left){
+                                       alignment[i][j].cValue = diagonal;
+                                       alignment[i][j].prevCell = 'd';
+                               }
+                               else{
+                                       alignment[i][j].cValue = left;
+                                       alignment[i][j].prevCell = 'l';
+                               }
+                       }
+                       else{
+                               if(up >= left){
+                                       alignment[i][j].cValue = up;
+                                       alignment[i][j].prevCell = 'u';
+                               }
+                               else{
+                                       alignment[i][j].cValue = left;
+                                       alignment[i][j].prevCell = 'l';
+                               }
+                       }
+               }
+       }
+       Overlap over;                                           
+       over.setOverlap(alignment, lA, lB, 0);          //      Fix gaps at the beginning and end of the sequences
+       traceBack();                                                            //      Traceback the alignment to populate seqAaln and seqBaln
+}
+
+/**************************************************************************************************/
+
diff --git a/needlemanoverlap.hpp b/needlemanoverlap.hpp
new file mode 100644 (file)
index 0000000..57d3284
--- /dev/null
@@ -0,0 +1,43 @@
+#ifndef NEEDLEMAN_H
+#define NEEDLEMAN_H
+
+/*
+ *  needleman.h
+ *  
+ *
+ *  Created by Pat Schloss on 12/15/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ *     This class is an Alignment child class that implements the Needleman-Wunsch pairwise alignment algorithm as
+ *     described in:
+ *             
+ *     Needleman SB & Wunsch CD.  1970.  A general method applicable to the search for similarities in the amino acid
+ *             sequence of two proteins.  J Mol Biol.  48:443-53.
+ *     Korf I, Yandell M, & Bedell J.  2003.  BLAST.  O'Reilly & Associates.  Sebastopol, CA.
+ *
+ *     This method is simple as it assesses a consistent penalty for each gap position.  Because this method typically has
+ *     problems at the ends when two sequences do not full overlap, we employ a separate method to fix the ends (see
+ *     Overlap class documentation)
+ *
+ */
+
+#include "mothur.h"
+
+/**************************************************************************************************/
+
+class NeedlemanOverlap : public Alignment {
+       
+public:
+       NeedlemanOverlap(float, float, float, int);
+       ~NeedlemanOverlap();
+       void align(string, string);
+       
+private:       
+       float gap;
+       float match;
+       float mismatch;
+};
+
+/**************************************************************************************************/
+
+#endif
diff --git a/noalign.cpp b/noalign.cpp
new file mode 100644 (file)
index 0000000..14e34f6
--- /dev/null
@@ -0,0 +1,25 @@
+/*
+ *  noalign.cpp
+ *  
+ *
+ *  Created by Pat Schloss on 2/19/09.
+ *  Copyright 2009 __MyCompanyName__. All rights reserved.
+ *
+ */
+
+#include "alignment.hpp"
+#include "noalign.hpp"
+
+/**************************************************************************************************/
+
+NoAlign::NoAlign(){    /*      do nothing      */      }
+
+/**************************************************************************************************/
+
+NoAlign::~NoAlign(){   /*      do nothing      */      }
+
+/**************************************************************************************************/
+
+void NoAlign::align(string A, string B){       }
+
+/**************************************************************************************************/
diff --git a/noalign.hpp b/noalign.hpp
new file mode 100644 (file)
index 0000000..dd1e4ba
--- /dev/null
@@ -0,0 +1,31 @@
+#ifndef NOALIGN_HPP
+#define NOALIGN_HPP
+
+/*
+ *  noalign.hpp
+ *  
+ *
+ *  Created by Pat Schloss on 2/19/09.
+ *  Copyright 2009 __MyCompanyName__. All rights reserved.
+ *
+ */
+using namespace std;
+
+#include "mothur.h"
+
+/**************************************************************************************************/
+
+class NoAlign : public Alignment {
+       
+public:
+       NoAlign();
+       ~NoAlign();
+       void align(string, string);
+       
+private:       
+};
+
+/**************************************************************************************************/
+
+
+#endif
diff --git a/overlap.cpp b/overlap.cpp
new file mode 100644 (file)
index 0000000..9645e7a
--- /dev/null
@@ -0,0 +1,83 @@
+/*
+ *  overlap.cpp
+ *  
+ *
+ *  Created by Pat Schloss on 12/15/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ *     This class cleans up the alignment at the 3' end of the alignments.  Because the Gotoh and Needleman-Wunsch
+ *     algorithms start the traceback from the lower-right corner of the dynamic programming matrix, there may be a lot of
+ *     scattered bases in the alignment near the 3' end of the alignment.  Here we basically look for the largest score
+ *     in the last column and row to determine whether there should be exta gaps in sequence A or sequence B.  The gap
+ *     issues at the 5' end of the alignment seem to take care of themselves in the traceback.
+ *
+ */
+using namespace std;
+
+#include "alignmentcell.hpp"
+#include "overlap.hpp"
+
+
+/**************************************************************************************************/
+
+int Overlap::maxRow(vector<vector<AlignmentCell> >& alignment, const int band){
+       
+       float max = -100;
+       int end = lA - 1;
+       int index = end;
+       
+       for(int i=band;i<lB;i++){                                       //      find the row where the right most column has the highest alignment
+               if(alignment[i][end].cValue >= max){    //      score.
+                       index = i;
+                       max = alignment[i][end].cValue;
+               }
+       }
+       return index;
+}
+
+/**************************************************************************************************/
+
+int Overlap::maxColumn(vector<vector<AlignmentCell> >& alignment, const int band){
+       
+       float max = -100;
+       int end = lB - 1;
+       int index = end;
+       
+       for(int i=band;i<lA;i++){                                       //      find the column where the bottom most column has the highest
+               if(alignment[end][i].cValue >= max){    //      alignment score.
+                       index = i;
+                       max = alignment[end][i].cValue;
+               }
+       }
+       return index;
+}
+
+/**************************************************************************************************/
+
+void Overlap::setOverlap(vector<vector<AlignmentCell> >& alignment, const int nA, const int nB, const int band=0){
+       
+       lA = nA;
+       lB = nB;        
+       
+       int rowIndex = maxRow(alignment, band);         //      get the index for the row with the highest right hand side score
+       int colIndex = maxColumn(alignment, band);      //      get the index for the column with the highest bottom row score
+       
+       int row = lB-1;
+       int column = lA-1;
+       
+       if(colIndex == column && rowIndex == row){}     //      if the max values are the lower right corner, then we're good
+       else if(alignment[row][colIndex].cValue < alignment[rowIndex][column].cValue){
+               for(int i=rowIndex+1;i<lB;i++){                 //      decide whether sequence A or B needs the gaps at the end either set 
+                       alignment[i][column].prevCell = 'u';//  the pointer upwards or...
+               }
+               
+       }
+       else {
+               for(int i=colIndex+1;i<lA;i++){
+                       alignment[row][i].prevCell = 'l';       //      ...to the left
+               }
+       }
+}                                                                                              //      the traceback should take care of the gaps at the 5' end
+
+/**************************************************************************************************/
+
diff --git a/overlap.hpp b/overlap.hpp
new file mode 100644 (file)
index 0000000..f87f808
--- /dev/null
@@ -0,0 +1,37 @@
+#ifndef OVERLAP_H
+#define OVERLAP_H
+
+/*
+ *  overlap.hpp
+ *  
+ *
+ *  Created by Pat Schloss on 12/15/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ *     This class cleans up the alignment at the 3' end of the alignments.  Because the Gotoh and Needleman-Wunsch
+ *     algorithms start the traceback from the lower-right corner of the dynamic programming matrix, there may be a lot of
+ *     scattered bases in the alignment near the 3' end of the alignment.  Here we basically look for the largest score
+ *     in the last column and row to determine whether there should be exta gaps in sequence A or sequence B.  The gap
+ *     issues at the 5' end of the alignment seem to take care of themselves in the traceback.
+ *
+ */
+
+#include "mothur.h"
+
+/**************************************************************************************************/
+
+class Overlap {
+       
+public:
+       Overlap(){};
+       ~Overlap(){};
+       void setOverlap(vector<vector<AlignmentCell> >&, const int, const int, const int);
+private:
+       int maxRow(vector<vector<AlignmentCell> >&, const int);
+       int maxColumn(vector<vector<AlignmentCell> >&, const int);
+       int lA, lB;
+};
+
+/**************************************************************************************************/
+
+#endif
index 5c1ce741c951a620fb68529b834a3fd44e4b1840..ba0c02e2a474545474e4cf13efbdc633bfbbb3a3 100644 (file)
@@ -94,15 +94,23 @@ try {
                        //randomize the groups
                        random_shuffle(lookup.begin(), lookup.end());
                        
+                       //make merge the size of lookup[0]
+                       SharedRAbundVector* merge = new SharedRAbundVector(lookup[0]->size());
+                       
+                       //make copy of lookup zero
+                       for(int i = 0; i<lookup[0]->size(); i++) {
+                               merge->set(i, lookup[0]->getAbundance(i), "merge");
+                       }
+                       
                        vector<SharedRAbundVector*> subset;
                        //send each group one at a time
                        for (int k = 0; k < lookup.size(); k++) { 
                                subset.clear(); //clears out old pair of sharedrabunds
                                //add in new pair of sharedrabunds
-                               subset.push_back(lookup[0]); subset.push_back(lookup[k]);
+                               subset.push_back(merge); subset.push_back(lookup[k]);
                                
                                rcd->updateSharedData(subset, k+1, numGroupComb);
-                               mergeVectors(lookup[0], lookup[k]);
+                               mergeVectors(merge, lookup[k]);
                        }
 
                        //resets output files
@@ -134,9 +142,7 @@ void Rarefact::mergeVectors(SharedRAbundVector* shared1, SharedRAbundVector* sha
        try{
                for (int k = 0; k < shared1->size(); k++) {
                        //merge new species into shared1
-                       if ((shared1->getAbundance(k) == 0) && (shared2->getAbundance(k) != 0)) {
-                               shared1->set(k, shared2->getAbundance(k), "combo");  //set to 'combo' since this vector now contains multiple groups
-                       }
+                       shared1->set(k, (shared1->getAbundance(k) + shared2->getAbundance(k)), "combo");  //set to 'combo' since this vector now contains multiple groups
                }
        }
        catch(exception& e) {
index b59363e13cc92f969145434d1ba8c0fa64573778..f136426b327fdbd13d4143645b80cb060059325e 100644 (file)
@@ -24,6 +24,36 @@ Sequence::Sequence(string newName, string sequence) {
        }
        setUnaligned(sequence);
 }
+//********************************************************************************************************************
+
+Sequence::Sequence(ifstream& fastaFile){
+       
+       string accession;                               //      provided a file handle to a fasta-formatted sequence file, read in the next
+       fastaFile >> accession;                 //      accession number and sequence we find...
+       setName(accession);
+
+       char letter;
+       string sequence;
+       
+       while(fastaFile){
+               letter= fastaFile.get();
+               if(letter == '>'){
+                       fastaFile.putback(letter);
+                       break;
+               }
+               else if(isprint(letter)){
+                       letter = toupper(letter);
+                       if(letter == 'U'){letter = 'T';}
+                       sequence += letter;
+               }
+               
+       }
+
+       if(sequence.find_first_of('-') != string::npos){        //      if there are any gaps in the sequence, assume that it is
+               setAligned(sequence);                                                   //      an alignment file
+       }
+       setUnaligned(sequence);                                                         //      also set the unaligned sequence file
+}
 
 //********************************************************************************************************************
 
@@ -122,3 +152,18 @@ void Sequence::printSequence(ostream& out){
 }
 
 //********************************************************************************************************************
+
+int Sequence::getUnalignLength(){
+       return unaligned.length();
+}
+
+//********************************************************************************************************************
+
+int Sequence::getAlignLength(){
+       return aligned.length();
+}
+
+//********************************************************************************************************************
+
+
+
index 03cbab7ffddce5531e272cf6116069edfa0ebade..37ca87cf019ead5dd230bb8226d09761e7d16880 100644 (file)
@@ -8,6 +8,10 @@
  *  Created by Pat Schloss on 12/15/08.
  *  Copyright 2008 Patrick D. Schloss. All rights reserved.
  *
+ *     A sequence object has three components: i) an accession number / name, ii) the unaligned primary sequence, iii) a
+ *     pairwise aligned sequence, and iv) a sequence that is aligned to a reference alignment.  This class has methods
+ *     to set and get these values for the other classes where they are needed.  *
+ *
  */
 using namespace std;
 
@@ -20,6 +24,7 @@ class Sequence {
 public:
        Sequence();
        Sequence(string, string);
+       Sequence(ifstream&);
        
        void setName(string);
        void setUnaligned(string);
@@ -32,7 +37,9 @@ public:
        string getAligned();
        string getPairwise();
        string getUnaligned();
-       int getLength();
+       int getLength();  //the greater of the lengths of unaligned and aligned
+       int getUnalignLength();
+       int getAlignLength();
        void printSequence(ostream&);
        
 private:
diff --git a/suffixdb.cpp b/suffixdb.cpp
new file mode 100644 (file)
index 0000000..b4639f9
--- /dev/null
@@ -0,0 +1,56 @@
+/*
+ *  suffixdb.cpp
+ *  
+ *
+ *  Created by Pat Schloss on 12/16/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ *     This is a child class of the Database abstract datatype.  The class is basically a database of suffix trees and an
+ *     encapsulation of the method for finding the most similar tree to an inputted sequence.  the suffixForest objecct
+ *     is a vector of SuffixTrees, with each template sequence being represented by a different SuffixTree.  The class also
+ *     provides a method to take an unaligned sequence and find the closest sequence in the suffixForest.  The search
+ *     method is inspired by the article and Perl source code provided at http://www.ddj.com/web-development/184416093.  I 
+ *     would estimate that the time complexity is O(LN) for each search, which is slower than the kmer searching, but 
+ *     faster than blast
+ *
+ */
+
+using namespace std;
+
+#include "database.hpp"
+#include "sequence.hpp"
+#include "suffixtree.hpp"
+#include "suffixdb.hpp"
+
+/**************************************************************************************************/
+
+SuffixDB::SuffixDB(string fastaFileName) : Database(fastaFileName) {
+
+       suffixForest.resize(numSeqs);
+       cout << "Generating the suffix tree database...\t";     cout.flush();
+       for(int i=0;i<numSeqs;i++){                                                             //      The parent class' constructor generates the vector of
+               suffixForest[i].loadSequence(templateSequences[i]);     //      template Sequence objects.  Here each of these objects
+       }                                                                                                               //      is used to generate a suffix tree, aka the suffix forest
+       cout << "DONE." << endl << endl;        cout.flush();
+
+}
+
+/**************************************************************************************************/
+
+Sequence* SuffixDB::findClosestSequence(Sequence* candidateSeq){
+
+       int minValue = 2000;
+       int closestSequenceNo = 0;
+       string processedSeq = candidateSeq->convert2ints();             //      the candidate sequence needs to be a string of ints
+       for(int i=0;i<suffixForest.size();i++){                                 //      scan through the forest and see what the minimum
+               int count = suffixForest[i].countSuffixes(processedSeq, minValue);      //      return score is and keep track of the
+               if(count == minValue){                                                          //      template sequence index that corresponds to that score
+                       closestSequenceNo = i;
+               }
+       }
+       searchScore = 100 * (1. - minValue / (float)processedSeq.length());
+       return templateSequences[closestSequenceNo];                    //      return the Sequence object that has the minimum score
+       
+}
+
+/**************************************************************************************************/
diff --git a/suffixdb.hpp b/suffixdb.hpp
new file mode 100644 (file)
index 0000000..3d0b01d
--- /dev/null
@@ -0,0 +1,36 @@
+#ifndef SUFFIXDB_HPP
+#define SUFFIXDB_HPP
+
+/*
+ *  suffixdb.hpp
+ *  
+ *
+ *  Created by Pat Schloss on 12/16/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ *     This is a child class of the Database abstract datatype.  The class is basically a database of suffix trees and an
+ *     encapsulation of the method for finding the most similar tree to an inputted sequence.  the suffixForest objecct
+ *     is a vector of SuffixTrees, with each template sequence being represented by a different SuffixTree.  The class also
+ *     provides a method to take an unaligned sequence and find the closest sequence in the suffixForest.  The search
+ *     method is inspired by the article and Perl source code provided at http://www.ddj.com/web-development/184416093.  I 
+ *     would estimate that the time complexity is O(LN) for each search, which is slower than the kmer searching, but 
+ *     faster than blast
+ *
+ */
+
+#include "mothur.h"
+#include "database.hpp"
+
+class SuffixTree;
+
+class SuffixDB : public Database {
+       
+public:
+       SuffixDB(string);
+       Sequence* findClosestSequence(Sequence*);
+
+private:
+       vector<SuffixTree> suffixForest;
+};
+
+#endif
diff --git a/suffixnodes.cpp b/suffixnodes.cpp
new file mode 100644 (file)
index 0000000..79826ff
--- /dev/null
@@ -0,0 +1,96 @@
+/*
+ *  SuffixNodes.cpp
+ *  
+ *
+ *  Created by Pat Schloss on 12/15/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ *     There are two types of nodes in a suffix tree as I have implemented it.  First, there are the internal nodes that
+ *     have children, these are the SuffixBranch objects.  There are also the terminal nodes, which are the suffixBranches.
+ *  I divided them into two groups to save on memory.  A SuffixTree object will be a vector of SuffixNodes; therefore,
+ *     the values of parentNode, children nodes, and suffix nodes are stored as ints that correspond to indices in the 
+ *     vector
+ *
+ */
+
+#include "suffixnodes.hpp"
+
+using namespace std;
+
+//********************************************************************************************************************
+
+inline char deCodeSequence(char code){
+       
+       if(code == '0')                 {       return 'a';     }       //      this method allows us to go from the int string to a char string;
+       else if(code == '1')    {       return 'c';     }       //      it's only really useful if we want to print out the tree
+       else if(code == '2')    {       return 'g';     }
+       else if(code == '3')    {       return 't';     }
+       else if(code == '4')    {       return 'n';     }
+       else                                    {       return '$';     }
+       
+}
+
+//********************************************************************************************************************
+
+SuffixNode::SuffixNode(int parent, int start, int end) : 
+               parentNode(parent),                     //      we store the parent node as an int
+               startCharPosition(start),       //      the suffix tree class will hold the sequence that the startCharPosition and 
+               endCharPosition(end)            //      endCharPosition indices correspond to
+               {       /*      do nothing      */                      }
+
+
+void SuffixNode::setChildren(char, int)                        {       /*      do nothing      */                      }       //      there's no children in a leaf
+int SuffixNode::getNumChildren()                               {       return 0;                                       }       //      ditto
+void SuffixNode::eraseChild(char)                              {       /*      do nothing      */                      }       //      ditto
+int SuffixNode::getChild(char)                                 {       return -1;                                      }       //      ditto
+void SuffixNode::setSuffixNode(int)                            {       /*      do nothing      */                      }       //      there's no suffix node in a leaf
+int SuffixNode::getSuffixNode()                                        {       return -1;                                      }       //      ditto
+int SuffixNode::getParentNode()                                        {       return parentNode;                      }
+void SuffixNode::setParentNode(int number)             {       parentNode = number;            }
+int SuffixNode::getStartCharPos()                              {       return startCharPosition;       }
+void SuffixNode::setStartCharPos(int start)            {       startCharPosition = start;      }
+int SuffixNode::getEndCharPos()                                        {       return endCharPosition;         }       
+
+//********************************************************************************************************************
+
+SuffixLeaf::SuffixLeaf(int parent, int start, int end) : SuffixNode(parent, start, end) {      /*      do nothing      */      }
+
+
+void SuffixLeaf::print(string sequence, int nodeNumber){
+       
+       cout << this << '\t' << parentNode << '\t' << nodeNumber << '\t' <<
+       -1 << '\t' << startCharPosition << '\t' << endCharPosition << '\t';
+       
+       cout << '\'';
+       for(int i=startCharPosition;i<=endCharPosition;i++){
+               cout << deCodeSequence(sequence[i]);
+       }
+       cout << '\'' << endl;
+}
+
+//********************************************************************************************************************
+
+SuffixBranch::SuffixBranch(int parent, int start, int end) : SuffixNode(parent, start, end), suffixNode(-1){
+               childNodes.assign(6, -1);
+}
+       
+void SuffixBranch::print(string sequence, int nodeNumber){                                             //      this method is different that than
+       cout << this << '\t' << parentNode << '\t' << nodeNumber << '\t' <<                     //      of a leaf because it prints out a
+       suffixNode << '\t' << startCharPosition << '\t' << endCharPosition << '\t';     //      value for the suffix node       
+       
+       cout << '\'';
+       for(int i=startCharPosition;i<=endCharPosition;i++){
+               cout << deCodeSequence(sequence[i]);
+       }
+       cout << '\'' << endl;
+}
+
+//     we can access the children by subtracting '0' from the the char value from the string, the difference is an int
+//     value and the index we need to access.
+void SuffixBranch::eraseChild(char base)       {       childNodes[base - '0'] = -1;    }       //to erase set the child index to -1
+void SuffixBranch::setChildren(char base, int nodeIndex){      childNodes[base - '0'] = nodeIndex;     }
+void SuffixBranch::setSuffixNode(int nodeIndex){       suffixNode = nodeIndex;         }
+int SuffixBranch::getSuffixNode()                      {       return suffixNode;                              }
+int SuffixBranch::getChild(char base)          {       return childNodes[base - '0'];  }
+       
+//********************************************************************************************************************
diff --git a/suffixnodes.hpp b/suffixnodes.hpp
new file mode 100644 (file)
index 0000000..863a3ae
--- /dev/null
@@ -0,0 +1,77 @@
+#ifndef SUFFIXNODES_H
+#define SUFFIXNODES_H
+
+/*
+ *  SuffixNodes.h
+ *  
+ *
+ *  Created by Pat Schloss on 12/15/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ *     There are two types of nodes in a suffix tree as I have implemented it.  First, there are the internal nodes that
+ *     have children, these are the SuffixBranch objects.  There are also the terminal nodes, which are the suffixBranches.
+ *  I divided them into two groups to save on memory.  A SuffixTree object will be a vector of SuffixNodes; therefore,
+ *     the values of parentNode, children nodes, and suffix nodes are stored as ints that correspond to indices in the 
+ *     vector
+ *
+ */
+
+#include "mothur.h"
+
+using namespace std;
+
+//********************************************************************************************************************
+
+class SuffixNode {
+       
+public:
+       SuffixNode(int, int, int);
+       virtual void print(string, int) = 0;
+       virtual void setChildren(char, int);
+       virtual int getNumChildren();
+       virtual void eraseChild(char);
+       virtual void setSuffixNode(int);
+       virtual int getSuffixNode();
+       virtual int getChild(char);
+       int getParentNode();
+       void setParentNode(int);
+       int getStartCharPos();
+       void setStartCharPos(int start);
+       int getEndCharPos();
+       
+protected:
+       int parentNode;
+       int startCharPosition;
+       int endCharPosition;
+};
+
+//********************************************************************************************************************
+
+class SuffixLeaf : public SuffixNode { //      most of the methods are already set in the parent class
+       
+public:
+       SuffixLeaf(int, int, int);              //      we just need to define a constructor and
+       void print(string, int);                //      print method
+};
+
+//********************************************************************************************************************
+
+class SuffixBranch : public SuffixNode {
+       
+public:
+       SuffixBranch(int, int, int);
+       void print(string, int);                //      need a special method for printing the node because there are children
+       void eraseChild(char);                  //      need a special method for erasing the children
+       void setChildren(char, int);    //      need a special method for setting children
+       void setSuffixNode(int);                //      need a special method for setting the suffix node
+       int getSuffixNode();                    //      need a special method for returning the suffix node
+       int getChild(char);                             //      need a special method for return children
+       
+private:
+       vector<int> childNodes;                 //      a suffix branch is unique because it has children and a suffixNode.  The 
+       int suffixNode;                                 //      are stored in a vector for super-fast lookup.  If the alphabet were bigger, this
+};                                                                     //      might not be practical.  Since we only have 5 possible letters, it makes sense
+
+//********************************************************************************************************************
+
+#endif
diff --git a/suffixtree.cpp b/suffixtree.cpp
new file mode 100644 (file)
index 0000000..56ce6ed
--- /dev/null
@@ -0,0 +1,247 @@
+/*
+ *  suffixtree.cpp
+ *  
+ *
+ *  Created by Pat Schloss on 12/15/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ *     This is my half-assed attempt to implement a suffix tree.  This is a cobbled together algorithm using materials that
+ *     I found at http://marknelson.us/1996/08/01/suffix-trees/ and:
+ *
+ *             Ukkonen E. (1995). On-line construction of suffix trees. Algorithmica 14 (3): 249--260
+ *             Gusfield, Dan (1999). Algorithms on Strings, Trees and Sequences: Computer Science and Computational Biology. 
+ *                     USA: Cambridge University Press
+ *
+ *     The Ukkonen paper is the seminal paper describing the on-line method of constructing a suffix tree.
+ *
+ *     I have chosen to store the nodes of the tree as a vector of pointers to SuffixNode objects.  The root is stored at
+ *     nodeVector[0].  Each tree also stores the sequence name and the string that corresponds to the actual sequence. 
+ *     Finally, this class provides a way of counting the number of suffixes that are needed in one tree to generate a new
+ *     sequence (countSuffixes).  This method is used to determine similarity between sequences and was inspired by the
+ *     article and Perl source code provided at http://www.ddj.com/web-development/184416093.
+ *
+ */
+
+#include "sequence.hpp"
+#include "suffixnodes.hpp"
+#include "suffixtree.hpp"
+
+using namespace std;
+
+//********************************************************************************************************************
+
+inline bool compareParents(SuffixNode* left, SuffixNode* right){//     this is necessary to print the tree and to sort the
+       return (left->getParentNode() < right->getParentNode());        //      nodes in order of their parent
+}
+
+//********************************************************************************************************************
+
+SuffixTree::SuffixTree(){}
+
+//********************************************************************************************************************
+
+SuffixTree::~SuffixTree(){
+       for(int i=0;i<nodeVector.size();i++){   delete nodeVector[i];   }       
+}
+
+//********************************************************************************************************************
+
+void SuffixTree::loadSequence(Sequence* seq){
+       nodeCounter = 0;                                                        //      initially there are 0 nodes in the tree
+       activeStartPosition = 0;
+       activeEndPosition = -1;                                         
+       seqName = seq->getName();
+       sequence = seq->convert2ints();
+       sequence += '5';                                                        //      this essentially concatenates a '$' to the end of the sequence to
+       int seqLength = sequence.length();                      //      make it a cononical suffix tree
+       
+       nodeVector.push_back(new SuffixBranch(-1, 0, -1));      //      enter the root of the suffix tree
+       
+       activeNode = root = 0;
+       string hold;
+       for(int i=0;i<seqLength;i++){
+               addPrefix(i);                                                   //      step through the sequence adding each prefix
+       }
+}
+
+//********************************************************************************************************************
+
+string SuffixTree::getSeqName()        {
+       return seqName;         
+}
+
+//********************************************************************************************************************
+
+void SuffixTree::print(){
+       vector<SuffixNode*> hold = nodeVector;
+       sort(hold.begin(), hold.end(), compareParents);
+       cout << "Address\t\tParent\tNode\tSuffix\tStartC\tEndC\tSuffix" << endl;
+       for(int i=1;i<=nodeCounter;i++){
+               hold[i]->print(sequence, i);
+       }
+}
+
+//********************************************************************************************************************
+
+int SuffixTree::countSuffixes(string compareSequence, int& minValue){  //      here we count the number of suffix parts 
+                                                                                                                       //      we need to rewrite a user supplied sequence.  if the 
+       int numSuffixes = 0;                                                                    //      count exceeds the supplied minValue, bail out.  The
+       int seqLength = compareSequence.length();                               //      time complexity should be O(L)
+       int position = 0;
+       
+       int presentNode = 0;
+       
+       while(position < seqLength){            //      while the position in the query sequence isn't at the end...
+               
+               if(numSuffixes > minValue)      {       return 1000000;         }       //      bail if the count gets too high
+               
+               int newNode = nodeVector[presentNode]->getChild(compareSequence[position]);     //      see if the current node has a
+                                                                                                                               //      child that matches the next character in the query
+               if(newNode == -1){                                                                              
+                       if(presentNode == 0){   position++;             }                       //      if not, go back to the root and increase the count
+                       numSuffixes++;                                                                          //      by one.
+                       presentNode = 0;
+               }
+               else{                                                                                                   //      if there is, move to that node and see how far down
+                       presentNode = newNode;                                                          //      it we can get
+                       
+                       for(int i=nodeVector[newNode]->getStartCharPos(); i<=nodeVector[newNode]->getEndCharPos(); i++){
+                               if(compareSequence[position] == sequence[i]){
+                                       position++;                                                                     //      as long as the query and branch agree, keep going
+                               }
+                               else{
+                                       numSuffixes++;                                                          //      if there is a mismatch, increase the number of 
+                                       presentNode = 0;                                                        //      suffixes and go back to the root
+                                       break;
+                               }
+                       }
+               }
+               //      if we get all the way through the node we'll go to the top of the while loop and find the child node
+               //      that corresponds to what we are interested in           
+       }
+       numSuffixes--;                                                                                          //      the method puts an extra count on numSuffixes
+       
+       if(numSuffixes < minValue)      {       minValue = numSuffixes; }       //      if the count is less than the previous minValue,
+       return numSuffixes;                                                                                     //      change the value and return the number of suffixes
+       
+}
+
+//********************************************************************************************************************
+
+void SuffixTree::canonize(){   //      if you have to ask how this works, you don't really want to know and this really
+                                                               //      isn't the place to ask.
+       if ( isExplicit() == 0 ) {      //      if the node has no children...
+               
+               int tempNodeIndex = nodeVector[activeNode]->getChild(sequence[activeStartPosition]);
+               SuffixNode* tempNode = nodeVector[tempNodeIndex];
+               
+               int span = tempNode->getEndCharPos() - tempNode->getStartCharPos();
+               
+               while ( span <= ( activeEndPosition - activeStartPosition ) ) {
+                       
+            activeStartPosition = activeStartPosition + span + 1;
+                       
+                       activeNode = tempNodeIndex;
+                       
+            if ( activeStartPosition <= activeEndPosition ) {
+                               tempNodeIndex = nodeVector[tempNodeIndex]->getChild(sequence[activeStartPosition]);
+                               tempNode = nodeVector[tempNodeIndex];
+                               span = tempNode->getEndCharPos() - tempNode->getStartCharPos();
+            }
+                       
+        }
+    }
+}
+
+//********************************************************************************************************************
+
+int SuffixTree::split(int nodeIndex, int position){    //      leaves stay leaves, etc, to split a leaf we make a new interior 
+                                                                                                       //      node and reconnect everything
+       SuffixNode* node = nodeVector[nodeIndex];                                       //      get the node that needs to be split
+       SuffixNode* parentNode = nodeVector[node->getParentNode()];     //      get it's parent node
+       
+       parentNode->eraseChild(sequence[node->getStartCharPos()]);      //      erase the present node from the registry of its parent
+       
+       nodeCounter++;
+       SuffixNode* newNode = new SuffixBranch(node->getParentNode(), node->getStartCharPos(), node->getStartCharPos() + activeEndPosition - activeStartPosition);      //      create a new node that will link the parent with the old child
+       parentNode->setChildren(sequence[newNode->getStartCharPos()], nodeCounter);//   give the parent the new child
+       nodeVector.push_back(newNode);
+       
+       node->setParentNode(nodeCounter);       //      give the original node the new node as its parent
+       newNode->setChildren(sequence[node->getStartCharPos() + activeEndPosition - activeStartPosition + 1], nodeIndex);
+       //      put the original node in the registry of the new node's children
+       newNode->setSuffixNode(activeNode);//link the new node with the old active node
+       
+       //      recalculate the startCharPosition of the outermost node
+       node->setStartCharPos(node->getStartCharPos() + activeEndPosition - activeStartPosition + 1 );
+       
+       return node->getParentNode();
+}
+
+//********************************************************************************************************************
+
+void SuffixTree::makeSuffixLink(int& previous, int present){
+       
+//     here we link the nodes that are suffixes of one another to rapidly speed through the tree
+       if ( previous > 0 ) {   nodeVector[previous]->setSuffixNode(present);   }
+       else                            {       /*      do nothing                                                              */      }
+       
+    previous = present;
+}
+
+//********************************************************************************************************************
+
+void SuffixTree::addPrefix(int prefixPosition){
+       
+       int lastParentNode = -1;        //      we need to place a new prefix in the suffix tree
+       int parentNode = 0;
+       
+       while(1){
+               
+               parentNode = activeNode;
+               
+               if(isExplicit() == 1){  //      if the node is explicit (has kids), try to follow it down the branch if its there...
+                       if(nodeVector[activeNode]->getChild(sequence[prefixPosition]) != -1){   //      break out and get next prefix...
+                               break;                                                                                          
+                       }
+                       else{                           //      ...otherwise continue, we'll need to make a new node later on...
+                       }
+               }
+               else{                                   //      if it's not explicit (no kids), read through and see if all of the chars agree...
+                       int tempNode = nodeVector[activeNode]->getChild(sequence[activeStartPosition]);
+                       int span = activeEndPosition - activeStartPosition;
+                       
+                       if(sequence[nodeVector[tempNode]->getStartCharPos() + span + 1] == sequence[prefixPosition] ){
+                               break;                  //      if the existing suffix agrees with the new one, grab a new prefix...
+                       }
+                       else{
+                               parentNode = split(tempNode, prefixPosition);   //      ... otherwise we need to split the node
+                       }
+                       
+               }
+               
+               nodeCounter++;  //      we need to generate a new node here if the kid didn't exist, or we split a node
+               SuffixNode* newSuffixLeaf = new SuffixLeaf(parentNode, prefixPosition, sequence.length()-1);
+               nodeVector[parentNode]->setChildren(sequence[prefixPosition], nodeCounter);
+               nodeVector.push_back(newSuffixLeaf);
+               
+               makeSuffixLink( lastParentNode, parentNode );           //      make a suffix link for the parent node
+               
+               if(nodeVector[activeNode]->getParentNode() == -1){      //      move along the start position for the tree
+            activeStartPosition++;
+        } 
+               else {
+            activeNode = nodeVector[activeNode]->getSuffixNode();
+               }
+               canonize();                                                                                     //      frankly, i'm not entirely clear on what canonize does.
+       }
+       
+       makeSuffixLink( lastParentNode, parentNode );
+       activeEndPosition++;                                                                    //      move along the end position for the tree
+       
+       canonize();                                                                                             //      frankly, i'm not entirely clear on what canonize does.
+       
+}
+
+//********************************************************************************************************************
+
diff --git a/suffixtree.hpp b/suffixtree.hpp
new file mode 100644 (file)
index 0000000..387d4bd
--- /dev/null
@@ -0,0 +1,69 @@
+#ifndef SUFFIXTREE_H
+#define SUFFIXTREE_H
+
+/*
+ *  suffixtree.h
+ *  
+ *
+ *  Created by Pat Schloss on 12/15/08.
+ *  Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ *     This is my half-assed attempt to implement a suffix tree.  This is a cobbled together algorithm using materials that
+ *     I found at http://marknelson.us/1996/08/01/suffix-trees/ and:
+ *
+ *             Ukkonen E. (1995). On-line construction of suffix trees. Algorithmica 14 (3): 249--260
+ *             Gusfield, Dan (1999). Algorithms on Strings, Trees and Sequences: Computer Science and Computational Biology. 
+ *                     USA: Cambridge University Press
+ *
+ *     The Ukkonen paper is the seminal paper describing the on-line method of constructing a suffix tree.
+ *
+ *     I have chosen to store the nodes of the tree as a vector of pointers to SuffixNode objects.  The root is stored at
+ *     nodeVector[0].  Each tree also stores the sequence name and the string that corresponds to the actual sequence. 
+ *     Finally, this class provides a way of counting the number of suffixes that are needed in one tree to generate a new
+ *     sequence (countSuffixes).  This method is used to determine similarity between sequences and was inspired by the
+ *     article and Perl source code provided at http://www.ddj.com/web-development/184416093.
+ *
+ */
+
+#include "mothur.h"
+
+using namespace std;
+
+class SuffixNode;
+
+//********************************************************************************************************************
+
+class SuffixTree {
+       
+public:
+       SuffixTree();
+       ~SuffixTree();
+//     SuffixTree(string, string);
+
+       void loadSequence(Sequence*);
+       string getSeqName();
+       void print();   
+       int countSuffixes(string, int&);
+
+private:       
+       void addPrefix(int);
+       void canonize();
+       int split(int, int);
+       void makeSuffixLink(int&, int);
+       bool isExplicit(){      return activeStartPosition > activeEndPosition; }
+       
+       int activeStartPosition;
+       int activeEndPosition;
+       
+       vector<SuffixNode*> nodeVector;
+       int root;
+       int activeNode;
+       int nodeCounter;
+       string seqName;
+       string sequence;
+       
+};
+
+//********************************************************************************************************************
+
+#endif
index 5549e6396527c198a5131ba85c42ee356c07774f..a5f36519b7af379e3e5e761021f6b1c060e297c9 100644 (file)
@@ -43,6 +43,7 @@ ValidCommands::ValidCommands() {
                commands["concensus"]                   = "concensus";
                commands["help"]                                = "help"; 
                commands["filter.seqs"]                 = "filter.seqs";
+               commands["align.seqs"]                  = "align.seqs";
                commands["quit"]                                = "quit"; 
 
                                
index 8af4a8154e74fab87e3985222a0e0e8ac93a0bfd..e79a2db2663a48ceee335dba6c1a77973dd09bb6 100644 (file)
@@ -211,7 +211,6 @@ void ValidParameters::initCommandParameters() {
                //{"parameter1","parameter2",...,"last parameter"};
                
                string readdistArray[] = {"phylip","column", "name","cutoff","precision", "group"};
-
                commandParameters["read.dist"] = addParameters(readdistArray, sizeof(readdistArray)/sizeof(string));
 
                string readotuArray[] =  {"list","order","shared", "line", "label","group","sabund", "rabund"};
@@ -292,6 +291,9 @@ void ValidParameters::initCommandParameters() {
                string distanceArray[] =  {"fasta","phylip","clustal","nexus", "calc", "ends", "cutoff", "processors"};
                commandParameters["dist.seqs"] = addParameters(distanceArray, sizeof(distanceArray)/sizeof(string));
                
+               string AlignArray[] =  {"fasta","phylip","clustal","nexus", "template", "ksize", "align", "match", "mismatch", "gapopen", "gapextend"};
+               commandParameters["align.seqs"] = addParameters(AlignArray, sizeof(AlignArray)/sizeof(string));
+               
                string quitArray[] = {};
                commandParameters["quit"] = addParameters(quitArray, sizeof(quitArray)/sizeof(string));