From 526a868606faa50caf86e7399f7554c0335b39e5 Mon Sep 17 00:00:00 2001 From: westcott Date: Mon, 18 May 2009 17:37:53 +0000 Subject: [PATCH] added alignment code --- Mothur.xcodeproj/project.pbxproj | 102 ++++++++- aligncommand.cpp | 151 +++++++++++++ aligncommand.h | 35 +++ alignment.cpp | 163 ++++++++++++++ alignment.hpp | 53 +++++ alignmentcell.cpp | 18 ++ alignmentcell.hpp | 30 +++ blastalign.cpp | 156 +++++++++++++ blastalign.hpp | 34 +++ blastdb.cpp | 95 ++++++++ blastdb.hpp | 34 +++ commandfactory.cpp | 2 + database.cpp | 32 +-- database.hpp | 7 +- distancecommand.cpp | 9 +- distancedb.cpp | 65 ++++++ distancedb.hpp | 36 +++ errorchecking.cpp | 32 ++- errorchecking.h | 2 +- globaldata.cpp | 45 +++- globaldata.hpp | 12 +- gotohoverlap.cpp | 90 ++++++++ gotohoverlap.hpp | 43 ++++ kmer.cpp | 70 ++++-- kmer.hpp | 4 +- kmerdb.cpp | 102 +++++---- kmerdb.hpp | 9 + nast.cpp | 362 +++++++++++++++++++++++++++++++ nast.hpp | 49 +++++ nastreport.cpp | 92 ++++++++ nastreport.hpp | 51 +++++ needlemanoverlap.cpp | 92 ++++++++ needlemanoverlap.hpp | 43 ++++ noalign.cpp | 25 +++ noalign.hpp | 31 +++ overlap.cpp | 83 +++++++ overlap.hpp | 37 ++++ rarefact.cpp | 16 +- sequence.cpp | 45 ++++ sequence.hpp | 9 +- suffixdb.cpp | 56 +++++ suffixdb.hpp | 36 +++ suffixnodes.cpp | 96 ++++++++ suffixnodes.hpp | 77 +++++++ suffixtree.cpp | 247 +++++++++++++++++++++ suffixtree.hpp | 69 ++++++ validcommands.cpp | 1 + validparameter.cpp | 4 +- 48 files changed, 2841 insertions(+), 111 deletions(-) create mode 100644 aligncommand.cpp create mode 100644 aligncommand.h create mode 100644 alignment.cpp create mode 100644 alignment.hpp create mode 100644 alignmentcell.cpp create mode 100644 alignmentcell.hpp create mode 100644 blastalign.cpp create mode 100644 blastalign.hpp create mode 100644 blastdb.cpp create mode 100644 blastdb.hpp create mode 100644 distancedb.cpp create mode 100644 distancedb.hpp create mode 100644 gotohoverlap.cpp create mode 100644 gotohoverlap.hpp create mode 100644 nast.cpp create mode 100644 nast.hpp create mode 100644 nastreport.cpp create mode 100644 nastreport.hpp create mode 100644 needlemanoverlap.cpp create mode 100644 needlemanoverlap.hpp create mode 100644 noalign.cpp create mode 100644 noalign.hpp create mode 100644 overlap.cpp create mode 100644 overlap.hpp create mode 100644 suffixdb.cpp create mode 100644 suffixdb.hpp create mode 100644 suffixnodes.cpp create mode 100644 suffixnodes.hpp create mode 100644 suffixtree.cpp create mode 100644 suffixtree.hpp diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 7beb6cb..7bd433f 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -14,6 +14,20 @@ 372E12700F26365B0095CF7E /* readotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E126F0F26365B0095CF7E /* readotucommand.cpp */; }; 372E12960F263D5A0095CF7E /* readdistcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E12950F263D5A0095CF7E /* readdistcommand.cpp */; }; 372E12ED0F264D320095CF7E /* commandfactory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E12EC0F264D320095CF7E /* commandfactory.cpp */; }; + 373C68A40FC1C07D00137ACD /* alignment.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68A00FC1C07D00137ACD /* alignment.cpp */; }; + 373C68A50FC1C07D00137ACD /* alignmentcell.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68A20FC1C07D00137ACD /* alignmentcell.cpp */; }; + 373C68BC0FC1C1A600137ACD /* gotohoverlap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68BA0FC1C1A600137ACD /* gotohoverlap.cpp */; }; + 373C68C50FC1C25F00137ACD /* overlap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68C30FC1C25F00137ACD /* overlap.cpp */; }; + 373C68CF0FC1C2D900137ACD /* needlemanoverlap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68CD0FC1C2D900137ACD /* needlemanoverlap.cpp */; }; + 373C68DD0FC1C38D00137ACD /* blastalign.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68DB0FC1C38D00137ACD /* blastalign.cpp */; }; + 373C68E50FC1C4A500137ACD /* noalign.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68E30FC1C4A500137ACD /* noalign.cpp */; }; + 373C68F90FC1C6A800137ACD /* suffixdb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68F30FC1C6A800137ACD /* suffixdb.cpp */; }; + 373C68FA0FC1C6A800137ACD /* suffixnodes.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68F50FC1C6A800137ACD /* suffixnodes.cpp */; }; + 373C68FB0FC1C6A800137ACD /* suffixtree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68F70FC1C6A800137ACD /* suffixtree.cpp */; }; + 373C69180FC1C8AF00137ACD /* blastdb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C69160FC1C8AF00137ACD /* blastdb.cpp */; }; + 373C691F0FC1C98600137ACD /* nast.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C691D0FC1C98600137ACD /* nast.cpp */; }; + 373C692B0FC1C9EB00137ACD /* nastreport.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C69290FC1C9EB00137ACD /* nastreport.cpp */; }; + 373C69340FC1CA9E00137ACD /* distancedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C69320FC1CA9E00137ACD /* distancedb.cpp */; }; 374610780F40645300460C57 /* unifracweightedcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 374610770F40645300460C57 /* unifracweightedcommand.cpp */; }; 3746107E0F4064D100460C57 /* weighted.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3746107D0F4064D100460C57 /* weighted.cpp */; }; 374610830F40652400460C57 /* unweighted.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 374610820F40652400460C57 /* unweighted.cpp */; }; @@ -45,6 +59,7 @@ 378C1B0A0FB0644E004D63F5 /* sequencedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 378C1AFD0FB0644D004D63F5 /* sequencedb.cpp */; }; 378C1B0B0FB0644E004D63F5 /* sharedjackknife.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 378C1AFF0FB0644D004D63F5 /* sharedjackknife.cpp */; }; 378C1B0C0FB0644E004D63F5 /* sharedmarczewski.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 378C1B010FB0644D004D63F5 /* sharedmarczewski.cpp */; }; + 378DC5CF0FBDE1C8003B8607 /* aligncommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 378DC5CE0FBDE1C8003B8607 /* aligncommand.cpp */; }; 379293C30F2DE73400B9034A /* treemap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 379293C20F2DE73400B9034A /* treemap.cpp */; }; 379294700F2E191800B9034A /* parsimonycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3792946F0F2E191800B9034A /* parsimonycommand.cpp */; }; 3792948A0F2E258500B9034A /* parsimony.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 379294890F2E258500B9034A /* parsimony.cpp */; }; @@ -168,6 +183,34 @@ 372E12940F263D5A0095CF7E /* readdistcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = readdistcommand.h; sourceTree = ""; }; 372E12950F263D5A0095CF7E /* readdistcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = readdistcommand.cpp; sourceTree = ""; }; 372E12EC0F264D320095CF7E /* commandfactory.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = commandfactory.cpp; sourceTree = ""; }; + 373C68A00FC1C07D00137ACD /* alignment.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignment.cpp; sourceTree = ""; }; + 373C68A10FC1C07D00137ACD /* alignment.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = alignment.hpp; sourceTree = ""; }; + 373C68A20FC1C07D00137ACD /* alignmentcell.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignmentcell.cpp; sourceTree = ""; }; + 373C68A30FC1C07D00137ACD /* alignmentcell.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = alignmentcell.hpp; sourceTree = ""; }; + 373C68BA0FC1C1A600137ACD /* gotohoverlap.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = gotohoverlap.cpp; sourceTree = ""; }; + 373C68BB0FC1C1A600137ACD /* gotohoverlap.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = gotohoverlap.hpp; sourceTree = ""; }; + 373C68C30FC1C25F00137ACD /* overlap.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = overlap.cpp; sourceTree = ""; }; + 373C68C40FC1C25F00137ACD /* overlap.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = overlap.hpp; sourceTree = ""; }; + 373C68CD0FC1C2D900137ACD /* needlemanoverlap.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = needlemanoverlap.cpp; sourceTree = ""; }; + 373C68CE0FC1C2D900137ACD /* needlemanoverlap.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = needlemanoverlap.hpp; sourceTree = ""; }; + 373C68DB0FC1C38D00137ACD /* blastalign.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blastalign.cpp; sourceTree = ""; }; + 373C68DC0FC1C38D00137ACD /* blastalign.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = blastalign.hpp; sourceTree = ""; }; + 373C68E30FC1C4A500137ACD /* noalign.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = noalign.cpp; sourceTree = ""; }; + 373C68E40FC1C4A500137ACD /* noalign.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = noalign.hpp; sourceTree = ""; }; + 373C68F30FC1C6A800137ACD /* suffixdb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = suffixdb.cpp; sourceTree = ""; }; + 373C68F40FC1C6A800137ACD /* suffixdb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = suffixdb.hpp; sourceTree = ""; }; + 373C68F50FC1C6A800137ACD /* suffixnodes.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = suffixnodes.cpp; sourceTree = ""; }; + 373C68F60FC1C6A800137ACD /* suffixnodes.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = suffixnodes.hpp; sourceTree = ""; }; + 373C68F70FC1C6A800137ACD /* suffixtree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = suffixtree.cpp; sourceTree = ""; }; + 373C68F80FC1C6A800137ACD /* suffixtree.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = suffixtree.hpp; sourceTree = ""; }; + 373C69160FC1C8AF00137ACD /* blastdb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blastdb.cpp; sourceTree = ""; }; + 373C69170FC1C8AF00137ACD /* blastdb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = blastdb.hpp; sourceTree = ""; }; + 373C691D0FC1C98600137ACD /* nast.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = nast.cpp; sourceTree = ""; }; + 373C691E0FC1C98600137ACD /* nast.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = nast.hpp; sourceTree = ""; }; + 373C69290FC1C9EB00137ACD /* nastreport.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = nastreport.cpp; sourceTree = ""; }; + 373C692A0FC1C9EB00137ACD /* nastreport.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = nastreport.hpp; sourceTree = ""; }; + 373C69320FC1CA9E00137ACD /* distancedb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = distancedb.cpp; sourceTree = ""; }; + 373C69330FC1CA9E00137ACD /* distancedb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = distancedb.hpp; sourceTree = ""; }; 374610760F40645300460C57 /* unifracweightedcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = unifracweightedcommand.h; sourceTree = ""; }; 374610770F40645300460C57 /* unifracweightedcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = unifracweightedcommand.cpp; sourceTree = ""; }; 3746107C0F4064D100460C57 /* weighted.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = weighted.h; sourceTree = ""; }; @@ -233,6 +276,8 @@ 378C1B000FB0644D004D63F5 /* sharedjackknife.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sharedjackknife.h; sourceTree = ""; }; 378C1B010FB0644D004D63F5 /* sharedmarczewski.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sharedmarczewski.cpp; sourceTree = ""; }; 378C1B020FB0644D004D63F5 /* sharedmarczewski.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sharedmarczewski.h; sourceTree = ""; }; + 378DC5CD0FBDE1C8003B8607 /* aligncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = aligncommand.h; sourceTree = ""; }; + 378DC5CE0FBDE1C8003B8607 /* aligncommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligncommand.cpp; sourceTree = ""; }; 379293C10F2DE73400B9034A /* treemap.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = treemap.h; sourceTree = ""; }; 379293C20F2DE73400B9034A /* treemap.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = treemap.cpp; sourceTree = ""; }; 3792946E0F2E191800B9034A /* parsimonycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = parsimonycommand.h; sourceTree = ""; }; @@ -465,6 +510,8 @@ isa = PBXGroup; children = ( 37D927BA0F21331F001D4494 /* averagelinkage.cpp */, + 373C68DC0FC1C38D00137ACD /* blastalign.hpp */, + 373C68DB0FC1C38D00137ACD /* blastalign.cpp */, 37D928A60F2133C0001D4494 /* calculators */, 37D927C20F21331F001D4494 /* cluster.hpp */, 37D927C10F21331F001D4494 /* cluster.cpp */, @@ -479,8 +526,6 @@ 37D927D10F21331F001D4494 /* commandoptionparser.hpp */, 37D927D00F21331F001D4494 /* commandoptionparser.cpp */, 37D927D20F21331F001D4494 /* completelinkage.cpp */, - 37D927D40F21331F001D4494 /* database.hpp */, - 37D927D30F21331F001D4494 /* database.cpp */, 7E4130F70F8E58FA00381DD0 /* dlibshuff.h */, 7E4130F60F8E58FA00381DD0 /* dlibshuff.cpp */, 37D927D60F21331F001D4494 /* display.h */, @@ -491,21 +536,29 @@ 37D927DD0F21331F001D4494 /* fileoutput.cpp */, 37D927E00F21331F001D4494 /* globaldata.hpp */, 37D927DF0F21331F001D4494 /* globaldata.cpp */, + 373C68BB0FC1C1A600137ACD /* gotohoverlap.hpp */, + 373C68BA0FC1C1A600137ACD /* gotohoverlap.cpp */, 375873ED0F7D646F0040F377 /* heatmap.h */, 375873EE0F7D646F0040F377 /* heatmap.cpp */, 37D927E60F21331F001D4494 /* inputdata.h */, 37D927E50F21331F001D4494 /* inputdata.cpp */, - 37D927EA0F21331F001D4494 /* kmer.hpp */, - 37D927E90F21331F001D4494 /* kmer.cpp */, - 37D927EC0F21331F001D4494 /* kmerdb.hpp */, - 37D927EB0F21331F001D4494 /* kmerdb.cpp */, 7E412F420F8D213C00381DD0 /* libshuff.h */, 7E412FE90F8D3E2C00381DD0 /* libshuff.cpp */, 37E3ED800F4D9D0D00B5DF02 /* mothur.h */, 37D927EF0F21331F001D4494 /* mothur.cpp */, 37D927F10F21331F001D4494 /* nameassignment.hpp */, 37D927F00F21331F001D4494 /* nameassignment.cpp */, + 373C691E0FC1C98600137ACD /* nast.hpp */, + 373C691D0FC1C98600137ACD /* nast.cpp */, + 373C692A0FC1C9EB00137ACD /* nastreport.hpp */, + 373C69290FC1C9EB00137ACD /* nastreport.cpp */, + 373C68CE0FC1C2D900137ACD /* needlemanoverlap.hpp */, + 373C68CD0FC1C2D900137ACD /* needlemanoverlap.cpp */, + 373C68E40FC1C4A500137ACD /* noalign.hpp */, + 373C68E30FC1C4A500137ACD /* noalign.cpp */, 37D927F60F21331F001D4494 /* observable.h */, + 373C68C40FC1C25F00137ACD /* overlap.hpp */, + 373C68C30FC1C25F00137ACD /* overlap.cpp */, 37D927FC0F21331F001D4494 /* progress.hpp */, 37D927FB0F21331F001D4494 /* progress.cpp */, 37D928040F21331F001D4494 /* raredisplay.h */, @@ -672,6 +725,8 @@ isa = PBXGroup; children = ( 37D927CD0F21331F001D4494 /* command.hpp */, + 378DC5CD0FBDE1C8003B8607 /* aligncommand.h */, + 378DC5CE0FBDE1C8003B8607 /* aligncommand.cpp */, 37C1D9710F86506E0059E3F0 /* binsequencecommand.h */, 37C1D9720F86506E0059E3F0 /* binsequencecommand.cpp */, 21DDC0190F97A8FE0060691C /* bootstrapsharedcommand.h */, @@ -743,13 +798,27 @@ 37D928AC0F213420001D4494 /* containers */ = { isa = PBXGroup; children = ( + 373C68A10FC1C07D00137ACD /* alignment.hpp */, + 373C68A00FC1C07D00137ACD /* alignment.cpp */, + 373C68A30FC1C07D00137ACD /* alignmentcell.hpp */, + 373C68A20FC1C07D00137ACD /* alignmentcell.cpp */, + 373C69170FC1C8AF00137ACD /* blastdb.hpp */, + 373C69160FC1C8AF00137ACD /* blastdb.cpp */, + 37D927D40F21331F001D4494 /* database.hpp */, + 37D927D30F21331F001D4494 /* database.cpp */, 37D927D50F21331F001D4494 /* datavector.hpp */, + 373C69330FC1CA9E00137ACD /* distancedb.hpp */, + 373C69320FC1CA9E00137ACD /* distancedb.cpp */, 37D927DC0F21331F001D4494 /* fastamap.h */, 37D927DB0F21331F001D4494 /* fastamap.cpp */, 375873EA0F7D64520040F377 /* fullmatrix.h */, 375873EB0F7D64520040F377 /* fullmatrix.cpp */, 37D927E20F21331F001D4494 /* groupmap.h */, 37D927E10F21331F001D4494 /* groupmap.cpp */, + 37D927EA0F21331F001D4494 /* kmer.hpp */, + 37D927E90F21331F001D4494 /* kmer.cpp */, + 37D927EC0F21331F001D4494 /* kmerdb.hpp */, + 37D927EB0F21331F001D4494 /* kmerdb.cpp */, 37D927EE0F21331F001D4494 /* listvector.hpp */, 37D927ED0F21331F001D4494 /* listvector.cpp */, 37D927F80F21331F001D4494 /* ordervector.hpp */, @@ -772,6 +841,12 @@ 37D928320F21331F001D4494 /* sharedsabundvector.cpp */, 37D928450F21331F001D4494 /* sparsematrix.hpp */, 37D928440F21331F001D4494 /* sparsematrix.cpp */, + 373C68F40FC1C6A800137ACD /* suffixdb.hpp */, + 373C68F30FC1C6A800137ACD /* suffixdb.cpp */, + 373C68F60FC1C6A800137ACD /* suffixnodes.hpp */, + 373C68F50FC1C6A800137ACD /* suffixnodes.cpp */, + 373C68F80FC1C6A800137ACD /* suffixtree.hpp */, + 373C68F70FC1C6A800137ACD /* suffixtree.cpp */, 37AD4DB90F28E2FE00AA2D49 /* tree.h */, 37AD4DBA0F28E2FE00AA2D49 /* tree.cpp */, 379293C10F2DE73400B9034A /* treemap.h */, @@ -979,6 +1054,21 @@ 378C1B0C0FB0644E004D63F5 /* sharedmarczewski.cpp in Sources */, 37C753CE0FB3415200DBD02E /* distancecommand.cpp in Sources */, 379643ED0FB9B5A80081FDB6 /* readseqs.cpp in Sources */, + 378DC5CF0FBDE1C8003B8607 /* aligncommand.cpp in Sources */, + 373C68A40FC1C07D00137ACD /* alignment.cpp in Sources */, + 373C68A50FC1C07D00137ACD /* alignmentcell.cpp in Sources */, + 373C68BC0FC1C1A600137ACD /* gotohoverlap.cpp in Sources */, + 373C68C50FC1C25F00137ACD /* overlap.cpp in Sources */, + 373C68CF0FC1C2D900137ACD /* needlemanoverlap.cpp in Sources */, + 373C68DD0FC1C38D00137ACD /* blastalign.cpp in Sources */, + 373C68E50FC1C4A500137ACD /* noalign.cpp in Sources */, + 373C68F90FC1C6A800137ACD /* suffixdb.cpp in Sources */, + 373C68FA0FC1C6A800137ACD /* suffixnodes.cpp in Sources */, + 373C68FB0FC1C6A800137ACD /* suffixtree.cpp in Sources */, + 373C69180FC1C8AF00137ACD /* blastdb.cpp in Sources */, + 373C691F0FC1C98600137ACD /* nast.cpp in Sources */, + 373C692B0FC1C9EB00137ACD /* nastreport.cpp in Sources */, + 373C69340FC1CA9E00137ACD /* distancedb.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/aligncommand.cpp b/aligncommand.cpp new file mode 100644 index 0000000..81f8e6a --- /dev/null +++ b/aligncommand.cpp @@ -0,0 +1,151 @@ +/* + * aligncommand.cpp + * Mothur + * + * Created by Sarah Westcott on 5/15/09. + * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved. + * + * This version of nast does everything I think that the greengenes nast server does and then some. I have added the + * feature of allowing users to define their database, kmer size for searching, alignment penalty values and alignment + * method. This latter feature is perhaps most significant. nastPlus enables a user to use either a Needleman-Wunsch + * (non-affine gap penalty) or Gotoh (affine gap penalty) pairwise alignment algorithm. This is significant because it + * allows for a global alignment and not the local alignment provided by bLAst. Furthermore, it has the potential to + * provide a better alignment because of the banding method employed by blast (I'm not sure about this). + * + * to compile type: + * make + * + * for basic instructions on how to run nastPlus type: + * ./nastPlus + */ + +#include "aligncommand.h" +#include "sequence.hpp" + +#include "alignment.hpp" +#include "gotohoverlap.hpp" +#include "needlemanoverlap.hpp" +#include "blastalign.hpp" +#include "noalign.hpp" + +#include "database.hpp" +#include "kmerdb.hpp" +#include "suffixdb.hpp" +#include "blastdb.hpp" +#include "distancedb.hpp" + +#include "nast.hpp" +#include "nastreport.hpp" + + +//********************************************************************************************************************** +AlignCommand::AlignCommand(){ + try { + globaldata = GlobalData::getInstance(); + candidateFileName = globaldata->inputFileName; + templateFileName = globaldata->getTemplateFile(); + openInputFile(candidateFileName, in); + convert(globaldata->getKSize(), kmerSize); + convert(globaldata->getMatch(), match); + convert(globaldata->getMismatch(), misMatch); + convert(globaldata->getGapopen(), gapOpen); + convert(globaldata->getGapextend(), gapExtend); + distanceFileName = "????"; + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the AlignCommand class Function AlignCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + catch(...) { + cout << "An unknown error has occurred in the AlignCommand class function AlignCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } +} + +//********************************************************************************************************************** + +AlignCommand::~AlignCommand(){ + +} + +//********************************************************************************************************************** + +int AlignCommand::execute(){ + try { + srand( (unsigned)time( NULL ) ); //needed to assign names to temporary files + + Database* templateDB; + if(globaldata->getSearch() == "kmer") { templateDB = new KmerDB(templateFileName, kmerSize); } + else if(globaldata->getSearch() == "suffix") { templateDB = new SuffixDB(templateFileName); } + else if(globaldata->getSearch() == "blast") { templateDB = new BlastDB(templateFileName, gapOpen, gapExtend, match, misMatch); } + else if(globaldata->getSearch() == "distance") { templateDB = new DistanceDB(templateFileName, distanceFileName); } + else { cout << globaldata->getSearch() << " is not a valid search option. I will run the command using suffix." << endl; + templateDB = new SuffixDB(templateFileName); } + + Alignment* alignment; + if(globaldata->getAlign() == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, 3000); } + else if(globaldata->getAlign() == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, 3000); } + else if(globaldata->getAlign() == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); } + else if(globaldata->getAlign() == "noalign") { alignment = new NoAlign(); } + else { cout << globaldata->getAlign() << " is not a valid alignment option. I will run the command using blast." << endl; + alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); } + + int numFastaSeqs=count(istreambuf_iterator(in),istreambuf_iterator(), '>'); + in.seekg(0); + + string candidateAligngmentFName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + globaldata->getSearch() + '.' + globaldata->getAlign() + ".nast.align"; + ofstream candidateAlignmentFile; + openOutputFile(candidateAligngmentFName, candidateAlignmentFile); + + string candidateReportFName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + globaldata->getSearch() + '.' + globaldata->getAlign() + ".nast.report"; + NastReport report(candidateReportFName); + + cout << "We are going to align the " << numFastaSeqs << " sequences in " << candidateFileName << "..." << endl; + cout.flush(); + + int start = time(NULL); + + for(int i=0;ifindClosestSequence(candidateSeq); + report.setTemplate(templateSeq); + report.setSearchParameters(globaldata->getSearch(), templateDB->getSearchScore()); + + Nast nast(alignment, candidateSeq, templateSeq); + report.setAlignmentParameters(globaldata->getAlign(), alignment); + report.setNastParameters(nast); + + candidateAlignmentFile << '>' << candidateSeq->getName() << '\n' << candidateSeq->getAligned() << endl; + candidateAlignmentFile.flush(); + + if((i+1) % 100 == 0){ + cout << "It has taken " << time(NULL) - start << " secs to align " << i+1 << " sequences" << endl; + } + report.print(); + + delete candidateSeq; + } + + cout << "It took " << time(NULL) - start << " secs to align " << numFastaSeqs << " sequences" << endl; + cout << endl; + + delete templateDB; + delete alignment; + + + return 0; + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the AlignCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + catch(...) { + cout << "An unknown error has occurred in the AlignCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } +} + +//********************************************************************************************************************** \ No newline at end of file diff --git a/aligncommand.h b/aligncommand.h new file mode 100644 index 0000000..cf30a6b --- /dev/null +++ b/aligncommand.h @@ -0,0 +1,35 @@ +#ifndef ALIGNCOMMAND_H +#define ALIGNCOMMAND_H + +/* + * aligncommand.h + * Mothur + * + * Created by Sarah Westcott on 5/15/09. + * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved. + * + */ + +#include "command.hpp" +#include "globaldata.hpp" + +class AlignCommand : public Command { + + public: + AlignCommand(); + ~AlignCommand(); + int execute(); + + private: + GlobalData* globaldata; + string candidateFileName, templateFileName, distanceFileName; + int kmerSize; + float match, misMatch, gapOpen, gapExtend; + ofstream out; + ifstream in; + +}; + + + +#endif \ No newline at end of file diff --git a/alignment.cpp b/alignment.cpp new file mode 100644 index 0000000..6db3b3d --- /dev/null +++ b/alignment.cpp @@ -0,0 +1,163 @@ +/* + * alignment.cpp + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This is a class for an abstract datatype for classes that implement various types of alignment algorithms. + * As of 12/18/08 these included alignments based on blastn, needleman-wunsch, and the Gotoh algorithms + * + */ + +using namespace std; + +#include +#include + +#include "alignmentcell.hpp" +#include "alignment.hpp" + +#include + +/**************************************************************************************************/ + +Alignment::Alignment() { /* do nothing */ } + +/**************************************************************************************************/ + +Alignment::Alignment(int A) : nCols(A), nRows(A) { + + alignment.resize(nRows); // For the Gotoh and Needleman-Wunsch we initialize the dynamic programming + for(int i=0;i=0;i--){ + if(seqAaln[i] != '-' && seqBaln[i] == '-') { seqAend++; } + else if(seqAaln[i] == '-' && seqBaln[i] != '-') { seqBend++; } + else { break; } + } + pairwiseLength -= (seqAend + seqBend); + + seqAend = seqA.length() - seqAend - 1; + seqBend = seqB.length() - seqBend - 1; + +} + +/**************************************************************************************************/ + +string Alignment::getSeqAAln(){ + return seqAaln; // this is called to get the alignment of seqA +} + +/**************************************************************************************************/ + +string Alignment::getSeqBAln(){ + return seqBaln; // this is called to get the alignment of seqB +} + +/**************************************************************************************************/ + +int Alignment::getCandidateStartPos(){ + return seqAstart; // this is called to report the quality of the alignment +} + +/**************************************************************************************************/ + +int Alignment::getCandidateEndPos(){ + return seqAend; // this is called to report the quality of the alignment +} + +/**************************************************************************************************/ + +int Alignment::getTemplateStartPos(){ + return seqBstart; // this is called to report the quality of the alignment +} + +/**************************************************************************************************/ + +int Alignment::getTemplateEndPos(){ + return seqBend; // this is called to report the quality of the alignment +} + +/**************************************************************************************************/ + +int Alignment::getPairwiseLength(){ + return pairwiseLength; // this is the pairwise alignment length +} + +/**************************************************************************************************/ + +//int Alignment::getLongestTemplateGap(){ +// +// int length = seqBaln.length(); +// int longGap = 0; +// int gapLength = 0; +// +// int start = seqAstart; +// if(seqAstart < seqBstart){ start = seqBstart; } +// for(int i=seqAstart;i 0){ +// if(gapLength > longGap){ longGap = gapLength; } +// } +// gapLength = 0; +// } +// } +// return longGap; +//} + +/**************************************************************************************************/ diff --git a/alignment.hpp b/alignment.hpp new file mode 100644 index 0000000..3367812 --- /dev/null +++ b/alignment.hpp @@ -0,0 +1,53 @@ +#ifndef DPALIGNMENT_H +#define DPALIGNMENT_H + +/* + * dpalignment.h + * + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This is a class for an abstract datatype for classes that implement various types of alignment algorithms. + * As of 12/18/08 these included alignments based on blastn, needleman-wunsch, and the Gotoh algorithms + * + */ +using namespace std; + +#include "mothur.h" +#include "alignmentcell.hpp" + +/**************************************************************************************************/ + +class Alignment { + +public: + Alignment(int); + Alignment(); + virtual void align(string, string) = 0; + + float getAlignmentScore(); + string getSeqAAln(); + string getSeqBAln(); + int getCandidateStartPos(); + int getCandidateEndPos(); + int getTemplateStartPos(); + int getTemplateEndPos(); + + int getPairwiseLength(); +// int getLongestTemplateGap(); + +protected: + void traceBack(); + string seqA, seqAaln; + string seqB, seqBaln; + int seqAstart, seqAend; + int seqBstart, seqBend; + int pairwiseLength; + int nRows, nCols, lA, lB; + vector > alignment; +}; + +/**************************************************************************************************/ + +#endif diff --git a/alignmentcell.cpp b/alignmentcell.cpp new file mode 100644 index 0000000..544c218 --- /dev/null +++ b/alignmentcell.cpp @@ -0,0 +1,18 @@ +/* + * alignmentcell.cpp + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This class is pretty basic. Each AlignmentCell object contains a pointer to the previous cell and different values + * used to calcualte the alignment. Initially everything is set to zero and all pointers are set to 'x' + * + */ + +#include "alignmentcell.hpp" + +//******************************************************************************************************************** + +AlignmentCell::AlignmentCell() : prevCell('x'), cValue(0.0000), dValue(0.0000), iValue(0.0000) {}; + +//******************************************************************************************************************** diff --git a/alignmentcell.hpp b/alignmentcell.hpp new file mode 100644 index 0000000..c309048 --- /dev/null +++ b/alignmentcell.hpp @@ -0,0 +1,30 @@ +#ifndef ALIGNMENTCELL_H +#define ALIGNMENTCELL_H + +/* + * alignmentcell.hpp + * + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This class is pretty basic. Each AlignmentCell object contains a pointer to the previous cell and different values + * used to calcualte the alignment. Initially everything is set to zero and all pointers are set to 'x' + * + */ + +//******************************************************************************************************************** + +class AlignmentCell { + +public: + AlignmentCell(); + char prevCell; + float cValue; + float dValue; + float iValue; +}; + +//******************************************************************************************************************** + +#endif diff --git a/blastalign.cpp b/blastalign.cpp new file mode 100644 index 0000000..0a1350f --- /dev/null +++ b/blastalign.cpp @@ -0,0 +1,156 @@ +/* + * blastalign.cpp + * + * + * Created by Pat Schloss on 12/16/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This is a basic alignment method that gets the blast program to do the heavy lifting. In the future, we should + * probably incorporate NCBI's library so that we don't have to call on a user-supplied executable. This is a child + * of the Alignment class, which requires a constructor and align method. + * + */ + +using namespace std; + +#include "alignment.hpp" +#include "blastalign.hpp" + + +//**************************************************************************************************/ + +BlastAlignment::BlastAlignment(float go, float ge, float m, float mm) : + match(m), // This is the score to award for two nucleotides matching (match >= 0) + mismatch(mm) // This is the penalty to assess for a mismatch (mismatch <= 0) +{ + gapOpen = abs(go); // This is the penalty to assess for opening a gap (gapOpen >= 0) + gapExtend = abs(ge); // This is the penalty to assess for extending a gap (gapExtend >= 0) + + int randNumber = rand(); + candidateFileName = toString(randNumber) + ".candidate"; + templateFileName = toString(randNumber) + ".template"; + blastFileName = toString(randNumber) + ".pairwise"; +} + +//**************************************************************************************************/ + +BlastAlignment::~BlastAlignment(){ // The desctructor should clean up by removing the temporary + remove(candidateFileName.c_str()); // files used to run bl2seq + remove(templateFileName.c_str()); + remove(blastFileName.c_str()); +} + +//**************************************************************************************************/ + +void BlastAlignment::align(string seqA, string seqB){ //Use blastn to align the two sequences + + ofstream candidateFile(candidateFileName.c_str()); // Write the sequence to be aligned to a temporary candidate seq file + candidateFile << ">candidate" << endl << seqA << endl; + candidateFile.close(); + + ofstream templateFile(templateFileName.c_str()); // Write the unaligned template sequence to a temporary candidate seq file + templateFile << ">template" << endl << seqB << endl; + templateFile.close(); + + // The blastCommand assumes that we have DNA sequences (blastn) and that they are fairly similar (-e 0.001) and + // that we don't want to apply any kind of complexity filtering (-F F) + string blastCommand = "~/Pipeline/src/cpp/production/blast/bin/bl2seq -p blastn -i " + candidateFileName + " -j " + templateFileName + " -e 0.0001 -F F -o " + blastFileName + " -W 11"; + blastCommand += " -r " + toString(match) + " -q " + toString(mismatch); + blastCommand += " -G " + toString(gapOpen) + " -E " + toString(gapExtend); + + system(blastCommand.c_str()); // Here we assume that "bl2seq" is in the users path or in the same folder as + // this executable + setPairwiseSeqs(); +} + +/**************************************************************************************************/ + +void BlastAlignment::setPairwiseSeqs(){ // This method call assigns the blast generated alignment + // to the pairwise entry in the Sequence class for the + // candidate and template Sequence objects + ifstream blastFile; + openInputFile(blastFileName, blastFile); + + seqAaln = ""; + seqBaln = ""; + + int candidateLength, templateLength; + char d; + + string candidateName, templateName; + + while(d=blastFile.get() != '='){}; + blastFile >> candidateName; // Get the candidate sequence name from flatfile + + while(d=blastFile.get() != '('){}; + blastFile >> candidateLength; // Get the candidate sequence length from flatfile + + while(d=blastFile.get()){ + if(d == '>'){ + blastFile >> templateName; // Get the template sequence name from flatfile + break; + } + else if(d == '*'){ // We go here if there is no significant match + + seqAstart = 0; + seqBstart = 0; + seqAend = 0; + seqBend = 0; + pairwiseLength = 0; + +// string dummy; +// while(dummy != "query:"){ cout << dummy << endl;blastFile >> dummy; } +// blastFile >> seqBend; +// cout << seqBend << endl; +// for(int i=0;i> templateLength; // Get the template sequence length from flatfile + + while(d=blastFile.get() != 'Q'){}; // Suck up everything else until we get to the start of the alignment + int queryStart, sbjctStart, queryEnd, sbjctEnd; + string queryLabel, sbjctLabel, query, sbjct; + + blastFile >> queryLabel; queryLabel = 'Q' + queryLabel; + + + while(queryLabel == "Query:"){ + blastFile >> queryStart >> query >> queryEnd; + + while(d=blastFile.get() != 'S'){}; + + blastFile >> sbjctLabel >> sbjctStart >> sbjct >> sbjctEnd; + + if(seqAaln == ""){ + seqAstart = queryStart; + seqBstart = sbjctStart; + } + + seqAaln += query; // concatenate each line of the sequence to what we already have + seqBaln += sbjct; // for the query and template (subject) sequence + + blastFile >> queryLabel; + } + seqAend = queryEnd; + seqBend = sbjctEnd; + pairwiseLength = seqAaln.length(); + + for(int i=1;i' << i << endl; // sequences, which will be input to formatdb + unalignedFastaFile << templateSequences[i]->getUnaligned() << endl; + } + unalignedFastaFile.close(); + + string formatdbCommand = "~/Pipeline/src/cpp/production/blast/bin/formatdb -p F -o T -i " + dbFileName; // format the database, -o option gives us the ability + system(formatdbCommand.c_str()); // to get the right sequence names, i think. -p F + // option tells formatdb that seqs are DNA, not prot + cout << "DONE." << endl << endl; cout.flush(); + emptySequence = new Sequence(); + emptySequence->setName("no_match"); + emptySequence->setUnaligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + emptySequence->setAligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + + +} + +/**************************************************************************************************/ + +BlastDB::~BlastDB(){ + remove(queryFileName.c_str()); // let's clean stuff up and remove the temp files + remove(dbFileName.c_str()); // let's clean stuff up and remove the temp files + remove(blastFileName.c_str()); // let's clean stuff up and remove the temp files +} + +/**************************************************************************************************/ + +Sequence* BlastDB::findClosestSequence(Sequence* candidate){ + + ofstream queryFile; + openOutputFile(queryFileName, queryFile); + queryFile << '>' << candidate->getName() << endl; + queryFile << candidate->getUnaligned() << endl; + queryFile.close(); + + +// the goal here is to quickly survey the database to find the closest match. To do this we are using the default +// wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too +// long. With this setting, it seems comparable in speed to the suffix tree approach. + + string blastCommand = "~/Pipeline/src/cpp/production/blast/bin/blastall -p blastn -d " + dbFileName + " -b 1 -m 8 -W 28"; + blastCommand += (" -i " + queryFileName + " -o " + blastFileName); + system(blastCommand.c_str()); + + ifstream m8FileHandle; + openInputFile(blastFileName, m8FileHandle); + + string dummy; + int templateAccession; + gobble(m8FileHandle); + if(!m8FileHandle.eof()){ + m8FileHandle >> dummy >> templateAccession >> searchScore; + m8FileHandle.close(); + return templateSequences[templateAccession]; + } + else{ + searchScore = 0.00; + return emptySequence; + } +} + +/**************************************************************************************************/ diff --git a/blastdb.hpp b/blastdb.hpp new file mode 100644 index 0000000..a952897 --- /dev/null +++ b/blastdb.hpp @@ -0,0 +1,34 @@ +#ifndef BLASTDB_HPP +#define BLASTDB_HPP + + +/* + * blastdb.hpp + * + * + * Created by Pat Schloss on 12/22/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + */ + +#include "mothur.h" + +class BlastDB : public Database { +public: + BlastDB(string, float, float, float, float); + ~BlastDB(); + Sequence* findClosestSequence(Sequence*); + +private: + string dbFileName; + string queryFileName; + string blastFileName; + + float gapOpen; + float gapExtend; + float match; + float misMatch; + Sequence* emptySequence; +}; + +#endif diff --git a/commandfactory.cpp b/commandfactory.cpp index 2e8aca7..d41e4a8 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -41,6 +41,7 @@ #include "bootstrapsharedcommand.h" #include "concensuscommand.h" #include "distancecommand.h" +#include "aligncommand.h" /***********************************************************/ @@ -93,6 +94,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "bootstrap.shared") { command = new BootSharedCommand(); } else if(commandName == "concensus") { command = new ConcensusCommand(); } else if(commandName == "dist.seqs") { command = new DistanceCommand(); } + else if(commandName == "align.seqs") { command = new AlignCommand(); } else { command = new NoCommand(); } return command; diff --git a/database.cpp b/database.cpp index 933bed4..8c080db 100644 --- a/database.cpp +++ b/database.cpp @@ -15,24 +15,23 @@ using namespace std; /**************************************************************************************************/ -Database::Database(string fastaFileName){ - - ifstream fastaFile(fastaFileName.c_str()); - if(!fastaFile) { - cerr << "Error: Could not open " << fastaFileName << endl; - exit(1); - } +Database::Database(string fastaFileName){ // This assumes that the template database is in fasta format, may + // need to alter this in the future? + + ifstream fastaFile; + openInputFile(fastaFileName, fastaFile); + cout << endl << "Reading in the " << fastaFileName << " template sequences...\t"; cout.flush(); - numSeqs=count(istreambuf_iterator(fastaFile),istreambuf_iterator(), '>'); - fastaFile.seekg(0); + numSeqs=count(istreambuf_iterator(fastaFile),istreambuf_iterator(), '>'); // count the number of + fastaFile.seekg(0); // sequences templateSequences.resize(numSeqs); string seqName, sequence; for(int i=0;i> seqName; templateSequences[i]->setName(seqName); @@ -42,12 +41,13 @@ Database::Database(string fastaFileName){ while(fastaFile && (letter=fastaFile.get()) != '>'){ if(isprint(letter)){ letter = toupper(letter); + if(letter == 'U'){letter = 'T';} aligned += letter; } } - templateSequences[i]->setAligned(aligned); - templateSequences[i]->setUnaligned(aligned); - fastaFile.putback(letter); + templateSequences[i]->setAligned(aligned); // Obviously, we need the fully aligned template sequence + templateSequences[i]->setUnaligned(aligned);// We will also need the unaligned sequence for pairwise alignments + fastaFile.putback(letter); // and database searches } fastaFile.close(); @@ -56,3 +56,7 @@ Database::Database(string fastaFileName){ } /**************************************************************************************************/ + +float Database::getSearchScore() { return searchScore; } // we're assuming that the search is already done + +/**************************************************************************************************/ diff --git a/database.hpp b/database.hpp index e80bfb5..09fea85 100644 --- a/database.hpp +++ b/database.hpp @@ -12,14 +12,17 @@ #include "mothur.h" +class Sequence; + class Database { public: Database(string); virtual Sequence* findClosestSequence(Sequence*) = 0; - + virtual float getSearchScore(); protected: int numSeqs; - vector templateSequences; + float searchScore; + vector templateSequences; }; #endif diff --git a/distancecommand.cpp b/distancecommand.cpp index e5878b3..7d1f6a0 100644 --- a/distancecommand.cpp +++ b/distancecommand.cpp @@ -88,11 +88,12 @@ int DistanceCommand::execute(){ remove(distFile.c_str()); - // # if defined (WIN_VERSION) + //# if defined (_WIN32) + //figure out how to implement the fork and wait commands in windows // driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff); - // # endif + //# endif - # if defined (__APPLE__) || (__MACH__) + # if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff); } @@ -168,6 +169,8 @@ int DistanceCommand::execute(){ wait(NULL); } wait(NULL); + # else + driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff); # endif delete distCalculator; diff --git a/distancedb.cpp b/distancedb.cpp new file mode 100644 index 0000000..ce74854 --- /dev/null +++ b/distancedb.cpp @@ -0,0 +1,65 @@ +/* + * distancedb.cpp + * + * + * Created by Pat Schloss on 12/29/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + */ + + +using namespace std; + + +#include "database.hpp" +#include "sequence.hpp" +#include "distancedb.hpp" + + +/**************************************************************************************************/ + +DistanceDB::DistanceDB(string fastaFileName, string distanceFileName) : Database(fastaFileName) { + + ifstream inputData; + openInputFile(distanceFileName, inputData); + int numCandSeqs=count(istreambuf_iterator(inputData),istreambuf_iterator(), '\n'); // count the number of + inputData.seekg(0); // sequences + + hit closestMatch; + string candidateSeqName; + string junk; + + mostSimSequenceVector.resize(numCandSeqs); + + for(int i=0;i> candidateSeqName >> closestMatch.seqName >> closestMatch.indexNumber >> closestMatch.simScore; +// getline(inputData, junk); + mostSimSequenceVector[i] = closestMatch; + } + cout << numCandSeqs << endl; + searchIndex = 0; + inputData.close(); +} + +/**************************************************************************************************/ + +Sequence* DistanceDB::findClosestSequence(Sequence* candidateSeq){ + + hit simAccession = mostSimSequenceVector[searchIndex]; +// string candidateSeqName, closestMatchSeqName, junk; +// int closestMatchIndexNumber; +// float closestMatchSimScore; +// +// inputData >> candidateSeqName >> closestMatchSeqName >> closestMatchIndexNumber >> closestMatchSimScore; +// getline(inputData, junk); + +// searchScore = 100. * closestMatchSimScore; + + searchScore = 100. * simAccession.simScore; + searchIndex++; +// return templateSequences[closestMatchIndexNumber]; + return templateSequences[simAccession.indexNumber]; + +} + +/**************************************************************************************************/ diff --git a/distancedb.hpp b/distancedb.hpp new file mode 100644 index 0000000..dc4c9a6 --- /dev/null +++ b/distancedb.hpp @@ -0,0 +1,36 @@ +#ifndef DISTANCEDB_HPP +#define DISTANCEDB_HPP + +/* + * distancedb.hpp + * + * + * Created by Pat Schloss on 12/29/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + */ + + +using namespace std; + +#include "mothur.h" + +class DistanceDB : public Database { + +public: + DistanceDB(string, string); + Sequence* findClosestSequence(Sequence*); + +private: + + struct hit{ + string seqName; + int indexNumber; + float simScore; + }; + vector mostSimSequenceVector; +// ifstream inputData; + int searchIndex; +}; + +#endif diff --git a/errorchecking.cpp b/errorchecking.cpp index 8aaa8b0..dfe9607 100644 --- a/errorchecking.cpp +++ b/errorchecking.cpp @@ -118,7 +118,14 @@ bool ErrorCheck::checkInput(string input) { if (parameter == "scale" ) { scale = value; } if (parameter == "ends" ) { ends = value; } if (parameter == "processors" ) { processors = value; } - + if (parameter == "template") { templatefile = value; } + if (parameter == "search") { search = value; } + if (parameter == "ksize") { ksize = value; } + if (parameter == "align") { align = value; } + if (parameter == "match") { match = value; } + if (parameter == "mismatch") { mismatch = value; } + if (parameter == "gapopen") { gapopen = value; } + if (parameter == "gapextend" ) { gapextend = value; } } //gets the last parameter and value @@ -160,6 +167,14 @@ bool ErrorCheck::checkInput(string input) { if (parameter == "scale" ) { scale = value; } if (parameter == "ends" ) { ends = value; } if (parameter == "processors" ) { processors = value; } + if (parameter == "template") { templatefile = value; } + if (parameter == "search") { search = value; } + if (parameter == "ksize") { ksize = value; } + if (parameter == "align") { align = value; } + if (parameter == "match") { match = value; } + if (parameter == "mismatch") { mismatch = value; } + if (parameter == "gapopen") { gapopen = value; } + if (parameter == "gapextend" ) { gapextend = value; } } } @@ -246,9 +261,11 @@ bool ErrorCheck::checkInput(string input) { } } - if ((commandName == "filter.seqs") || (commandName == "dist.seqs")) { + if ((commandName == "filter.seqs") || (commandName == "dist.seqs") || (commandName == "align.seqs")) { if ((fastafile == "") && (nexusfile == "") && (clustalfile == "") && (phylipfile == "")) { - cout << "You must read either a fasta, nexus, clustal, or phylip file before you can use the filter.seqs command." << endl; return false; + cout << "You must enter either a fasta, nexus, clustal, or phylip file before you can use the filter.seqs, dist.seqs or align.seqs command." << endl; return false; + }else if ((commandName == "align.seqs") && (templatefile == "")) { + cout << "You must enter a template to use the align.seqs command." << endl; return false; } validateSeqsFiles(); } @@ -558,9 +575,15 @@ void ErrorCheck::validateSeqsFiles() { errorFree = false; } } - + }else if (templatefile != "") { + ableToOpen = openInputFile(templatefile, filehandle); + filehandle.close(); + if (ableToOpen == 1) { //unable to open + errorFree = false; + } } + } catch(exception& e) { cout << "Standard Error: " << e.what() << " has occurred in the ErrorCheck class Function validateSeqsFiles. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; @@ -642,6 +665,7 @@ void ErrorCheck::clear() { fastafile = ""; nexusfile = ""; clustalfile = ""; + templatefile = ""; line = ""; label = ""; method = "furthest"; diff --git a/errorchecking.h b/errorchecking.h index 8de9234..d6388f7 100644 --- a/errorchecking.h +++ b/errorchecking.h @@ -36,7 +36,7 @@ class ErrorCheck { void refresh(); string phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, orderfile, fastafile, nexusfile, clustalfile, treefile, sharedfile, cutoff, format; string precision, method, fileroot, label, line, iters, jumble, freq, single, rarefaction, shared, summary, randomtree, abund, sorted, trump, soft, filter, scale, ends, processors; - + string templatefile, search, ksize, align, match, mismatch, gapopen, gapextend; string commandName, optionText; bool errorFree; diff --git a/globaldata.cpp b/globaldata.cpp index 4dfcbc1..b4239f3 100644 --- a/globaldata.cpp +++ b/globaldata.cpp @@ -85,10 +85,14 @@ void GlobalData::parseGlobalData(string commandString, string optionText){ if (key == "scale") { scale = value; } if (key == "ends" ) { ends = value; } if (key == "processors" ) { processors = value; } - - - - + if (key == "template") { templatefile = value; } + if (key == "search") { search = value; } + if (key == "ksize") { ksize = value; } + if (key == "align") { align = value; } + if (key == "match") { match = value; } + if (key == "mismatch") { mismatch = value; } + if (key == "gapopen") { gapopen = value; } + if (key == "gapextend" ) { gapextend = value; } if (key == "line") {//stores lines to be used in a set lines.clear(); @@ -151,7 +155,14 @@ void GlobalData::parseGlobalData(string commandString, string optionText){ if (key == "scale") { scale = value; } if (key == "ends" ) { ends = value; } if (key == "processors" ) { processors = value; } - + if (key == "template") { templatefile = value; } + if (key == "search") { search = value; } + if (key == "ksize") { ksize = value; } + if (key == "align") { align = value; } + if (key == "match") { match = value; } + if (key == "mismatch") { mismatch = value; } + if (key == "gapopen") { gapopen = value; } + if (key == "gapextend" ) { gapextend = value; } if (key == "line") {//stores lines to be used in a vector lines.clear(); @@ -289,6 +300,14 @@ string GlobalData::getFilter() { return filter; } string GlobalData::getScale() { return scale; } string GlobalData::getEnds() { return ends; } string GlobalData::getProcessors() { return processors; } +string GlobalData::getTemplateFile() { return templatefile;} +string GlobalData::getSearch() { return search; } +string GlobalData::getKSize() { return ksize; } +string GlobalData::getAlign() { return align; } +string GlobalData::getMatch() { return match; } +string GlobalData::getMismatch() { return mismatch; } +string GlobalData::getGapopen() { return gapopen; } +string GlobalData::getGapextend() { return gapextend; } void GlobalData::setListFile(string file) { listfile = file; inputFileName = file;} void GlobalData::setGroupFile(string file) { groupfile = file; } @@ -334,6 +353,7 @@ void GlobalData::clear() { clustalfile = ""; treefile = ""; sharedfile = ""; + templatefile = ""; cutoff = "10.00"; format = ""; precision = "100"; @@ -357,6 +377,14 @@ void GlobalData::clear() { scale = "log10"; ends = "T"; //yes processors = "1"; + search = "suffix"; + ksize = "7"; + align = "blast"; + match = "1.0"; + mismatch = "-1.0"; + gapopen = "-5.0"; + gapextend = "-2.0"; + } @@ -379,6 +407,13 @@ void GlobalData::reset() { form = "integral"; ends = "T"; processors = "1"; + search = "suffix"; + ksize = "7"; + align = "blast"; + match = "1.0"; + mismatch = "-1.0"; + gapopen = "-5.0"; + gapextend = "-2.0"; } /*******************************************************/ diff --git a/globaldata.hpp b/globaldata.hpp index 3a469f0..cb9aff6 100644 --- a/globaldata.hpp +++ b/globaldata.hpp @@ -77,7 +77,14 @@ public: string getSorted(); string getEnds(); string getProcessors(); - + string getTemplateFile(); + string getSearch(); + string getKSize(); + string getAlign(); + string getMatch(); + string getMismatch(); + string getGapopen(); + string getGapextend(); string getTrump(); string getSoft(); string getFilter(); @@ -116,7 +123,8 @@ public: private: string phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, orderfile, fastafile, nexusfile, clustalfile, treefile, sharedfile, line, label, randomtree, groups; - string cutoff, format, precision, method, fileroot, iters, jumble, freq, calc, abund, step, form, sorted, trump, soft, filter, scale, ends, processors; + string cutoff, format, precision, method, fileroot, iters, jumble, freq, calc, abund, step, form, sorted, trump, soft, filter, scale, ends, processors, templatefile, search, ksize, align, match; + string mismatch, gapopen, gapextend; static GlobalData* _uniqueInstance; diff --git a/gotohoverlap.cpp b/gotohoverlap.cpp new file mode 100644 index 0000000..57d884b --- /dev/null +++ b/gotohoverlap.cpp @@ -0,0 +1,90 @@ +/* + * gotohoverlap.cpp + * + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This class is an Alignment child class that implements the Gotoh pairwise alignment algorithm as described in: + * + * Gotoh O. 1982. An improved algorithm for matching biological sequences. J. Mol. Biol. 162:705-8. + * Myers, EW & Miller, W. 1988. Optimal alignments in linear space. Comput Appl Biosci. 4:11-7. + * + * This method is nice because it allows for an affine gap penalty to be assessed, which is analogous to what is used + * in blast and is an alternative to Needleman-Wunsch, which only charges the same penalty for each gap position. + * Because this method typically has problems at the ends when two sequences do not full overlap, we employ a separate + * method to fix the ends (see Overlap class documentation) + * + */ + +using namespace std; + +#include "alignmentcell.hpp" +#include "overlap.hpp" +#include "alignment.hpp" +#include "gotohoverlap.hpp" + +/**************************************************************************************************/ + +GotohOverlap::GotohOverlap(float gO, float gE, float m, float mm, int r) : + gapOpen(gO), gapExtend(gE), match(m), mismatch(mm), Alignment(r) { + + for(int i=1;i alignment[i][j].dValue){ + if(alignment[i][j].iValue > diagonal){ + alignment[i][j].cValue = alignment[i][j].iValue; + alignment[i][j].prevCell = 'l'; + } + else{ + alignment[i][j].cValue = diagonal; + alignment[i][j].prevCell = 'd'; + } + } + else{ + if(alignment[i][j].dValue > diagonal){ + alignment[i][j].cValue = alignment[i][j].dValue; + alignment[i][j].prevCell = 'u'; + } + else{ + alignment[i][j].cValue = diagonal; + alignment[i][j].prevCell = 'd'; + } + } + + } + } + Overlap over; + over.setOverlap(alignment, lA, lB, 0); // Fix the gaps at the ends of the sequences + traceBack(); // Construct the alignment and set seqAaln and seqBaln +} + +/**************************************************************************************************/ diff --git a/gotohoverlap.hpp b/gotohoverlap.hpp new file mode 100644 index 0000000..bf3c636 --- /dev/null +++ b/gotohoverlap.hpp @@ -0,0 +1,43 @@ +#ifndef GOTOHOVERLAP_H +#define GOTOHOVERLAP_H + +/* + * gotohoverlap.h + * + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This class is an Alignment child class that implements the Gotoh pairwise alignment algorithm as described in: + * + * Gotoh O. 1982. An improved algorithm for matching biological sequences. J. Mol. Biol. 162:705-8. + * Myers, EW & Miller, W. 1988. Optimal alignments in linear space. Comput Appl Biosci. 4:11-7. + * + * This method is nice because it allows for an affine gap penalty to be assessed, which is analogous to what is used + * in blast and is an alternative to Needleman-Wunsch, which only charges the same penalty for each gap position. + * Because this method typically has problems at the ends when two sequences do not full overlap, we employ a separate + * method to fix the ends (see Overlap class documentation) + * + */ + +#include "mothur.h" +#include "alignment.hpp" + +/**************************************************************************************************/ + +class GotohOverlap : public Alignment { + +public: + GotohOverlap(float, float, float, float, int); + void align(string, string); + +private: + float gapOpen; + float gapExtend; + float match; + float mismatch; +}; + +/**************************************************************************************************/ + +#endif diff --git a/kmer.cpp b/kmer.cpp index 875cec5..dcb6475 100644 --- a/kmer.cpp +++ b/kmer.cpp @@ -14,28 +14,29 @@ using namespace std; /**************************************************************************************************/ -Kmer::Kmer(int size) : kmerSize(size) { - - int power4s[9] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536 }; - maxKmer = power4s[kmerSize]+1;// (int)pow(4.,k)+1; +Kmer::Kmer(int size) : kmerSize(size) { // The constructor sets the size of the kmer + int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; + // No reason to waste the time of recalculating + maxKmer = power4s[kmerSize]+1;// (int)pow(4.,k)+1; // powers of 4 everytime through. We need an + // extra kmer if we get a non-ATGCU base } /**************************************************************************************************/ -string Kmer::getKmerString(string sequence){ - int length = sequence.length(); - int nKmers = length - kmerSize + 1; +string Kmer::getKmerString(string sequence){ // Calculate kmer for each position in the sequence, count the freq + int length = sequence.length(); // of each kmer, and convert it to an ascii character with base '!'. + int nKmers = length - kmerSize + 1; // Export the string of characters as a string vector counts(maxKmer, 0); - for(int i=0;i T [T] +// Base5 = (915 / 4^1) % 4 = 228 % 4 = 0 => A [AT] +// Base4 = (915 / 4^2) % 4 = 57 % 4 = 1 => C [CAT] +// Base3 = (915 / 4^3) % 4 = 14 % 4 = 2 => G [GCAT] +// Base2 = (915 / 4^4) % 4 = 3 % 4 = 3 => T [TGCAT] +// Base1 = (915 / 4^5) % 4 = 0 % 4 = 0 => A [ATGCAT] -> this checks out with the previous method + + int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; string kmer = ""; - if(kmerNumber == power4s[kmerSize]){//pow(4.,7)){ - for(int i=0;i fast. + * + */ using namespace std; @@ -22,16 +37,16 @@ KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerS string kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; ifstream kmerFileTest(kmerDBName.c_str()); - int power4s[9] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536 }; + int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; maxKmer = power4s[kmerSize]; kmerLocations.resize(maxKmer+1); - if(!kmerFileTest){ + if(!kmerFileTest){ // if we can open the kmer db file, then read it in... cout << "Generating the " << kmerDBName << " database...\t"; cout.flush(); generateKmerDB(kmerDBName); } - else{ + else{ // ...otherwise generate it. cout << "Reading in the " << kmerDBName << " database...\t"; cout.flush(); readKmerDB(kmerDBName, kmerFileTest); } @@ -42,68 +57,67 @@ KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerS /**************************************************************************************************/ Sequence* KmerDB::findClosestSequence(Sequence* candidateSeq){ + + vector matches(numSeqs, 0); // a record of the sequences with shared kmers + vector timesKmerFound(kmerLocations.size()+1, 0); // a record of the kmers that we have already found - vector matches(numSeqs, 0); - vector timesKmerFound(kmerLocations.size()+1, 0); - - int maxMatches = 0; + searchScore = 0; int maxSequence = 0; - string query = candidateSeq->getUnaligned(); - - int numKmers = query.length() - kmerSize + 1; + string query = candidateSeq->getUnaligned(); // we want to search using an unaligned dna sequence + + int numKmers = query.length() - kmerSize + 1; Kmer kmer(kmerSize); for(int i=0;i maxMatches){ - maxMatches = matches[i]; + + for(int i=0;i searchScore){ // the query sequence + searchScore = matches[i]; maxSequence = i; } } - return templateSequences[maxSequence]; - + searchScore = 100 * searchScore / (float)numKmers; + return templateSequences[maxSequence]; // return the Sequence object corresponding to the db + // sequence with the most shared kmers. } /**************************************************************************************************/ void KmerDB::generateKmerDB(string kmerDBName){ - Kmer kmer(kmerSize); - for(int i=0;igetUnaligned(); + string seq = templateSequences[i]->getUnaligned(); // ...take the unaligned sequence... int numKmers = seq.length() - kmerSize + 1; - for(int j=0;j seenBefore(maxKmer+1,0); + for(int j=0;j> seqName >> numValues; - for(int j=0;j> seqNumber; - kmerLocations[i].push_back(seqNumber); + for(int j=0;j> seqNumber; // 1. number of sequences with the kmer number + kmerLocations[i].push_back(seqNumber); // 2. sequence indices } } kmerDBFile.close(); diff --git a/kmerdb.hpp b/kmerdb.hpp index af52789..eb10061 100644 --- a/kmerdb.hpp +++ b/kmerdb.hpp @@ -8,6 +8,15 @@ * Created by Pat Schloss on 12/16/08. * Copyright 2008 Patrick D. Schloss. All rights reserved. * + * This class is a child class of the Database class, which stores the template sequences as a kmer table and provides + * a method of searching the kmer table for the sequence with the most kmers in common with a query sequence. + * kmerLocations is the primary storage variable that is a two-dimensional vector where each row represents the + * different number of kmers and each column contains the index to sequences that use that kmer. + * + * Construction of an object of this type will first look for an appropriately named database file and if it is found + * then will read in the database file (readKmerDB), otherwise it will generate one and store the data in memory + * (generateKmerDB) + */ #include "mothur.h" diff --git a/nast.cpp b/nast.cpp new file mode 100644 index 0000000..0fae6ab --- /dev/null +++ b/nast.cpp @@ -0,0 +1,362 @@ +/* + * nast.cpp + * + * + * Created by Pat Schloss on 12/17/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This is my implementation of the NAST (nearest alignment space termination) algorithm as described in: + * + * DeSantis TZ, Hugenholtz P, Keller K, Brodie EL, Larsen N, Piceno YM, Phan R, & Anderson GL. 2006. NAST: a multiple + * sequence alignment server for comparative analysis of 16S rRNA genes. Nucleic Acids Research. 34:W394-9. + * + * To construct an object one needs to provide a method of getting a pairwise alignment (alignment) and the template + * and candidate sequence that are to be aligned to each other. + * + */ + +using namespace std; + +#include "sequence.hpp" +#include "alignment.hpp" +#include "nast.hpp" + +/**************************************************************************************************/ + +Nast::Nast(Alignment* method, Sequence* cand, Sequence* temp) : alignment(method), candidateSeq(cand), templateSeq(temp) { + maxInsertLength = 0; + pairwiseAlignSeqs(); // This is part A in Fig. 2 of DeSantis et al. + regapSequences(); // This is parts B-F in Fig. 2 of DeSantis et al. +} + +/**************************************************************************************************/ + +void Nast::pairwiseAlignSeqs(){ // Here we call one of the pairwise alignment methods to align our unaligned candidate + // and template sequences + alignment->align(candidateSeq->getUnaligned(), templateSeq->getUnaligned()); + + string candAln = alignment->getSeqAAln(); + string tempAln = alignment->getSeqBAln(); + + if(candAln == ""){ + candidateSeq->setPairwise(""); + templateSeq->setPairwise(templateSeq->getUnaligned()); + } + else{ + if(tempAln[0] == '-'){ + int pairwiseAlignmentLength = tempAln.length(); // we need to make sure that the candidate sequence alignment + for(int i=0;i=0; i--){// ends where the template sequence alignment ends, if it runs + if(isalpha(tempAln[i])){ // long, we nuke the end of the candidate sequence + candAln = candAln.substr(0,i+1); + tempAln = tempAln.substr(0,i+1); + break; + } + } + } + } + candidateSeq->setPairwise(candAln); // set the pairwise sequences in the Sequence objects for + templateSeq->setPairwise(tempAln); // the candidate and template sequences + +} + +/**************************************************************************************************/ + +void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAlign){ + +// here we do steps C-F of Fig. 2 from DeSantis et al. + + int longAlignmentLength = newTemplateAlign.length(); + + for(int i=0; i=0;leftIndex--){ // then we've got problems... + if(!isalpha(candAln[leftIndex])){ + leftRoom = 1; //count how far it is to the nearest gap on the LEFT side of the anomaly + while(leftIndex-leftRoom>=0 && !isalpha(candAln[leftIndex-leftRoom])) { leftRoom++; } + break; + } + } +// cout << leftIndex << '\t' << leftRoom << endl; + + for(rightIndex=i+1;rightIndex maxInsertLength){ maxInsertLength = insertLength; } + + if((leftRoom + rightRoom) >= insertLength){ + // Parts D & E from Fig. 2 of DeSantis et al. + if((i-leftIndex) <= (rightIndex-i)){ // the left gap is closer - > move stuff left there's + if(leftRoom >= insertLength){ // enough room to the left to move + string leftTemplateString = newTemplateAlign.substr(0,i); + string rightTemplateString = newTemplateAlign.substr(i+insertLength); + newTemplateAlign = leftTemplateString + rightTemplateString; + longAlignmentLength = newTemplateAlign.length(); + + string leftCandidateString = candAln.substr(0,leftIndex-insertLength+1); + string rightCandidateString = candAln.substr(leftIndex+1); + candAln = leftCandidateString + rightCandidateString; + } + else{ // not enough room to the left, have to steal some space to + string leftTemplateString = newTemplateAlign.substr(0,i); // the right + string rightTemplateString = newTemplateAlign.substr(i+insertLength); + newTemplateAlign = leftTemplateString + rightTemplateString; + longAlignmentLength = newTemplateAlign.length(); + + string leftCandidateString = candAln.substr(0,leftIndex-leftRoom+1); + string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1); + string rightCandidateString = candAln.substr(rightIndex+(insertLength-leftRoom)); + candAln = leftCandidateString + insertString + rightCandidateString; + } + } + else{ // the right gap is closer - > move stuff right there's + if(rightRoom >= insertLength){ // enough room to the right to move + string leftTemplateString = newTemplateAlign.substr(0,i); + string rightTemplateString = newTemplateAlign.substr(i+insertLength); + newTemplateAlign = leftTemplateString + rightTemplateString; + longAlignmentLength = newTemplateAlign.length(); + + string leftCandidateString = candAln.substr(0,rightIndex); + string rightCandidateString = candAln.substr(rightIndex+insertLength); + candAln = leftCandidateString + rightCandidateString; + + } + else{ // not enough room to the right, have to steal some + // space to the left lets move left and then right... + string leftTemplateString = newTemplateAlign.substr(0,i); + string rightTemplateString = newTemplateAlign.substr(i+insertLength); + newTemplateAlign = leftTemplateString + rightTemplateString; + longAlignmentLength = newTemplateAlign.length(); + + string leftCandidateString = candAln.substr(0,leftIndex-(insertLength-rightRoom)+1); + string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1); + string rightCandidateString = candAln.substr(rightIndex+rightRoom); + candAln = leftCandidateString + insertString + rightCandidateString; + + } + } + } + else{ // there could be a case where there isn't enough room in + string leftTemplateString = newTemplateAlign.substr(0,i); // either direction to move stuff + string rightTemplateString = newTemplateAlign.substr(i+leftRoom+rightRoom); + newTemplateAlign = leftTemplateString + rightTemplateString; + longAlignmentLength = newTemplateAlign.length(); + + string leftCandidateString = candAln.substr(0,leftIndex-leftRoom+1); + string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1); + string rightCandidateString = candAln.substr(rightIndex+rightRoom); + candAln = leftCandidateString + insertString + rightCandidateString; + } + + i -= insertLength; + } + } +// cout << candAln << endl << tempAln << endl << newTemplateAlign << endl; +} + +/**************************************************************************************************/ + +void Nast::regapSequences(){ //This is essentially part B in Fig 2. of DeSantis et al. + + string candPair = candidateSeq->getPairwise(); + string candAln = ""; + + string tempPair = templateSeq->getPairwise(); + string tempAln = templateSeq->getAligned(); // we use the template aligned sequence as our guide + + int pairwiseLength = candPair.length(); + int fullAlignLength = tempAln.length(); + + if(candPair == ""){ + for(int i=0;isetAligned(candAln); + return; + } + + int fullAlignIndex = 0; + int pairwiseAlignIndex = 0; + string newTemplateAlign = ""; // this is going to be messy so we want a temporary template + // alignment string + while(tempAln[fullAlignIndex] == '.' || tempAln[fullAlignIndex] == '-'){ + candAln += '.'; // add the initial '-' and '.' to the candidate and template + newTemplateAlign += tempAln[fullAlignIndex];// pairwise sequences + fullAlignIndex++; + } + + string lastLoop = ""; + + while(pairwiseAlignIndexseems to be the opposite of the alpha scenario + + candAln += candPair[pairwiseAlignIndex]; + newTemplateAlign += tempAln[fullAlignIndex];// + pairwiseAlignIndex++; + fullAlignIndex++; + } + else if(isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex]) + && !isalpha(candPair[pairwiseAlignIndex])){ + // template pairwise has a character, but its full aligned sequence and candidate sequence have gaps + // this would happen like we need to add a gap. basically the opposite of the alpha situation + + newTemplateAlign += tempAln[fullAlignIndex];// + candAln += "-"; + fullAlignIndex++; + } + else if(!isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex]) + && !isalpha(candPair[pairwiseAlignIndex])){ + // template and candidate pairwise are gaps and the template aligned is not a gap this should not be possible + // would skip the gaps and not progress through full alignment sequence + // not tested yet + + cout << "We're into D" << ' ' << fullAlignIndex << ' ' << pairwiseAlignIndex << endl; + pairwiseAlignIndex++; + } + else{ + // everything has a gap - not possible + // not tested yet + + cout << "We're into F" << ' ' << fullAlignIndex << ' ' << pairwiseAlignIndex << endl; + pairwiseAlignIndex++; + fullAlignIndex++; + } + } + + for(int i=fullAlignIndex;i=0;i--){ // ditto. + if(candAln[i] == 'Z' || !isalnum(candAln[i])) { candAln[i] = '.'; } + else{ end = i; break; } + } + + for(int i=start;i<=end;i++){ // go through the candidate alignment sequence and make sure that + candAln[i] = toupper(candAln[i]); // everything is upper case + } + +// cout << candAln << endl; +// cout << tempAln << endl; +// cout << newTemplateAlign << endl; + + if(candAln.length() != tempAln.length()){ // if the regapped candidate sequence is longer than the official + removeExtraGaps(candAln, tempAln, newTemplateAlign);// template alignment then we need to do steps C-F in Fig. + } // 2 of Desantis et al. + +// cout << candAln << endl; +// cout << tempAln << endl; +// cout << newTemplateAlign << endl; + + candidateSeq->setAligned(candAln); +} + +/**************************************************************************************************/ + +float Nast::getSimilarityScore(){ + + string cand = candidateSeq->getAligned(); + string temp = templateSeq->getAligned(); + int alignmentLength = temp.length(); + int mismatch = 0; + int denominator = 0; + + for(int i=0;igetName(); + queryLength = candSeq->getUnalignLength(); +} + +/******************************************************************************************************************/ + +void NastReport::setTemplate(Sequence* tempSeq){ + templateName = tempSeq->getName(); + templateLength = tempSeq->getUnalignLength(); +} + +/******************************************************************************************************************/ + +void NastReport::setSearchParameters(string method, float score){ + searchMethod = method; + searchScore = score; +} + +/******************************************************************************************************************/ + +void NastReport::setAlignmentParameters(string method, Alignment* align){ + alignmentMethod = method; + + candidateStartPosition = align->getCandidateStartPos(); + candidateEndPosition = align->getCandidateEndPos(); + templateStartPosition = align->getTemplateStartPos(); + templateEndPosition = align->getTemplateEndPos(); + pairwiseAlignmentLength = align->getPairwiseLength(); + + totalGapsInQuery = pairwiseAlignmentLength - (candidateEndPosition - candidateStartPosition + 1); + totalGapsInTemplate = pairwiseAlignmentLength - (templateEndPosition - templateStartPosition + 1); +} + +/******************************************************************************************************************/ + +void NastReport::setNastParameters(Nast nast){ + + longestInsert = nast.getMaxInsertLength(); + similarityToTemplate = nast.getSimilarityScore(); + +} + +/******************************************************************************************************************/ diff --git a/nastreport.hpp b/nastreport.hpp new file mode 100644 index 0000000..424b0bb --- /dev/null +++ b/nastreport.hpp @@ -0,0 +1,51 @@ +#ifndef NASTREPORT_HPP +#define NASTREPORT_HPP + + +/* + * nastreport.hpp + * + * + * Created by Pat Schloss on 12/19/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + */ + +#include "mothur.h" + +using namespace std; + +/******************************************************************************************************************/ + +class NastReport { + +public: + NastReport(string); + void setCandidate(Sequence*); + void setTemplate(Sequence*); + void setSearchParameters(string, float); + void setAlignmentParameters(string, Alignment*); + void setNastParameters(Nast); + void print(); + +private: + string queryName; + int queryLength; + string templateName; + int templateLength; + string searchMethod; + float searchScore; + string alignmentMethod; + int candidateStartPosition, candidateEndPosition; + int templateStartPosition, templateEndPosition; + + int pairwiseAlignmentLength; + int longestInsert; + int totalGapsInQuery, totalGapsInTemplate; + float similarityToTemplate; + ofstream candidateReportFile; +}; + +/******************************************************************************************************************/ + +#endif diff --git a/needlemanoverlap.cpp b/needlemanoverlap.cpp new file mode 100644 index 0000000..8f5d464 --- /dev/null +++ b/needlemanoverlap.cpp @@ -0,0 +1,92 @@ +/* + * needleman.cpp + * + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This class is an Alignment child class that implements the Gotoh pairwise alignment algorithm as described in: + * + * Gotoh O. 1982. An improved algorithm for matching biological sequences. J. Mol. Biol. 162:705-8. + * Myers, EW & Miller, W. 1988. Optimal alignments in linear space. Comput Appl Biosci. 4:11-7. + * + * This method is nice because it allows for an affine gap penalty to be assessed, which is analogous to what is used + * in blast and is an alternative to Needleman-Wunsch, which only charges the same penalty for each gap position. + * Because this method typically has problems at the ends when two sequences do not full overlap, we employ a separate + * method to fix the ends (see Overlap class documentation) + * + */ + +using namespace std; + +#include "alignmentcell.hpp" +#include "alignment.hpp" +#include "overlap.hpp" +#include "needlemanoverlap.hpp" + +/**************************************************************************************************/ + +NeedlemanOverlap::NeedlemanOverlap(float gO, float m, float mm, int r) :// note that we don't have a gap extend +gap(gO), match(m), mismatch(mm), Alignment(r) { // the gap openning penalty is assessed for + // every gapped position + for(int i=1;i= up){ + if(diagonal >= left){ + alignment[i][j].cValue = diagonal; + alignment[i][j].prevCell = 'd'; + } + else{ + alignment[i][j].cValue = left; + alignment[i][j].prevCell = 'l'; + } + } + else{ + if(up >= left){ + alignment[i][j].cValue = up; + alignment[i][j].prevCell = 'u'; + } + else{ + alignment[i][j].cValue = left; + alignment[i][j].prevCell = 'l'; + } + } + } + } + Overlap over; + over.setOverlap(alignment, lA, lB, 0); // Fix gaps at the beginning and end of the sequences + traceBack(); // Traceback the alignment to populate seqAaln and seqBaln +} + +/**************************************************************************************************/ + diff --git a/needlemanoverlap.hpp b/needlemanoverlap.hpp new file mode 100644 index 0000000..57d3284 --- /dev/null +++ b/needlemanoverlap.hpp @@ -0,0 +1,43 @@ +#ifndef NEEDLEMAN_H +#define NEEDLEMAN_H + +/* + * needleman.h + * + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This class is an Alignment child class that implements the Needleman-Wunsch pairwise alignment algorithm as + * described in: + * + * Needleman SB & Wunsch CD. 1970. A general method applicable to the search for similarities in the amino acid + * sequence of two proteins. J Mol Biol. 48:443-53. + * Korf I, Yandell M, & Bedell J. 2003. BLAST. O'Reilly & Associates. Sebastopol, CA. + * + * This method is simple as it assesses a consistent penalty for each gap position. Because this method typically has + * problems at the ends when two sequences do not full overlap, we employ a separate method to fix the ends (see + * Overlap class documentation) + * + */ + +#include "mothur.h" + +/**************************************************************************************************/ + +class NeedlemanOverlap : public Alignment { + +public: + NeedlemanOverlap(float, float, float, int); + ~NeedlemanOverlap(); + void align(string, string); + +private: + float gap; + float match; + float mismatch; +}; + +/**************************************************************************************************/ + +#endif diff --git a/noalign.cpp b/noalign.cpp new file mode 100644 index 0000000..14e34f6 --- /dev/null +++ b/noalign.cpp @@ -0,0 +1,25 @@ +/* + * noalign.cpp + * + * + * Created by Pat Schloss on 2/19/09. + * Copyright 2009 __MyCompanyName__. All rights reserved. + * + */ + +#include "alignment.hpp" +#include "noalign.hpp" + +/**************************************************************************************************/ + +NoAlign::NoAlign(){ /* do nothing */ } + +/**************************************************************************************************/ + +NoAlign::~NoAlign(){ /* do nothing */ } + +/**************************************************************************************************/ + +void NoAlign::align(string A, string B){ } + +/**************************************************************************************************/ diff --git a/noalign.hpp b/noalign.hpp new file mode 100644 index 0000000..dd1e4ba --- /dev/null +++ b/noalign.hpp @@ -0,0 +1,31 @@ +#ifndef NOALIGN_HPP +#define NOALIGN_HPP + +/* + * noalign.hpp + * + * + * Created by Pat Schloss on 2/19/09. + * Copyright 2009 __MyCompanyName__. All rights reserved. + * + */ +using namespace std; + +#include "mothur.h" + +/**************************************************************************************************/ + +class NoAlign : public Alignment { + +public: + NoAlign(); + ~NoAlign(); + void align(string, string); + +private: +}; + +/**************************************************************************************************/ + + +#endif diff --git a/overlap.cpp b/overlap.cpp new file mode 100644 index 0000000..9645e7a --- /dev/null +++ b/overlap.cpp @@ -0,0 +1,83 @@ +/* + * overlap.cpp + * + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This class cleans up the alignment at the 3' end of the alignments. Because the Gotoh and Needleman-Wunsch + * algorithms start the traceback from the lower-right corner of the dynamic programming matrix, there may be a lot of + * scattered bases in the alignment near the 3' end of the alignment. Here we basically look for the largest score + * in the last column and row to determine whether there should be exta gaps in sequence A or sequence B. The gap + * issues at the 5' end of the alignment seem to take care of themselves in the traceback. + * + */ +using namespace std; + +#include "alignmentcell.hpp" +#include "overlap.hpp" + + +/**************************************************************************************************/ + +int Overlap::maxRow(vector >& alignment, const int band){ + + float max = -100; + int end = lA - 1; + int index = end; + + for(int i=band;i= max){ // score. + index = i; + max = alignment[i][end].cValue; + } + } + return index; +} + +/**************************************************************************************************/ + +int Overlap::maxColumn(vector >& alignment, const int band){ + + float max = -100; + int end = lB - 1; + int index = end; + + for(int i=band;i= max){ // alignment score. + index = i; + max = alignment[end][i].cValue; + } + } + return index; +} + +/**************************************************************************************************/ + +void Overlap::setOverlap(vector >& alignment, const int nA, const int nB, const int band=0){ + + lA = nA; + lB = nB; + + int rowIndex = maxRow(alignment, band); // get the index for the row with the highest right hand side score + int colIndex = maxColumn(alignment, band); // get the index for the column with the highest bottom row score + + int row = lB-1; + int column = lA-1; + + if(colIndex == column && rowIndex == row){} // if the max values are the lower right corner, then we're good + else if(alignment[row][colIndex].cValue < alignment[rowIndex][column].cValue){ + for(int i=rowIndex+1;i >&, const int, const int, const int); +private: + int maxRow(vector >&, const int); + int maxColumn(vector >&, const int); + int lA, lB; +}; + +/**************************************************************************************************/ + +#endif diff --git a/rarefact.cpp b/rarefact.cpp index 5c1ce74..ba0c02e 100644 --- a/rarefact.cpp +++ b/rarefact.cpp @@ -94,15 +94,23 @@ try { //randomize the groups random_shuffle(lookup.begin(), lookup.end()); + //make merge the size of lookup[0] + SharedRAbundVector* merge = new SharedRAbundVector(lookup[0]->size()); + + //make copy of lookup zero + for(int i = 0; isize(); i++) { + merge->set(i, lookup[0]->getAbundance(i), "merge"); + } + vector subset; //send each group one at a time for (int k = 0; k < lookup.size(); k++) { subset.clear(); //clears out old pair of sharedrabunds //add in new pair of sharedrabunds - subset.push_back(lookup[0]); subset.push_back(lookup[k]); + subset.push_back(merge); subset.push_back(lookup[k]); rcd->updateSharedData(subset, k+1, numGroupComb); - mergeVectors(lookup[0], lookup[k]); + mergeVectors(merge, lookup[k]); } //resets output files @@ -134,9 +142,7 @@ void Rarefact::mergeVectors(SharedRAbundVector* shared1, SharedRAbundVector* sha try{ for (int k = 0; k < shared1->size(); k++) { //merge new species into shared1 - if ((shared1->getAbundance(k) == 0) && (shared2->getAbundance(k) != 0)) { - shared1->set(k, shared2->getAbundance(k), "combo"); //set to 'combo' since this vector now contains multiple groups - } + shared1->set(k, (shared1->getAbundance(k) + shared2->getAbundance(k)), "combo"); //set to 'combo' since this vector now contains multiple groups } } catch(exception& e) { diff --git a/sequence.cpp b/sequence.cpp index b59363e..f136426 100644 --- a/sequence.cpp +++ b/sequence.cpp @@ -24,6 +24,36 @@ Sequence::Sequence(string newName, string sequence) { } setUnaligned(sequence); } +//******************************************************************************************************************** + +Sequence::Sequence(ifstream& fastaFile){ + + string accession; // provided a file handle to a fasta-formatted sequence file, read in the next + fastaFile >> accession; // accession number and sequence we find... + setName(accession); + + char letter; + string sequence; + + while(fastaFile){ + letter= fastaFile.get(); + if(letter == '>'){ + fastaFile.putback(letter); + break; + } + else if(isprint(letter)){ + letter = toupper(letter); + if(letter == 'U'){letter = 'T';} + sequence += letter; + } + + } + + if(sequence.find_first_of('-') != string::npos){ // if there are any gaps in the sequence, assume that it is + setAligned(sequence); // an alignment file + } + setUnaligned(sequence); // also set the unaligned sequence file +} //******************************************************************************************************************** @@ -122,3 +152,18 @@ void Sequence::printSequence(ostream& out){ } //******************************************************************************************************************** + +int Sequence::getUnalignLength(){ + return unaligned.length(); +} + +//******************************************************************************************************************** + +int Sequence::getAlignLength(){ + return aligned.length(); +} + +//******************************************************************************************************************** + + + diff --git a/sequence.hpp b/sequence.hpp index 03cbab7..37ca87c 100644 --- a/sequence.hpp +++ b/sequence.hpp @@ -8,6 +8,10 @@ * Created by Pat Schloss on 12/15/08. * Copyright 2008 Patrick D. Schloss. All rights reserved. * + * A sequence object has three components: i) an accession number / name, ii) the unaligned primary sequence, iii) a + * pairwise aligned sequence, and iv) a sequence that is aligned to a reference alignment. This class has methods + * to set and get these values for the other classes where they are needed. * + * */ using namespace std; @@ -20,6 +24,7 @@ class Sequence { public: Sequence(); Sequence(string, string); + Sequence(ifstream&); void setName(string); void setUnaligned(string); @@ -32,7 +37,9 @@ public: string getAligned(); string getPairwise(); string getUnaligned(); - int getLength(); + int getLength(); //the greater of the lengths of unaligned and aligned + int getUnalignLength(); + int getAlignLength(); void printSequence(ostream&); private: diff --git a/suffixdb.cpp b/suffixdb.cpp new file mode 100644 index 0000000..b4639f9 --- /dev/null +++ b/suffixdb.cpp @@ -0,0 +1,56 @@ +/* + * suffixdb.cpp + * + * + * Created by Pat Schloss on 12/16/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This is a child class of the Database abstract datatype. The class is basically a database of suffix trees and an + * encapsulation of the method for finding the most similar tree to an inputted sequence. the suffixForest objecct + * is a vector of SuffixTrees, with each template sequence being represented by a different SuffixTree. The class also + * provides a method to take an unaligned sequence and find the closest sequence in the suffixForest. The search + * method is inspired by the article and Perl source code provided at http://www.ddj.com/web-development/184416093. I + * would estimate that the time complexity is O(LN) for each search, which is slower than the kmer searching, but + * faster than blast + * + */ + +using namespace std; + +#include "database.hpp" +#include "sequence.hpp" +#include "suffixtree.hpp" +#include "suffixdb.hpp" + +/**************************************************************************************************/ + +SuffixDB::SuffixDB(string fastaFileName) : Database(fastaFileName) { + + suffixForest.resize(numSeqs); + cout << "Generating the suffix tree database...\t"; cout.flush(); + for(int i=0;iconvert2ints(); // the candidate sequence needs to be a string of ints + for(int i=0;i suffixForest; +}; + +#endif diff --git a/suffixnodes.cpp b/suffixnodes.cpp new file mode 100644 index 0000000..79826ff --- /dev/null +++ b/suffixnodes.cpp @@ -0,0 +1,96 @@ +/* + * SuffixNodes.cpp + * + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * There are two types of nodes in a suffix tree as I have implemented it. First, there are the internal nodes that + * have children, these are the SuffixBranch objects. There are also the terminal nodes, which are the suffixBranches. + * I divided them into two groups to save on memory. A SuffixTree object will be a vector of SuffixNodes; therefore, + * the values of parentNode, children nodes, and suffix nodes are stored as ints that correspond to indices in the + * vector + * + */ + +#include "suffixnodes.hpp" + +using namespace std; + +//******************************************************************************************************************** + +inline char deCodeSequence(char code){ + + if(code == '0') { return 'a'; } // this method allows us to go from the int string to a char string; + else if(code == '1') { return 'c'; } // it's only really useful if we want to print out the tree + else if(code == '2') { return 'g'; } + else if(code == '3') { return 't'; } + else if(code == '4') { return 'n'; } + else { return '$'; } + +} + +//******************************************************************************************************************** + +SuffixNode::SuffixNode(int parent, int start, int end) : + parentNode(parent), // we store the parent node as an int + startCharPosition(start), // the suffix tree class will hold the sequence that the startCharPosition and + endCharPosition(end) // endCharPosition indices correspond to + { /* do nothing */ } + + +void SuffixNode::setChildren(char, int) { /* do nothing */ } // there's no children in a leaf +int SuffixNode::getNumChildren() { return 0; } // ditto +void SuffixNode::eraseChild(char) { /* do nothing */ } // ditto +int SuffixNode::getChild(char) { return -1; } // ditto +void SuffixNode::setSuffixNode(int) { /* do nothing */ } // there's no suffix node in a leaf +int SuffixNode::getSuffixNode() { return -1; } // ditto +int SuffixNode::getParentNode() { return parentNode; } +void SuffixNode::setParentNode(int number) { parentNode = number; } +int SuffixNode::getStartCharPos() { return startCharPosition; } +void SuffixNode::setStartCharPos(int start) { startCharPosition = start; } +int SuffixNode::getEndCharPos() { return endCharPosition; } + +//******************************************************************************************************************** + +SuffixLeaf::SuffixLeaf(int parent, int start, int end) : SuffixNode(parent, start, end) { /* do nothing */ } + + +void SuffixLeaf::print(string sequence, int nodeNumber){ + + cout << this << '\t' << parentNode << '\t' << nodeNumber << '\t' << + -1 << '\t' << startCharPosition << '\t' << endCharPosition << '\t'; + + cout << '\''; + for(int i=startCharPosition;i<=endCharPosition;i++){ + cout << deCodeSequence(sequence[i]); + } + cout << '\'' << endl; +} + +//******************************************************************************************************************** + +SuffixBranch::SuffixBranch(int parent, int start, int end) : SuffixNode(parent, start, end), suffixNode(-1){ + childNodes.assign(6, -1); +} + +void SuffixBranch::print(string sequence, int nodeNumber){ // this method is different that than + cout << this << '\t' << parentNode << '\t' << nodeNumber << '\t' << // of a leaf because it prints out a + suffixNode << '\t' << startCharPosition << '\t' << endCharPosition << '\t'; // value for the suffix node + + cout << '\''; + for(int i=startCharPosition;i<=endCharPosition;i++){ + cout << deCodeSequence(sequence[i]); + } + cout << '\'' << endl; +} + +// we can access the children by subtracting '0' from the the char value from the string, the difference is an int +// value and the index we need to access. +void SuffixBranch::eraseChild(char base) { childNodes[base - '0'] = -1; } //to erase set the child index to -1 +void SuffixBranch::setChildren(char base, int nodeIndex){ childNodes[base - '0'] = nodeIndex; } +void SuffixBranch::setSuffixNode(int nodeIndex){ suffixNode = nodeIndex; } +int SuffixBranch::getSuffixNode() { return suffixNode; } +int SuffixBranch::getChild(char base) { return childNodes[base - '0']; } + +//******************************************************************************************************************** diff --git a/suffixnodes.hpp b/suffixnodes.hpp new file mode 100644 index 0000000..863a3ae --- /dev/null +++ b/suffixnodes.hpp @@ -0,0 +1,77 @@ +#ifndef SUFFIXNODES_H +#define SUFFIXNODES_H + +/* + * SuffixNodes.h + * + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * There are two types of nodes in a suffix tree as I have implemented it. First, there are the internal nodes that + * have children, these are the SuffixBranch objects. There are also the terminal nodes, which are the suffixBranches. + * I divided them into two groups to save on memory. A SuffixTree object will be a vector of SuffixNodes; therefore, + * the values of parentNode, children nodes, and suffix nodes are stored as ints that correspond to indices in the + * vector + * + */ + +#include "mothur.h" + +using namespace std; + +//******************************************************************************************************************** + +class SuffixNode { + +public: + SuffixNode(int, int, int); + virtual void print(string, int) = 0; + virtual void setChildren(char, int); + virtual int getNumChildren(); + virtual void eraseChild(char); + virtual void setSuffixNode(int); + virtual int getSuffixNode(); + virtual int getChild(char); + int getParentNode(); + void setParentNode(int); + int getStartCharPos(); + void setStartCharPos(int start); + int getEndCharPos(); + +protected: + int parentNode; + int startCharPosition; + int endCharPosition; +}; + +//******************************************************************************************************************** + +class SuffixLeaf : public SuffixNode { // most of the methods are already set in the parent class + +public: + SuffixLeaf(int, int, int); // we just need to define a constructor and + void print(string, int); // print method +}; + +//******************************************************************************************************************** + +class SuffixBranch : public SuffixNode { + +public: + SuffixBranch(int, int, int); + void print(string, int); // need a special method for printing the node because there are children + void eraseChild(char); // need a special method for erasing the children + void setChildren(char, int); // need a special method for setting children + void setSuffixNode(int); // need a special method for setting the suffix node + int getSuffixNode(); // need a special method for returning the suffix node + int getChild(char); // need a special method for return children + +private: + vector childNodes; // a suffix branch is unique because it has children and a suffixNode. The + int suffixNode; // are stored in a vector for super-fast lookup. If the alphabet were bigger, this +}; // might not be practical. Since we only have 5 possible letters, it makes sense + +//******************************************************************************************************************** + +#endif diff --git a/suffixtree.cpp b/suffixtree.cpp new file mode 100644 index 0000000..56ce6ed --- /dev/null +++ b/suffixtree.cpp @@ -0,0 +1,247 @@ +/* + * suffixtree.cpp + * + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This is my half-assed attempt to implement a suffix tree. This is a cobbled together algorithm using materials that + * I found at http://marknelson.us/1996/08/01/suffix-trees/ and: + * + * Ukkonen E. (1995). On-line construction of suffix trees. Algorithmica 14 (3): 249--260 + * Gusfield, Dan (1999). Algorithms on Strings, Trees and Sequences: Computer Science and Computational Biology. + * USA: Cambridge University Press + * + * The Ukkonen paper is the seminal paper describing the on-line method of constructing a suffix tree. + * + * I have chosen to store the nodes of the tree as a vector of pointers to SuffixNode objects. The root is stored at + * nodeVector[0]. Each tree also stores the sequence name and the string that corresponds to the actual sequence. + * Finally, this class provides a way of counting the number of suffixes that are needed in one tree to generate a new + * sequence (countSuffixes). This method is used to determine similarity between sequences and was inspired by the + * article and Perl source code provided at http://www.ddj.com/web-development/184416093. + * + */ + +#include "sequence.hpp" +#include "suffixnodes.hpp" +#include "suffixtree.hpp" + +using namespace std; + +//******************************************************************************************************************** + +inline bool compareParents(SuffixNode* left, SuffixNode* right){// this is necessary to print the tree and to sort the + return (left->getParentNode() < right->getParentNode()); // nodes in order of their parent +} + +//******************************************************************************************************************** + +SuffixTree::SuffixTree(){} + +//******************************************************************************************************************** + +SuffixTree::~SuffixTree(){ + for(int i=0;igetName(); + sequence = seq->convert2ints(); + sequence += '5'; // this essentially concatenates a '$' to the end of the sequence to + int seqLength = sequence.length(); // make it a cononical suffix tree + + nodeVector.push_back(new SuffixBranch(-1, 0, -1)); // enter the root of the suffix tree + + activeNode = root = 0; + string hold; + for(int i=0;i hold = nodeVector; + sort(hold.begin(), hold.end(), compareParents); + cout << "Address\t\tParent\tNode\tSuffix\tStartC\tEndC\tSuffix" << endl; + for(int i=1;i<=nodeCounter;i++){ + hold[i]->print(sequence, i); + } +} + +//******************************************************************************************************************** + +int SuffixTree::countSuffixes(string compareSequence, int& minValue){ // here we count the number of suffix parts + // we need to rewrite a user supplied sequence. if the + int numSuffixes = 0; // count exceeds the supplied minValue, bail out. The + int seqLength = compareSequence.length(); // time complexity should be O(L) + int position = 0; + + int presentNode = 0; + + while(position < seqLength){ // while the position in the query sequence isn't at the end... + + if(numSuffixes > minValue) { return 1000000; } // bail if the count gets too high + + int newNode = nodeVector[presentNode]->getChild(compareSequence[position]); // see if the current node has a + // child that matches the next character in the query + if(newNode == -1){ + if(presentNode == 0){ position++; } // if not, go back to the root and increase the count + numSuffixes++; // by one. + presentNode = 0; + } + else{ // if there is, move to that node and see how far down + presentNode = newNode; // it we can get + + for(int i=nodeVector[newNode]->getStartCharPos(); i<=nodeVector[newNode]->getEndCharPos(); i++){ + if(compareSequence[position] == sequence[i]){ + position++; // as long as the query and branch agree, keep going + } + else{ + numSuffixes++; // if there is a mismatch, increase the number of + presentNode = 0; // suffixes and go back to the root + break; + } + } + } + // if we get all the way through the node we'll go to the top of the while loop and find the child node + // that corresponds to what we are interested in + } + numSuffixes--; // the method puts an extra count on numSuffixes + + if(numSuffixes < minValue) { minValue = numSuffixes; } // if the count is less than the previous minValue, + return numSuffixes; // change the value and return the number of suffixes + +} + +//******************************************************************************************************************** + +void SuffixTree::canonize(){ // if you have to ask how this works, you don't really want to know and this really + // isn't the place to ask. + if ( isExplicit() == 0 ) { // if the node has no children... + + int tempNodeIndex = nodeVector[activeNode]->getChild(sequence[activeStartPosition]); + SuffixNode* tempNode = nodeVector[tempNodeIndex]; + + int span = tempNode->getEndCharPos() - tempNode->getStartCharPos(); + + while ( span <= ( activeEndPosition - activeStartPosition ) ) { + + activeStartPosition = activeStartPosition + span + 1; + + activeNode = tempNodeIndex; + + if ( activeStartPosition <= activeEndPosition ) { + tempNodeIndex = nodeVector[tempNodeIndex]->getChild(sequence[activeStartPosition]); + tempNode = nodeVector[tempNodeIndex]; + span = tempNode->getEndCharPos() - tempNode->getStartCharPos(); + } + + } + } +} + +//******************************************************************************************************************** + +int SuffixTree::split(int nodeIndex, int position){ // leaves stay leaves, etc, to split a leaf we make a new interior + // node and reconnect everything + SuffixNode* node = nodeVector[nodeIndex]; // get the node that needs to be split + SuffixNode* parentNode = nodeVector[node->getParentNode()]; // get it's parent node + + parentNode->eraseChild(sequence[node->getStartCharPos()]); // erase the present node from the registry of its parent + + nodeCounter++; + SuffixNode* newNode = new SuffixBranch(node->getParentNode(), node->getStartCharPos(), node->getStartCharPos() + activeEndPosition - activeStartPosition); // create a new node that will link the parent with the old child + parentNode->setChildren(sequence[newNode->getStartCharPos()], nodeCounter);// give the parent the new child + nodeVector.push_back(newNode); + + node->setParentNode(nodeCounter); // give the original node the new node as its parent + newNode->setChildren(sequence[node->getStartCharPos() + activeEndPosition - activeStartPosition + 1], nodeIndex); + // put the original node in the registry of the new node's children + newNode->setSuffixNode(activeNode);//link the new node with the old active node + + // recalculate the startCharPosition of the outermost node + node->setStartCharPos(node->getStartCharPos() + activeEndPosition - activeStartPosition + 1 ); + + return node->getParentNode(); +} + +//******************************************************************************************************************** + +void SuffixTree::makeSuffixLink(int& previous, int present){ + +// here we link the nodes that are suffixes of one another to rapidly speed through the tree + if ( previous > 0 ) { nodeVector[previous]->setSuffixNode(present); } + else { /* do nothing */ } + + previous = present; +} + +//******************************************************************************************************************** + +void SuffixTree::addPrefix(int prefixPosition){ + + int lastParentNode = -1; // we need to place a new prefix in the suffix tree + int parentNode = 0; + + while(1){ + + parentNode = activeNode; + + if(isExplicit() == 1){ // if the node is explicit (has kids), try to follow it down the branch if its there... + if(nodeVector[activeNode]->getChild(sequence[prefixPosition]) != -1){ // break out and get next prefix... + break; + } + else{ // ...otherwise continue, we'll need to make a new node later on... + } + } + else{ // if it's not explicit (no kids), read through and see if all of the chars agree... + int tempNode = nodeVector[activeNode]->getChild(sequence[activeStartPosition]); + int span = activeEndPosition - activeStartPosition; + + if(sequence[nodeVector[tempNode]->getStartCharPos() + span + 1] == sequence[prefixPosition] ){ + break; // if the existing suffix agrees with the new one, grab a new prefix... + } + else{ + parentNode = split(tempNode, prefixPosition); // ... otherwise we need to split the node + } + + } + + nodeCounter++; // we need to generate a new node here if the kid didn't exist, or we split a node + SuffixNode* newSuffixLeaf = new SuffixLeaf(parentNode, prefixPosition, sequence.length()-1); + nodeVector[parentNode]->setChildren(sequence[prefixPosition], nodeCounter); + nodeVector.push_back(newSuffixLeaf); + + makeSuffixLink( lastParentNode, parentNode ); // make a suffix link for the parent node + + if(nodeVector[activeNode]->getParentNode() == -1){ // move along the start position for the tree + activeStartPosition++; + } + else { + activeNode = nodeVector[activeNode]->getSuffixNode(); + } + canonize(); // frankly, i'm not entirely clear on what canonize does. + } + + makeSuffixLink( lastParentNode, parentNode ); + activeEndPosition++; // move along the end position for the tree + + canonize(); // frankly, i'm not entirely clear on what canonize does. + +} + +//******************************************************************************************************************** + diff --git a/suffixtree.hpp b/suffixtree.hpp new file mode 100644 index 0000000..387d4bd --- /dev/null +++ b/suffixtree.hpp @@ -0,0 +1,69 @@ +#ifndef SUFFIXTREE_H +#define SUFFIXTREE_H + +/* + * suffixtree.h + * + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This is my half-assed attempt to implement a suffix tree. This is a cobbled together algorithm using materials that + * I found at http://marknelson.us/1996/08/01/suffix-trees/ and: + * + * Ukkonen E. (1995). On-line construction of suffix trees. Algorithmica 14 (3): 249--260 + * Gusfield, Dan (1999). Algorithms on Strings, Trees and Sequences: Computer Science and Computational Biology. + * USA: Cambridge University Press + * + * The Ukkonen paper is the seminal paper describing the on-line method of constructing a suffix tree. + * + * I have chosen to store the nodes of the tree as a vector of pointers to SuffixNode objects. The root is stored at + * nodeVector[0]. Each tree also stores the sequence name and the string that corresponds to the actual sequence. + * Finally, this class provides a way of counting the number of suffixes that are needed in one tree to generate a new + * sequence (countSuffixes). This method is used to determine similarity between sequences and was inspired by the + * article and Perl source code provided at http://www.ddj.com/web-development/184416093. + * + */ + +#include "mothur.h" + +using namespace std; + +class SuffixNode; + +//******************************************************************************************************************** + +class SuffixTree { + +public: + SuffixTree(); + ~SuffixTree(); +// SuffixTree(string, string); + + void loadSequence(Sequence*); + string getSeqName(); + void print(); + int countSuffixes(string, int&); + +private: + void addPrefix(int); + void canonize(); + int split(int, int); + void makeSuffixLink(int&, int); + bool isExplicit(){ return activeStartPosition > activeEndPosition; } + + int activeStartPosition; + int activeEndPosition; + + vector nodeVector; + int root; + int activeNode; + int nodeCounter; + string seqName; + string sequence; + +}; + +//******************************************************************************************************************** + +#endif diff --git a/validcommands.cpp b/validcommands.cpp index 5549e63..a5f3651 100644 --- a/validcommands.cpp +++ b/validcommands.cpp @@ -43,6 +43,7 @@ ValidCommands::ValidCommands() { commands["concensus"] = "concensus"; commands["help"] = "help"; commands["filter.seqs"] = "filter.seqs"; + commands["align.seqs"] = "align.seqs"; commands["quit"] = "quit"; diff --git a/validparameter.cpp b/validparameter.cpp index 8af4a81..e79a2db 100644 --- a/validparameter.cpp +++ b/validparameter.cpp @@ -211,7 +211,6 @@ void ValidParameters::initCommandParameters() { //{"parameter1","parameter2",...,"last parameter"}; string readdistArray[] = {"phylip","column", "name","cutoff","precision", "group"}; - commandParameters["read.dist"] = addParameters(readdistArray, sizeof(readdistArray)/sizeof(string)); string readotuArray[] = {"list","order","shared", "line", "label","group","sabund", "rabund"}; @@ -292,6 +291,9 @@ void ValidParameters::initCommandParameters() { string distanceArray[] = {"fasta","phylip","clustal","nexus", "calc", "ends", "cutoff", "processors"}; commandParameters["dist.seqs"] = addParameters(distanceArray, sizeof(distanceArray)/sizeof(string)); + string AlignArray[] = {"fasta","phylip","clustal","nexus", "template", "ksize", "align", "match", "mismatch", "gapopen", "gapextend"}; + commandParameters["align.seqs"] = addParameters(AlignArray, sizeof(AlignArray)/sizeof(string)); + string quitArray[] = {}; commandParameters["quit"] = addParameters(quitArray, sizeof(quitArray)/sizeof(string)); -- 2.39.2