From: westcott Date: Mon, 18 May 2009 17:37:53 +0000 (+0000) Subject: added alignment code X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=526a868606faa50caf86e7399f7554c0335b39e5 added alignment code --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 7beb6cb..7bd433f 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -14,6 +14,20 @@ 372E12700F26365B0095CF7E /* readotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E126F0F26365B0095CF7E /* readotucommand.cpp */; }; 372E12960F263D5A0095CF7E /* readdistcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E12950F263D5A0095CF7E /* readdistcommand.cpp */; }; 372E12ED0F264D320095CF7E /* commandfactory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E12EC0F264D320095CF7E /* commandfactory.cpp */; }; + 373C68A40FC1C07D00137ACD /* alignment.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68A00FC1C07D00137ACD /* alignment.cpp */; }; + 373C68A50FC1C07D00137ACD /* alignmentcell.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68A20FC1C07D00137ACD /* alignmentcell.cpp */; }; + 373C68BC0FC1C1A600137ACD /* gotohoverlap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68BA0FC1C1A600137ACD /* gotohoverlap.cpp */; }; + 373C68C50FC1C25F00137ACD /* overlap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68C30FC1C25F00137ACD /* overlap.cpp */; }; + 373C68CF0FC1C2D900137ACD /* needlemanoverlap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68CD0FC1C2D900137ACD /* needlemanoverlap.cpp */; }; + 373C68DD0FC1C38D00137ACD /* blastalign.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68DB0FC1C38D00137ACD /* blastalign.cpp */; }; + 373C68E50FC1C4A500137ACD /* noalign.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68E30FC1C4A500137ACD /* noalign.cpp */; }; + 373C68F90FC1C6A800137ACD /* suffixdb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68F30FC1C6A800137ACD /* suffixdb.cpp */; }; + 373C68FA0FC1C6A800137ACD /* suffixnodes.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68F50FC1C6A800137ACD /* suffixnodes.cpp */; }; + 373C68FB0FC1C6A800137ACD /* suffixtree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68F70FC1C6A800137ACD /* suffixtree.cpp */; }; + 373C69180FC1C8AF00137ACD /* blastdb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C69160FC1C8AF00137ACD /* blastdb.cpp */; }; + 373C691F0FC1C98600137ACD /* nast.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C691D0FC1C98600137ACD /* nast.cpp */; }; + 373C692B0FC1C9EB00137ACD /* nastreport.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C69290FC1C9EB00137ACD /* nastreport.cpp */; }; + 373C69340FC1CA9E00137ACD /* distancedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C69320FC1CA9E00137ACD /* distancedb.cpp */; }; 374610780F40645300460C57 /* unifracweightedcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 374610770F40645300460C57 /* unifracweightedcommand.cpp */; }; 3746107E0F4064D100460C57 /* weighted.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3746107D0F4064D100460C57 /* weighted.cpp */; }; 374610830F40652400460C57 /* unweighted.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 374610820F40652400460C57 /* unweighted.cpp */; }; @@ -45,6 +59,7 @@ 378C1B0A0FB0644E004D63F5 /* sequencedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 378C1AFD0FB0644D004D63F5 /* sequencedb.cpp */; }; 378C1B0B0FB0644E004D63F5 /* sharedjackknife.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 378C1AFF0FB0644D004D63F5 /* sharedjackknife.cpp */; }; 378C1B0C0FB0644E004D63F5 /* sharedmarczewski.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 378C1B010FB0644D004D63F5 /* sharedmarczewski.cpp */; }; + 378DC5CF0FBDE1C8003B8607 /* aligncommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 378DC5CE0FBDE1C8003B8607 /* aligncommand.cpp */; }; 379293C30F2DE73400B9034A /* treemap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 379293C20F2DE73400B9034A /* treemap.cpp */; }; 379294700F2E191800B9034A /* parsimonycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3792946F0F2E191800B9034A /* parsimonycommand.cpp */; }; 3792948A0F2E258500B9034A /* parsimony.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 379294890F2E258500B9034A /* parsimony.cpp */; }; @@ -168,6 +183,34 @@ 372E12940F263D5A0095CF7E /* readdistcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = readdistcommand.h; sourceTree = ""; }; 372E12950F263D5A0095CF7E /* readdistcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = readdistcommand.cpp; sourceTree = ""; }; 372E12EC0F264D320095CF7E /* commandfactory.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = commandfactory.cpp; sourceTree = ""; }; + 373C68A00FC1C07D00137ACD /* alignment.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignment.cpp; sourceTree = ""; }; + 373C68A10FC1C07D00137ACD /* alignment.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = alignment.hpp; sourceTree = ""; }; + 373C68A20FC1C07D00137ACD /* alignmentcell.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignmentcell.cpp; sourceTree = ""; }; + 373C68A30FC1C07D00137ACD /* alignmentcell.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = alignmentcell.hpp; sourceTree = ""; }; + 373C68BA0FC1C1A600137ACD /* gotohoverlap.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = gotohoverlap.cpp; sourceTree = ""; }; + 373C68BB0FC1C1A600137ACD /* gotohoverlap.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = gotohoverlap.hpp; sourceTree = ""; }; + 373C68C30FC1C25F00137ACD /* overlap.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = overlap.cpp; sourceTree = ""; }; + 373C68C40FC1C25F00137ACD /* overlap.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = overlap.hpp; sourceTree = ""; }; + 373C68CD0FC1C2D900137ACD /* needlemanoverlap.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = needlemanoverlap.cpp; sourceTree = ""; }; + 373C68CE0FC1C2D900137ACD /* needlemanoverlap.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = needlemanoverlap.hpp; sourceTree = ""; }; + 373C68DB0FC1C38D00137ACD /* blastalign.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blastalign.cpp; sourceTree = ""; }; + 373C68DC0FC1C38D00137ACD /* blastalign.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = blastalign.hpp; sourceTree = ""; }; + 373C68E30FC1C4A500137ACD /* noalign.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = noalign.cpp; sourceTree = ""; }; + 373C68E40FC1C4A500137ACD /* noalign.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = noalign.hpp; sourceTree = ""; }; + 373C68F30FC1C6A800137ACD /* suffixdb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = suffixdb.cpp; sourceTree = ""; }; + 373C68F40FC1C6A800137ACD /* suffixdb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = suffixdb.hpp; sourceTree = ""; }; + 373C68F50FC1C6A800137ACD /* suffixnodes.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = suffixnodes.cpp; sourceTree = ""; }; + 373C68F60FC1C6A800137ACD /* suffixnodes.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = suffixnodes.hpp; sourceTree = ""; }; + 373C68F70FC1C6A800137ACD /* suffixtree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = suffixtree.cpp; sourceTree = ""; }; + 373C68F80FC1C6A800137ACD /* suffixtree.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = suffixtree.hpp; sourceTree = ""; }; + 373C69160FC1C8AF00137ACD /* blastdb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blastdb.cpp; sourceTree = ""; }; + 373C69170FC1C8AF00137ACD /* blastdb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = blastdb.hpp; sourceTree = ""; }; + 373C691D0FC1C98600137ACD /* nast.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = nast.cpp; sourceTree = ""; }; + 373C691E0FC1C98600137ACD /* nast.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = nast.hpp; sourceTree = ""; }; + 373C69290FC1C9EB00137ACD /* nastreport.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = nastreport.cpp; sourceTree = ""; }; + 373C692A0FC1C9EB00137ACD /* nastreport.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = nastreport.hpp; sourceTree = ""; }; + 373C69320FC1CA9E00137ACD /* distancedb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = distancedb.cpp; sourceTree = ""; }; + 373C69330FC1CA9E00137ACD /* distancedb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = distancedb.hpp; sourceTree = ""; }; 374610760F40645300460C57 /* unifracweightedcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = unifracweightedcommand.h; sourceTree = ""; }; 374610770F40645300460C57 /* unifracweightedcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = unifracweightedcommand.cpp; sourceTree = ""; }; 3746107C0F4064D100460C57 /* weighted.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = weighted.h; sourceTree = ""; }; @@ -233,6 +276,8 @@ 378C1B000FB0644D004D63F5 /* sharedjackknife.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sharedjackknife.h; sourceTree = ""; }; 378C1B010FB0644D004D63F5 /* sharedmarczewski.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sharedmarczewski.cpp; sourceTree = ""; }; 378C1B020FB0644D004D63F5 /* sharedmarczewski.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sharedmarczewski.h; sourceTree = ""; }; + 378DC5CD0FBDE1C8003B8607 /* aligncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = aligncommand.h; sourceTree = ""; }; + 378DC5CE0FBDE1C8003B8607 /* aligncommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligncommand.cpp; sourceTree = ""; }; 379293C10F2DE73400B9034A /* treemap.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = treemap.h; sourceTree = ""; }; 379293C20F2DE73400B9034A /* treemap.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = treemap.cpp; sourceTree = ""; }; 3792946E0F2E191800B9034A /* parsimonycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = parsimonycommand.h; sourceTree = ""; }; @@ -465,6 +510,8 @@ isa = PBXGroup; children = ( 37D927BA0F21331F001D4494 /* averagelinkage.cpp */, + 373C68DC0FC1C38D00137ACD /* blastalign.hpp */, + 373C68DB0FC1C38D00137ACD /* blastalign.cpp */, 37D928A60F2133C0001D4494 /* calculators */, 37D927C20F21331F001D4494 /* cluster.hpp */, 37D927C10F21331F001D4494 /* cluster.cpp */, @@ -479,8 +526,6 @@ 37D927D10F21331F001D4494 /* commandoptionparser.hpp */, 37D927D00F21331F001D4494 /* commandoptionparser.cpp */, 37D927D20F21331F001D4494 /* completelinkage.cpp */, - 37D927D40F21331F001D4494 /* database.hpp */, - 37D927D30F21331F001D4494 /* database.cpp */, 7E4130F70F8E58FA00381DD0 /* dlibshuff.h */, 7E4130F60F8E58FA00381DD0 /* dlibshuff.cpp */, 37D927D60F21331F001D4494 /* display.h */, @@ -491,21 +536,29 @@ 37D927DD0F21331F001D4494 /* fileoutput.cpp */, 37D927E00F21331F001D4494 /* globaldata.hpp */, 37D927DF0F21331F001D4494 /* globaldata.cpp */, + 373C68BB0FC1C1A600137ACD /* gotohoverlap.hpp */, + 373C68BA0FC1C1A600137ACD /* gotohoverlap.cpp */, 375873ED0F7D646F0040F377 /* heatmap.h */, 375873EE0F7D646F0040F377 /* heatmap.cpp */, 37D927E60F21331F001D4494 /* inputdata.h */, 37D927E50F21331F001D4494 /* inputdata.cpp */, - 37D927EA0F21331F001D4494 /* kmer.hpp */, - 37D927E90F21331F001D4494 /* kmer.cpp */, - 37D927EC0F21331F001D4494 /* kmerdb.hpp */, - 37D927EB0F21331F001D4494 /* kmerdb.cpp */, 7E412F420F8D213C00381DD0 /* libshuff.h */, 7E412FE90F8D3E2C00381DD0 /* libshuff.cpp */, 37E3ED800F4D9D0D00B5DF02 /* mothur.h */, 37D927EF0F21331F001D4494 /* mothur.cpp */, 37D927F10F21331F001D4494 /* nameassignment.hpp */, 37D927F00F21331F001D4494 /* nameassignment.cpp */, + 373C691E0FC1C98600137ACD /* nast.hpp */, + 373C691D0FC1C98600137ACD /* nast.cpp */, + 373C692A0FC1C9EB00137ACD /* nastreport.hpp */, + 373C69290FC1C9EB00137ACD /* nastreport.cpp */, + 373C68CE0FC1C2D900137ACD /* needlemanoverlap.hpp */, + 373C68CD0FC1C2D900137ACD /* needlemanoverlap.cpp */, + 373C68E40FC1C4A500137ACD /* noalign.hpp */, + 373C68E30FC1C4A500137ACD /* noalign.cpp */, 37D927F60F21331F001D4494 /* observable.h */, + 373C68C40FC1C25F00137ACD /* overlap.hpp */, + 373C68C30FC1C25F00137ACD /* overlap.cpp */, 37D927FC0F21331F001D4494 /* progress.hpp */, 37D927FB0F21331F001D4494 /* progress.cpp */, 37D928040F21331F001D4494 /* raredisplay.h */, @@ -672,6 +725,8 @@ isa = PBXGroup; children = ( 37D927CD0F21331F001D4494 /* command.hpp */, + 378DC5CD0FBDE1C8003B8607 /* aligncommand.h */, + 378DC5CE0FBDE1C8003B8607 /* aligncommand.cpp */, 37C1D9710F86506E0059E3F0 /* binsequencecommand.h */, 37C1D9720F86506E0059E3F0 /* binsequencecommand.cpp */, 21DDC0190F97A8FE0060691C /* bootstrapsharedcommand.h */, @@ -743,13 +798,27 @@ 37D928AC0F213420001D4494 /* containers */ = { isa = PBXGroup; children = ( + 373C68A10FC1C07D00137ACD /* alignment.hpp */, + 373C68A00FC1C07D00137ACD /* alignment.cpp */, + 373C68A30FC1C07D00137ACD /* alignmentcell.hpp */, + 373C68A20FC1C07D00137ACD /* alignmentcell.cpp */, + 373C69170FC1C8AF00137ACD /* blastdb.hpp */, + 373C69160FC1C8AF00137ACD /* blastdb.cpp */, + 37D927D40F21331F001D4494 /* database.hpp */, + 37D927D30F21331F001D4494 /* database.cpp */, 37D927D50F21331F001D4494 /* datavector.hpp */, + 373C69330FC1CA9E00137ACD /* distancedb.hpp */, + 373C69320FC1CA9E00137ACD /* distancedb.cpp */, 37D927DC0F21331F001D4494 /* fastamap.h */, 37D927DB0F21331F001D4494 /* fastamap.cpp */, 375873EA0F7D64520040F377 /* fullmatrix.h */, 375873EB0F7D64520040F377 /* fullmatrix.cpp */, 37D927E20F21331F001D4494 /* groupmap.h */, 37D927E10F21331F001D4494 /* groupmap.cpp */, + 37D927EA0F21331F001D4494 /* kmer.hpp */, + 37D927E90F21331F001D4494 /* kmer.cpp */, + 37D927EC0F21331F001D4494 /* kmerdb.hpp */, + 37D927EB0F21331F001D4494 /* kmerdb.cpp */, 37D927EE0F21331F001D4494 /* listvector.hpp */, 37D927ED0F21331F001D4494 /* listvector.cpp */, 37D927F80F21331F001D4494 /* ordervector.hpp */, @@ -772,6 +841,12 @@ 37D928320F21331F001D4494 /* sharedsabundvector.cpp */, 37D928450F21331F001D4494 /* sparsematrix.hpp */, 37D928440F21331F001D4494 /* sparsematrix.cpp */, + 373C68F40FC1C6A800137ACD /* suffixdb.hpp */, + 373C68F30FC1C6A800137ACD /* suffixdb.cpp */, + 373C68F60FC1C6A800137ACD /* suffixnodes.hpp */, + 373C68F50FC1C6A800137ACD /* suffixnodes.cpp */, + 373C68F80FC1C6A800137ACD /* suffixtree.hpp */, + 373C68F70FC1C6A800137ACD /* suffixtree.cpp */, 37AD4DB90F28E2FE00AA2D49 /* tree.h */, 37AD4DBA0F28E2FE00AA2D49 /* tree.cpp */, 379293C10F2DE73400B9034A /* treemap.h */, @@ -979,6 +1054,21 @@ 378C1B0C0FB0644E004D63F5 /* sharedmarczewski.cpp in Sources */, 37C753CE0FB3415200DBD02E /* distancecommand.cpp in Sources */, 379643ED0FB9B5A80081FDB6 /* readseqs.cpp in Sources */, + 378DC5CF0FBDE1C8003B8607 /* aligncommand.cpp in Sources */, + 373C68A40FC1C07D00137ACD /* alignment.cpp in Sources */, + 373C68A50FC1C07D00137ACD /* alignmentcell.cpp in Sources */, + 373C68BC0FC1C1A600137ACD /* gotohoverlap.cpp in Sources */, + 373C68C50FC1C25F00137ACD /* overlap.cpp in Sources */, + 373C68CF0FC1C2D900137ACD /* needlemanoverlap.cpp in Sources */, + 373C68DD0FC1C38D00137ACD /* blastalign.cpp in Sources */, + 373C68E50FC1C4A500137ACD /* noalign.cpp in Sources */, + 373C68F90FC1C6A800137ACD /* suffixdb.cpp in Sources */, + 373C68FA0FC1C6A800137ACD /* suffixnodes.cpp in Sources */, + 373C68FB0FC1C6A800137ACD /* suffixtree.cpp in Sources */, + 373C69180FC1C8AF00137ACD /* blastdb.cpp in Sources */, + 373C691F0FC1C98600137ACD /* nast.cpp in Sources */, + 373C692B0FC1C9EB00137ACD /* nastreport.cpp in Sources */, + 373C69340FC1CA9E00137ACD /* distancedb.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/aligncommand.cpp b/aligncommand.cpp new file mode 100644 index 0000000..81f8e6a --- /dev/null +++ b/aligncommand.cpp @@ -0,0 +1,151 @@ +/* + * aligncommand.cpp + * Mothur + * + * Created by Sarah Westcott on 5/15/09. + * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved. + * + * This version of nast does everything I think that the greengenes nast server does and then some. I have added the + * feature of allowing users to define their database, kmer size for searching, alignment penalty values and alignment + * method. This latter feature is perhaps most significant. nastPlus enables a user to use either a Needleman-Wunsch + * (non-affine gap penalty) or Gotoh (affine gap penalty) pairwise alignment algorithm. This is significant because it + * allows for a global alignment and not the local alignment provided by bLAst. Furthermore, it has the potential to + * provide a better alignment because of the banding method employed by blast (I'm not sure about this). + * + * to compile type: + * make + * + * for basic instructions on how to run nastPlus type: + * ./nastPlus + */ + +#include "aligncommand.h" +#include "sequence.hpp" + +#include "alignment.hpp" +#include "gotohoverlap.hpp" +#include "needlemanoverlap.hpp" +#include "blastalign.hpp" +#include "noalign.hpp" + +#include "database.hpp" +#include "kmerdb.hpp" +#include "suffixdb.hpp" +#include "blastdb.hpp" +#include "distancedb.hpp" + +#include "nast.hpp" +#include "nastreport.hpp" + + +//********************************************************************************************************************** +AlignCommand::AlignCommand(){ + try { + globaldata = GlobalData::getInstance(); + candidateFileName = globaldata->inputFileName; + templateFileName = globaldata->getTemplateFile(); + openInputFile(candidateFileName, in); + convert(globaldata->getKSize(), kmerSize); + convert(globaldata->getMatch(), match); + convert(globaldata->getMismatch(), misMatch); + convert(globaldata->getGapopen(), gapOpen); + convert(globaldata->getGapextend(), gapExtend); + distanceFileName = "????"; + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the AlignCommand class Function AlignCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + catch(...) { + cout << "An unknown error has occurred in the AlignCommand class function AlignCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } +} + +//********************************************************************************************************************** + +AlignCommand::~AlignCommand(){ + +} + +//********************************************************************************************************************** + +int AlignCommand::execute(){ + try { + srand( (unsigned)time( NULL ) ); //needed to assign names to temporary files + + Database* templateDB; + if(globaldata->getSearch() == "kmer") { templateDB = new KmerDB(templateFileName, kmerSize); } + else if(globaldata->getSearch() == "suffix") { templateDB = new SuffixDB(templateFileName); } + else if(globaldata->getSearch() == "blast") { templateDB = new BlastDB(templateFileName, gapOpen, gapExtend, match, misMatch); } + else if(globaldata->getSearch() == "distance") { templateDB = new DistanceDB(templateFileName, distanceFileName); } + else { cout << globaldata->getSearch() << " is not a valid search option. I will run the command using suffix." << endl; + templateDB = new SuffixDB(templateFileName); } + + Alignment* alignment; + if(globaldata->getAlign() == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, 3000); } + else if(globaldata->getAlign() == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, 3000); } + else if(globaldata->getAlign() == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); } + else if(globaldata->getAlign() == "noalign") { alignment = new NoAlign(); } + else { cout << globaldata->getAlign() << " is not a valid alignment option. I will run the command using blast." << endl; + alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); } + + int numFastaSeqs=count(istreambuf_iterator(in),istreambuf_iterator(), '>'); + in.seekg(0); + + string candidateAligngmentFName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + globaldata->getSearch() + '.' + globaldata->getAlign() + ".nast.align"; + ofstream candidateAlignmentFile; + openOutputFile(candidateAligngmentFName, candidateAlignmentFile); + + string candidateReportFName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + globaldata->getSearch() + '.' + globaldata->getAlign() + ".nast.report"; + NastReport report(candidateReportFName); + + cout << "We are going to align the " << numFastaSeqs << " sequences in " << candidateFileName << "..." << endl; + cout.flush(); + + int start = time(NULL); + + for(int i=0;ifindClosestSequence(candidateSeq); + report.setTemplate(templateSeq); + report.setSearchParameters(globaldata->getSearch(), templateDB->getSearchScore()); + + Nast nast(alignment, candidateSeq, templateSeq); + report.setAlignmentParameters(globaldata->getAlign(), alignment); + report.setNastParameters(nast); + + candidateAlignmentFile << '>' << candidateSeq->getName() << '\n' << candidateSeq->getAligned() << endl; + candidateAlignmentFile.flush(); + + if((i+1) % 100 == 0){ + cout << "It has taken " << time(NULL) - start << " secs to align " << i+1 << " sequences" << endl; + } + report.print(); + + delete candidateSeq; + } + + cout << "It took " << time(NULL) - start << " secs to align " << numFastaSeqs << " sequences" << endl; + cout << endl; + + delete templateDB; + delete alignment; + + + return 0; + } + catch(exception& e) { + cout << "Standard Error: " << e.what() << " has occurred in the AlignCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } + catch(...) { + cout << "An unknown error has occurred in the AlignCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; + exit(1); + } +} + +//********************************************************************************************************************** \ No newline at end of file diff --git a/aligncommand.h b/aligncommand.h new file mode 100644 index 0000000..cf30a6b --- /dev/null +++ b/aligncommand.h @@ -0,0 +1,35 @@ +#ifndef ALIGNCOMMAND_H +#define ALIGNCOMMAND_H + +/* + * aligncommand.h + * Mothur + * + * Created by Sarah Westcott on 5/15/09. + * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved. + * + */ + +#include "command.hpp" +#include "globaldata.hpp" + +class AlignCommand : public Command { + + public: + AlignCommand(); + ~AlignCommand(); + int execute(); + + private: + GlobalData* globaldata; + string candidateFileName, templateFileName, distanceFileName; + int kmerSize; + float match, misMatch, gapOpen, gapExtend; + ofstream out; + ifstream in; + +}; + + + +#endif \ No newline at end of file diff --git a/alignment.cpp b/alignment.cpp new file mode 100644 index 0000000..6db3b3d --- /dev/null +++ b/alignment.cpp @@ -0,0 +1,163 @@ +/* + * alignment.cpp + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This is a class for an abstract datatype for classes that implement various types of alignment algorithms. + * As of 12/18/08 these included alignments based on blastn, needleman-wunsch, and the Gotoh algorithms + * + */ + +using namespace std; + +#include +#include + +#include "alignmentcell.hpp" +#include "alignment.hpp" + +#include + +/**************************************************************************************************/ + +Alignment::Alignment() { /* do nothing */ } + +/**************************************************************************************************/ + +Alignment::Alignment(int A) : nCols(A), nRows(A) { + + alignment.resize(nRows); // For the Gotoh and Needleman-Wunsch we initialize the dynamic programming + for(int i=0;i=0;i--){ + if(seqAaln[i] != '-' && seqBaln[i] == '-') { seqAend++; } + else if(seqAaln[i] == '-' && seqBaln[i] != '-') { seqBend++; } + else { break; } + } + pairwiseLength -= (seqAend + seqBend); + + seqAend = seqA.length() - seqAend - 1; + seqBend = seqB.length() - seqBend - 1; + +} + +/**************************************************************************************************/ + +string Alignment::getSeqAAln(){ + return seqAaln; // this is called to get the alignment of seqA +} + +/**************************************************************************************************/ + +string Alignment::getSeqBAln(){ + return seqBaln; // this is called to get the alignment of seqB +} + +/**************************************************************************************************/ + +int Alignment::getCandidateStartPos(){ + return seqAstart; // this is called to report the quality of the alignment +} + +/**************************************************************************************************/ + +int Alignment::getCandidateEndPos(){ + return seqAend; // this is called to report the quality of the alignment +} + +/**************************************************************************************************/ + +int Alignment::getTemplateStartPos(){ + return seqBstart; // this is called to report the quality of the alignment +} + +/**************************************************************************************************/ + +int Alignment::getTemplateEndPos(){ + return seqBend; // this is called to report the quality of the alignment +} + +/**************************************************************************************************/ + +int Alignment::getPairwiseLength(){ + return pairwiseLength; // this is the pairwise alignment length +} + +/**************************************************************************************************/ + +//int Alignment::getLongestTemplateGap(){ +// +// int length = seqBaln.length(); +// int longGap = 0; +// int gapLength = 0; +// +// int start = seqAstart; +// if(seqAstart < seqBstart){ start = seqBstart; } +// for(int i=seqAstart;i 0){ +// if(gapLength > longGap){ longGap = gapLength; } +// } +// gapLength = 0; +// } +// } +// return longGap; +//} + +/**************************************************************************************************/ diff --git a/alignment.hpp b/alignment.hpp new file mode 100644 index 0000000..3367812 --- /dev/null +++ b/alignment.hpp @@ -0,0 +1,53 @@ +#ifndef DPALIGNMENT_H +#define DPALIGNMENT_H + +/* + * dpalignment.h + * + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This is a class for an abstract datatype for classes that implement various types of alignment algorithms. + * As of 12/18/08 these included alignments based on blastn, needleman-wunsch, and the Gotoh algorithms + * + */ +using namespace std; + +#include "mothur.h" +#include "alignmentcell.hpp" + +/**************************************************************************************************/ + +class Alignment { + +public: + Alignment(int); + Alignment(); + virtual void align(string, string) = 0; + + float getAlignmentScore(); + string getSeqAAln(); + string getSeqBAln(); + int getCandidateStartPos(); + int getCandidateEndPos(); + int getTemplateStartPos(); + int getTemplateEndPos(); + + int getPairwiseLength(); +// int getLongestTemplateGap(); + +protected: + void traceBack(); + string seqA, seqAaln; + string seqB, seqBaln; + int seqAstart, seqAend; + int seqBstart, seqBend; + int pairwiseLength; + int nRows, nCols, lA, lB; + vector > alignment; +}; + +/**************************************************************************************************/ + +#endif diff --git a/alignmentcell.cpp b/alignmentcell.cpp new file mode 100644 index 0000000..544c218 --- /dev/null +++ b/alignmentcell.cpp @@ -0,0 +1,18 @@ +/* + * alignmentcell.cpp + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This class is pretty basic. Each AlignmentCell object contains a pointer to the previous cell and different values + * used to calcualte the alignment. Initially everything is set to zero and all pointers are set to 'x' + * + */ + +#include "alignmentcell.hpp" + +//******************************************************************************************************************** + +AlignmentCell::AlignmentCell() : prevCell('x'), cValue(0.0000), dValue(0.0000), iValue(0.0000) {}; + +//******************************************************************************************************************** diff --git a/alignmentcell.hpp b/alignmentcell.hpp new file mode 100644 index 0000000..c309048 --- /dev/null +++ b/alignmentcell.hpp @@ -0,0 +1,30 @@ +#ifndef ALIGNMENTCELL_H +#define ALIGNMENTCELL_H + +/* + * alignmentcell.hpp + * + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This class is pretty basic. Each AlignmentCell object contains a pointer to the previous cell and different values + * used to calcualte the alignment. Initially everything is set to zero and all pointers are set to 'x' + * + */ + +//******************************************************************************************************************** + +class AlignmentCell { + +public: + AlignmentCell(); + char prevCell; + float cValue; + float dValue; + float iValue; +}; + +//******************************************************************************************************************** + +#endif diff --git a/blastalign.cpp b/blastalign.cpp new file mode 100644 index 0000000..0a1350f --- /dev/null +++ b/blastalign.cpp @@ -0,0 +1,156 @@ +/* + * blastalign.cpp + * + * + * Created by Pat Schloss on 12/16/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This is a basic alignment method that gets the blast program to do the heavy lifting. In the future, we should + * probably incorporate NCBI's library so that we don't have to call on a user-supplied executable. This is a child + * of the Alignment class, which requires a constructor and align method. + * + */ + +using namespace std; + +#include "alignment.hpp" +#include "blastalign.hpp" + + +//**************************************************************************************************/ + +BlastAlignment::BlastAlignment(float go, float ge, float m, float mm) : + match(m), // This is the score to award for two nucleotides matching (match >= 0) + mismatch(mm) // This is the penalty to assess for a mismatch (mismatch <= 0) +{ + gapOpen = abs(go); // This is the penalty to assess for opening a gap (gapOpen >= 0) + gapExtend = abs(ge); // This is the penalty to assess for extending a gap (gapExtend >= 0) + + int randNumber = rand(); + candidateFileName = toString(randNumber) + ".candidate"; + templateFileName = toString(randNumber) + ".template"; + blastFileName = toString(randNumber) + ".pairwise"; +} + +//**************************************************************************************************/ + +BlastAlignment::~BlastAlignment(){ // The desctructor should clean up by removing the temporary + remove(candidateFileName.c_str()); // files used to run bl2seq + remove(templateFileName.c_str()); + remove(blastFileName.c_str()); +} + +//**************************************************************************************************/ + +void BlastAlignment::align(string seqA, string seqB){ //Use blastn to align the two sequences + + ofstream candidateFile(candidateFileName.c_str()); // Write the sequence to be aligned to a temporary candidate seq file + candidateFile << ">candidate" << endl << seqA << endl; + candidateFile.close(); + + ofstream templateFile(templateFileName.c_str()); // Write the unaligned template sequence to a temporary candidate seq file + templateFile << ">template" << endl << seqB << endl; + templateFile.close(); + + // The blastCommand assumes that we have DNA sequences (blastn) and that they are fairly similar (-e 0.001) and + // that we don't want to apply any kind of complexity filtering (-F F) + string blastCommand = "~/Pipeline/src/cpp/production/blast/bin/bl2seq -p blastn -i " + candidateFileName + " -j " + templateFileName + " -e 0.0001 -F F -o " + blastFileName + " -W 11"; + blastCommand += " -r " + toString(match) + " -q " + toString(mismatch); + blastCommand += " -G " + toString(gapOpen) + " -E " + toString(gapExtend); + + system(blastCommand.c_str()); // Here we assume that "bl2seq" is in the users path or in the same folder as + // this executable + setPairwiseSeqs(); +} + +/**************************************************************************************************/ + +void BlastAlignment::setPairwiseSeqs(){ // This method call assigns the blast generated alignment + // to the pairwise entry in the Sequence class for the + // candidate and template Sequence objects + ifstream blastFile; + openInputFile(blastFileName, blastFile); + + seqAaln = ""; + seqBaln = ""; + + int candidateLength, templateLength; + char d; + + string candidateName, templateName; + + while(d=blastFile.get() != '='){}; + blastFile >> candidateName; // Get the candidate sequence name from flatfile + + while(d=blastFile.get() != '('){}; + blastFile >> candidateLength; // Get the candidate sequence length from flatfile + + while(d=blastFile.get()){ + if(d == '>'){ + blastFile >> templateName; // Get the template sequence name from flatfile + break; + } + else if(d == '*'){ // We go here if there is no significant match + + seqAstart = 0; + seqBstart = 0; + seqAend = 0; + seqBend = 0; + pairwiseLength = 0; + +// string dummy; +// while(dummy != "query:"){ cout << dummy << endl;blastFile >> dummy; } +// blastFile >> seqBend; +// cout << seqBend << endl; +// for(int i=0;i> templateLength; // Get the template sequence length from flatfile + + while(d=blastFile.get() != 'Q'){}; // Suck up everything else until we get to the start of the alignment + int queryStart, sbjctStart, queryEnd, sbjctEnd; + string queryLabel, sbjctLabel, query, sbjct; + + blastFile >> queryLabel; queryLabel = 'Q' + queryLabel; + + + while(queryLabel == "Query:"){ + blastFile >> queryStart >> query >> queryEnd; + + while(d=blastFile.get() != 'S'){}; + + blastFile >> sbjctLabel >> sbjctStart >> sbjct >> sbjctEnd; + + if(seqAaln == ""){ + seqAstart = queryStart; + seqBstart = sbjctStart; + } + + seqAaln += query; // concatenate each line of the sequence to what we already have + seqBaln += sbjct; // for the query and template (subject) sequence + + blastFile >> queryLabel; + } + seqAend = queryEnd; + seqBend = sbjctEnd; + pairwiseLength = seqAaln.length(); + + for(int i=1;i' << i << endl; // sequences, which will be input to formatdb + unalignedFastaFile << templateSequences[i]->getUnaligned() << endl; + } + unalignedFastaFile.close(); + + string formatdbCommand = "~/Pipeline/src/cpp/production/blast/bin/formatdb -p F -o T -i " + dbFileName; // format the database, -o option gives us the ability + system(formatdbCommand.c_str()); // to get the right sequence names, i think. -p F + // option tells formatdb that seqs are DNA, not prot + cout << "DONE." << endl << endl; cout.flush(); + emptySequence = new Sequence(); + emptySequence->setName("no_match"); + emptySequence->setUnaligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + emptySequence->setAligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); + + +} + +/**************************************************************************************************/ + +BlastDB::~BlastDB(){ + remove(queryFileName.c_str()); // let's clean stuff up and remove the temp files + remove(dbFileName.c_str()); // let's clean stuff up and remove the temp files + remove(blastFileName.c_str()); // let's clean stuff up and remove the temp files +} + +/**************************************************************************************************/ + +Sequence* BlastDB::findClosestSequence(Sequence* candidate){ + + ofstream queryFile; + openOutputFile(queryFileName, queryFile); + queryFile << '>' << candidate->getName() << endl; + queryFile << candidate->getUnaligned() << endl; + queryFile.close(); + + +// the goal here is to quickly survey the database to find the closest match. To do this we are using the default +// wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too +// long. With this setting, it seems comparable in speed to the suffix tree approach. + + string blastCommand = "~/Pipeline/src/cpp/production/blast/bin/blastall -p blastn -d " + dbFileName + " -b 1 -m 8 -W 28"; + blastCommand += (" -i " + queryFileName + " -o " + blastFileName); + system(blastCommand.c_str()); + + ifstream m8FileHandle; + openInputFile(blastFileName, m8FileHandle); + + string dummy; + int templateAccession; + gobble(m8FileHandle); + if(!m8FileHandle.eof()){ + m8FileHandle >> dummy >> templateAccession >> searchScore; + m8FileHandle.close(); + return templateSequences[templateAccession]; + } + else{ + searchScore = 0.00; + return emptySequence; + } +} + +/**************************************************************************************************/ diff --git a/blastdb.hpp b/blastdb.hpp new file mode 100644 index 0000000..a952897 --- /dev/null +++ b/blastdb.hpp @@ -0,0 +1,34 @@ +#ifndef BLASTDB_HPP +#define BLASTDB_HPP + + +/* + * blastdb.hpp + * + * + * Created by Pat Schloss on 12/22/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + */ + +#include "mothur.h" + +class BlastDB : public Database { +public: + BlastDB(string, float, float, float, float); + ~BlastDB(); + Sequence* findClosestSequence(Sequence*); + +private: + string dbFileName; + string queryFileName; + string blastFileName; + + float gapOpen; + float gapExtend; + float match; + float misMatch; + Sequence* emptySequence; +}; + +#endif diff --git a/commandfactory.cpp b/commandfactory.cpp index 2e8aca7..d41e4a8 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -41,6 +41,7 @@ #include "bootstrapsharedcommand.h" #include "concensuscommand.h" #include "distancecommand.h" +#include "aligncommand.h" /***********************************************************/ @@ -93,6 +94,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "bootstrap.shared") { command = new BootSharedCommand(); } else if(commandName == "concensus") { command = new ConcensusCommand(); } else if(commandName == "dist.seqs") { command = new DistanceCommand(); } + else if(commandName == "align.seqs") { command = new AlignCommand(); } else { command = new NoCommand(); } return command; diff --git a/database.cpp b/database.cpp index 933bed4..8c080db 100644 --- a/database.cpp +++ b/database.cpp @@ -15,24 +15,23 @@ using namespace std; /**************************************************************************************************/ -Database::Database(string fastaFileName){ - - ifstream fastaFile(fastaFileName.c_str()); - if(!fastaFile) { - cerr << "Error: Could not open " << fastaFileName << endl; - exit(1); - } +Database::Database(string fastaFileName){ // This assumes that the template database is in fasta format, may + // need to alter this in the future? + + ifstream fastaFile; + openInputFile(fastaFileName, fastaFile); + cout << endl << "Reading in the " << fastaFileName << " template sequences...\t"; cout.flush(); - numSeqs=count(istreambuf_iterator(fastaFile),istreambuf_iterator(), '>'); - fastaFile.seekg(0); + numSeqs=count(istreambuf_iterator(fastaFile),istreambuf_iterator(), '>'); // count the number of + fastaFile.seekg(0); // sequences templateSequences.resize(numSeqs); string seqName, sequence; for(int i=0;i> seqName; templateSequences[i]->setName(seqName); @@ -42,12 +41,13 @@ Database::Database(string fastaFileName){ while(fastaFile && (letter=fastaFile.get()) != '>'){ if(isprint(letter)){ letter = toupper(letter); + if(letter == 'U'){letter = 'T';} aligned += letter; } } - templateSequences[i]->setAligned(aligned); - templateSequences[i]->setUnaligned(aligned); - fastaFile.putback(letter); + templateSequences[i]->setAligned(aligned); // Obviously, we need the fully aligned template sequence + templateSequences[i]->setUnaligned(aligned);// We will also need the unaligned sequence for pairwise alignments + fastaFile.putback(letter); // and database searches } fastaFile.close(); @@ -56,3 +56,7 @@ Database::Database(string fastaFileName){ } /**************************************************************************************************/ + +float Database::getSearchScore() { return searchScore; } // we're assuming that the search is already done + +/**************************************************************************************************/ diff --git a/database.hpp b/database.hpp index e80bfb5..09fea85 100644 --- a/database.hpp +++ b/database.hpp @@ -12,14 +12,17 @@ #include "mothur.h" +class Sequence; + class Database { public: Database(string); virtual Sequence* findClosestSequence(Sequence*) = 0; - + virtual float getSearchScore(); protected: int numSeqs; - vector templateSequences; + float searchScore; + vector templateSequences; }; #endif diff --git a/distancecommand.cpp b/distancecommand.cpp index e5878b3..7d1f6a0 100644 --- a/distancecommand.cpp +++ b/distancecommand.cpp @@ -88,11 +88,12 @@ int DistanceCommand::execute(){ remove(distFile.c_str()); - // # if defined (WIN_VERSION) + //# if defined (_WIN32) + //figure out how to implement the fork and wait commands in windows // driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff); - // # endif + //# endif - # if defined (__APPLE__) || (__MACH__) + # if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) if(processors == 1){ driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff); } @@ -168,6 +169,8 @@ int DistanceCommand::execute(){ wait(NULL); } wait(NULL); + # else + driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff); # endif delete distCalculator; diff --git a/distancedb.cpp b/distancedb.cpp new file mode 100644 index 0000000..ce74854 --- /dev/null +++ b/distancedb.cpp @@ -0,0 +1,65 @@ +/* + * distancedb.cpp + * + * + * Created by Pat Schloss on 12/29/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + */ + + +using namespace std; + + +#include "database.hpp" +#include "sequence.hpp" +#include "distancedb.hpp" + + +/**************************************************************************************************/ + +DistanceDB::DistanceDB(string fastaFileName, string distanceFileName) : Database(fastaFileName) { + + ifstream inputData; + openInputFile(distanceFileName, inputData); + int numCandSeqs=count(istreambuf_iterator(inputData),istreambuf_iterator(), '\n'); // count the number of + inputData.seekg(0); // sequences + + hit closestMatch; + string candidateSeqName; + string junk; + + mostSimSequenceVector.resize(numCandSeqs); + + for(int i=0;i> candidateSeqName >> closestMatch.seqName >> closestMatch.indexNumber >> closestMatch.simScore; +// getline(inputData, junk); + mostSimSequenceVector[i] = closestMatch; + } + cout << numCandSeqs << endl; + searchIndex = 0; + inputData.close(); +} + +/**************************************************************************************************/ + +Sequence* DistanceDB::findClosestSequence(Sequence* candidateSeq){ + + hit simAccession = mostSimSequenceVector[searchIndex]; +// string candidateSeqName, closestMatchSeqName, junk; +// int closestMatchIndexNumber; +// float closestMatchSimScore; +// +// inputData >> candidateSeqName >> closestMatchSeqName >> closestMatchIndexNumber >> closestMatchSimScore; +// getline(inputData, junk); + +// searchScore = 100. * closestMatchSimScore; + + searchScore = 100. * simAccession.simScore; + searchIndex++; +// return templateSequences[closestMatchIndexNumber]; + return templateSequences[simAccession.indexNumber]; + +} + +/**************************************************************************************************/ diff --git a/distancedb.hpp b/distancedb.hpp new file mode 100644 index 0000000..dc4c9a6 --- /dev/null +++ b/distancedb.hpp @@ -0,0 +1,36 @@ +#ifndef DISTANCEDB_HPP +#define DISTANCEDB_HPP + +/* + * distancedb.hpp + * + * + * Created by Pat Schloss on 12/29/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + */ + + +using namespace std; + +#include "mothur.h" + +class DistanceDB : public Database { + +public: + DistanceDB(string, string); + Sequence* findClosestSequence(Sequence*); + +private: + + struct hit{ + string seqName; + int indexNumber; + float simScore; + }; + vector mostSimSequenceVector; +// ifstream inputData; + int searchIndex; +}; + +#endif diff --git a/errorchecking.cpp b/errorchecking.cpp index 8aaa8b0..dfe9607 100644 --- a/errorchecking.cpp +++ b/errorchecking.cpp @@ -118,7 +118,14 @@ bool ErrorCheck::checkInput(string input) { if (parameter == "scale" ) { scale = value; } if (parameter == "ends" ) { ends = value; } if (parameter == "processors" ) { processors = value; } - + if (parameter == "template") { templatefile = value; } + if (parameter == "search") { search = value; } + if (parameter == "ksize") { ksize = value; } + if (parameter == "align") { align = value; } + if (parameter == "match") { match = value; } + if (parameter == "mismatch") { mismatch = value; } + if (parameter == "gapopen") { gapopen = value; } + if (parameter == "gapextend" ) { gapextend = value; } } //gets the last parameter and value @@ -160,6 +167,14 @@ bool ErrorCheck::checkInput(string input) { if (parameter == "scale" ) { scale = value; } if (parameter == "ends" ) { ends = value; } if (parameter == "processors" ) { processors = value; } + if (parameter == "template") { templatefile = value; } + if (parameter == "search") { search = value; } + if (parameter == "ksize") { ksize = value; } + if (parameter == "align") { align = value; } + if (parameter == "match") { match = value; } + if (parameter == "mismatch") { mismatch = value; } + if (parameter == "gapopen") { gapopen = value; } + if (parameter == "gapextend" ) { gapextend = value; } } } @@ -246,9 +261,11 @@ bool ErrorCheck::checkInput(string input) { } } - if ((commandName == "filter.seqs") || (commandName == "dist.seqs")) { + if ((commandName == "filter.seqs") || (commandName == "dist.seqs") || (commandName == "align.seqs")) { if ((fastafile == "") && (nexusfile == "") && (clustalfile == "") && (phylipfile == "")) { - cout << "You must read either a fasta, nexus, clustal, or phylip file before you can use the filter.seqs command." << endl; return false; + cout << "You must enter either a fasta, nexus, clustal, or phylip file before you can use the filter.seqs, dist.seqs or align.seqs command." << endl; return false; + }else if ((commandName == "align.seqs") && (templatefile == "")) { + cout << "You must enter a template to use the align.seqs command." << endl; return false; } validateSeqsFiles(); } @@ -558,9 +575,15 @@ void ErrorCheck::validateSeqsFiles() { errorFree = false; } } - + }else if (templatefile != "") { + ableToOpen = openInputFile(templatefile, filehandle); + filehandle.close(); + if (ableToOpen == 1) { //unable to open + errorFree = false; + } } + } catch(exception& e) { cout << "Standard Error: " << e.what() << " has occurred in the ErrorCheck class Function validateSeqsFiles. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n"; @@ -642,6 +665,7 @@ void ErrorCheck::clear() { fastafile = ""; nexusfile = ""; clustalfile = ""; + templatefile = ""; line = ""; label = ""; method = "furthest"; diff --git a/errorchecking.h b/errorchecking.h index 8de9234..d6388f7 100644 --- a/errorchecking.h +++ b/errorchecking.h @@ -36,7 +36,7 @@ class ErrorCheck { void refresh(); string phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, orderfile, fastafile, nexusfile, clustalfile, treefile, sharedfile, cutoff, format; string precision, method, fileroot, label, line, iters, jumble, freq, single, rarefaction, shared, summary, randomtree, abund, sorted, trump, soft, filter, scale, ends, processors; - + string templatefile, search, ksize, align, match, mismatch, gapopen, gapextend; string commandName, optionText; bool errorFree; diff --git a/globaldata.cpp b/globaldata.cpp index 4dfcbc1..b4239f3 100644 --- a/globaldata.cpp +++ b/globaldata.cpp @@ -85,10 +85,14 @@ void GlobalData::parseGlobalData(string commandString, string optionText){ if (key == "scale") { scale = value; } if (key == "ends" ) { ends = value; } if (key == "processors" ) { processors = value; } - - - - + if (key == "template") { templatefile = value; } + if (key == "search") { search = value; } + if (key == "ksize") { ksize = value; } + if (key == "align") { align = value; } + if (key == "match") { match = value; } + if (key == "mismatch") { mismatch = value; } + if (key == "gapopen") { gapopen = value; } + if (key == "gapextend" ) { gapextend = value; } if (key == "line") {//stores lines to be used in a set lines.clear(); @@ -151,7 +155,14 @@ void GlobalData::parseGlobalData(string commandString, string optionText){ if (key == "scale") { scale = value; } if (key == "ends" ) { ends = value; } if (key == "processors" ) { processors = value; } - + if (key == "template") { templatefile = value; } + if (key == "search") { search = value; } + if (key == "ksize") { ksize = value; } + if (key == "align") { align = value; } + if (key == "match") { match = value; } + if (key == "mismatch") { mismatch = value; } + if (key == "gapopen") { gapopen = value; } + if (key == "gapextend" ) { gapextend = value; } if (key == "line") {//stores lines to be used in a vector lines.clear(); @@ -289,6 +300,14 @@ string GlobalData::getFilter() { return filter; } string GlobalData::getScale() { return scale; } string GlobalData::getEnds() { return ends; } string GlobalData::getProcessors() { return processors; } +string GlobalData::getTemplateFile() { return templatefile;} +string GlobalData::getSearch() { return search; } +string GlobalData::getKSize() { return ksize; } +string GlobalData::getAlign() { return align; } +string GlobalData::getMatch() { return match; } +string GlobalData::getMismatch() { return mismatch; } +string GlobalData::getGapopen() { return gapopen; } +string GlobalData::getGapextend() { return gapextend; } void GlobalData::setListFile(string file) { listfile = file; inputFileName = file;} void GlobalData::setGroupFile(string file) { groupfile = file; } @@ -334,6 +353,7 @@ void GlobalData::clear() { clustalfile = ""; treefile = ""; sharedfile = ""; + templatefile = ""; cutoff = "10.00"; format = ""; precision = "100"; @@ -357,6 +377,14 @@ void GlobalData::clear() { scale = "log10"; ends = "T"; //yes processors = "1"; + search = "suffix"; + ksize = "7"; + align = "blast"; + match = "1.0"; + mismatch = "-1.0"; + gapopen = "-5.0"; + gapextend = "-2.0"; + } @@ -379,6 +407,13 @@ void GlobalData::reset() { form = "integral"; ends = "T"; processors = "1"; + search = "suffix"; + ksize = "7"; + align = "blast"; + match = "1.0"; + mismatch = "-1.0"; + gapopen = "-5.0"; + gapextend = "-2.0"; } /*******************************************************/ diff --git a/globaldata.hpp b/globaldata.hpp index 3a469f0..cb9aff6 100644 --- a/globaldata.hpp +++ b/globaldata.hpp @@ -77,7 +77,14 @@ public: string getSorted(); string getEnds(); string getProcessors(); - + string getTemplateFile(); + string getSearch(); + string getKSize(); + string getAlign(); + string getMatch(); + string getMismatch(); + string getGapopen(); + string getGapextend(); string getTrump(); string getSoft(); string getFilter(); @@ -116,7 +123,8 @@ public: private: string phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, orderfile, fastafile, nexusfile, clustalfile, treefile, sharedfile, line, label, randomtree, groups; - string cutoff, format, precision, method, fileroot, iters, jumble, freq, calc, abund, step, form, sorted, trump, soft, filter, scale, ends, processors; + string cutoff, format, precision, method, fileroot, iters, jumble, freq, calc, abund, step, form, sorted, trump, soft, filter, scale, ends, processors, templatefile, search, ksize, align, match; + string mismatch, gapopen, gapextend; static GlobalData* _uniqueInstance; diff --git a/gotohoverlap.cpp b/gotohoverlap.cpp new file mode 100644 index 0000000..57d884b --- /dev/null +++ b/gotohoverlap.cpp @@ -0,0 +1,90 @@ +/* + * gotohoverlap.cpp + * + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This class is an Alignment child class that implements the Gotoh pairwise alignment algorithm as described in: + * + * Gotoh O. 1982. An improved algorithm for matching biological sequences. J. Mol. Biol. 162:705-8. + * Myers, EW & Miller, W. 1988. Optimal alignments in linear space. Comput Appl Biosci. 4:11-7. + * + * This method is nice because it allows for an affine gap penalty to be assessed, which is analogous to what is used + * in blast and is an alternative to Needleman-Wunsch, which only charges the same penalty for each gap position. + * Because this method typically has problems at the ends when two sequences do not full overlap, we employ a separate + * method to fix the ends (see Overlap class documentation) + * + */ + +using namespace std; + +#include "alignmentcell.hpp" +#include "overlap.hpp" +#include "alignment.hpp" +#include "gotohoverlap.hpp" + +/**************************************************************************************************/ + +GotohOverlap::GotohOverlap(float gO, float gE, float m, float mm, int r) : + gapOpen(gO), gapExtend(gE), match(m), mismatch(mm), Alignment(r) { + + for(int i=1;i alignment[i][j].dValue){ + if(alignment[i][j].iValue > diagonal){ + alignment[i][j].cValue = alignment[i][j].iValue; + alignment[i][j].prevCell = 'l'; + } + else{ + alignment[i][j].cValue = diagonal; + alignment[i][j].prevCell = 'd'; + } + } + else{ + if(alignment[i][j].dValue > diagonal){ + alignment[i][j].cValue = alignment[i][j].dValue; + alignment[i][j].prevCell = 'u'; + } + else{ + alignment[i][j].cValue = diagonal; + alignment[i][j].prevCell = 'd'; + } + } + + } + } + Overlap over; + over.setOverlap(alignment, lA, lB, 0); // Fix the gaps at the ends of the sequences + traceBack(); // Construct the alignment and set seqAaln and seqBaln +} + +/**************************************************************************************************/ diff --git a/gotohoverlap.hpp b/gotohoverlap.hpp new file mode 100644 index 0000000..bf3c636 --- /dev/null +++ b/gotohoverlap.hpp @@ -0,0 +1,43 @@ +#ifndef GOTOHOVERLAP_H +#define GOTOHOVERLAP_H + +/* + * gotohoverlap.h + * + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This class is an Alignment child class that implements the Gotoh pairwise alignment algorithm as described in: + * + * Gotoh O. 1982. An improved algorithm for matching biological sequences. J. Mol. Biol. 162:705-8. + * Myers, EW & Miller, W. 1988. Optimal alignments in linear space. Comput Appl Biosci. 4:11-7. + * + * This method is nice because it allows for an affine gap penalty to be assessed, which is analogous to what is used + * in blast and is an alternative to Needleman-Wunsch, which only charges the same penalty for each gap position. + * Because this method typically has problems at the ends when two sequences do not full overlap, we employ a separate + * method to fix the ends (see Overlap class documentation) + * + */ + +#include "mothur.h" +#include "alignment.hpp" + +/**************************************************************************************************/ + +class GotohOverlap : public Alignment { + +public: + GotohOverlap(float, float, float, float, int); + void align(string, string); + +private: + float gapOpen; + float gapExtend; + float match; + float mismatch; +}; + +/**************************************************************************************************/ + +#endif diff --git a/kmer.cpp b/kmer.cpp index 875cec5..dcb6475 100644 --- a/kmer.cpp +++ b/kmer.cpp @@ -14,28 +14,29 @@ using namespace std; /**************************************************************************************************/ -Kmer::Kmer(int size) : kmerSize(size) { - - int power4s[9] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536 }; - maxKmer = power4s[kmerSize]+1;// (int)pow(4.,k)+1; +Kmer::Kmer(int size) : kmerSize(size) { // The constructor sets the size of the kmer + int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; + // No reason to waste the time of recalculating + maxKmer = power4s[kmerSize]+1;// (int)pow(4.,k)+1; // powers of 4 everytime through. We need an + // extra kmer if we get a non-ATGCU base } /**************************************************************************************************/ -string Kmer::getKmerString(string sequence){ - int length = sequence.length(); - int nKmers = length - kmerSize + 1; +string Kmer::getKmerString(string sequence){ // Calculate kmer for each position in the sequence, count the freq + int length = sequence.length(); // of each kmer, and convert it to an ascii character with base '!'. + int nKmers = length - kmerSize + 1; // Export the string of characters as a string vector counts(maxKmer, 0); - for(int i=0;i T [T] +// Base5 = (915 / 4^1) % 4 = 228 % 4 = 0 => A [AT] +// Base4 = (915 / 4^2) % 4 = 57 % 4 = 1 => C [CAT] +// Base3 = (915 / 4^3) % 4 = 14 % 4 = 2 => G [GCAT] +// Base2 = (915 / 4^4) % 4 = 3 % 4 = 3 => T [TGCAT] +// Base1 = (915 / 4^5) % 4 = 0 % 4 = 0 => A [ATGCAT] -> this checks out with the previous method + + int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; string kmer = ""; - if(kmerNumber == power4s[kmerSize]){//pow(4.,7)){ - for(int i=0;i fast. + * + */ using namespace std; @@ -22,16 +37,16 @@ KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerS string kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer"; ifstream kmerFileTest(kmerDBName.c_str()); - int power4s[9] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536 }; + int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 }; maxKmer = power4s[kmerSize]; kmerLocations.resize(maxKmer+1); - if(!kmerFileTest){ + if(!kmerFileTest){ // if we can open the kmer db file, then read it in... cout << "Generating the " << kmerDBName << " database...\t"; cout.flush(); generateKmerDB(kmerDBName); } - else{ + else{ // ...otherwise generate it. cout << "Reading in the " << kmerDBName << " database...\t"; cout.flush(); readKmerDB(kmerDBName, kmerFileTest); } @@ -42,68 +57,67 @@ KmerDB::KmerDB(string fastaFileName, int kSize) : Database(fastaFileName), kmerS /**************************************************************************************************/ Sequence* KmerDB::findClosestSequence(Sequence* candidateSeq){ + + vector matches(numSeqs, 0); // a record of the sequences with shared kmers + vector timesKmerFound(kmerLocations.size()+1, 0); // a record of the kmers that we have already found - vector matches(numSeqs, 0); - vector timesKmerFound(kmerLocations.size()+1, 0); - - int maxMatches = 0; + searchScore = 0; int maxSequence = 0; - string query = candidateSeq->getUnaligned(); - - int numKmers = query.length() - kmerSize + 1; + string query = candidateSeq->getUnaligned(); // we want to search using an unaligned dna sequence + + int numKmers = query.length() - kmerSize + 1; Kmer kmer(kmerSize); for(int i=0;i maxMatches){ - maxMatches = matches[i]; + + for(int i=0;i searchScore){ // the query sequence + searchScore = matches[i]; maxSequence = i; } } - return templateSequences[maxSequence]; - + searchScore = 100 * searchScore / (float)numKmers; + return templateSequences[maxSequence]; // return the Sequence object corresponding to the db + // sequence with the most shared kmers. } /**************************************************************************************************/ void KmerDB::generateKmerDB(string kmerDBName){ - Kmer kmer(kmerSize); - for(int i=0;igetUnaligned(); + string seq = templateSequences[i]->getUnaligned(); // ...take the unaligned sequence... int numKmers = seq.length() - kmerSize + 1; - for(int j=0;j seenBefore(maxKmer+1,0); + for(int j=0;j> seqName >> numValues; - for(int j=0;j> seqNumber; - kmerLocations[i].push_back(seqNumber); + for(int j=0;j> seqNumber; // 1. number of sequences with the kmer number + kmerLocations[i].push_back(seqNumber); // 2. sequence indices } } kmerDBFile.close(); diff --git a/kmerdb.hpp b/kmerdb.hpp index af52789..eb10061 100644 --- a/kmerdb.hpp +++ b/kmerdb.hpp @@ -8,6 +8,15 @@ * Created by Pat Schloss on 12/16/08. * Copyright 2008 Patrick D. Schloss. All rights reserved. * + * This class is a child class of the Database class, which stores the template sequences as a kmer table and provides + * a method of searching the kmer table for the sequence with the most kmers in common with a query sequence. + * kmerLocations is the primary storage variable that is a two-dimensional vector where each row represents the + * different number of kmers and each column contains the index to sequences that use that kmer. + * + * Construction of an object of this type will first look for an appropriately named database file and if it is found + * then will read in the database file (readKmerDB), otherwise it will generate one and store the data in memory + * (generateKmerDB) + */ #include "mothur.h" diff --git a/nast.cpp b/nast.cpp new file mode 100644 index 0000000..0fae6ab --- /dev/null +++ b/nast.cpp @@ -0,0 +1,362 @@ +/* + * nast.cpp + * + * + * Created by Pat Schloss on 12/17/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This is my implementation of the NAST (nearest alignment space termination) algorithm as described in: + * + * DeSantis TZ, Hugenholtz P, Keller K, Brodie EL, Larsen N, Piceno YM, Phan R, & Anderson GL. 2006. NAST: a multiple + * sequence alignment server for comparative analysis of 16S rRNA genes. Nucleic Acids Research. 34:W394-9. + * + * To construct an object one needs to provide a method of getting a pairwise alignment (alignment) and the template + * and candidate sequence that are to be aligned to each other. + * + */ + +using namespace std; + +#include "sequence.hpp" +#include "alignment.hpp" +#include "nast.hpp" + +/**************************************************************************************************/ + +Nast::Nast(Alignment* method, Sequence* cand, Sequence* temp) : alignment(method), candidateSeq(cand), templateSeq(temp) { + maxInsertLength = 0; + pairwiseAlignSeqs(); // This is part A in Fig. 2 of DeSantis et al. + regapSequences(); // This is parts B-F in Fig. 2 of DeSantis et al. +} + +/**************************************************************************************************/ + +void Nast::pairwiseAlignSeqs(){ // Here we call one of the pairwise alignment methods to align our unaligned candidate + // and template sequences + alignment->align(candidateSeq->getUnaligned(), templateSeq->getUnaligned()); + + string candAln = alignment->getSeqAAln(); + string tempAln = alignment->getSeqBAln(); + + if(candAln == ""){ + candidateSeq->setPairwise(""); + templateSeq->setPairwise(templateSeq->getUnaligned()); + } + else{ + if(tempAln[0] == '-'){ + int pairwiseAlignmentLength = tempAln.length(); // we need to make sure that the candidate sequence alignment + for(int i=0;i=0; i--){// ends where the template sequence alignment ends, if it runs + if(isalpha(tempAln[i])){ // long, we nuke the end of the candidate sequence + candAln = candAln.substr(0,i+1); + tempAln = tempAln.substr(0,i+1); + break; + } + } + } + } + candidateSeq->setPairwise(candAln); // set the pairwise sequences in the Sequence objects for + templateSeq->setPairwise(tempAln); // the candidate and template sequences + +} + +/**************************************************************************************************/ + +void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAlign){ + +// here we do steps C-F of Fig. 2 from DeSantis et al. + + int longAlignmentLength = newTemplateAlign.length(); + + for(int i=0; i=0;leftIndex--){ // then we've got problems... + if(!isalpha(candAln[leftIndex])){ + leftRoom = 1; //count how far it is to the nearest gap on the LEFT side of the anomaly + while(leftIndex-leftRoom>=0 && !isalpha(candAln[leftIndex-leftRoom])) { leftRoom++; } + break; + } + } +// cout << leftIndex << '\t' << leftRoom << endl; + + for(rightIndex=i+1;rightIndex maxInsertLength){ maxInsertLength = insertLength; } + + if((leftRoom + rightRoom) >= insertLength){ + // Parts D & E from Fig. 2 of DeSantis et al. + if((i-leftIndex) <= (rightIndex-i)){ // the left gap is closer - > move stuff left there's + if(leftRoom >= insertLength){ // enough room to the left to move + string leftTemplateString = newTemplateAlign.substr(0,i); + string rightTemplateString = newTemplateAlign.substr(i+insertLength); + newTemplateAlign = leftTemplateString + rightTemplateString; + longAlignmentLength = newTemplateAlign.length(); + + string leftCandidateString = candAln.substr(0,leftIndex-insertLength+1); + string rightCandidateString = candAln.substr(leftIndex+1); + candAln = leftCandidateString + rightCandidateString; + } + else{ // not enough room to the left, have to steal some space to + string leftTemplateString = newTemplateAlign.substr(0,i); // the right + string rightTemplateString = newTemplateAlign.substr(i+insertLength); + newTemplateAlign = leftTemplateString + rightTemplateString; + longAlignmentLength = newTemplateAlign.length(); + + string leftCandidateString = candAln.substr(0,leftIndex-leftRoom+1); + string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1); + string rightCandidateString = candAln.substr(rightIndex+(insertLength-leftRoom)); + candAln = leftCandidateString + insertString + rightCandidateString; + } + } + else{ // the right gap is closer - > move stuff right there's + if(rightRoom >= insertLength){ // enough room to the right to move + string leftTemplateString = newTemplateAlign.substr(0,i); + string rightTemplateString = newTemplateAlign.substr(i+insertLength); + newTemplateAlign = leftTemplateString + rightTemplateString; + longAlignmentLength = newTemplateAlign.length(); + + string leftCandidateString = candAln.substr(0,rightIndex); + string rightCandidateString = candAln.substr(rightIndex+insertLength); + candAln = leftCandidateString + rightCandidateString; + + } + else{ // not enough room to the right, have to steal some + // space to the left lets move left and then right... + string leftTemplateString = newTemplateAlign.substr(0,i); + string rightTemplateString = newTemplateAlign.substr(i+insertLength); + newTemplateAlign = leftTemplateString + rightTemplateString; + longAlignmentLength = newTemplateAlign.length(); + + string leftCandidateString = candAln.substr(0,leftIndex-(insertLength-rightRoom)+1); + string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1); + string rightCandidateString = candAln.substr(rightIndex+rightRoom); + candAln = leftCandidateString + insertString + rightCandidateString; + + } + } + } + else{ // there could be a case where there isn't enough room in + string leftTemplateString = newTemplateAlign.substr(0,i); // either direction to move stuff + string rightTemplateString = newTemplateAlign.substr(i+leftRoom+rightRoom); + newTemplateAlign = leftTemplateString + rightTemplateString; + longAlignmentLength = newTemplateAlign.length(); + + string leftCandidateString = candAln.substr(0,leftIndex-leftRoom+1); + string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1); + string rightCandidateString = candAln.substr(rightIndex+rightRoom); + candAln = leftCandidateString + insertString + rightCandidateString; + } + + i -= insertLength; + } + } +// cout << candAln << endl << tempAln << endl << newTemplateAlign << endl; +} + +/**************************************************************************************************/ + +void Nast::regapSequences(){ //This is essentially part B in Fig 2. of DeSantis et al. + + string candPair = candidateSeq->getPairwise(); + string candAln = ""; + + string tempPair = templateSeq->getPairwise(); + string tempAln = templateSeq->getAligned(); // we use the template aligned sequence as our guide + + int pairwiseLength = candPair.length(); + int fullAlignLength = tempAln.length(); + + if(candPair == ""){ + for(int i=0;isetAligned(candAln); + return; + } + + int fullAlignIndex = 0; + int pairwiseAlignIndex = 0; + string newTemplateAlign = ""; // this is going to be messy so we want a temporary template + // alignment string + while(tempAln[fullAlignIndex] == '.' || tempAln[fullAlignIndex] == '-'){ + candAln += '.'; // add the initial '-' and '.' to the candidate and template + newTemplateAlign += tempAln[fullAlignIndex];// pairwise sequences + fullAlignIndex++; + } + + string lastLoop = ""; + + while(pairwiseAlignIndexseems to be the opposite of the alpha scenario + + candAln += candPair[pairwiseAlignIndex]; + newTemplateAlign += tempAln[fullAlignIndex];// + pairwiseAlignIndex++; + fullAlignIndex++; + } + else if(isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex]) + && !isalpha(candPair[pairwiseAlignIndex])){ + // template pairwise has a character, but its full aligned sequence and candidate sequence have gaps + // this would happen like we need to add a gap. basically the opposite of the alpha situation + + newTemplateAlign += tempAln[fullAlignIndex];// + candAln += "-"; + fullAlignIndex++; + } + else if(!isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex]) + && !isalpha(candPair[pairwiseAlignIndex])){ + // template and candidate pairwise are gaps and the template aligned is not a gap this should not be possible + // would skip the gaps and not progress through full alignment sequence + // not tested yet + + cout << "We're into D" << ' ' << fullAlignIndex << ' ' << pairwiseAlignIndex << endl; + pairwiseAlignIndex++; + } + else{ + // everything has a gap - not possible + // not tested yet + + cout << "We're into F" << ' ' << fullAlignIndex << ' ' << pairwiseAlignIndex << endl; + pairwiseAlignIndex++; + fullAlignIndex++; + } + } + + for(int i=fullAlignIndex;i=0;i--){ // ditto. + if(candAln[i] == 'Z' || !isalnum(candAln[i])) { candAln[i] = '.'; } + else{ end = i; break; } + } + + for(int i=start;i<=end;i++){ // go through the candidate alignment sequence and make sure that + candAln[i] = toupper(candAln[i]); // everything is upper case + } + +// cout << candAln << endl; +// cout << tempAln << endl; +// cout << newTemplateAlign << endl; + + if(candAln.length() != tempAln.length()){ // if the regapped candidate sequence is longer than the official + removeExtraGaps(candAln, tempAln, newTemplateAlign);// template alignment then we need to do steps C-F in Fig. + } // 2 of Desantis et al. + +// cout << candAln << endl; +// cout << tempAln << endl; +// cout << newTemplateAlign << endl; + + candidateSeq->setAligned(candAln); +} + +/**************************************************************************************************/ + +float Nast::getSimilarityScore(){ + + string cand = candidateSeq->getAligned(); + string temp = templateSeq->getAligned(); + int alignmentLength = temp.length(); + int mismatch = 0; + int denominator = 0; + + for(int i=0;igetName(); + queryLength = candSeq->getUnalignLength(); +} + +/******************************************************************************************************************/ + +void NastReport::setTemplate(Sequence* tempSeq){ + templateName = tempSeq->getName(); + templateLength = tempSeq->getUnalignLength(); +} + +/******************************************************************************************************************/ + +void NastReport::setSearchParameters(string method, float score){ + searchMethod = method; + searchScore = score; +} + +/******************************************************************************************************************/ + +void NastReport::setAlignmentParameters(string method, Alignment* align){ + alignmentMethod = method; + + candidateStartPosition = align->getCandidateStartPos(); + candidateEndPosition = align->getCandidateEndPos(); + templateStartPosition = align->getTemplateStartPos(); + templateEndPosition = align->getTemplateEndPos(); + pairwiseAlignmentLength = align->getPairwiseLength(); + + totalGapsInQuery = pairwiseAlignmentLength - (candidateEndPosition - candidateStartPosition + 1); + totalGapsInTemplate = pairwiseAlignmentLength - (templateEndPosition - templateStartPosition + 1); +} + +/******************************************************************************************************************/ + +void NastReport::setNastParameters(Nast nast){ + + longestInsert = nast.getMaxInsertLength(); + similarityToTemplate = nast.getSimilarityScore(); + +} + +/******************************************************************************************************************/ diff --git a/nastreport.hpp b/nastreport.hpp new file mode 100644 index 0000000..424b0bb --- /dev/null +++ b/nastreport.hpp @@ -0,0 +1,51 @@ +#ifndef NASTREPORT_HPP +#define NASTREPORT_HPP + + +/* + * nastreport.hpp + * + * + * Created by Pat Schloss on 12/19/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + */ + +#include "mothur.h" + +using namespace std; + +/******************************************************************************************************************/ + +class NastReport { + +public: + NastReport(string); + void setCandidate(Sequence*); + void setTemplate(Sequence*); + void setSearchParameters(string, float); + void setAlignmentParameters(string, Alignment*); + void setNastParameters(Nast); + void print(); + +private: + string queryName; + int queryLength; + string templateName; + int templateLength; + string searchMethod; + float searchScore; + string alignmentMethod; + int candidateStartPosition, candidateEndPosition; + int templateStartPosition, templateEndPosition; + + int pairwiseAlignmentLength; + int longestInsert; + int totalGapsInQuery, totalGapsInTemplate; + float similarityToTemplate; + ofstream candidateReportFile; +}; + +/******************************************************************************************************************/ + +#endif diff --git a/needlemanoverlap.cpp b/needlemanoverlap.cpp new file mode 100644 index 0000000..8f5d464 --- /dev/null +++ b/needlemanoverlap.cpp @@ -0,0 +1,92 @@ +/* + * needleman.cpp + * + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This class is an Alignment child class that implements the Gotoh pairwise alignment algorithm as described in: + * + * Gotoh O. 1982. An improved algorithm for matching biological sequences. J. Mol. Biol. 162:705-8. + * Myers, EW & Miller, W. 1988. Optimal alignments in linear space. Comput Appl Biosci. 4:11-7. + * + * This method is nice because it allows for an affine gap penalty to be assessed, which is analogous to what is used + * in blast and is an alternative to Needleman-Wunsch, which only charges the same penalty for each gap position. + * Because this method typically has problems at the ends when two sequences do not full overlap, we employ a separate + * method to fix the ends (see Overlap class documentation) + * + */ + +using namespace std; + +#include "alignmentcell.hpp" +#include "alignment.hpp" +#include "overlap.hpp" +#include "needlemanoverlap.hpp" + +/**************************************************************************************************/ + +NeedlemanOverlap::NeedlemanOverlap(float gO, float m, float mm, int r) :// note that we don't have a gap extend +gap(gO), match(m), mismatch(mm), Alignment(r) { // the gap openning penalty is assessed for + // every gapped position + for(int i=1;i= up){ + if(diagonal >= left){ + alignment[i][j].cValue = diagonal; + alignment[i][j].prevCell = 'd'; + } + else{ + alignment[i][j].cValue = left; + alignment[i][j].prevCell = 'l'; + } + } + else{ + if(up >= left){ + alignment[i][j].cValue = up; + alignment[i][j].prevCell = 'u'; + } + else{ + alignment[i][j].cValue = left; + alignment[i][j].prevCell = 'l'; + } + } + } + } + Overlap over; + over.setOverlap(alignment, lA, lB, 0); // Fix gaps at the beginning and end of the sequences + traceBack(); // Traceback the alignment to populate seqAaln and seqBaln +} + +/**************************************************************************************************/ + diff --git a/needlemanoverlap.hpp b/needlemanoverlap.hpp new file mode 100644 index 0000000..57d3284 --- /dev/null +++ b/needlemanoverlap.hpp @@ -0,0 +1,43 @@ +#ifndef NEEDLEMAN_H +#define NEEDLEMAN_H + +/* + * needleman.h + * + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This class is an Alignment child class that implements the Needleman-Wunsch pairwise alignment algorithm as + * described in: + * + * Needleman SB & Wunsch CD. 1970. A general method applicable to the search for similarities in the amino acid + * sequence of two proteins. J Mol Biol. 48:443-53. + * Korf I, Yandell M, & Bedell J. 2003. BLAST. O'Reilly & Associates. Sebastopol, CA. + * + * This method is simple as it assesses a consistent penalty for each gap position. Because this method typically has + * problems at the ends when two sequences do not full overlap, we employ a separate method to fix the ends (see + * Overlap class documentation) + * + */ + +#include "mothur.h" + +/**************************************************************************************************/ + +class NeedlemanOverlap : public Alignment { + +public: + NeedlemanOverlap(float, float, float, int); + ~NeedlemanOverlap(); + void align(string, string); + +private: + float gap; + float match; + float mismatch; +}; + +/**************************************************************************************************/ + +#endif diff --git a/noalign.cpp b/noalign.cpp new file mode 100644 index 0000000..14e34f6 --- /dev/null +++ b/noalign.cpp @@ -0,0 +1,25 @@ +/* + * noalign.cpp + * + * + * Created by Pat Schloss on 2/19/09. + * Copyright 2009 __MyCompanyName__. All rights reserved. + * + */ + +#include "alignment.hpp" +#include "noalign.hpp" + +/**************************************************************************************************/ + +NoAlign::NoAlign(){ /* do nothing */ } + +/**************************************************************************************************/ + +NoAlign::~NoAlign(){ /* do nothing */ } + +/**************************************************************************************************/ + +void NoAlign::align(string A, string B){ } + +/**************************************************************************************************/ diff --git a/noalign.hpp b/noalign.hpp new file mode 100644 index 0000000..dd1e4ba --- /dev/null +++ b/noalign.hpp @@ -0,0 +1,31 @@ +#ifndef NOALIGN_HPP +#define NOALIGN_HPP + +/* + * noalign.hpp + * + * + * Created by Pat Schloss on 2/19/09. + * Copyright 2009 __MyCompanyName__. All rights reserved. + * + */ +using namespace std; + +#include "mothur.h" + +/**************************************************************************************************/ + +class NoAlign : public Alignment { + +public: + NoAlign(); + ~NoAlign(); + void align(string, string); + +private: +}; + +/**************************************************************************************************/ + + +#endif diff --git a/overlap.cpp b/overlap.cpp new file mode 100644 index 0000000..9645e7a --- /dev/null +++ b/overlap.cpp @@ -0,0 +1,83 @@ +/* + * overlap.cpp + * + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This class cleans up the alignment at the 3' end of the alignments. Because the Gotoh and Needleman-Wunsch + * algorithms start the traceback from the lower-right corner of the dynamic programming matrix, there may be a lot of + * scattered bases in the alignment near the 3' end of the alignment. Here we basically look for the largest score + * in the last column and row to determine whether there should be exta gaps in sequence A or sequence B. The gap + * issues at the 5' end of the alignment seem to take care of themselves in the traceback. + * + */ +using namespace std; + +#include "alignmentcell.hpp" +#include "overlap.hpp" + + +/**************************************************************************************************/ + +int Overlap::maxRow(vector >& alignment, const int band){ + + float max = -100; + int end = lA - 1; + int index = end; + + for(int i=band;i= max){ // score. + index = i; + max = alignment[i][end].cValue; + } + } + return index; +} + +/**************************************************************************************************/ + +int Overlap::maxColumn(vector >& alignment, const int band){ + + float max = -100; + int end = lB - 1; + int index = end; + + for(int i=band;i= max){ // alignment score. + index = i; + max = alignment[end][i].cValue; + } + } + return index; +} + +/**************************************************************************************************/ + +void Overlap::setOverlap(vector >& alignment, const int nA, const int nB, const int band=0){ + + lA = nA; + lB = nB; + + int rowIndex = maxRow(alignment, band); // get the index for the row with the highest right hand side score + int colIndex = maxColumn(alignment, band); // get the index for the column with the highest bottom row score + + int row = lB-1; + int column = lA-1; + + if(colIndex == column && rowIndex == row){} // if the max values are the lower right corner, then we're good + else if(alignment[row][colIndex].cValue < alignment[rowIndex][column].cValue){ + for(int i=rowIndex+1;i >&, const int, const int, const int); +private: + int maxRow(vector >&, const int); + int maxColumn(vector >&, const int); + int lA, lB; +}; + +/**************************************************************************************************/ + +#endif diff --git a/rarefact.cpp b/rarefact.cpp index 5c1ce74..ba0c02e 100644 --- a/rarefact.cpp +++ b/rarefact.cpp @@ -94,15 +94,23 @@ try { //randomize the groups random_shuffle(lookup.begin(), lookup.end()); + //make merge the size of lookup[0] + SharedRAbundVector* merge = new SharedRAbundVector(lookup[0]->size()); + + //make copy of lookup zero + for(int i = 0; isize(); i++) { + merge->set(i, lookup[0]->getAbundance(i), "merge"); + } + vector subset; //send each group one at a time for (int k = 0; k < lookup.size(); k++) { subset.clear(); //clears out old pair of sharedrabunds //add in new pair of sharedrabunds - subset.push_back(lookup[0]); subset.push_back(lookup[k]); + subset.push_back(merge); subset.push_back(lookup[k]); rcd->updateSharedData(subset, k+1, numGroupComb); - mergeVectors(lookup[0], lookup[k]); + mergeVectors(merge, lookup[k]); } //resets output files @@ -134,9 +142,7 @@ void Rarefact::mergeVectors(SharedRAbundVector* shared1, SharedRAbundVector* sha try{ for (int k = 0; k < shared1->size(); k++) { //merge new species into shared1 - if ((shared1->getAbundance(k) == 0) && (shared2->getAbundance(k) != 0)) { - shared1->set(k, shared2->getAbundance(k), "combo"); //set to 'combo' since this vector now contains multiple groups - } + shared1->set(k, (shared1->getAbundance(k) + shared2->getAbundance(k)), "combo"); //set to 'combo' since this vector now contains multiple groups } } catch(exception& e) { diff --git a/sequence.cpp b/sequence.cpp index b59363e..f136426 100644 --- a/sequence.cpp +++ b/sequence.cpp @@ -24,6 +24,36 @@ Sequence::Sequence(string newName, string sequence) { } setUnaligned(sequence); } +//******************************************************************************************************************** + +Sequence::Sequence(ifstream& fastaFile){ + + string accession; // provided a file handle to a fasta-formatted sequence file, read in the next + fastaFile >> accession; // accession number and sequence we find... + setName(accession); + + char letter; + string sequence; + + while(fastaFile){ + letter= fastaFile.get(); + if(letter == '>'){ + fastaFile.putback(letter); + break; + } + else if(isprint(letter)){ + letter = toupper(letter); + if(letter == 'U'){letter = 'T';} + sequence += letter; + } + + } + + if(sequence.find_first_of('-') != string::npos){ // if there are any gaps in the sequence, assume that it is + setAligned(sequence); // an alignment file + } + setUnaligned(sequence); // also set the unaligned sequence file +} //******************************************************************************************************************** @@ -122,3 +152,18 @@ void Sequence::printSequence(ostream& out){ } //******************************************************************************************************************** + +int Sequence::getUnalignLength(){ + return unaligned.length(); +} + +//******************************************************************************************************************** + +int Sequence::getAlignLength(){ + return aligned.length(); +} + +//******************************************************************************************************************** + + + diff --git a/sequence.hpp b/sequence.hpp index 03cbab7..37ca87c 100644 --- a/sequence.hpp +++ b/sequence.hpp @@ -8,6 +8,10 @@ * Created by Pat Schloss on 12/15/08. * Copyright 2008 Patrick D. Schloss. All rights reserved. * + * A sequence object has three components: i) an accession number / name, ii) the unaligned primary sequence, iii) a + * pairwise aligned sequence, and iv) a sequence that is aligned to a reference alignment. This class has methods + * to set and get these values for the other classes where they are needed. * + * */ using namespace std; @@ -20,6 +24,7 @@ class Sequence { public: Sequence(); Sequence(string, string); + Sequence(ifstream&); void setName(string); void setUnaligned(string); @@ -32,7 +37,9 @@ public: string getAligned(); string getPairwise(); string getUnaligned(); - int getLength(); + int getLength(); //the greater of the lengths of unaligned and aligned + int getUnalignLength(); + int getAlignLength(); void printSequence(ostream&); private: diff --git a/suffixdb.cpp b/suffixdb.cpp new file mode 100644 index 0000000..b4639f9 --- /dev/null +++ b/suffixdb.cpp @@ -0,0 +1,56 @@ +/* + * suffixdb.cpp + * + * + * Created by Pat Schloss on 12/16/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This is a child class of the Database abstract datatype. The class is basically a database of suffix trees and an + * encapsulation of the method for finding the most similar tree to an inputted sequence. the suffixForest objecct + * is a vector of SuffixTrees, with each template sequence being represented by a different SuffixTree. The class also + * provides a method to take an unaligned sequence and find the closest sequence in the suffixForest. The search + * method is inspired by the article and Perl source code provided at http://www.ddj.com/web-development/184416093. I + * would estimate that the time complexity is O(LN) for each search, which is slower than the kmer searching, but + * faster than blast + * + */ + +using namespace std; + +#include "database.hpp" +#include "sequence.hpp" +#include "suffixtree.hpp" +#include "suffixdb.hpp" + +/**************************************************************************************************/ + +SuffixDB::SuffixDB(string fastaFileName) : Database(fastaFileName) { + + suffixForest.resize(numSeqs); + cout << "Generating the suffix tree database...\t"; cout.flush(); + for(int i=0;iconvert2ints(); // the candidate sequence needs to be a string of ints + for(int i=0;i suffixForest; +}; + +#endif diff --git a/suffixnodes.cpp b/suffixnodes.cpp new file mode 100644 index 0000000..79826ff --- /dev/null +++ b/suffixnodes.cpp @@ -0,0 +1,96 @@ +/* + * SuffixNodes.cpp + * + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * There are two types of nodes in a suffix tree as I have implemented it. First, there are the internal nodes that + * have children, these are the SuffixBranch objects. There are also the terminal nodes, which are the suffixBranches. + * I divided them into two groups to save on memory. A SuffixTree object will be a vector of SuffixNodes; therefore, + * the values of parentNode, children nodes, and suffix nodes are stored as ints that correspond to indices in the + * vector + * + */ + +#include "suffixnodes.hpp" + +using namespace std; + +//******************************************************************************************************************** + +inline char deCodeSequence(char code){ + + if(code == '0') { return 'a'; } // this method allows us to go from the int string to a char string; + else if(code == '1') { return 'c'; } // it's only really useful if we want to print out the tree + else if(code == '2') { return 'g'; } + else if(code == '3') { return 't'; } + else if(code == '4') { return 'n'; } + else { return '$'; } + +} + +//******************************************************************************************************************** + +SuffixNode::SuffixNode(int parent, int start, int end) : + parentNode(parent), // we store the parent node as an int + startCharPosition(start), // the suffix tree class will hold the sequence that the startCharPosition and + endCharPosition(end) // endCharPosition indices correspond to + { /* do nothing */ } + + +void SuffixNode::setChildren(char, int) { /* do nothing */ } // there's no children in a leaf +int SuffixNode::getNumChildren() { return 0; } // ditto +void SuffixNode::eraseChild(char) { /* do nothing */ } // ditto +int SuffixNode::getChild(char) { return -1; } // ditto +void SuffixNode::setSuffixNode(int) { /* do nothing */ } // there's no suffix node in a leaf +int SuffixNode::getSuffixNode() { return -1; } // ditto +int SuffixNode::getParentNode() { return parentNode; } +void SuffixNode::setParentNode(int number) { parentNode = number; } +int SuffixNode::getStartCharPos() { return startCharPosition; } +void SuffixNode::setStartCharPos(int start) { startCharPosition = start; } +int SuffixNode::getEndCharPos() { return endCharPosition; } + +//******************************************************************************************************************** + +SuffixLeaf::SuffixLeaf(int parent, int start, int end) : SuffixNode(parent, start, end) { /* do nothing */ } + + +void SuffixLeaf::print(string sequence, int nodeNumber){ + + cout << this << '\t' << parentNode << '\t' << nodeNumber << '\t' << + -1 << '\t' << startCharPosition << '\t' << endCharPosition << '\t'; + + cout << '\''; + for(int i=startCharPosition;i<=endCharPosition;i++){ + cout << deCodeSequence(sequence[i]); + } + cout << '\'' << endl; +} + +//******************************************************************************************************************** + +SuffixBranch::SuffixBranch(int parent, int start, int end) : SuffixNode(parent, start, end), suffixNode(-1){ + childNodes.assign(6, -1); +} + +void SuffixBranch::print(string sequence, int nodeNumber){ // this method is different that than + cout << this << '\t' << parentNode << '\t' << nodeNumber << '\t' << // of a leaf because it prints out a + suffixNode << '\t' << startCharPosition << '\t' << endCharPosition << '\t'; // value for the suffix node + + cout << '\''; + for(int i=startCharPosition;i<=endCharPosition;i++){ + cout << deCodeSequence(sequence[i]); + } + cout << '\'' << endl; +} + +// we can access the children by subtracting '0' from the the char value from the string, the difference is an int +// value and the index we need to access. +void SuffixBranch::eraseChild(char base) { childNodes[base - '0'] = -1; } //to erase set the child index to -1 +void SuffixBranch::setChildren(char base, int nodeIndex){ childNodes[base - '0'] = nodeIndex; } +void SuffixBranch::setSuffixNode(int nodeIndex){ suffixNode = nodeIndex; } +int SuffixBranch::getSuffixNode() { return suffixNode; } +int SuffixBranch::getChild(char base) { return childNodes[base - '0']; } + +//******************************************************************************************************************** diff --git a/suffixnodes.hpp b/suffixnodes.hpp new file mode 100644 index 0000000..863a3ae --- /dev/null +++ b/suffixnodes.hpp @@ -0,0 +1,77 @@ +#ifndef SUFFIXNODES_H +#define SUFFIXNODES_H + +/* + * SuffixNodes.h + * + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * There are two types of nodes in a suffix tree as I have implemented it. First, there are the internal nodes that + * have children, these are the SuffixBranch objects. There are also the terminal nodes, which are the suffixBranches. + * I divided them into two groups to save on memory. A SuffixTree object will be a vector of SuffixNodes; therefore, + * the values of parentNode, children nodes, and suffix nodes are stored as ints that correspond to indices in the + * vector + * + */ + +#include "mothur.h" + +using namespace std; + +//******************************************************************************************************************** + +class SuffixNode { + +public: + SuffixNode(int, int, int); + virtual void print(string, int) = 0; + virtual void setChildren(char, int); + virtual int getNumChildren(); + virtual void eraseChild(char); + virtual void setSuffixNode(int); + virtual int getSuffixNode(); + virtual int getChild(char); + int getParentNode(); + void setParentNode(int); + int getStartCharPos(); + void setStartCharPos(int start); + int getEndCharPos(); + +protected: + int parentNode; + int startCharPosition; + int endCharPosition; +}; + +//******************************************************************************************************************** + +class SuffixLeaf : public SuffixNode { // most of the methods are already set in the parent class + +public: + SuffixLeaf(int, int, int); // we just need to define a constructor and + void print(string, int); // print method +}; + +//******************************************************************************************************************** + +class SuffixBranch : public SuffixNode { + +public: + SuffixBranch(int, int, int); + void print(string, int); // need a special method for printing the node because there are children + void eraseChild(char); // need a special method for erasing the children + void setChildren(char, int); // need a special method for setting children + void setSuffixNode(int); // need a special method for setting the suffix node + int getSuffixNode(); // need a special method for returning the suffix node + int getChild(char); // need a special method for return children + +private: + vector childNodes; // a suffix branch is unique because it has children and a suffixNode. The + int suffixNode; // are stored in a vector for super-fast lookup. If the alphabet were bigger, this +}; // might not be practical. Since we only have 5 possible letters, it makes sense + +//******************************************************************************************************************** + +#endif diff --git a/suffixtree.cpp b/suffixtree.cpp new file mode 100644 index 0000000..56ce6ed --- /dev/null +++ b/suffixtree.cpp @@ -0,0 +1,247 @@ +/* + * suffixtree.cpp + * + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This is my half-assed attempt to implement a suffix tree. This is a cobbled together algorithm using materials that + * I found at http://marknelson.us/1996/08/01/suffix-trees/ and: + * + * Ukkonen E. (1995). On-line construction of suffix trees. Algorithmica 14 (3): 249--260 + * Gusfield, Dan (1999). Algorithms on Strings, Trees and Sequences: Computer Science and Computational Biology. + * USA: Cambridge University Press + * + * The Ukkonen paper is the seminal paper describing the on-line method of constructing a suffix tree. + * + * I have chosen to store the nodes of the tree as a vector of pointers to SuffixNode objects. The root is stored at + * nodeVector[0]. Each tree also stores the sequence name and the string that corresponds to the actual sequence. + * Finally, this class provides a way of counting the number of suffixes that are needed in one tree to generate a new + * sequence (countSuffixes). This method is used to determine similarity between sequences and was inspired by the + * article and Perl source code provided at http://www.ddj.com/web-development/184416093. + * + */ + +#include "sequence.hpp" +#include "suffixnodes.hpp" +#include "suffixtree.hpp" + +using namespace std; + +//******************************************************************************************************************** + +inline bool compareParents(SuffixNode* left, SuffixNode* right){// this is necessary to print the tree and to sort the + return (left->getParentNode() < right->getParentNode()); // nodes in order of their parent +} + +//******************************************************************************************************************** + +SuffixTree::SuffixTree(){} + +//******************************************************************************************************************** + +SuffixTree::~SuffixTree(){ + for(int i=0;igetName(); + sequence = seq->convert2ints(); + sequence += '5'; // this essentially concatenates a '$' to the end of the sequence to + int seqLength = sequence.length(); // make it a cononical suffix tree + + nodeVector.push_back(new SuffixBranch(-1, 0, -1)); // enter the root of the suffix tree + + activeNode = root = 0; + string hold; + for(int i=0;i hold = nodeVector; + sort(hold.begin(), hold.end(), compareParents); + cout << "Address\t\tParent\tNode\tSuffix\tStartC\tEndC\tSuffix" << endl; + for(int i=1;i<=nodeCounter;i++){ + hold[i]->print(sequence, i); + } +} + +//******************************************************************************************************************** + +int SuffixTree::countSuffixes(string compareSequence, int& minValue){ // here we count the number of suffix parts + // we need to rewrite a user supplied sequence. if the + int numSuffixes = 0; // count exceeds the supplied minValue, bail out. The + int seqLength = compareSequence.length(); // time complexity should be O(L) + int position = 0; + + int presentNode = 0; + + while(position < seqLength){ // while the position in the query sequence isn't at the end... + + if(numSuffixes > minValue) { return 1000000; } // bail if the count gets too high + + int newNode = nodeVector[presentNode]->getChild(compareSequence[position]); // see if the current node has a + // child that matches the next character in the query + if(newNode == -1){ + if(presentNode == 0){ position++; } // if not, go back to the root and increase the count + numSuffixes++; // by one. + presentNode = 0; + } + else{ // if there is, move to that node and see how far down + presentNode = newNode; // it we can get + + for(int i=nodeVector[newNode]->getStartCharPos(); i<=nodeVector[newNode]->getEndCharPos(); i++){ + if(compareSequence[position] == sequence[i]){ + position++; // as long as the query and branch agree, keep going + } + else{ + numSuffixes++; // if there is a mismatch, increase the number of + presentNode = 0; // suffixes and go back to the root + break; + } + } + } + // if we get all the way through the node we'll go to the top of the while loop and find the child node + // that corresponds to what we are interested in + } + numSuffixes--; // the method puts an extra count on numSuffixes + + if(numSuffixes < minValue) { minValue = numSuffixes; } // if the count is less than the previous minValue, + return numSuffixes; // change the value and return the number of suffixes + +} + +//******************************************************************************************************************** + +void SuffixTree::canonize(){ // if you have to ask how this works, you don't really want to know and this really + // isn't the place to ask. + if ( isExplicit() == 0 ) { // if the node has no children... + + int tempNodeIndex = nodeVector[activeNode]->getChild(sequence[activeStartPosition]); + SuffixNode* tempNode = nodeVector[tempNodeIndex]; + + int span = tempNode->getEndCharPos() - tempNode->getStartCharPos(); + + while ( span <= ( activeEndPosition - activeStartPosition ) ) { + + activeStartPosition = activeStartPosition + span + 1; + + activeNode = tempNodeIndex; + + if ( activeStartPosition <= activeEndPosition ) { + tempNodeIndex = nodeVector[tempNodeIndex]->getChild(sequence[activeStartPosition]); + tempNode = nodeVector[tempNodeIndex]; + span = tempNode->getEndCharPos() - tempNode->getStartCharPos(); + } + + } + } +} + +//******************************************************************************************************************** + +int SuffixTree::split(int nodeIndex, int position){ // leaves stay leaves, etc, to split a leaf we make a new interior + // node and reconnect everything + SuffixNode* node = nodeVector[nodeIndex]; // get the node that needs to be split + SuffixNode* parentNode = nodeVector[node->getParentNode()]; // get it's parent node + + parentNode->eraseChild(sequence[node->getStartCharPos()]); // erase the present node from the registry of its parent + + nodeCounter++; + SuffixNode* newNode = new SuffixBranch(node->getParentNode(), node->getStartCharPos(), node->getStartCharPos() + activeEndPosition - activeStartPosition); // create a new node that will link the parent with the old child + parentNode->setChildren(sequence[newNode->getStartCharPos()], nodeCounter);// give the parent the new child + nodeVector.push_back(newNode); + + node->setParentNode(nodeCounter); // give the original node the new node as its parent + newNode->setChildren(sequence[node->getStartCharPos() + activeEndPosition - activeStartPosition + 1], nodeIndex); + // put the original node in the registry of the new node's children + newNode->setSuffixNode(activeNode);//link the new node with the old active node + + // recalculate the startCharPosition of the outermost node + node->setStartCharPos(node->getStartCharPos() + activeEndPosition - activeStartPosition + 1 ); + + return node->getParentNode(); +} + +//******************************************************************************************************************** + +void SuffixTree::makeSuffixLink(int& previous, int present){ + +// here we link the nodes that are suffixes of one another to rapidly speed through the tree + if ( previous > 0 ) { nodeVector[previous]->setSuffixNode(present); } + else { /* do nothing */ } + + previous = present; +} + +//******************************************************************************************************************** + +void SuffixTree::addPrefix(int prefixPosition){ + + int lastParentNode = -1; // we need to place a new prefix in the suffix tree + int parentNode = 0; + + while(1){ + + parentNode = activeNode; + + if(isExplicit() == 1){ // if the node is explicit (has kids), try to follow it down the branch if its there... + if(nodeVector[activeNode]->getChild(sequence[prefixPosition]) != -1){ // break out and get next prefix... + break; + } + else{ // ...otherwise continue, we'll need to make a new node later on... + } + } + else{ // if it's not explicit (no kids), read through and see if all of the chars agree... + int tempNode = nodeVector[activeNode]->getChild(sequence[activeStartPosition]); + int span = activeEndPosition - activeStartPosition; + + if(sequence[nodeVector[tempNode]->getStartCharPos() + span + 1] == sequence[prefixPosition] ){ + break; // if the existing suffix agrees with the new one, grab a new prefix... + } + else{ + parentNode = split(tempNode, prefixPosition); // ... otherwise we need to split the node + } + + } + + nodeCounter++; // we need to generate a new node here if the kid didn't exist, or we split a node + SuffixNode* newSuffixLeaf = new SuffixLeaf(parentNode, prefixPosition, sequence.length()-1); + nodeVector[parentNode]->setChildren(sequence[prefixPosition], nodeCounter); + nodeVector.push_back(newSuffixLeaf); + + makeSuffixLink( lastParentNode, parentNode ); // make a suffix link for the parent node + + if(nodeVector[activeNode]->getParentNode() == -1){ // move along the start position for the tree + activeStartPosition++; + } + else { + activeNode = nodeVector[activeNode]->getSuffixNode(); + } + canonize(); // frankly, i'm not entirely clear on what canonize does. + } + + makeSuffixLink( lastParentNode, parentNode ); + activeEndPosition++; // move along the end position for the tree + + canonize(); // frankly, i'm not entirely clear on what canonize does. + +} + +//******************************************************************************************************************** + diff --git a/suffixtree.hpp b/suffixtree.hpp new file mode 100644 index 0000000..387d4bd --- /dev/null +++ b/suffixtree.hpp @@ -0,0 +1,69 @@ +#ifndef SUFFIXTREE_H +#define SUFFIXTREE_H + +/* + * suffixtree.h + * + * + * Created by Pat Schloss on 12/15/08. + * Copyright 2008 Patrick D. Schloss. All rights reserved. + * + * This is my half-assed attempt to implement a suffix tree. This is a cobbled together algorithm using materials that + * I found at http://marknelson.us/1996/08/01/suffix-trees/ and: + * + * Ukkonen E. (1995). On-line construction of suffix trees. Algorithmica 14 (3): 249--260 + * Gusfield, Dan (1999). Algorithms on Strings, Trees and Sequences: Computer Science and Computational Biology. + * USA: Cambridge University Press + * + * The Ukkonen paper is the seminal paper describing the on-line method of constructing a suffix tree. + * + * I have chosen to store the nodes of the tree as a vector of pointers to SuffixNode objects. The root is stored at + * nodeVector[0]. Each tree also stores the sequence name and the string that corresponds to the actual sequence. + * Finally, this class provides a way of counting the number of suffixes that are needed in one tree to generate a new + * sequence (countSuffixes). This method is used to determine similarity between sequences and was inspired by the + * article and Perl source code provided at http://www.ddj.com/web-development/184416093. + * + */ + +#include "mothur.h" + +using namespace std; + +class SuffixNode; + +//******************************************************************************************************************** + +class SuffixTree { + +public: + SuffixTree(); + ~SuffixTree(); +// SuffixTree(string, string); + + void loadSequence(Sequence*); + string getSeqName(); + void print(); + int countSuffixes(string, int&); + +private: + void addPrefix(int); + void canonize(); + int split(int, int); + void makeSuffixLink(int&, int); + bool isExplicit(){ return activeStartPosition > activeEndPosition; } + + int activeStartPosition; + int activeEndPosition; + + vector nodeVector; + int root; + int activeNode; + int nodeCounter; + string seqName; + string sequence; + +}; + +//******************************************************************************************************************** + +#endif diff --git a/validcommands.cpp b/validcommands.cpp index 5549e63..a5f3651 100644 --- a/validcommands.cpp +++ b/validcommands.cpp @@ -43,6 +43,7 @@ ValidCommands::ValidCommands() { commands["concensus"] = "concensus"; commands["help"] = "help"; commands["filter.seqs"] = "filter.seqs"; + commands["align.seqs"] = "align.seqs"; commands["quit"] = "quit"; diff --git a/validparameter.cpp b/validparameter.cpp index 8af4a81..e79a2db 100644 --- a/validparameter.cpp +++ b/validparameter.cpp @@ -211,7 +211,6 @@ void ValidParameters::initCommandParameters() { //{"parameter1","parameter2",...,"last parameter"}; string readdistArray[] = {"phylip","column", "name","cutoff","precision", "group"}; - commandParameters["read.dist"] = addParameters(readdistArray, sizeof(readdistArray)/sizeof(string)); string readotuArray[] = {"list","order","shared", "line", "label","group","sabund", "rabund"}; @@ -292,6 +291,9 @@ void ValidParameters::initCommandParameters() { string distanceArray[] = {"fasta","phylip","clustal","nexus", "calc", "ends", "cutoff", "processors"}; commandParameters["dist.seqs"] = addParameters(distanceArray, sizeof(distanceArray)/sizeof(string)); + string AlignArray[] = {"fasta","phylip","clustal","nexus", "template", "ksize", "align", "match", "mismatch", "gapopen", "gapextend"}; + commandParameters["align.seqs"] = addParameters(AlignArray, sizeof(AlignArray)/sizeof(string)); + string quitArray[] = {}; commandParameters["quit"] = addParameters(quitArray, sizeof(quitArray)/sizeof(string));