372E12700F26365B0095CF7E /* readotucommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E126F0F26365B0095CF7E /* readotucommand.cpp */; };
372E12960F263D5A0095CF7E /* readdistcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E12950F263D5A0095CF7E /* readdistcommand.cpp */; };
372E12ED0F264D320095CF7E /* commandfactory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 372E12EC0F264D320095CF7E /* commandfactory.cpp */; };
+ 373C68A40FC1C07D00137ACD /* alignment.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68A00FC1C07D00137ACD /* alignment.cpp */; };
+ 373C68A50FC1C07D00137ACD /* alignmentcell.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68A20FC1C07D00137ACD /* alignmentcell.cpp */; };
+ 373C68BC0FC1C1A600137ACD /* gotohoverlap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68BA0FC1C1A600137ACD /* gotohoverlap.cpp */; };
+ 373C68C50FC1C25F00137ACD /* overlap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68C30FC1C25F00137ACD /* overlap.cpp */; };
+ 373C68CF0FC1C2D900137ACD /* needlemanoverlap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68CD0FC1C2D900137ACD /* needlemanoverlap.cpp */; };
+ 373C68DD0FC1C38D00137ACD /* blastalign.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68DB0FC1C38D00137ACD /* blastalign.cpp */; };
+ 373C68E50FC1C4A500137ACD /* noalign.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68E30FC1C4A500137ACD /* noalign.cpp */; };
+ 373C68F90FC1C6A800137ACD /* suffixdb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68F30FC1C6A800137ACD /* suffixdb.cpp */; };
+ 373C68FA0FC1C6A800137ACD /* suffixnodes.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68F50FC1C6A800137ACD /* suffixnodes.cpp */; };
+ 373C68FB0FC1C6A800137ACD /* suffixtree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C68F70FC1C6A800137ACD /* suffixtree.cpp */; };
+ 373C69180FC1C8AF00137ACD /* blastdb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C69160FC1C8AF00137ACD /* blastdb.cpp */; };
+ 373C691F0FC1C98600137ACD /* nast.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C691D0FC1C98600137ACD /* nast.cpp */; };
+ 373C692B0FC1C9EB00137ACD /* nastreport.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C69290FC1C9EB00137ACD /* nastreport.cpp */; };
+ 373C69340FC1CA9E00137ACD /* distancedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 373C69320FC1CA9E00137ACD /* distancedb.cpp */; };
374610780F40645300460C57 /* unifracweightedcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 374610770F40645300460C57 /* unifracweightedcommand.cpp */; };
3746107E0F4064D100460C57 /* weighted.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3746107D0F4064D100460C57 /* weighted.cpp */; };
374610830F40652400460C57 /* unweighted.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 374610820F40652400460C57 /* unweighted.cpp */; };
378C1B0A0FB0644E004D63F5 /* sequencedb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 378C1AFD0FB0644D004D63F5 /* sequencedb.cpp */; };
378C1B0B0FB0644E004D63F5 /* sharedjackknife.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 378C1AFF0FB0644D004D63F5 /* sharedjackknife.cpp */; };
378C1B0C0FB0644E004D63F5 /* sharedmarczewski.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 378C1B010FB0644D004D63F5 /* sharedmarczewski.cpp */; };
+ 378DC5CF0FBDE1C8003B8607 /* aligncommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 378DC5CE0FBDE1C8003B8607 /* aligncommand.cpp */; };
379293C30F2DE73400B9034A /* treemap.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 379293C20F2DE73400B9034A /* treemap.cpp */; };
379294700F2E191800B9034A /* parsimonycommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 3792946F0F2E191800B9034A /* parsimonycommand.cpp */; };
3792948A0F2E258500B9034A /* parsimony.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 379294890F2E258500B9034A /* parsimony.cpp */; };
372E12940F263D5A0095CF7E /* readdistcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = readdistcommand.h; sourceTree = "<group>"; };
372E12950F263D5A0095CF7E /* readdistcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = readdistcommand.cpp; sourceTree = "<group>"; };
372E12EC0F264D320095CF7E /* commandfactory.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = commandfactory.cpp; sourceTree = "<group>"; };
+ 373C68A00FC1C07D00137ACD /* alignment.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignment.cpp; sourceTree = "<group>"; };
+ 373C68A10FC1C07D00137ACD /* alignment.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = alignment.hpp; sourceTree = "<group>"; };
+ 373C68A20FC1C07D00137ACD /* alignmentcell.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignmentcell.cpp; sourceTree = "<group>"; };
+ 373C68A30FC1C07D00137ACD /* alignmentcell.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = alignmentcell.hpp; sourceTree = "<group>"; };
+ 373C68BA0FC1C1A600137ACD /* gotohoverlap.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = gotohoverlap.cpp; sourceTree = "<group>"; };
+ 373C68BB0FC1C1A600137ACD /* gotohoverlap.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = gotohoverlap.hpp; sourceTree = "<group>"; };
+ 373C68C30FC1C25F00137ACD /* overlap.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = overlap.cpp; sourceTree = "<group>"; };
+ 373C68C40FC1C25F00137ACD /* overlap.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = overlap.hpp; sourceTree = "<group>"; };
+ 373C68CD0FC1C2D900137ACD /* needlemanoverlap.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = needlemanoverlap.cpp; sourceTree = "<group>"; };
+ 373C68CE0FC1C2D900137ACD /* needlemanoverlap.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = needlemanoverlap.hpp; sourceTree = "<group>"; };
+ 373C68DB0FC1C38D00137ACD /* blastalign.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blastalign.cpp; sourceTree = "<group>"; };
+ 373C68DC0FC1C38D00137ACD /* blastalign.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = blastalign.hpp; sourceTree = "<group>"; };
+ 373C68E30FC1C4A500137ACD /* noalign.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = noalign.cpp; sourceTree = "<group>"; };
+ 373C68E40FC1C4A500137ACD /* noalign.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = noalign.hpp; sourceTree = "<group>"; };
+ 373C68F30FC1C6A800137ACD /* suffixdb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = suffixdb.cpp; sourceTree = "<group>"; };
+ 373C68F40FC1C6A800137ACD /* suffixdb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = suffixdb.hpp; sourceTree = "<group>"; };
+ 373C68F50FC1C6A800137ACD /* suffixnodes.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = suffixnodes.cpp; sourceTree = "<group>"; };
+ 373C68F60FC1C6A800137ACD /* suffixnodes.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = suffixnodes.hpp; sourceTree = "<group>"; };
+ 373C68F70FC1C6A800137ACD /* suffixtree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = suffixtree.cpp; sourceTree = "<group>"; };
+ 373C68F80FC1C6A800137ACD /* suffixtree.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = suffixtree.hpp; sourceTree = "<group>"; };
+ 373C69160FC1C8AF00137ACD /* blastdb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = blastdb.cpp; sourceTree = "<group>"; };
+ 373C69170FC1C8AF00137ACD /* blastdb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = blastdb.hpp; sourceTree = "<group>"; };
+ 373C691D0FC1C98600137ACD /* nast.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = nast.cpp; sourceTree = "<group>"; };
+ 373C691E0FC1C98600137ACD /* nast.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = nast.hpp; sourceTree = "<group>"; };
+ 373C69290FC1C9EB00137ACD /* nastreport.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = nastreport.cpp; sourceTree = "<group>"; };
+ 373C692A0FC1C9EB00137ACD /* nastreport.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = nastreport.hpp; sourceTree = "<group>"; };
+ 373C69320FC1CA9E00137ACD /* distancedb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = distancedb.cpp; sourceTree = "<group>"; };
+ 373C69330FC1CA9E00137ACD /* distancedb.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = distancedb.hpp; sourceTree = "<group>"; };
374610760F40645300460C57 /* unifracweightedcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = unifracweightedcommand.h; sourceTree = "<group>"; };
374610770F40645300460C57 /* unifracweightedcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = unifracweightedcommand.cpp; sourceTree = "<group>"; };
3746107C0F4064D100460C57 /* weighted.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = weighted.h; sourceTree = "<group>"; };
378C1B000FB0644D004D63F5 /* sharedjackknife.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sharedjackknife.h; sourceTree = "<group>"; };
378C1B010FB0644D004D63F5 /* sharedmarczewski.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sharedmarczewski.cpp; sourceTree = "<group>"; };
378C1B020FB0644D004D63F5 /* sharedmarczewski.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sharedmarczewski.h; sourceTree = "<group>"; };
+ 378DC5CD0FBDE1C8003B8607 /* aligncommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = aligncommand.h; sourceTree = "<group>"; };
+ 378DC5CE0FBDE1C8003B8607 /* aligncommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = aligncommand.cpp; sourceTree = "<group>"; };
379293C10F2DE73400B9034A /* treemap.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = treemap.h; sourceTree = "<group>"; };
379293C20F2DE73400B9034A /* treemap.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = treemap.cpp; sourceTree = "<group>"; };
3792946E0F2E191800B9034A /* parsimonycommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = parsimonycommand.h; sourceTree = "<group>"; };
isa = PBXGroup;
children = (
37D927BA0F21331F001D4494 /* averagelinkage.cpp */,
+ 373C68DC0FC1C38D00137ACD /* blastalign.hpp */,
+ 373C68DB0FC1C38D00137ACD /* blastalign.cpp */,
37D928A60F2133C0001D4494 /* calculators */,
37D927C20F21331F001D4494 /* cluster.hpp */,
37D927C10F21331F001D4494 /* cluster.cpp */,
37D927D10F21331F001D4494 /* commandoptionparser.hpp */,
37D927D00F21331F001D4494 /* commandoptionparser.cpp */,
37D927D20F21331F001D4494 /* completelinkage.cpp */,
- 37D927D40F21331F001D4494 /* database.hpp */,
- 37D927D30F21331F001D4494 /* database.cpp */,
7E4130F70F8E58FA00381DD0 /* dlibshuff.h */,
7E4130F60F8E58FA00381DD0 /* dlibshuff.cpp */,
37D927D60F21331F001D4494 /* display.h */,
37D927DD0F21331F001D4494 /* fileoutput.cpp */,
37D927E00F21331F001D4494 /* globaldata.hpp */,
37D927DF0F21331F001D4494 /* globaldata.cpp */,
+ 373C68BB0FC1C1A600137ACD /* gotohoverlap.hpp */,
+ 373C68BA0FC1C1A600137ACD /* gotohoverlap.cpp */,
375873ED0F7D646F0040F377 /* heatmap.h */,
375873EE0F7D646F0040F377 /* heatmap.cpp */,
37D927E60F21331F001D4494 /* inputdata.h */,
37D927E50F21331F001D4494 /* inputdata.cpp */,
- 37D927EA0F21331F001D4494 /* kmer.hpp */,
- 37D927E90F21331F001D4494 /* kmer.cpp */,
- 37D927EC0F21331F001D4494 /* kmerdb.hpp */,
- 37D927EB0F21331F001D4494 /* kmerdb.cpp */,
7E412F420F8D213C00381DD0 /* libshuff.h */,
7E412FE90F8D3E2C00381DD0 /* libshuff.cpp */,
37E3ED800F4D9D0D00B5DF02 /* mothur.h */,
37D927EF0F21331F001D4494 /* mothur.cpp */,
37D927F10F21331F001D4494 /* nameassignment.hpp */,
37D927F00F21331F001D4494 /* nameassignment.cpp */,
+ 373C691E0FC1C98600137ACD /* nast.hpp */,
+ 373C691D0FC1C98600137ACD /* nast.cpp */,
+ 373C692A0FC1C9EB00137ACD /* nastreport.hpp */,
+ 373C69290FC1C9EB00137ACD /* nastreport.cpp */,
+ 373C68CE0FC1C2D900137ACD /* needlemanoverlap.hpp */,
+ 373C68CD0FC1C2D900137ACD /* needlemanoverlap.cpp */,
+ 373C68E40FC1C4A500137ACD /* noalign.hpp */,
+ 373C68E30FC1C4A500137ACD /* noalign.cpp */,
37D927F60F21331F001D4494 /* observable.h */,
+ 373C68C40FC1C25F00137ACD /* overlap.hpp */,
+ 373C68C30FC1C25F00137ACD /* overlap.cpp */,
37D927FC0F21331F001D4494 /* progress.hpp */,
37D927FB0F21331F001D4494 /* progress.cpp */,
37D928040F21331F001D4494 /* raredisplay.h */,
isa = PBXGroup;
children = (
37D927CD0F21331F001D4494 /* command.hpp */,
+ 378DC5CD0FBDE1C8003B8607 /* aligncommand.h */,
+ 378DC5CE0FBDE1C8003B8607 /* aligncommand.cpp */,
37C1D9710F86506E0059E3F0 /* binsequencecommand.h */,
37C1D9720F86506E0059E3F0 /* binsequencecommand.cpp */,
21DDC0190F97A8FE0060691C /* bootstrapsharedcommand.h */,
37D928AC0F213420001D4494 /* containers */ = {
isa = PBXGroup;
children = (
+ 373C68A10FC1C07D00137ACD /* alignment.hpp */,
+ 373C68A00FC1C07D00137ACD /* alignment.cpp */,
+ 373C68A30FC1C07D00137ACD /* alignmentcell.hpp */,
+ 373C68A20FC1C07D00137ACD /* alignmentcell.cpp */,
+ 373C69170FC1C8AF00137ACD /* blastdb.hpp */,
+ 373C69160FC1C8AF00137ACD /* blastdb.cpp */,
+ 37D927D40F21331F001D4494 /* database.hpp */,
+ 37D927D30F21331F001D4494 /* database.cpp */,
37D927D50F21331F001D4494 /* datavector.hpp */,
+ 373C69330FC1CA9E00137ACD /* distancedb.hpp */,
+ 373C69320FC1CA9E00137ACD /* distancedb.cpp */,
37D927DC0F21331F001D4494 /* fastamap.h */,
37D927DB0F21331F001D4494 /* fastamap.cpp */,
375873EA0F7D64520040F377 /* fullmatrix.h */,
375873EB0F7D64520040F377 /* fullmatrix.cpp */,
37D927E20F21331F001D4494 /* groupmap.h */,
37D927E10F21331F001D4494 /* groupmap.cpp */,
+ 37D927EA0F21331F001D4494 /* kmer.hpp */,
+ 37D927E90F21331F001D4494 /* kmer.cpp */,
+ 37D927EC0F21331F001D4494 /* kmerdb.hpp */,
+ 37D927EB0F21331F001D4494 /* kmerdb.cpp */,
37D927EE0F21331F001D4494 /* listvector.hpp */,
37D927ED0F21331F001D4494 /* listvector.cpp */,
37D927F80F21331F001D4494 /* ordervector.hpp */,
37D928320F21331F001D4494 /* sharedsabundvector.cpp */,
37D928450F21331F001D4494 /* sparsematrix.hpp */,
37D928440F21331F001D4494 /* sparsematrix.cpp */,
+ 373C68F40FC1C6A800137ACD /* suffixdb.hpp */,
+ 373C68F30FC1C6A800137ACD /* suffixdb.cpp */,
+ 373C68F60FC1C6A800137ACD /* suffixnodes.hpp */,
+ 373C68F50FC1C6A800137ACD /* suffixnodes.cpp */,
+ 373C68F80FC1C6A800137ACD /* suffixtree.hpp */,
+ 373C68F70FC1C6A800137ACD /* suffixtree.cpp */,
37AD4DB90F28E2FE00AA2D49 /* tree.h */,
37AD4DBA0F28E2FE00AA2D49 /* tree.cpp */,
379293C10F2DE73400B9034A /* treemap.h */,
378C1B0C0FB0644E004D63F5 /* sharedmarczewski.cpp in Sources */,
37C753CE0FB3415200DBD02E /* distancecommand.cpp in Sources */,
379643ED0FB9B5A80081FDB6 /* readseqs.cpp in Sources */,
+ 378DC5CF0FBDE1C8003B8607 /* aligncommand.cpp in Sources */,
+ 373C68A40FC1C07D00137ACD /* alignment.cpp in Sources */,
+ 373C68A50FC1C07D00137ACD /* alignmentcell.cpp in Sources */,
+ 373C68BC0FC1C1A600137ACD /* gotohoverlap.cpp in Sources */,
+ 373C68C50FC1C25F00137ACD /* overlap.cpp in Sources */,
+ 373C68CF0FC1C2D900137ACD /* needlemanoverlap.cpp in Sources */,
+ 373C68DD0FC1C38D00137ACD /* blastalign.cpp in Sources */,
+ 373C68E50FC1C4A500137ACD /* noalign.cpp in Sources */,
+ 373C68F90FC1C6A800137ACD /* suffixdb.cpp in Sources */,
+ 373C68FA0FC1C6A800137ACD /* suffixnodes.cpp in Sources */,
+ 373C68FB0FC1C6A800137ACD /* suffixtree.cpp in Sources */,
+ 373C69180FC1C8AF00137ACD /* blastdb.cpp in Sources */,
+ 373C691F0FC1C98600137ACD /* nast.cpp in Sources */,
+ 373C692B0FC1C9EB00137ACD /* nastreport.cpp in Sources */,
+ 373C69340FC1CA9E00137ACD /* distancedb.cpp in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
--- /dev/null
+/*
+ * aligncommand.cpp
+ * Mothur
+ *
+ * Created by Sarah Westcott on 5/15/09.
+ * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
+ *
+ * This version of nast does everything I think that the greengenes nast server does and then some. I have added the
+ * feature of allowing users to define their database, kmer size for searching, alignment penalty values and alignment
+ * method. This latter feature is perhaps most significant. nastPlus enables a user to use either a Needleman-Wunsch
+ * (non-affine gap penalty) or Gotoh (affine gap penalty) pairwise alignment algorithm. This is significant because it
+ * allows for a global alignment and not the local alignment provided by bLAst. Furthermore, it has the potential to
+ * provide a better alignment because of the banding method employed by blast (I'm not sure about this).
+ *
+ * to compile type:
+ * make
+ *
+ * for basic instructions on how to run nastPlus type:
+ * ./nastPlus
+ */
+
+#include "aligncommand.h"
+#include "sequence.hpp"
+
+#include "alignment.hpp"
+#include "gotohoverlap.hpp"
+#include "needlemanoverlap.hpp"
+#include "blastalign.hpp"
+#include "noalign.hpp"
+
+#include "database.hpp"
+#include "kmerdb.hpp"
+#include "suffixdb.hpp"
+#include "blastdb.hpp"
+#include "distancedb.hpp"
+
+#include "nast.hpp"
+#include "nastreport.hpp"
+
+
+//**********************************************************************************************************************
+AlignCommand::AlignCommand(){
+ try {
+ globaldata = GlobalData::getInstance();
+ candidateFileName = globaldata->inputFileName;
+ templateFileName = globaldata->getTemplateFile();
+ openInputFile(candidateFileName, in);
+ convert(globaldata->getKSize(), kmerSize);
+ convert(globaldata->getMatch(), match);
+ convert(globaldata->getMismatch(), misMatch);
+ convert(globaldata->getGapopen(), gapOpen);
+ convert(globaldata->getGapextend(), gapExtend);
+ distanceFileName = "????";
+ }
+ catch(exception& e) {
+ cout << "Standard Error: " << e.what() << " has occurred in the AlignCommand class Function AlignCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+ exit(1);
+ }
+ catch(...) {
+ cout << "An unknown error has occurred in the AlignCommand class function AlignCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
+
+AlignCommand::~AlignCommand(){
+
+}
+
+//**********************************************************************************************************************
+
+int AlignCommand::execute(){
+ try {
+ srand( (unsigned)time( NULL ) ); //needed to assign names to temporary files
+
+ Database* templateDB;
+ if(globaldata->getSearch() == "kmer") { templateDB = new KmerDB(templateFileName, kmerSize); }
+ else if(globaldata->getSearch() == "suffix") { templateDB = new SuffixDB(templateFileName); }
+ else if(globaldata->getSearch() == "blast") { templateDB = new BlastDB(templateFileName, gapOpen, gapExtend, match, misMatch); }
+ else if(globaldata->getSearch() == "distance") { templateDB = new DistanceDB(templateFileName, distanceFileName); }
+ else { cout << globaldata->getSearch() << " is not a valid search option. I will run the command using suffix." << endl;
+ templateDB = new SuffixDB(templateFileName); }
+
+ Alignment* alignment;
+ if(globaldata->getAlign() == "gotoh") { alignment = new GotohOverlap(gapOpen, gapExtend, match, misMatch, 3000); }
+ else if(globaldata->getAlign() == "needleman") { alignment = new NeedlemanOverlap(gapOpen, match, misMatch, 3000); }
+ else if(globaldata->getAlign() == "blast") { alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); }
+ else if(globaldata->getAlign() == "noalign") { alignment = new NoAlign(); }
+ else { cout << globaldata->getAlign() << " is not a valid alignment option. I will run the command using blast." << endl;
+ alignment = new BlastAlignment(gapOpen, gapExtend, match, misMatch); }
+
+ int numFastaSeqs=count(istreambuf_iterator<char>(in),istreambuf_iterator<char>(), '>');
+ in.seekg(0);
+
+ string candidateAligngmentFName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + globaldata->getSearch() + '.' + globaldata->getAlign() + ".nast.align";
+ ofstream candidateAlignmentFile;
+ openOutputFile(candidateAligngmentFName, candidateAlignmentFile);
+
+ string candidateReportFName = candidateFileName.substr(0,candidateFileName.find_last_of(".")+1) + globaldata->getSearch() + '.' + globaldata->getAlign() + ".nast.report";
+ NastReport report(candidateReportFName);
+
+ cout << "We are going to align the " << numFastaSeqs << " sequences in " << candidateFileName << "..." << endl;
+ cout.flush();
+
+ int start = time(NULL);
+
+ for(int i=0;i<numFastaSeqs;i++){
+
+ Sequence* candidateSeq = new Sequence(in);
+ report.setCandidate(candidateSeq);
+
+ Sequence* templateSeq = templateDB->findClosestSequence(candidateSeq);
+ report.setTemplate(templateSeq);
+ report.setSearchParameters(globaldata->getSearch(), templateDB->getSearchScore());
+
+ Nast nast(alignment, candidateSeq, templateSeq);
+ report.setAlignmentParameters(globaldata->getAlign(), alignment);
+ report.setNastParameters(nast);
+
+ candidateAlignmentFile << '>' << candidateSeq->getName() << '\n' << candidateSeq->getAligned() << endl;
+ candidateAlignmentFile.flush();
+
+ if((i+1) % 100 == 0){
+ cout << "It has taken " << time(NULL) - start << " secs to align " << i+1 << " sequences" << endl;
+ }
+ report.print();
+
+ delete candidateSeq;
+ }
+
+ cout << "It took " << time(NULL) - start << " secs to align " << numFastaSeqs << " sequences" << endl;
+ cout << endl;
+
+ delete templateDB;
+ delete alignment;
+
+
+ return 0;
+ }
+ catch(exception& e) {
+ cout << "Standard Error: " << e.what() << " has occurred in the AlignCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+ exit(1);
+ }
+ catch(...) {
+ cout << "An unknown error has occurred in the AlignCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
+ exit(1);
+ }
+}
+
+//**********************************************************************************************************************
\ No newline at end of file
--- /dev/null
+#ifndef ALIGNCOMMAND_H
+#define ALIGNCOMMAND_H
+
+/*
+ * aligncommand.h
+ * Mothur
+ *
+ * Created by Sarah Westcott on 5/15/09.
+ * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
+ *
+ */
+
+#include "command.hpp"
+#include "globaldata.hpp"
+
+class AlignCommand : public Command {
+
+ public:
+ AlignCommand();
+ ~AlignCommand();
+ int execute();
+
+ private:
+ GlobalData* globaldata;
+ string candidateFileName, templateFileName, distanceFileName;
+ int kmerSize;
+ float match, misMatch, gapOpen, gapExtend;
+ ofstream out;
+ ifstream in;
+
+};
+
+
+
+#endif
\ No newline at end of file
--- /dev/null
+/*
+ * alignment.cpp
+ *
+ * Created by Pat Schloss on 12/15/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ * This is a class for an abstract datatype for classes that implement various types of alignment algorithms.
+ * As of 12/18/08 these included alignments based on blastn, needleman-wunsch, and the Gotoh algorithms
+ *
+ */
+
+using namespace std;
+
+#include <string>
+#include <vector>
+
+#include "alignmentcell.hpp"
+#include "alignment.hpp"
+
+#include <iostream>
+
+/**************************************************************************************************/
+
+Alignment::Alignment() { /* do nothing */ }
+
+/**************************************************************************************************/
+
+Alignment::Alignment(int A) : nCols(A), nRows(A) {
+
+ alignment.resize(nRows); // For the Gotoh and Needleman-Wunsch we initialize the dynamic programming
+ for(int i=0;i<nRows;i++){ // matrix by initializing a matrix that is A x A. By default we will set A
+ alignment[i].resize(nCols); // at 2000 for 16S rRNA gene sequences
+ }
+
+}
+
+/**************************************************************************************************/
+
+void Alignment::traceBack(){ // This traceback routine is used by the dynamic programming algorithms
+ // to fill the values of seqAaln and seqBaln
+ seqAaln = "";
+ seqBaln = "";
+ int row = lB-1;
+ int column = lA-1;
+// seqAstart = 1;
+// seqAend = column;
+
+ AlignmentCell currentCell = alignment[row][column]; // Start the traceback from the bottom-right corner of the
+ // matrix
+
+ if(currentCell.prevCell == 'x'){ seqAaln = seqBaln = "NOALIGNMENT"; }//If there's an 'x' in the bottom-
+ else{ // right corner bail out because it means nothing got aligned
+ while(currentCell.prevCell != 'x'){ // while the previous cell isn't an 'x', keep going...
+
+ if(currentCell.prevCell == 'u'){ // if the pointer to the previous cell is 'u', go up in the
+ seqAaln = '-' + seqAaln; // matrix. this indicates that we need to insert a gap in
+ seqBaln = seqB[row] + seqBaln; // seqA and a base in seqB
+ currentCell = alignment[--row][column];
+ }
+ else if(currentCell.prevCell == 'l'){ // if the pointer to the previous cell is 'l', go to the left
+ seqBaln = '-' + seqBaln; // in the matrix. this indicates that we need to insert a gap
+ seqAaln = seqA[column] + seqAaln; // in seqB and a base in seqA
+ currentCell = alignment[row][--column];
+ }
+ else{
+ seqAaln = seqA[column] + seqAaln; // otherwise we need to go diagonally up and to the left,
+ seqBaln = seqB[row] + seqBaln; // here we add a base to both alignments
+ currentCell = alignment[--row][--column];
+ }
+ }
+ }
+
+ pairwiseLength = seqAaln.length();
+ seqAstart = 1; seqAend = 0;
+ seqBstart = 1; seqBend = 0;
+
+ for(int i=0;i<seqAaln.length();i++){
+ if(seqAaln[i] != '-' && seqBaln[i] == '-') { seqAstart++; }
+ else if(seqAaln[i] == '-' && seqBaln[i] != '-') { seqBstart++; }
+ else { break; }
+ }
+
+ pairwiseLength -= (seqAstart + seqBstart - 2);
+
+ for(int i=seqAaln.length()-1; i>=0;i--){
+ if(seqAaln[i] != '-' && seqBaln[i] == '-') { seqAend++; }
+ else if(seqAaln[i] == '-' && seqBaln[i] != '-') { seqBend++; }
+ else { break; }
+ }
+ pairwiseLength -= (seqAend + seqBend);
+
+ seqAend = seqA.length() - seqAend - 1;
+ seqBend = seqB.length() - seqBend - 1;
+
+}
+
+/**************************************************************************************************/
+
+string Alignment::getSeqAAln(){
+ return seqAaln; // this is called to get the alignment of seqA
+}
+
+/**************************************************************************************************/
+
+string Alignment::getSeqBAln(){
+ return seqBaln; // this is called to get the alignment of seqB
+}
+
+/**************************************************************************************************/
+
+int Alignment::getCandidateStartPos(){
+ return seqAstart; // this is called to report the quality of the alignment
+}
+
+/**************************************************************************************************/
+
+int Alignment::getCandidateEndPos(){
+ return seqAend; // this is called to report the quality of the alignment
+}
+
+/**************************************************************************************************/
+
+int Alignment::getTemplateStartPos(){
+ return seqBstart; // this is called to report the quality of the alignment
+}
+
+/**************************************************************************************************/
+
+int Alignment::getTemplateEndPos(){
+ return seqBend; // this is called to report the quality of the alignment
+}
+
+/**************************************************************************************************/
+
+int Alignment::getPairwiseLength(){
+ return pairwiseLength; // this is the pairwise alignment length
+}
+
+/**************************************************************************************************/
+
+//int Alignment::getLongestTemplateGap(){
+//
+// int length = seqBaln.length();
+// int longGap = 0;
+// int gapLength = 0;
+//
+// int start = seqAstart;
+// if(seqAstart < seqBstart){ start = seqBstart; }
+// for(int i=seqAstart;i<length;i++){
+// if(seqBaln[i] == '-'){
+// gapLength++;
+// }
+// else{
+// if(gapLength > 0){
+// if(gapLength > longGap){ longGap = gapLength; }
+// }
+// gapLength = 0;
+// }
+// }
+// return longGap;
+//}
+
+/**************************************************************************************************/
--- /dev/null
+#ifndef DPALIGNMENT_H
+#define DPALIGNMENT_H
+
+/*
+ * dpalignment.h
+ *
+ *
+ * Created by Pat Schloss on 12/15/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ * This is a class for an abstract datatype for classes that implement various types of alignment algorithms.
+ * As of 12/18/08 these included alignments based on blastn, needleman-wunsch, and the Gotoh algorithms
+ *
+ */
+using namespace std;
+
+#include "mothur.h"
+#include "alignmentcell.hpp"
+
+/**************************************************************************************************/
+
+class Alignment {
+
+public:
+ Alignment(int);
+ Alignment();
+ virtual void align(string, string) = 0;
+
+ float getAlignmentScore();
+ string getSeqAAln();
+ string getSeqBAln();
+ int getCandidateStartPos();
+ int getCandidateEndPos();
+ int getTemplateStartPos();
+ int getTemplateEndPos();
+
+ int getPairwiseLength();
+// int getLongestTemplateGap();
+
+protected:
+ void traceBack();
+ string seqA, seqAaln;
+ string seqB, seqBaln;
+ int seqAstart, seqAend;
+ int seqBstart, seqBend;
+ int pairwiseLength;
+ int nRows, nCols, lA, lB;
+ vector<vector<AlignmentCell> > alignment;
+};
+
+/**************************************************************************************************/
+
+#endif
--- /dev/null
+/*
+ * alignmentcell.cpp
+ *
+ * Created by Pat Schloss on 12/15/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ * This class is pretty basic. Each AlignmentCell object contains a pointer to the previous cell and different values
+ * used to calcualte the alignment. Initially everything is set to zero and all pointers are set to 'x'
+ *
+ */
+
+#include "alignmentcell.hpp"
+
+//********************************************************************************************************************
+
+AlignmentCell::AlignmentCell() : prevCell('x'), cValue(0.0000), dValue(0.0000), iValue(0.0000) {};
+
+//********************************************************************************************************************
--- /dev/null
+#ifndef ALIGNMENTCELL_H
+#define ALIGNMENTCELL_H
+
+/*
+ * alignmentcell.hpp
+ *
+ *
+ * Created by Pat Schloss on 12/15/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ * This class is pretty basic. Each AlignmentCell object contains a pointer to the previous cell and different values
+ * used to calcualte the alignment. Initially everything is set to zero and all pointers are set to 'x'
+ *
+ */
+
+//********************************************************************************************************************
+
+class AlignmentCell {
+
+public:
+ AlignmentCell();
+ char prevCell;
+ float cValue;
+ float dValue;
+ float iValue;
+};
+
+//********************************************************************************************************************
+
+#endif
--- /dev/null
+/*
+ * blastalign.cpp
+ *
+ *
+ * Created by Pat Schloss on 12/16/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ * This is a basic alignment method that gets the blast program to do the heavy lifting. In the future, we should
+ * probably incorporate NCBI's library so that we don't have to call on a user-supplied executable. This is a child
+ * of the Alignment class, which requires a constructor and align method.
+ *
+ */
+
+using namespace std;
+
+#include "alignment.hpp"
+#include "blastalign.hpp"
+
+
+//**************************************************************************************************/
+
+BlastAlignment::BlastAlignment(float go, float ge, float m, float mm) :
+ match(m), // This is the score to award for two nucleotides matching (match >= 0)
+ mismatch(mm) // This is the penalty to assess for a mismatch (mismatch <= 0)
+{
+ gapOpen = abs(go); // This is the penalty to assess for opening a gap (gapOpen >= 0)
+ gapExtend = abs(ge); // This is the penalty to assess for extending a gap (gapExtend >= 0)
+
+ int randNumber = rand();
+ candidateFileName = toString(randNumber) + ".candidate";
+ templateFileName = toString(randNumber) + ".template";
+ blastFileName = toString(randNumber) + ".pairwise";
+}
+
+//**************************************************************************************************/
+
+BlastAlignment::~BlastAlignment(){ // The desctructor should clean up by removing the temporary
+ remove(candidateFileName.c_str()); // files used to run bl2seq
+ remove(templateFileName.c_str());
+ remove(blastFileName.c_str());
+}
+
+//**************************************************************************************************/
+
+void BlastAlignment::align(string seqA, string seqB){ //Use blastn to align the two sequences
+
+ ofstream candidateFile(candidateFileName.c_str()); // Write the sequence to be aligned to a temporary candidate seq file
+ candidateFile << ">candidate" << endl << seqA << endl;
+ candidateFile.close();
+
+ ofstream templateFile(templateFileName.c_str()); // Write the unaligned template sequence to a temporary candidate seq file
+ templateFile << ">template" << endl << seqB << endl;
+ templateFile.close();
+
+ // The blastCommand assumes that we have DNA sequences (blastn) and that they are fairly similar (-e 0.001) and
+ // that we don't want to apply any kind of complexity filtering (-F F)
+ string blastCommand = "~/Pipeline/src/cpp/production/blast/bin/bl2seq -p blastn -i " + candidateFileName + " -j " + templateFileName + " -e 0.0001 -F F -o " + blastFileName + " -W 11";
+ blastCommand += " -r " + toString(match) + " -q " + toString(mismatch);
+ blastCommand += " -G " + toString(gapOpen) + " -E " + toString(gapExtend);
+
+ system(blastCommand.c_str()); // Here we assume that "bl2seq" is in the users path or in the same folder as
+ // this executable
+ setPairwiseSeqs();
+}
+
+/**************************************************************************************************/
+
+void BlastAlignment::setPairwiseSeqs(){ // This method call assigns the blast generated alignment
+ // to the pairwise entry in the Sequence class for the
+ // candidate and template Sequence objects
+ ifstream blastFile;
+ openInputFile(blastFileName, blastFile);
+
+ seqAaln = "";
+ seqBaln = "";
+
+ int candidateLength, templateLength;
+ char d;
+
+ string candidateName, templateName;
+
+ while(d=blastFile.get() != '='){};
+ blastFile >> candidateName; // Get the candidate sequence name from flatfile
+
+ while(d=blastFile.get() != '('){};
+ blastFile >> candidateLength; // Get the candidate sequence length from flatfile
+
+ while(d=blastFile.get()){
+ if(d == '>'){
+ blastFile >> templateName; // Get the template sequence name from flatfile
+ break;
+ }
+ else if(d == '*'){ // We go here if there is no significant match
+
+ seqAstart = 0;
+ seqBstart = 0;
+ seqAend = 0;
+ seqBend = 0;
+ pairwiseLength = 0;
+
+// string dummy;
+// while(dummy != "query:"){ cout << dummy << endl;blastFile >> dummy; }
+// blastFile >> seqBend;
+// cout << seqBend << endl;
+// for(int i=0;i<seqBend;i++){
+// seqAaln += 'Z';
+// seqBaln += 'X';
+// }
+// pairwiseLength = 0;
+ return;
+ }
+ }
+
+ while(d=blastFile.get() != '='){};
+ blastFile >> templateLength; // Get the template sequence length from flatfile
+
+ while(d=blastFile.get() != 'Q'){}; // Suck up everything else until we get to the start of the alignment
+ int queryStart, sbjctStart, queryEnd, sbjctEnd;
+ string queryLabel, sbjctLabel, query, sbjct;
+
+ blastFile >> queryLabel; queryLabel = 'Q' + queryLabel;
+
+
+ while(queryLabel == "Query:"){
+ blastFile >> queryStart >> query >> queryEnd;
+
+ while(d=blastFile.get() != 'S'){};
+
+ blastFile >> sbjctLabel >> sbjctStart >> sbjct >> sbjctEnd;
+
+ if(seqAaln == ""){
+ seqAstart = queryStart;
+ seqBstart = sbjctStart;
+ }
+
+ seqAaln += query; // concatenate each line of the sequence to what we already have
+ seqBaln += sbjct; // for the query and template (subject) sequence
+
+ blastFile >> queryLabel;
+ }
+ seqAend = queryEnd;
+ seqBend = sbjctEnd;
+ pairwiseLength = seqAaln.length();
+
+ for(int i=1;i<seqBstart;i++){ // Since the alignments don't always start at (1, 1), we need to pad
+ seqAaln = 'Z' + seqAaln; // the sequences so that they start at the same point
+ seqBaln = 'X' + seqBaln;
+ }
+
+ for(int i=seqBend+1;i<=templateLength;i++){ // since the sequences don't necessarily end at the same point, we
+ seqAaln += 'Z'; // again need ot pad the sequences so that they extend to the length
+ seqBaln += 'X'; // of the template sequence
+ }
+}
+
+//**************************************************************************************************/
--- /dev/null
+/*
+ * blastalign.hpp
+ *
+ *
+ * Created by Pat Schloss on 12/16/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ * This is a basic alignment method that gets the blast program to do the heavy lifting. In the future, we should
+ * probably incorporate NCBI's library so that we don't have to call on a user-supplied executable. This is a child
+ * of the Alignment class, which requires a constructor and align method.
+ *
+ */
+
+#include "mothur.h"
+
+class BlastAlignment : public Alignment {
+
+public:
+ BlastAlignment(float, float, float, float);
+ ~BlastAlignment();
+ void align(string, string);
+private:
+
+ string candidateFileName;
+ string templateFileName;
+ string blastFileName;
+
+ void setPairwiseSeqs();
+ float match;
+ float mismatch;
+ float gapOpen;
+ float gapExtend;
+};
+
--- /dev/null
+/*
+ * blastdb.cpp
+ *
+ *
+ * Created by Pat Schloss on 12/22/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+using namespace std;
+
+
+#include "database.hpp"
+#include "sequence.hpp"
+#include "blastdb.hpp"
+
+/**************************************************************************************************/
+
+BlastDB::BlastDB(string fastaFileName, float gO, float gE, float m, float mM) : Database(fastaFileName),
+gapOpen(gO), gapExtend(gE), match(m), misMatch(mM) {
+
+ cout << "Generating the temporary BLAST database...\t"; cout.flush();
+
+ int randNumber = rand();
+ dbFileName = toString(randNumber) + ".template.unaligned.fasta";
+ queryFileName = toString(randNumber) + ".candidate.unaligned.fasta";
+ blastFileName = toString(randNumber) + ".blast";
+
+
+ ofstream unalignedFastaFile;
+ openOutputFile(dbFileName, unalignedFastaFile);
+
+ for(int i=0;i<numSeqs;i++){ // generating a fasta file with unaligned template
+ unalignedFastaFile << '>' << i << endl; // sequences, which will be input to formatdb
+ unalignedFastaFile << templateSequences[i]->getUnaligned() << endl;
+ }
+ unalignedFastaFile.close();
+
+ string formatdbCommand = "~/Pipeline/src/cpp/production/blast/bin/formatdb -p F -o T -i " + dbFileName; // format the database, -o option gives us the ability
+ system(formatdbCommand.c_str()); // to get the right sequence names, i think. -p F
+ // option tells formatdb that seqs are DNA, not prot
+ cout << "DONE." << endl << endl; cout.flush();
+ emptySequence = new Sequence();
+ emptySequence->setName("no_match");
+ emptySequence->setUnaligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
+ emptySequence->setAligned("XXXXXXXXXXXXXXXXXXXXXXXXXXXXX");
+
+
+}
+
+/**************************************************************************************************/
+
+BlastDB::~BlastDB(){
+ remove(queryFileName.c_str()); // let's clean stuff up and remove the temp files
+ remove(dbFileName.c_str()); // let's clean stuff up and remove the temp files
+ remove(blastFileName.c_str()); // let's clean stuff up and remove the temp files
+}
+
+/**************************************************************************************************/
+
+Sequence* BlastDB::findClosestSequence(Sequence* candidate){
+
+ ofstream queryFile;
+ openOutputFile(queryFileName, queryFile);
+ queryFile << '>' << candidate->getName() << endl;
+ queryFile << candidate->getUnaligned() << endl;
+ queryFile.close();
+
+
+// the goal here is to quickly survey the database to find the closest match. To do this we are using the default
+// wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too
+// long. With this setting, it seems comparable in speed to the suffix tree approach.
+
+ string blastCommand = "~/Pipeline/src/cpp/production/blast/bin/blastall -p blastn -d " + dbFileName + " -b 1 -m 8 -W 28";
+ blastCommand += (" -i " + queryFileName + " -o " + blastFileName);
+ system(blastCommand.c_str());
+
+ ifstream m8FileHandle;
+ openInputFile(blastFileName, m8FileHandle);
+
+ string dummy;
+ int templateAccession;
+ gobble(m8FileHandle);
+ if(!m8FileHandle.eof()){
+ m8FileHandle >> dummy >> templateAccession >> searchScore;
+ m8FileHandle.close();
+ return templateSequences[templateAccession];
+ }
+ else{
+ searchScore = 0.00;
+ return emptySequence;
+ }
+}
+
+/**************************************************************************************************/
--- /dev/null
+#ifndef BLASTDB_HPP
+#define BLASTDB_HPP
+
+
+/*
+ * blastdb.hpp
+ *
+ *
+ * Created by Pat Schloss on 12/22/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+
+class BlastDB : public Database {
+public:
+ BlastDB(string, float, float, float, float);
+ ~BlastDB();
+ Sequence* findClosestSequence(Sequence*);
+
+private:
+ string dbFileName;
+ string queryFileName;
+ string blastFileName;
+
+ float gapOpen;
+ float gapExtend;
+ float match;
+ float misMatch;
+ Sequence* emptySequence;
+};
+
+#endif
#include "bootstrapsharedcommand.h"
#include "concensuscommand.h"
#include "distancecommand.h"
+#include "aligncommand.h"
/***********************************************************/
else if(commandName == "bootstrap.shared") { command = new BootSharedCommand(); }
else if(commandName == "concensus") { command = new ConcensusCommand(); }
else if(commandName == "dist.seqs") { command = new DistanceCommand(); }
+ else if(commandName == "align.seqs") { command = new AlignCommand(); }
else { command = new NoCommand(); }
return command;
/**************************************************************************************************/
-Database::Database(string fastaFileName){
-
- ifstream fastaFile(fastaFileName.c_str());
- if(!fastaFile) {
- cerr << "Error: Could not open " << fastaFileName << endl;
- exit(1);
- }
+Database::Database(string fastaFileName){ // This assumes that the template database is in fasta format, may
+ // need to alter this in the future?
+
+ ifstream fastaFile;
+ openInputFile(fastaFileName, fastaFile);
+
cout << endl << "Reading in the " << fastaFileName << " template sequences...\t"; cout.flush();
- numSeqs=count(istreambuf_iterator<char>(fastaFile),istreambuf_iterator<char>(), '>');
- fastaFile.seekg(0);
+ numSeqs=count(istreambuf_iterator<char>(fastaFile),istreambuf_iterator<char>(), '>'); // count the number of
+ fastaFile.seekg(0); // sequences
templateSequences.resize(numSeqs);
string seqName, sequence;
for(int i=0;i<numSeqs;i++){
- templateSequences[i] = new Sequence();
-
+ templateSequences[i] = new Sequence(); // We're storing the aligned template sequences as a vector of
+ // pointers to Sequence objects
fastaFile >> seqName;
templateSequences[i]->setName(seqName);
while(fastaFile && (letter=fastaFile.get()) != '>'){
if(isprint(letter)){
letter = toupper(letter);
+ if(letter == 'U'){letter = 'T';}
aligned += letter;
}
}
- templateSequences[i]->setAligned(aligned);
- templateSequences[i]->setUnaligned(aligned);
- fastaFile.putback(letter);
+ templateSequences[i]->setAligned(aligned); // Obviously, we need the fully aligned template sequence
+ templateSequences[i]->setUnaligned(aligned);// We will also need the unaligned sequence for pairwise alignments
+ fastaFile.putback(letter); // and database searches
}
fastaFile.close();
}
/**************************************************************************************************/
+
+float Database::getSearchScore() { return searchScore; } // we're assuming that the search is already done
+
+/**************************************************************************************************/
#include "mothur.h"
+class Sequence;
+
class Database {
public:
Database(string);
virtual Sequence* findClosestSequence(Sequence*) = 0;
-
+ virtual float getSearchScore();
protected:
int numSeqs;
- vector<Sequence*> templateSequences;
+ float searchScore;
+ vector<Sequence*> templateSequences;
};
#endif
remove(distFile.c_str());
- // # if defined (WIN_VERSION)
+ //# if defined (_WIN32)
+ //figure out how to implement the fork and wait commands in windows
// driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff);
- // # endif
+ //# endif
- # if defined (__APPLE__) || (__MACH__)
+ # if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
if(processors == 1){
driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff);
}
wait(NULL);
}
wait(NULL);
+ # else
+ driver(distCalculator, seqDB, 0, numSeqs, distFile, cutoff);
# endif
delete distCalculator;
--- /dev/null
+/*
+ * distancedb.cpp
+ *
+ *
+ * Created by Pat Schloss on 12/29/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+
+using namespace std;
+
+
+#include "database.hpp"
+#include "sequence.hpp"
+#include "distancedb.hpp"
+
+
+/**************************************************************************************************/
+
+DistanceDB::DistanceDB(string fastaFileName, string distanceFileName) : Database(fastaFileName) {
+
+ ifstream inputData;
+ openInputFile(distanceFileName, inputData);
+ int numCandSeqs=count(istreambuf_iterator<char>(inputData),istreambuf_iterator<char>(), '\n'); // count the number of
+ inputData.seekg(0); // sequences
+
+ hit closestMatch;
+ string candidateSeqName;
+ string junk;
+
+ mostSimSequenceVector.resize(numCandSeqs);
+
+ for(int i=0;i<numCandSeqs;i++){
+ inputData >> candidateSeqName >> closestMatch.seqName >> closestMatch.indexNumber >> closestMatch.simScore;
+// getline(inputData, junk);
+ mostSimSequenceVector[i] = closestMatch;
+ }
+ cout << numCandSeqs << endl;
+ searchIndex = 0;
+ inputData.close();
+}
+
+/**************************************************************************************************/
+
+Sequence* DistanceDB::findClosestSequence(Sequence* candidateSeq){
+
+ hit simAccession = mostSimSequenceVector[searchIndex];
+// string candidateSeqName, closestMatchSeqName, junk;
+// int closestMatchIndexNumber;
+// float closestMatchSimScore;
+//
+// inputData >> candidateSeqName >> closestMatchSeqName >> closestMatchIndexNumber >> closestMatchSimScore;
+// getline(inputData, junk);
+
+// searchScore = 100. * closestMatchSimScore;
+
+ searchScore = 100. * simAccession.simScore;
+ searchIndex++;
+// return templateSequences[closestMatchIndexNumber];
+ return templateSequences[simAccession.indexNumber];
+
+}
+
+/**************************************************************************************************/
--- /dev/null
+#ifndef DISTANCEDB_HPP
+#define DISTANCEDB_HPP
+
+/*
+ * distancedb.hpp
+ *
+ *
+ * Created by Pat Schloss on 12/29/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+
+using namespace std;
+
+#include "mothur.h"
+
+class DistanceDB : public Database {
+
+public:
+ DistanceDB(string, string);
+ Sequence* findClosestSequence(Sequence*);
+
+private:
+
+ struct hit{
+ string seqName;
+ int indexNumber;
+ float simScore;
+ };
+ vector<hit> mostSimSequenceVector;
+// ifstream inputData;
+ int searchIndex;
+};
+
+#endif
if (parameter == "scale" ) { scale = value; }
if (parameter == "ends" ) { ends = value; }
if (parameter == "processors" ) { processors = value; }
-
+ if (parameter == "template") { templatefile = value; }
+ if (parameter == "search") { search = value; }
+ if (parameter == "ksize") { ksize = value; }
+ if (parameter == "align") { align = value; }
+ if (parameter == "match") { match = value; }
+ if (parameter == "mismatch") { mismatch = value; }
+ if (parameter == "gapopen") { gapopen = value; }
+ if (parameter == "gapextend" ) { gapextend = value; }
}
//gets the last parameter and value
if (parameter == "scale" ) { scale = value; }
if (parameter == "ends" ) { ends = value; }
if (parameter == "processors" ) { processors = value; }
+ if (parameter == "template") { templatefile = value; }
+ if (parameter == "search") { search = value; }
+ if (parameter == "ksize") { ksize = value; }
+ if (parameter == "align") { align = value; }
+ if (parameter == "match") { match = value; }
+ if (parameter == "mismatch") { mismatch = value; }
+ if (parameter == "gapopen") { gapopen = value; }
+ if (parameter == "gapextend" ) { gapextend = value; }
}
}
}
}
- if ((commandName == "filter.seqs") || (commandName == "dist.seqs")) {
+ if ((commandName == "filter.seqs") || (commandName == "dist.seqs") || (commandName == "align.seqs")) {
if ((fastafile == "") && (nexusfile == "") && (clustalfile == "") && (phylipfile == "")) {
- cout << "You must read either a fasta, nexus, clustal, or phylip file before you can use the filter.seqs command." << endl; return false;
+ cout << "You must enter either a fasta, nexus, clustal, or phylip file before you can use the filter.seqs, dist.seqs or align.seqs command." << endl; return false;
+ }else if ((commandName == "align.seqs") && (templatefile == "")) {
+ cout << "You must enter a template to use the align.seqs command." << endl; return false;
}
validateSeqsFiles();
}
errorFree = false;
}
}
-
+ }else if (templatefile != "") {
+ ableToOpen = openInputFile(templatefile, filehandle);
+ filehandle.close();
+ if (ableToOpen == 1) { //unable to open
+ errorFree = false;
+ }
}
+
}
catch(exception& e) {
cout << "Standard Error: " << e.what() << " has occurred in the ErrorCheck class Function validateSeqsFiles. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
fastafile = "";
nexusfile = "";
clustalfile = "";
+ templatefile = "";
line = "";
label = "";
method = "furthest";
void refresh();
string phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, orderfile, fastafile, nexusfile, clustalfile, treefile, sharedfile, cutoff, format;
string precision, method, fileroot, label, line, iters, jumble, freq, single, rarefaction, shared, summary, randomtree, abund, sorted, trump, soft, filter, scale, ends, processors;
-
+ string templatefile, search, ksize, align, match, mismatch, gapopen, gapextend;
string commandName, optionText;
bool errorFree;
if (key == "scale") { scale = value; }
if (key == "ends" ) { ends = value; }
if (key == "processors" ) { processors = value; }
-
-
-
-
+ if (key == "template") { templatefile = value; }
+ if (key == "search") { search = value; }
+ if (key == "ksize") { ksize = value; }
+ if (key == "align") { align = value; }
+ if (key == "match") { match = value; }
+ if (key == "mismatch") { mismatch = value; }
+ if (key == "gapopen") { gapopen = value; }
+ if (key == "gapextend" ) { gapextend = value; }
if (key == "line") {//stores lines to be used in a set
lines.clear();
if (key == "scale") { scale = value; }
if (key == "ends" ) { ends = value; }
if (key == "processors" ) { processors = value; }
-
+ if (key == "template") { templatefile = value; }
+ if (key == "search") { search = value; }
+ if (key == "ksize") { ksize = value; }
+ if (key == "align") { align = value; }
+ if (key == "match") { match = value; }
+ if (key == "mismatch") { mismatch = value; }
+ if (key == "gapopen") { gapopen = value; }
+ if (key == "gapextend" ) { gapextend = value; }
if (key == "line") {//stores lines to be used in a vector
lines.clear();
string GlobalData::getScale() { return scale; }
string GlobalData::getEnds() { return ends; }
string GlobalData::getProcessors() { return processors; }
+string GlobalData::getTemplateFile() { return templatefile;}
+string GlobalData::getSearch() { return search; }
+string GlobalData::getKSize() { return ksize; }
+string GlobalData::getAlign() { return align; }
+string GlobalData::getMatch() { return match; }
+string GlobalData::getMismatch() { return mismatch; }
+string GlobalData::getGapopen() { return gapopen; }
+string GlobalData::getGapextend() { return gapextend; }
void GlobalData::setListFile(string file) { listfile = file; inputFileName = file;}
void GlobalData::setGroupFile(string file) { groupfile = file; }
clustalfile = "";
treefile = "";
sharedfile = "";
+ templatefile = "";
cutoff = "10.00";
format = "";
precision = "100";
scale = "log10";
ends = "T"; //yes
processors = "1";
+ search = "suffix";
+ ksize = "7";
+ align = "blast";
+ match = "1.0";
+ mismatch = "-1.0";
+ gapopen = "-5.0";
+ gapextend = "-2.0";
+
}
form = "integral";
ends = "T";
processors = "1";
+ search = "suffix";
+ ksize = "7";
+ align = "blast";
+ match = "1.0";
+ mismatch = "-1.0";
+ gapopen = "-5.0";
+ gapextend = "-2.0";
}
/*******************************************************/
string getSorted();
string getEnds();
string getProcessors();
-
+ string getTemplateFile();
+ string getSearch();
+ string getKSize();
+ string getAlign();
+ string getMatch();
+ string getMismatch();
+ string getGapopen();
+ string getGapextend();
string getTrump();
string getSoft();
string getFilter();
private:
string phylipfile, columnfile, listfile, rabundfile, sabundfile, namefile, groupfile, orderfile, fastafile, nexusfile, clustalfile, treefile, sharedfile, line, label, randomtree, groups;
- string cutoff, format, precision, method, fileroot, iters, jumble, freq, calc, abund, step, form, sorted, trump, soft, filter, scale, ends, processors;
+ string cutoff, format, precision, method, fileroot, iters, jumble, freq, calc, abund, step, form, sorted, trump, soft, filter, scale, ends, processors, templatefile, search, ksize, align, match;
+ string mismatch, gapopen, gapextend;
static GlobalData* _uniqueInstance;
--- /dev/null
+/*
+ * gotohoverlap.cpp
+ *
+ *
+ * Created by Pat Schloss on 12/15/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ * This class is an Alignment child class that implements the Gotoh pairwise alignment algorithm as described in:
+ *
+ * Gotoh O. 1982. An improved algorithm for matching biological sequences. J. Mol. Biol. 162:705-8.
+ * Myers, EW & Miller, W. 1988. Optimal alignments in linear space. Comput Appl Biosci. 4:11-7.
+ *
+ * This method is nice because it allows for an affine gap penalty to be assessed, which is analogous to what is used
+ * in blast and is an alternative to Needleman-Wunsch, which only charges the same penalty for each gap position.
+ * Because this method typically has problems at the ends when two sequences do not full overlap, we employ a separate
+ * method to fix the ends (see Overlap class documentation)
+ *
+ */
+
+using namespace std;
+
+#include "alignmentcell.hpp"
+#include "overlap.hpp"
+#include "alignment.hpp"
+#include "gotohoverlap.hpp"
+
+/**************************************************************************************************/
+
+GotohOverlap::GotohOverlap(float gO, float gE, float m, float mm, int r) :
+ gapOpen(gO), gapExtend(gE), match(m), mismatch(mm), Alignment(r) {
+
+ for(int i=1;i<nCols;i++){ // we initialize the dynamic programming matrix by setting the pointers in
+ alignment[0][i].prevCell = 'l'; // the first row to the left
+ alignment[0][i].cValue = 0;
+ alignment[0][i].dValue = 0;
+ }
+
+ for(int i=1;i<nRows;i++){ // we initialize the dynamic programming matrix by setting the pointers in
+ alignment[i][0].prevCell = 'u'; // the first column upward
+ alignment[i][0].cValue = 0;
+ alignment[i][0].iValue = 0;
+ }
+}
+
+/**************************************************************************************************/
+
+void GotohOverlap::align(string A, string B){
+
+ seqA = ' ' + A; lA = seqA.length(); // the algorithm requires that the first character be a dummy value
+ seqB = ' ' + B; lB = seqB.length(); // the algorithm requires that the first character be a dummy value
+
+ for(int i=1;i<lB;i++){ // the recursion here is shown in Webb and Miller, Fig. 1A. Note that
+ for(int j=1;j<lA;j++){ // if we need to conserve on space we should see Fig. 1B, which is linear
+ // in space, which I think is unnecessary
+ float diagonal;
+ if(seqB[i] == seqA[j]) { diagonal = alignment[i-1][j-1].cValue + match; }
+ else { diagonal = alignment[i-1][j-1].cValue + mismatch; }
+
+ alignment[i][j].iValue = max(alignment[i][j-1].iValue, alignment[i][j-1].cValue + gapOpen) + gapExtend;
+ alignment[i][j].dValue = max(alignment[i-1][j].dValue, alignment[i-1][j].cValue + gapOpen) + gapExtend;
+
+ if(alignment[i][j].iValue > alignment[i][j].dValue){
+ if(alignment[i][j].iValue > diagonal){
+ alignment[i][j].cValue = alignment[i][j].iValue;
+ alignment[i][j].prevCell = 'l';
+ }
+ else{
+ alignment[i][j].cValue = diagonal;
+ alignment[i][j].prevCell = 'd';
+ }
+ }
+ else{
+ if(alignment[i][j].dValue > diagonal){
+ alignment[i][j].cValue = alignment[i][j].dValue;
+ alignment[i][j].prevCell = 'u';
+ }
+ else{
+ alignment[i][j].cValue = diagonal;
+ alignment[i][j].prevCell = 'd';
+ }
+ }
+
+ }
+ }
+ Overlap over;
+ over.setOverlap(alignment, lA, lB, 0); // Fix the gaps at the ends of the sequences
+ traceBack(); // Construct the alignment and set seqAaln and seqBaln
+}
+
+/**************************************************************************************************/
--- /dev/null
+#ifndef GOTOHOVERLAP_H
+#define GOTOHOVERLAP_H
+
+/*
+ * gotohoverlap.h
+ *
+ *
+ * Created by Pat Schloss on 12/15/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ * This class is an Alignment child class that implements the Gotoh pairwise alignment algorithm as described in:
+ *
+ * Gotoh O. 1982. An improved algorithm for matching biological sequences. J. Mol. Biol. 162:705-8.
+ * Myers, EW & Miller, W. 1988. Optimal alignments in linear space. Comput Appl Biosci. 4:11-7.
+ *
+ * This method is nice because it allows for an affine gap penalty to be assessed, which is analogous to what is used
+ * in blast and is an alternative to Needleman-Wunsch, which only charges the same penalty for each gap position.
+ * Because this method typically has problems at the ends when two sequences do not full overlap, we employ a separate
+ * method to fix the ends (see Overlap class documentation)
+ *
+ */
+
+#include "mothur.h"
+#include "alignment.hpp"
+
+/**************************************************************************************************/
+
+class GotohOverlap : public Alignment {
+
+public:
+ GotohOverlap(float, float, float, float, int);
+ void align(string, string);
+
+private:
+ float gapOpen;
+ float gapExtend;
+ float match;
+ float mismatch;
+};
+
+/**************************************************************************************************/
+
+#endif
/**************************************************************************************************/
-Kmer::Kmer(int size) : kmerSize(size) {
-
- int power4s[9] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536 };
- maxKmer = power4s[kmerSize]+1;// (int)pow(4.,k)+1;
+Kmer::Kmer(int size) : kmerSize(size) { // The constructor sets the size of the kmer
+ int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
+ // No reason to waste the time of recalculating
+ maxKmer = power4s[kmerSize]+1;// (int)pow(4.,k)+1; // powers of 4 everytime through. We need an
+ // extra kmer if we get a non-ATGCU base
}
/**************************************************************************************************/
-string Kmer::getKmerString(string sequence){
- int length = sequence.length();
- int nKmers = length - kmerSize + 1;
+string Kmer::getKmerString(string sequence){ // Calculate kmer for each position in the sequence, count the freq
+ int length = sequence.length(); // of each kmer, and convert it to an ascii character with base '!'.
+ int nKmers = length - kmerSize + 1; // Export the string of characters as a string
vector<int> counts(maxKmer, 0);
- for(int i=0;i<nKmers;i++){
- int kmerNumber = getKmerNumber(sequence, i);
+ for(int i=0;i<nKmers;i++){ // Go though sequence and get the number between 0 and maxKmer for that
+ int kmerNumber = getKmerNumber(sequence, i);// kmer. Increase the count for the kmer in the counts vector
counts[kmerNumber]++;
}
- string kmerString = "";
- for(int i=0;i<maxKmer;i++){
- kmerString += getASCII(counts[i]);
+ string kmerString = "";
+ for(int i=0;i<maxKmer;i++){ // Scan through the vector of counts and convert each element of the
+ kmerString += getASCII(counts[i]); // vector to an ascii character that is equal to or greater than '!'
}
return kmerString;
/**************************************************************************************************/
int Kmer::getKmerNumber(string sequence, int index){
- int power4s[9] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536 };
+
+// Here we convert a kmer to a number between 0 and maxKmer. For example, AAAA would equal 0 and TTTT would equal 255.
+// If there's an N in the kmer, it is set to 256 (if we are looking at 4mers). The largest we can look at are 8mers,
+// this could be easily increased.
+//
+// Example: ATGCAT (kSize = 6)
+// i: 012345
+//
+// Score = 0*4^(6-0-1) + 3*4^(6-1-1) + 2*4^(6-2-1) + 1*4^(6-3-1) + 0*4^(6-4-1) + 3*4^(6-5-1)
+// = 0*4^5 + 3*4^4 + 2*4^3 + 1*4^2 + 0*4^1 + 3*4^0
+// = 0 + 3*256 + 2*64 + 1*16 + 0*4 + 3*1
+// = 0 + 768 + 128 + 16 + 0 + 3
+// = 915
+
+ int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
+
int kmer = 0;
+
for(int i=0;i<kmerSize;i++){
if(toupper(sequence[i+index]) == 'A') { kmer += (0 * power4s[kmerSize-i-1]); }
else if(toupper(sequence[i+index]) == 'C') { kmer += (1 * power4s[kmerSize-i-1]); }
/**************************************************************************************************/
string Kmer::getKmerBases(int kmerNumber){
- int power4s[9] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536 };
+
+// Here we convert the kmer number into the kmer in terms of bases.
+//
+// Example: Score = 915 (for a 6-mer)
+// Base6 = (915 / 4^0) % 4 = 915 % 4 = 3 => T [T]
+// Base5 = (915 / 4^1) % 4 = 228 % 4 = 0 => A [AT]
+// Base4 = (915 / 4^2) % 4 = 57 % 4 = 1 => C [CAT]
+// Base3 = (915 / 4^3) % 4 = 14 % 4 = 2 => G [GCAT]
+// Base2 = (915 / 4^4) % 4 = 3 % 4 = 3 => T [TGCAT]
+// Base1 = (915 / 4^5) % 4 = 0 % 4 = 0 => A [ATGCAT] -> this checks out with the previous method
+
+ int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
string kmer = "";
- if(kmerNumber == power4s[kmerSize]){//pow(4.,7)){
- for(int i=0;i<kmerSize;i++){
+ if(kmerNumber == power4s[kmerSize]){//pow(4.,7)){ // if the kmer number is the same as the maxKmer then it must
+ for(int i=0;i<kmerSize;i++){ // have had an N in it and so we'll just call it N x kmerSize
kmer += 'N';
}
}
else{
for(int i=0;i<kmerSize;i++){
- int nt = (int)(kmerNumber / (float)power4s[i]) % 4;
- if(nt == 0) { kmer = 'A' + kmer; }
+ int nt = (int)(kmerNumber / (float)power4s[i]) % 4; // the '%' operator returns the remainder
+ if(nt == 0) { kmer = 'A' + kmer; } // from int-based division ]
else if(nt == 1){ kmer = 'C' + kmer; }
else if(nt == 2){ kmer = 'G' + kmer; }
else if(nt == 3){ kmer = 'T' + kmer; }
/**************************************************************************************************/
-char Kmer::getASCII(int number) { return (char)(33+number); }
-
+char Kmer::getASCII(int number) { return (char)(33+number); } // '!' is the first printable char and
+ // has the int value of 33
/**************************************************************************************************/
-int Kmer::getNumber(char character) { return ((int)(character-'!')); }
+int Kmer::getNumber(char character) { return ((int)(character-'!')); } // '!' has the value of 33
/**************************************************************************************************/
Kmer(int);
string getKmerString(string);
int getKmerNumber(string, int);
+ string getKmerBases(int);
private:
- string getKmerBases(int);
-
char getASCII(int);
int getNumber(char);
int kmerSize;
/**************************************************************************************************/
+
#endif
* Created by Pat Schloss on 12/16/08.
* Copyright 2008 Patrick D. Schloss. All rights reserved.
*
+ * This class is a child class of the Database class, which stores the template sequences as a kmer table and provides
+ * a method of searching the kmer table for the sequence with the most kmers in common with a query sequence.
+ * kmerLocations is the primary storage variable that is a two-dimensional vector where each row represents the
+ * different number of kmers and each column contains the index to sequences that use that kmer.
+ *
+ * Construction of an object of this type will first look for an appropriately named database file and if it is found
+ * then will read in the database file (readKmerDB), otherwise it will generate one and store the data in memory
+ * (generateKmerDB)
+ *
+ * The search method used here is roughly the same as that used in the SimRank program that is found at the
+ * greengenes website. The default kmer size is 7. The speed complexity is between O(L) and O(LN). When I use 7mers
+ * on average a kmer is found in ~100 other sequences with a database of ~5000 sequences. If this is the case then the
+ * time would be on the order of O(0.1LN) -> fast.
+ *
+
*/
using namespace std;
string kmerDBName = fastaFileName.substr(0,fastaFileName.find_last_of(".")+1) + char('0'+ kmerSize) + "mer";
ifstream kmerFileTest(kmerDBName.c_str());
- int power4s[9] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536 };
+ int power4s[14] = { 1, 4, 16, 64, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216, 67108864 };
maxKmer = power4s[kmerSize];
kmerLocations.resize(maxKmer+1);
- if(!kmerFileTest){
+ if(!kmerFileTest){ // if we can open the kmer db file, then read it in...
cout << "Generating the " << kmerDBName << " database...\t"; cout.flush();
generateKmerDB(kmerDBName);
}
- else{
+ else{ // ...otherwise generate it.
cout << "Reading in the " << kmerDBName << " database...\t"; cout.flush();
readKmerDB(kmerDBName, kmerFileTest);
}
/**************************************************************************************************/
Sequence* KmerDB::findClosestSequence(Sequence* candidateSeq){
+
+ vector<int> matches(numSeqs, 0); // a record of the sequences with shared kmers
+ vector<int> timesKmerFound(kmerLocations.size()+1, 0); // a record of the kmers that we have already found
- vector<int> matches(numSeqs, 0);
- vector<int> timesKmerFound(kmerLocations.size()+1, 0);
-
- int maxMatches = 0;
+ searchScore = 0;
int maxSequence = 0;
- string query = candidateSeq->getUnaligned();
-
- int numKmers = query.length() - kmerSize + 1;
+ string query = candidateSeq->getUnaligned(); // we want to search using an unaligned dna sequence
+
+ int numKmers = query.length() - kmerSize + 1;
Kmer kmer(kmerSize);
for(int i=0;i<numKmers;i++){
-
- int kmerNumber = kmer.getKmerNumber(query, i);
-
- if(timesKmerFound[kmerNumber] == 0){
- for(int j=0;j<kmerLocations[kmerNumber].size();j++){
- matches[kmerLocations[kmerNumber][j]]++;
+ int kmerNumber = kmer.getKmerNumber(query, i); // go through the query sequence and get a kmer number
+ if(timesKmerFound[kmerNumber] == 0){ // if we haven't seen it before...
+ for(int j=0;j<kmerLocations[kmerNumber].size();j++){//increase the count for each sequence that also has
+ matches[kmerLocations[kmerNumber][j]]++; // that kmer
}
}
- timesKmerFound[kmerNumber] = 1;
-
+ timesKmerFound[kmerNumber] = 1; // ok, we've seen the kmer now
}
- for(int i=0;i<numSeqs;i++){
- if(matches[i] > maxMatches){
- maxMatches = matches[i];
+
+ for(int i=0;i<numSeqs;i++){ // find the db sequence that shares the most kmers with
+ if(matches[i] > searchScore){ // the query sequence
+ searchScore = matches[i];
maxSequence = i;
}
}
- return templateSequences[maxSequence];
-
+ searchScore = 100 * searchScore / (float)numKmers;
+ return templateSequences[maxSequence]; // return the Sequence object corresponding to the db
+ // sequence with the most shared kmers.
}
/**************************************************************************************************/
void KmerDB::generateKmerDB(string kmerDBName){
-
Kmer kmer(kmerSize);
- for(int i=0;i<numSeqs;i++){
+ for(int i=0;i<numSeqs;i++){ // for all of the template sequences...
- string seq = templateSequences[i]->getUnaligned();
+ string seq = templateSequences[i]->getUnaligned(); // ...take the unaligned sequence...
int numKmers = seq.length() - kmerSize + 1;
- for(int j=0;j<numKmers;j++){
+ vector<int> seenBefore(maxKmer+1,0);
+ for(int j=0;j<numKmers;j++){ // ...step though the sequence and get each kmer...
int kmerNumber = kmer.getKmerNumber(seq, j);
- kmerLocations[kmerNumber].push_back(i);
- }
+ if(seenBefore[kmerNumber] == 0){
+ kmerLocations[kmerNumber].push_back(i); // ...insert the sequence index into kmerLocations for
+ } // the appropriate kmer number
+ seenBefore[kmerNumber] = 1;
+ }
}
- ofstream kmerFile(kmerDBName.c_str(), ios::trunc);
- if(!kmerFile) {
- cerr << "Error: Could not open " << kmerDBName << endl;
- exit(1);
- }
+ ofstream kmerFile; // once we have the kmerLocations folder print it out
+ openOutputFile(kmerDBName, kmerFile); // to a file
- for(int i=0;i<maxKmer;i++){
- kmerFile << i << ' ' << kmerLocations[i].size();
- for(int j=0;j<kmerLocations[i].size();j++){
- kmerFile << ' ' << kmerLocations[i][j];
+ for(int i=0;i<maxKmer;i++){ // step through all of the possible kmer numbers
+ kmerFile << i << ' ' << kmerLocations[i].size(); // print the kmer number and the number of sequences with
+ for(int j=0;j<kmerLocations[i].size();j++){ // that kmer. then print out the indices of the sequences
+ kmerFile << ' ' << kmerLocations[i][j]; // with that kmer.
}
kmerFile << endl;
}
void KmerDB::readKmerDB(string kmerDBName, ifstream& kmerDBFile){
- kmerDBFile.seekg(0);
+ kmerDBFile.seekg(0); // start at the beginning of the file
string seqName;
int seqNumber;
- for(int i=0;i<numSeqs;i++){
- int numValues;
+ for(int i=0;i<maxKmer;i++){
+ int numValues;
kmerDBFile >> seqName >> numValues;
- for(int j=0;j<numValues;j++){
- kmerDBFile >> seqNumber;
- kmerLocations[i].push_back(seqNumber);
+ for(int j=0;j<numValues;j++){ // for each kmer number get the...
+ kmerDBFile >> seqNumber; // 1. number of sequences with the kmer number
+ kmerLocations[i].push_back(seqNumber); // 2. sequence indices
}
}
kmerDBFile.close();
* Created by Pat Schloss on 12/16/08.
* Copyright 2008 Patrick D. Schloss. All rights reserved.
*
+ * This class is a child class of the Database class, which stores the template sequences as a kmer table and provides
+ * a method of searching the kmer table for the sequence with the most kmers in common with a query sequence.
+ * kmerLocations is the primary storage variable that is a two-dimensional vector where each row represents the
+ * different number of kmers and each column contains the index to sequences that use that kmer.
+ *
+ * Construction of an object of this type will first look for an appropriately named database file and if it is found
+ * then will read in the database file (readKmerDB), otherwise it will generate one and store the data in memory
+ * (generateKmerDB)
+
*/
#include "mothur.h"
--- /dev/null
+/*
+ * nast.cpp
+ *
+ *
+ * Created by Pat Schloss on 12/17/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ * This is my implementation of the NAST (nearest alignment space termination) algorithm as described in:
+ *
+ * DeSantis TZ, Hugenholtz P, Keller K, Brodie EL, Larsen N, Piceno YM, Phan R, & Anderson GL. 2006. NAST: a multiple
+ * sequence alignment server for comparative analysis of 16S rRNA genes. Nucleic Acids Research. 34:W394-9.
+ *
+ * To construct an object one needs to provide a method of getting a pairwise alignment (alignment) and the template
+ * and candidate sequence that are to be aligned to each other.
+ *
+ */
+
+using namespace std;
+
+#include "sequence.hpp"
+#include "alignment.hpp"
+#include "nast.hpp"
+
+/**************************************************************************************************/
+
+Nast::Nast(Alignment* method, Sequence* cand, Sequence* temp) : alignment(method), candidateSeq(cand), templateSeq(temp) {
+ maxInsertLength = 0;
+ pairwiseAlignSeqs(); // This is part A in Fig. 2 of DeSantis et al.
+ regapSequences(); // This is parts B-F in Fig. 2 of DeSantis et al.
+}
+
+/**************************************************************************************************/
+
+void Nast::pairwiseAlignSeqs(){ // Here we call one of the pairwise alignment methods to align our unaligned candidate
+ // and template sequences
+ alignment->align(candidateSeq->getUnaligned(), templateSeq->getUnaligned());
+
+ string candAln = alignment->getSeqAAln();
+ string tempAln = alignment->getSeqBAln();
+
+ if(candAln == ""){
+ candidateSeq->setPairwise("");
+ templateSeq->setPairwise(templateSeq->getUnaligned());
+ }
+ else{
+ if(tempAln[0] == '-'){
+ int pairwiseAlignmentLength = tempAln.length(); // we need to make sure that the candidate sequence alignment
+ for(int i=0;i<pairwiseAlignmentLength;i++){ // starts where the template sequence alignment starts, if it
+ if(isalpha(tempAln[i])){ // starts early, we nuke the beginning of the candidate
+ candAln = candAln.substr(i); // sequence
+ tempAln = tempAln.substr(i);
+ break;
+ }
+ }
+ }
+
+ int pairwiseAlignmentLength = tempAln.length();
+ if(tempAln[pairwiseAlignmentLength-1] == '-'){ // we need to make sure that the candidate sequence alignment
+ for(int i=pairwiseAlignmentLength-1; i>=0; i--){// ends where the template sequence alignment ends, if it runs
+ if(isalpha(tempAln[i])){ // long, we nuke the end of the candidate sequence
+ candAln = candAln.substr(0,i+1);
+ tempAln = tempAln.substr(0,i+1);
+ break;
+ }
+ }
+ }
+ }
+ candidateSeq->setPairwise(candAln); // set the pairwise sequences in the Sequence objects for
+ templateSeq->setPairwise(tempAln); // the candidate and template sequences
+
+}
+
+/**************************************************************************************************/
+
+void Nast::removeExtraGaps(string& candAln, string tempAln, string newTemplateAlign){
+
+// here we do steps C-F of Fig. 2 from DeSantis et al.
+
+ int longAlignmentLength = newTemplateAlign.length();
+
+ for(int i=0; i<longAlignmentLength; i++){ // use the long alignment as the standard
+ int rightIndex, rightRoom, leftIndex, leftRoom;
+
+ // Part C of Fig. 2 from DeSantis et al.
+ if((isalpha(newTemplateAlign[i]) != isalpha(tempAln[i]))){ //if there is a discrepancy between the regapped
+
+// cout << i << '\t';cout.flush();
+
+ // Part D of Fig. 2 from DeSantis et al. // template sequence and the official template sequence
+ for(leftIndex=i-1;leftIndex>=0;leftIndex--){ // then we've got problems...
+ if(!isalpha(candAln[leftIndex])){
+ leftRoom = 1; //count how far it is to the nearest gap on the LEFT side of the anomaly
+ while(leftIndex-leftRoom>=0 && !isalpha(candAln[leftIndex-leftRoom])) { leftRoom++; }
+ break;
+ }
+ }
+// cout << leftIndex << '\t' << leftRoom << endl;
+
+ for(rightIndex=i+1;rightIndex<longAlignmentLength;rightIndex++){
+ if(!isalpha(candAln[rightIndex])){
+ rightRoom = 1; //count how far it is to the nearest gap on the RIGHT side of the anomaly
+ while(rightIndex+rightRoom<longAlignmentLength && !isalpha(candAln[rightIndex+rightRoom])) { rightRoom++; }
+ break;
+ }
+ }
+
+ int insertLength = 0; // figure out how long the anomaly is
+ while(!isalpha(newTemplateAlign[i + insertLength])) { insertLength++; }
+ if(insertLength > maxInsertLength){ maxInsertLength = insertLength; }
+
+ if((leftRoom + rightRoom) >= insertLength){
+ // Parts D & E from Fig. 2 of DeSantis et al.
+ if((i-leftIndex) <= (rightIndex-i)){ // the left gap is closer - > move stuff left there's
+ if(leftRoom >= insertLength){ // enough room to the left to move
+ string leftTemplateString = newTemplateAlign.substr(0,i);
+ string rightTemplateString = newTemplateAlign.substr(i+insertLength);
+ newTemplateAlign = leftTemplateString + rightTemplateString;
+ longAlignmentLength = newTemplateAlign.length();
+
+ string leftCandidateString = candAln.substr(0,leftIndex-insertLength+1);
+ string rightCandidateString = candAln.substr(leftIndex+1);
+ candAln = leftCandidateString + rightCandidateString;
+ }
+ else{ // not enough room to the left, have to steal some space to
+ string leftTemplateString = newTemplateAlign.substr(0,i); // the right
+ string rightTemplateString = newTemplateAlign.substr(i+insertLength);
+ newTemplateAlign = leftTemplateString + rightTemplateString;
+ longAlignmentLength = newTemplateAlign.length();
+
+ string leftCandidateString = candAln.substr(0,leftIndex-leftRoom+1);
+ string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
+ string rightCandidateString = candAln.substr(rightIndex+(insertLength-leftRoom));
+ candAln = leftCandidateString + insertString + rightCandidateString;
+ }
+ }
+ else{ // the right gap is closer - > move stuff right there's
+ if(rightRoom >= insertLength){ // enough room to the right to move
+ string leftTemplateString = newTemplateAlign.substr(0,i);
+ string rightTemplateString = newTemplateAlign.substr(i+insertLength);
+ newTemplateAlign = leftTemplateString + rightTemplateString;
+ longAlignmentLength = newTemplateAlign.length();
+
+ string leftCandidateString = candAln.substr(0,rightIndex);
+ string rightCandidateString = candAln.substr(rightIndex+insertLength);
+ candAln = leftCandidateString + rightCandidateString;
+
+ }
+ else{ // not enough room to the right, have to steal some
+ // space to the left lets move left and then right...
+ string leftTemplateString = newTemplateAlign.substr(0,i);
+ string rightTemplateString = newTemplateAlign.substr(i+insertLength);
+ newTemplateAlign = leftTemplateString + rightTemplateString;
+ longAlignmentLength = newTemplateAlign.length();
+
+ string leftCandidateString = candAln.substr(0,leftIndex-(insertLength-rightRoom)+1);
+ string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
+ string rightCandidateString = candAln.substr(rightIndex+rightRoom);
+ candAln = leftCandidateString + insertString + rightCandidateString;
+
+ }
+ }
+ }
+ else{ // there could be a case where there isn't enough room in
+ string leftTemplateString = newTemplateAlign.substr(0,i); // either direction to move stuff
+ string rightTemplateString = newTemplateAlign.substr(i+leftRoom+rightRoom);
+ newTemplateAlign = leftTemplateString + rightTemplateString;
+ longAlignmentLength = newTemplateAlign.length();
+
+ string leftCandidateString = candAln.substr(0,leftIndex-leftRoom+1);
+ string insertString = candAln.substr(leftIndex+1,rightIndex-leftIndex-1);
+ string rightCandidateString = candAln.substr(rightIndex+rightRoom);
+ candAln = leftCandidateString + insertString + rightCandidateString;
+ }
+
+ i -= insertLength;
+ }
+ }
+// cout << candAln << endl << tempAln << endl << newTemplateAlign << endl;
+}
+
+/**************************************************************************************************/
+
+void Nast::regapSequences(){ //This is essentially part B in Fig 2. of DeSantis et al.
+
+ string candPair = candidateSeq->getPairwise();
+ string candAln = "";
+
+ string tempPair = templateSeq->getPairwise();
+ string tempAln = templateSeq->getAligned(); // we use the template aligned sequence as our guide
+
+ int pairwiseLength = candPair.length();
+ int fullAlignLength = tempAln.length();
+
+ if(candPair == ""){
+ for(int i=0;i<fullAlignLength;i++) { candAln += '.'; }
+ candidateSeq->setAligned(candAln);
+ return;
+ }
+
+ int fullAlignIndex = 0;
+ int pairwiseAlignIndex = 0;
+ string newTemplateAlign = ""; // this is going to be messy so we want a temporary template
+ // alignment string
+ while(tempAln[fullAlignIndex] == '.' || tempAln[fullAlignIndex] == '-'){
+ candAln += '.'; // add the initial '-' and '.' to the candidate and template
+ newTemplateAlign += tempAln[fullAlignIndex];// pairwise sequences
+ fullAlignIndex++;
+ }
+
+ string lastLoop = "";
+
+ while(pairwiseAlignIndex<pairwiseLength){
+ if(isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
+ && isalpha(candPair[pairwiseAlignIndex])){
+ // the template and candidate pairwise and template aligned have characters
+ // need to add character onto the candidatSeq.aligned sequence
+
+ candAln += candPair[pairwiseAlignIndex];
+ newTemplateAlign += tempPair[pairwiseAlignIndex];//
+
+ pairwiseAlignIndex++;
+ fullAlignIndex++;
+ }
+ else if(isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
+ && isalpha(candPair[pairwiseAlignIndex])){
+ // the template pairwise and candidate pairwise are characters and the template aligned is a gap
+ // need to insert gaps into the candidateSeq.aligned sequence
+
+ candAln += '-';
+ newTemplateAlign += '-';//
+ fullAlignIndex++;
+ }
+ else if(!isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
+ && isalpha(candPair[pairwiseAlignIndex])){
+ // the template pairwise is a gap and the template aligned and pairwise sequences have characters
+ // this is the alpha scenario. add character to the candidateSeq.aligned sequence without progressing
+ // further through the tempAln sequence.
+
+ candAln += candPair[pairwiseAlignIndex];
+ newTemplateAlign += '-';//
+ pairwiseAlignIndex++;
+ }
+ else if(isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
+ && !isalpha(candPair[pairwiseAlignIndex])){
+ // the template pairwise and full alignment are characters and the candidate sequence has a gap
+ // should not be a big deal, just add the gap position to the candidateSeq.aligned sequence;
+
+ candAln += candPair[pairwiseAlignIndex];
+ newTemplateAlign += tempAln[fullAlignIndex];//
+ fullAlignIndex++;
+ pairwiseAlignIndex++;
+ }
+ else if(!isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
+ && isalpha(candPair[pairwiseAlignIndex])){
+ // the template pairwise and aligned are gaps while the candidate pairwise has a character
+ // this would be an insertion, go ahead and add the character->seems to be the opposite of the alpha scenario
+
+ candAln += candPair[pairwiseAlignIndex];
+ newTemplateAlign += tempAln[fullAlignIndex];//
+ pairwiseAlignIndex++;
+ fullAlignIndex++;
+ }
+ else if(isalpha(tempPair[pairwiseAlignIndex]) && !isalpha(tempAln[fullAlignIndex])
+ && !isalpha(candPair[pairwiseAlignIndex])){
+ // template pairwise has a character, but its full aligned sequence and candidate sequence have gaps
+ // this would happen like we need to add a gap. basically the opposite of the alpha situation
+
+ newTemplateAlign += tempAln[fullAlignIndex];//
+ candAln += "-";
+ fullAlignIndex++;
+ }
+ else if(!isalpha(tempPair[pairwiseAlignIndex]) && isalpha(tempAln[fullAlignIndex])
+ && !isalpha(candPair[pairwiseAlignIndex])){
+ // template and candidate pairwise are gaps and the template aligned is not a gap this should not be possible
+ // would skip the gaps and not progress through full alignment sequence
+ // not tested yet
+
+ cout << "We're into D" << ' ' << fullAlignIndex << ' ' << pairwiseAlignIndex << endl;
+ pairwiseAlignIndex++;
+ }
+ else{
+ // everything has a gap - not possible
+ // not tested yet
+
+ cout << "We're into F" << ' ' << fullAlignIndex << ' ' << pairwiseAlignIndex << endl;
+ pairwiseAlignIndex++;
+ fullAlignIndex++;
+ }
+ }
+
+ for(int i=fullAlignIndex;i<fullAlignLength;i++){
+ candAln += '.';
+ newTemplateAlign += tempAln[i];//
+ }
+
+ int start, end;
+ for(int i=0;i<candAln.length();i++){
+ if(candAln[i] == 'Z' || !isalnum(candAln[i])) { candAln[i] = '.'; } // if we padded the alignemnt from
+ else{ start = i; break; } // blast with Z's, change them to
+ } // '.' characters
+
+ for(int i=candAln.length()-1;i>=0;i--){ // ditto.
+ if(candAln[i] == 'Z' || !isalnum(candAln[i])) { candAln[i] = '.'; }
+ else{ end = i; break; }
+ }
+
+ for(int i=start;i<=end;i++){ // go through the candidate alignment sequence and make sure that
+ candAln[i] = toupper(candAln[i]); // everything is upper case
+ }
+
+// cout << candAln << endl;
+// cout << tempAln << endl;
+// cout << newTemplateAlign << endl;
+
+ if(candAln.length() != tempAln.length()){ // if the regapped candidate sequence is longer than the official
+ removeExtraGaps(candAln, tempAln, newTemplateAlign);// template alignment then we need to do steps C-F in Fig.
+ } // 2 of Desantis et al.
+
+// cout << candAln << endl;
+// cout << tempAln << endl;
+// cout << newTemplateAlign << endl;
+
+ candidateSeq->setAligned(candAln);
+}
+
+/**************************************************************************************************/
+
+float Nast::getSimilarityScore(){
+
+ string cand = candidateSeq->getAligned();
+ string temp = templateSeq->getAligned();
+ int alignmentLength = temp.length();
+ int mismatch = 0;
+ int denominator = 0;
+
+ for(int i=0;i<alignmentLength;i++){
+ if(cand[i] == '-' && temp[i] == '-'){
+
+ }
+ else if(cand[i] != '.' && temp[i] != '.'){
+ denominator++;
+
+ if(cand[i] != temp[i]){
+ mismatch++;
+ }
+ }
+ }
+ float similarity = 100 * (1. - mismatch / (float)denominator);
+ if(denominator == 0){ similarity = 0.0000; }
+
+ return similarity;
+}
+
+/**************************************************************************************************/
+
+int Nast::getMaxInsertLength(){
+
+ return maxInsertLength;
+
+}
+
+/**************************************************************************************************/
--- /dev/null
+#ifndef NAST_HPP
+#define NAST_HPP
+
+/*
+ * nast.hpp
+ *
+ *
+ * Created by Pat Schloss on 12/17/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ * This is my implementation of the NAST (nearest alignment space termination) algorithm as described in:
+ *
+ * DeSantis TZ, Hugenholtz P, Keller K, Brodie EL, Larsen N, Piceno YM, Phan R, & Anderson GL. 2006. NAST: a multiple
+ * sequence alignment server for comparative analysis of 16S rRNA genes. Nucleic Acids Research. 34:W394-9.
+ *
+ * To construct an object one needs to provide a method of getting a pairwise alignment (alignment) and the template
+ * and candidate sequence that are to be aligned to each other.
+ *
+ */
+
+#include "mothur.h"
+
+class Alignment;
+class Sequence;
+
+/**************************************************************************************************/
+
+class Nast {
+
+public:
+ Nast(Alignment*, Sequence*, Sequence*);
+ float getSimilarityScore();
+ int getMaxInsertLength();
+
+private:
+ void pairwiseAlignSeqs();
+ void regapSequences();
+ void removeExtraGaps(string&, string, string);
+
+ Alignment* alignment;
+ Sequence* candidateSeq;
+ Sequence* templateSeq;
+
+ int maxInsertLength;
+};
+
+/**************************************************************************************************/
+
+#endif
--- /dev/null
+/*
+ * nastreport.cpp
+ *
+ *
+ * Created by Pat Schloss on 12/19/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+using namespace std;
+
+#include "sequence.hpp"
+#include "nast.hpp"
+#include "alignment.hpp"
+#include "nastreport.hpp"
+
+/******************************************************************************************************************/
+
+NastReport::NastReport(string candidateReportFName) {
+ openOutputFile(candidateReportFName, candidateReportFile);
+
+ candidateReportFile << "QueryName\tQueryLength\tTemplateName\tTemplateLength\t";
+ candidateReportFile << "SearchMethod\tSearchScore\t";
+ candidateReportFile << "AlignmentMethod\tQueryStart\tQueryEnd\tTemplateStart\tTemplateEnd\t";
+ candidateReportFile << "PairwiseAlignmentLength\tGapsInQuery\tGapsInTemplate\t";
+ candidateReportFile << "LongestInsert\t";
+ candidateReportFile << "SimBtwnQuery&Template" << endl;
+}
+
+/******************************************************************************************************************/
+
+void NastReport::print(){
+
+ candidateReportFile << queryName << '\t' << queryLength << '\t' << templateName << '\t' << templateLength << '\t';
+ candidateReportFile << searchMethod << '\t' << setprecision(2) << fixed << searchScore << '\t';
+
+ candidateReportFile << alignmentMethod << '\t' << candidateStartPosition << "\t" << candidateEndPosition << '\t';
+ candidateReportFile << templateStartPosition << "\t" << templateEndPosition << '\t';
+ candidateReportFile << pairwiseAlignmentLength << '\t' << totalGapsInQuery << '\t' << totalGapsInTemplate << '\t';
+ candidateReportFile << longestInsert << '\t';;
+ candidateReportFile << setprecision(2) << similarityToTemplate;
+
+ candidateReportFile << endl;
+ candidateReportFile.flush();
+}
+
+/******************************************************************************************************************/
+
+void NastReport::setCandidate(Sequence* candSeq){
+ queryName = candSeq->getName();
+ queryLength = candSeq->getUnalignLength();
+}
+
+/******************************************************************************************************************/
+
+void NastReport::setTemplate(Sequence* tempSeq){
+ templateName = tempSeq->getName();
+ templateLength = tempSeq->getUnalignLength();
+}
+
+/******************************************************************************************************************/
+
+void NastReport::setSearchParameters(string method, float score){
+ searchMethod = method;
+ searchScore = score;
+}
+
+/******************************************************************************************************************/
+
+void NastReport::setAlignmentParameters(string method, Alignment* align){
+ alignmentMethod = method;
+
+ candidateStartPosition = align->getCandidateStartPos();
+ candidateEndPosition = align->getCandidateEndPos();
+ templateStartPosition = align->getTemplateStartPos();
+ templateEndPosition = align->getTemplateEndPos();
+ pairwiseAlignmentLength = align->getPairwiseLength();
+
+ totalGapsInQuery = pairwiseAlignmentLength - (candidateEndPosition - candidateStartPosition + 1);
+ totalGapsInTemplate = pairwiseAlignmentLength - (templateEndPosition - templateStartPosition + 1);
+}
+
+/******************************************************************************************************************/
+
+void NastReport::setNastParameters(Nast nast){
+
+ longestInsert = nast.getMaxInsertLength();
+ similarityToTemplate = nast.getSimilarityScore();
+
+}
+
+/******************************************************************************************************************/
--- /dev/null
+#ifndef NASTREPORT_HPP
+#define NASTREPORT_HPP
+
+
+/*
+ * nastreport.hpp
+ *
+ *
+ * Created by Pat Schloss on 12/19/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ */
+
+#include "mothur.h"
+
+using namespace std;
+
+/******************************************************************************************************************/
+
+class NastReport {
+
+public:
+ NastReport(string);
+ void setCandidate(Sequence*);
+ void setTemplate(Sequence*);
+ void setSearchParameters(string, float);
+ void setAlignmentParameters(string, Alignment*);
+ void setNastParameters(Nast);
+ void print();
+
+private:
+ string queryName;
+ int queryLength;
+ string templateName;
+ int templateLength;
+ string searchMethod;
+ float searchScore;
+ string alignmentMethod;
+ int candidateStartPosition, candidateEndPosition;
+ int templateStartPosition, templateEndPosition;
+
+ int pairwiseAlignmentLength;
+ int longestInsert;
+ int totalGapsInQuery, totalGapsInTemplate;
+ float similarityToTemplate;
+ ofstream candidateReportFile;
+};
+
+/******************************************************************************************************************/
+
+#endif
--- /dev/null
+/*
+ * needleman.cpp
+ *
+ *
+ * Created by Pat Schloss on 12/15/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ * This class is an Alignment child class that implements the Gotoh pairwise alignment algorithm as described in:
+ *
+ * Gotoh O. 1982. An improved algorithm for matching biological sequences. J. Mol. Biol. 162:705-8.
+ * Myers, EW & Miller, W. 1988. Optimal alignments in linear space. Comput Appl Biosci. 4:11-7.
+ *
+ * This method is nice because it allows for an affine gap penalty to be assessed, which is analogous to what is used
+ * in blast and is an alternative to Needleman-Wunsch, which only charges the same penalty for each gap position.
+ * Because this method typically has problems at the ends when two sequences do not full overlap, we employ a separate
+ * method to fix the ends (see Overlap class documentation)
+ *
+ */
+
+using namespace std;
+
+#include "alignmentcell.hpp"
+#include "alignment.hpp"
+#include "overlap.hpp"
+#include "needlemanoverlap.hpp"
+
+/**************************************************************************************************/
+
+NeedlemanOverlap::NeedlemanOverlap(float gO, float m, float mm, int r) :// note that we don't have a gap extend
+gap(gO), match(m), mismatch(mm), Alignment(r) { // the gap openning penalty is assessed for
+ // every gapped position
+ for(int i=1;i<nCols;i++){
+ alignment[0][i].prevCell = 'l'; // initialize first row by pointing all poiters to the left
+ alignment[0][i].cValue = 0; // and the score to zero
+ }
+
+ for(int i=1;i<nRows;i++){
+ alignment[i][0].prevCell = 'u'; // initialize first column by pointing all poiters upwards
+ alignment[i][0].cValue = 0; // and the score to zero
+ }
+}
+
+/**************************************************************************************************/
+
+NeedlemanOverlap::~NeedlemanOverlap(){ /* do nothing */ }
+
+/**************************************************************************************************/
+
+void NeedlemanOverlap::align(string A, string B){
+
+ seqA = ' ' + A; lA = seqA.length(); // algorithm requires a dummy space at the beginning of each string
+ seqB = ' ' + B; lB = seqB.length(); // algorithm requires a dummy space at the beginning of each string
+
+ for(int i=1;i<lB;i++){ // This code was largely translated from Perl code provided in Ex 3.1
+ for(int j=1;j<lA;j++){ // of the O'Reilly BLAST book. I found that the example output had a
+ // number of errors
+ float diagonal;
+ if(seqB[i] == seqA[j]) { diagonal = alignment[i-1][j-1].cValue + match; }
+ else { diagonal = alignment[i-1][j-1].cValue + mismatch; }
+
+ float up = alignment[i-1][j].cValue + gap;
+ float left = alignment[i][j-1].cValue + gap;
+
+ if(diagonal >= up){
+ if(diagonal >= left){
+ alignment[i][j].cValue = diagonal;
+ alignment[i][j].prevCell = 'd';
+ }
+ else{
+ alignment[i][j].cValue = left;
+ alignment[i][j].prevCell = 'l';
+ }
+ }
+ else{
+ if(up >= left){
+ alignment[i][j].cValue = up;
+ alignment[i][j].prevCell = 'u';
+ }
+ else{
+ alignment[i][j].cValue = left;
+ alignment[i][j].prevCell = 'l';
+ }
+ }
+ }
+ }
+ Overlap over;
+ over.setOverlap(alignment, lA, lB, 0); // Fix gaps at the beginning and end of the sequences
+ traceBack(); // Traceback the alignment to populate seqAaln and seqBaln
+}
+
+/**************************************************************************************************/
+
--- /dev/null
+#ifndef NEEDLEMAN_H
+#define NEEDLEMAN_H
+
+/*
+ * needleman.h
+ *
+ *
+ * Created by Pat Schloss on 12/15/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ * This class is an Alignment child class that implements the Needleman-Wunsch pairwise alignment algorithm as
+ * described in:
+ *
+ * Needleman SB & Wunsch CD. 1970. A general method applicable to the search for similarities in the amino acid
+ * sequence of two proteins. J Mol Biol. 48:443-53.
+ * Korf I, Yandell M, & Bedell J. 2003. BLAST. O'Reilly & Associates. Sebastopol, CA.
+ *
+ * This method is simple as it assesses a consistent penalty for each gap position. Because this method typically has
+ * problems at the ends when two sequences do not full overlap, we employ a separate method to fix the ends (see
+ * Overlap class documentation)
+ *
+ */
+
+#include "mothur.h"
+
+/**************************************************************************************************/
+
+class NeedlemanOverlap : public Alignment {
+
+public:
+ NeedlemanOverlap(float, float, float, int);
+ ~NeedlemanOverlap();
+ void align(string, string);
+
+private:
+ float gap;
+ float match;
+ float mismatch;
+};
+
+/**************************************************************************************************/
+
+#endif
--- /dev/null
+/*
+ * noalign.cpp
+ *
+ *
+ * Created by Pat Schloss on 2/19/09.
+ * Copyright 2009 __MyCompanyName__. All rights reserved.
+ *
+ */
+
+#include "alignment.hpp"
+#include "noalign.hpp"
+
+/**************************************************************************************************/
+
+NoAlign::NoAlign(){ /* do nothing */ }
+
+/**************************************************************************************************/
+
+NoAlign::~NoAlign(){ /* do nothing */ }
+
+/**************************************************************************************************/
+
+void NoAlign::align(string A, string B){ }
+
+/**************************************************************************************************/
--- /dev/null
+#ifndef NOALIGN_HPP
+#define NOALIGN_HPP
+
+/*
+ * noalign.hpp
+ *
+ *
+ * Created by Pat Schloss on 2/19/09.
+ * Copyright 2009 __MyCompanyName__. All rights reserved.
+ *
+ */
+using namespace std;
+
+#include "mothur.h"
+
+/**************************************************************************************************/
+
+class NoAlign : public Alignment {
+
+public:
+ NoAlign();
+ ~NoAlign();
+ void align(string, string);
+
+private:
+};
+
+/**************************************************************************************************/
+
+
+#endif
--- /dev/null
+/*
+ * overlap.cpp
+ *
+ *
+ * Created by Pat Schloss on 12/15/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ * This class cleans up the alignment at the 3' end of the alignments. Because the Gotoh and Needleman-Wunsch
+ * algorithms start the traceback from the lower-right corner of the dynamic programming matrix, there may be a lot of
+ * scattered bases in the alignment near the 3' end of the alignment. Here we basically look for the largest score
+ * in the last column and row to determine whether there should be exta gaps in sequence A or sequence B. The gap
+ * issues at the 5' end of the alignment seem to take care of themselves in the traceback.
+ *
+ */
+using namespace std;
+
+#include "alignmentcell.hpp"
+#include "overlap.hpp"
+
+
+/**************************************************************************************************/
+
+int Overlap::maxRow(vector<vector<AlignmentCell> >& alignment, const int band){
+
+ float max = -100;
+ int end = lA - 1;
+ int index = end;
+
+ for(int i=band;i<lB;i++){ // find the row where the right most column has the highest alignment
+ if(alignment[i][end].cValue >= max){ // score.
+ index = i;
+ max = alignment[i][end].cValue;
+ }
+ }
+ return index;
+}
+
+/**************************************************************************************************/
+
+int Overlap::maxColumn(vector<vector<AlignmentCell> >& alignment, const int band){
+
+ float max = -100;
+ int end = lB - 1;
+ int index = end;
+
+ for(int i=band;i<lA;i++){ // find the column where the bottom most column has the highest
+ if(alignment[end][i].cValue >= max){ // alignment score.
+ index = i;
+ max = alignment[end][i].cValue;
+ }
+ }
+ return index;
+}
+
+/**************************************************************************************************/
+
+void Overlap::setOverlap(vector<vector<AlignmentCell> >& alignment, const int nA, const int nB, const int band=0){
+
+ lA = nA;
+ lB = nB;
+
+ int rowIndex = maxRow(alignment, band); // get the index for the row with the highest right hand side score
+ int colIndex = maxColumn(alignment, band); // get the index for the column with the highest bottom row score
+
+ int row = lB-1;
+ int column = lA-1;
+
+ if(colIndex == column && rowIndex == row){} // if the max values are the lower right corner, then we're good
+ else if(alignment[row][colIndex].cValue < alignment[rowIndex][column].cValue){
+ for(int i=rowIndex+1;i<lB;i++){ // decide whether sequence A or B needs the gaps at the end either set
+ alignment[i][column].prevCell = 'u';// the pointer upwards or...
+ }
+
+ }
+ else {
+ for(int i=colIndex+1;i<lA;i++){
+ alignment[row][i].prevCell = 'l'; // ...to the left
+ }
+ }
+} // the traceback should take care of the gaps at the 5' end
+
+/**************************************************************************************************/
+
--- /dev/null
+#ifndef OVERLAP_H
+#define OVERLAP_H
+
+/*
+ * overlap.hpp
+ *
+ *
+ * Created by Pat Schloss on 12/15/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ * This class cleans up the alignment at the 3' end of the alignments. Because the Gotoh and Needleman-Wunsch
+ * algorithms start the traceback from the lower-right corner of the dynamic programming matrix, there may be a lot of
+ * scattered bases in the alignment near the 3' end of the alignment. Here we basically look for the largest score
+ * in the last column and row to determine whether there should be exta gaps in sequence A or sequence B. The gap
+ * issues at the 5' end of the alignment seem to take care of themselves in the traceback.
+ *
+ */
+
+#include "mothur.h"
+
+/**************************************************************************************************/
+
+class Overlap {
+
+public:
+ Overlap(){};
+ ~Overlap(){};
+ void setOverlap(vector<vector<AlignmentCell> >&, const int, const int, const int);
+private:
+ int maxRow(vector<vector<AlignmentCell> >&, const int);
+ int maxColumn(vector<vector<AlignmentCell> >&, const int);
+ int lA, lB;
+};
+
+/**************************************************************************************************/
+
+#endif
//randomize the groups
random_shuffle(lookup.begin(), lookup.end());
+ //make merge the size of lookup[0]
+ SharedRAbundVector* merge = new SharedRAbundVector(lookup[0]->size());
+
+ //make copy of lookup zero
+ for(int i = 0; i<lookup[0]->size(); i++) {
+ merge->set(i, lookup[0]->getAbundance(i), "merge");
+ }
+
vector<SharedRAbundVector*> subset;
//send each group one at a time
for (int k = 0; k < lookup.size(); k++) {
subset.clear(); //clears out old pair of sharedrabunds
//add in new pair of sharedrabunds
- subset.push_back(lookup[0]); subset.push_back(lookup[k]);
+ subset.push_back(merge); subset.push_back(lookup[k]);
rcd->updateSharedData(subset, k+1, numGroupComb);
- mergeVectors(lookup[0], lookup[k]);
+ mergeVectors(merge, lookup[k]);
}
//resets output files
try{
for (int k = 0; k < shared1->size(); k++) {
//merge new species into shared1
- if ((shared1->getAbundance(k) == 0) && (shared2->getAbundance(k) != 0)) {
- shared1->set(k, shared2->getAbundance(k), "combo"); //set to 'combo' since this vector now contains multiple groups
- }
+ shared1->set(k, (shared1->getAbundance(k) + shared2->getAbundance(k)), "combo"); //set to 'combo' since this vector now contains multiple groups
}
}
catch(exception& e) {
}
setUnaligned(sequence);
}
+//********************************************************************************************************************
+
+Sequence::Sequence(ifstream& fastaFile){
+
+ string accession; // provided a file handle to a fasta-formatted sequence file, read in the next
+ fastaFile >> accession; // accession number and sequence we find...
+ setName(accession);
+
+ char letter;
+ string sequence;
+
+ while(fastaFile){
+ letter= fastaFile.get();
+ if(letter == '>'){
+ fastaFile.putback(letter);
+ break;
+ }
+ else if(isprint(letter)){
+ letter = toupper(letter);
+ if(letter == 'U'){letter = 'T';}
+ sequence += letter;
+ }
+
+ }
+
+ if(sequence.find_first_of('-') != string::npos){ // if there are any gaps in the sequence, assume that it is
+ setAligned(sequence); // an alignment file
+ }
+ setUnaligned(sequence); // also set the unaligned sequence file
+}
//********************************************************************************************************************
}
//********************************************************************************************************************
+
+int Sequence::getUnalignLength(){
+ return unaligned.length();
+}
+
+//********************************************************************************************************************
+
+int Sequence::getAlignLength(){
+ return aligned.length();
+}
+
+//********************************************************************************************************************
+
+
+
* Created by Pat Schloss on 12/15/08.
* Copyright 2008 Patrick D. Schloss. All rights reserved.
*
+ * A sequence object has three components: i) an accession number / name, ii) the unaligned primary sequence, iii) a
+ * pairwise aligned sequence, and iv) a sequence that is aligned to a reference alignment. This class has methods
+ * to set and get these values for the other classes where they are needed. *
+ *
*/
using namespace std;
public:
Sequence();
Sequence(string, string);
+ Sequence(ifstream&);
void setName(string);
void setUnaligned(string);
string getAligned();
string getPairwise();
string getUnaligned();
- int getLength();
+ int getLength(); //the greater of the lengths of unaligned and aligned
+ int getUnalignLength();
+ int getAlignLength();
void printSequence(ostream&);
private:
--- /dev/null
+/*
+ * suffixdb.cpp
+ *
+ *
+ * Created by Pat Schloss on 12/16/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ * This is a child class of the Database abstract datatype. The class is basically a database of suffix trees and an
+ * encapsulation of the method for finding the most similar tree to an inputted sequence. the suffixForest objecct
+ * is a vector of SuffixTrees, with each template sequence being represented by a different SuffixTree. The class also
+ * provides a method to take an unaligned sequence and find the closest sequence in the suffixForest. The search
+ * method is inspired by the article and Perl source code provided at http://www.ddj.com/web-development/184416093. I
+ * would estimate that the time complexity is O(LN) for each search, which is slower than the kmer searching, but
+ * faster than blast
+ *
+ */
+
+using namespace std;
+
+#include "database.hpp"
+#include "sequence.hpp"
+#include "suffixtree.hpp"
+#include "suffixdb.hpp"
+
+/**************************************************************************************************/
+
+SuffixDB::SuffixDB(string fastaFileName) : Database(fastaFileName) {
+
+ suffixForest.resize(numSeqs);
+ cout << "Generating the suffix tree database...\t"; cout.flush();
+ for(int i=0;i<numSeqs;i++){ // The parent class' constructor generates the vector of
+ suffixForest[i].loadSequence(templateSequences[i]); // template Sequence objects. Here each of these objects
+ } // is used to generate a suffix tree, aka the suffix forest
+ cout << "DONE." << endl << endl; cout.flush();
+
+}
+
+/**************************************************************************************************/
+
+Sequence* SuffixDB::findClosestSequence(Sequence* candidateSeq){
+
+ int minValue = 2000;
+ int closestSequenceNo = 0;
+ string processedSeq = candidateSeq->convert2ints(); // the candidate sequence needs to be a string of ints
+ for(int i=0;i<suffixForest.size();i++){ // scan through the forest and see what the minimum
+ int count = suffixForest[i].countSuffixes(processedSeq, minValue); // return score is and keep track of the
+ if(count == minValue){ // template sequence index that corresponds to that score
+ closestSequenceNo = i;
+ }
+ }
+ searchScore = 100 * (1. - minValue / (float)processedSeq.length());
+ return templateSequences[closestSequenceNo]; // return the Sequence object that has the minimum score
+
+}
+
+/**************************************************************************************************/
--- /dev/null
+#ifndef SUFFIXDB_HPP
+#define SUFFIXDB_HPP
+
+/*
+ * suffixdb.hpp
+ *
+ *
+ * Created by Pat Schloss on 12/16/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ * This is a child class of the Database abstract datatype. The class is basically a database of suffix trees and an
+ * encapsulation of the method for finding the most similar tree to an inputted sequence. the suffixForest objecct
+ * is a vector of SuffixTrees, with each template sequence being represented by a different SuffixTree. The class also
+ * provides a method to take an unaligned sequence and find the closest sequence in the suffixForest. The search
+ * method is inspired by the article and Perl source code provided at http://www.ddj.com/web-development/184416093. I
+ * would estimate that the time complexity is O(LN) for each search, which is slower than the kmer searching, but
+ * faster than blast
+ *
+ */
+
+#include "mothur.h"
+#include "database.hpp"
+
+class SuffixTree;
+
+class SuffixDB : public Database {
+
+public:
+ SuffixDB(string);
+ Sequence* findClosestSequence(Sequence*);
+
+private:
+ vector<SuffixTree> suffixForest;
+};
+
+#endif
--- /dev/null
+/*
+ * SuffixNodes.cpp
+ *
+ *
+ * Created by Pat Schloss on 12/15/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ * There are two types of nodes in a suffix tree as I have implemented it. First, there are the internal nodes that
+ * have children, these are the SuffixBranch objects. There are also the terminal nodes, which are the suffixBranches.
+ * I divided them into two groups to save on memory. A SuffixTree object will be a vector of SuffixNodes; therefore,
+ * the values of parentNode, children nodes, and suffix nodes are stored as ints that correspond to indices in the
+ * vector
+ *
+ */
+
+#include "suffixnodes.hpp"
+
+using namespace std;
+
+//********************************************************************************************************************
+
+inline char deCodeSequence(char code){
+
+ if(code == '0') { return 'a'; } // this method allows us to go from the int string to a char string;
+ else if(code == '1') { return 'c'; } // it's only really useful if we want to print out the tree
+ else if(code == '2') { return 'g'; }
+ else if(code == '3') { return 't'; }
+ else if(code == '4') { return 'n'; }
+ else { return '$'; }
+
+}
+
+//********************************************************************************************************************
+
+SuffixNode::SuffixNode(int parent, int start, int end) :
+ parentNode(parent), // we store the parent node as an int
+ startCharPosition(start), // the suffix tree class will hold the sequence that the startCharPosition and
+ endCharPosition(end) // endCharPosition indices correspond to
+ { /* do nothing */ }
+
+
+void SuffixNode::setChildren(char, int) { /* do nothing */ } // there's no children in a leaf
+int SuffixNode::getNumChildren() { return 0; } // ditto
+void SuffixNode::eraseChild(char) { /* do nothing */ } // ditto
+int SuffixNode::getChild(char) { return -1; } // ditto
+void SuffixNode::setSuffixNode(int) { /* do nothing */ } // there's no suffix node in a leaf
+int SuffixNode::getSuffixNode() { return -1; } // ditto
+int SuffixNode::getParentNode() { return parentNode; }
+void SuffixNode::setParentNode(int number) { parentNode = number; }
+int SuffixNode::getStartCharPos() { return startCharPosition; }
+void SuffixNode::setStartCharPos(int start) { startCharPosition = start; }
+int SuffixNode::getEndCharPos() { return endCharPosition; }
+
+//********************************************************************************************************************
+
+SuffixLeaf::SuffixLeaf(int parent, int start, int end) : SuffixNode(parent, start, end) { /* do nothing */ }
+
+
+void SuffixLeaf::print(string sequence, int nodeNumber){
+
+ cout << this << '\t' << parentNode << '\t' << nodeNumber << '\t' <<
+ -1 << '\t' << startCharPosition << '\t' << endCharPosition << '\t';
+
+ cout << '\'';
+ for(int i=startCharPosition;i<=endCharPosition;i++){
+ cout << deCodeSequence(sequence[i]);
+ }
+ cout << '\'' << endl;
+}
+
+//********************************************************************************************************************
+
+SuffixBranch::SuffixBranch(int parent, int start, int end) : SuffixNode(parent, start, end), suffixNode(-1){
+ childNodes.assign(6, -1);
+}
+
+void SuffixBranch::print(string sequence, int nodeNumber){ // this method is different that than
+ cout << this << '\t' << parentNode << '\t' << nodeNumber << '\t' << // of a leaf because it prints out a
+ suffixNode << '\t' << startCharPosition << '\t' << endCharPosition << '\t'; // value for the suffix node
+
+ cout << '\'';
+ for(int i=startCharPosition;i<=endCharPosition;i++){
+ cout << deCodeSequence(sequence[i]);
+ }
+ cout << '\'' << endl;
+}
+
+// we can access the children by subtracting '0' from the the char value from the string, the difference is an int
+// value and the index we need to access.
+void SuffixBranch::eraseChild(char base) { childNodes[base - '0'] = -1; } //to erase set the child index to -1
+void SuffixBranch::setChildren(char base, int nodeIndex){ childNodes[base - '0'] = nodeIndex; }
+void SuffixBranch::setSuffixNode(int nodeIndex){ suffixNode = nodeIndex; }
+int SuffixBranch::getSuffixNode() { return suffixNode; }
+int SuffixBranch::getChild(char base) { return childNodes[base - '0']; }
+
+//********************************************************************************************************************
--- /dev/null
+#ifndef SUFFIXNODES_H
+#define SUFFIXNODES_H
+
+/*
+ * SuffixNodes.h
+ *
+ *
+ * Created by Pat Schloss on 12/15/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ * There are two types of nodes in a suffix tree as I have implemented it. First, there are the internal nodes that
+ * have children, these are the SuffixBranch objects. There are also the terminal nodes, which are the suffixBranches.
+ * I divided them into two groups to save on memory. A SuffixTree object will be a vector of SuffixNodes; therefore,
+ * the values of parentNode, children nodes, and suffix nodes are stored as ints that correspond to indices in the
+ * vector
+ *
+ */
+
+#include "mothur.h"
+
+using namespace std;
+
+//********************************************************************************************************************
+
+class SuffixNode {
+
+public:
+ SuffixNode(int, int, int);
+ virtual void print(string, int) = 0;
+ virtual void setChildren(char, int);
+ virtual int getNumChildren();
+ virtual void eraseChild(char);
+ virtual void setSuffixNode(int);
+ virtual int getSuffixNode();
+ virtual int getChild(char);
+ int getParentNode();
+ void setParentNode(int);
+ int getStartCharPos();
+ void setStartCharPos(int start);
+ int getEndCharPos();
+
+protected:
+ int parentNode;
+ int startCharPosition;
+ int endCharPosition;
+};
+
+//********************************************************************************************************************
+
+class SuffixLeaf : public SuffixNode { // most of the methods are already set in the parent class
+
+public:
+ SuffixLeaf(int, int, int); // we just need to define a constructor and
+ void print(string, int); // print method
+};
+
+//********************************************************************************************************************
+
+class SuffixBranch : public SuffixNode {
+
+public:
+ SuffixBranch(int, int, int);
+ void print(string, int); // need a special method for printing the node because there are children
+ void eraseChild(char); // need a special method for erasing the children
+ void setChildren(char, int); // need a special method for setting children
+ void setSuffixNode(int); // need a special method for setting the suffix node
+ int getSuffixNode(); // need a special method for returning the suffix node
+ int getChild(char); // need a special method for return children
+
+private:
+ vector<int> childNodes; // a suffix branch is unique because it has children and a suffixNode. The
+ int suffixNode; // are stored in a vector for super-fast lookup. If the alphabet were bigger, this
+}; // might not be practical. Since we only have 5 possible letters, it makes sense
+
+//********************************************************************************************************************
+
+#endif
--- /dev/null
+/*
+ * suffixtree.cpp
+ *
+ *
+ * Created by Pat Schloss on 12/15/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ * This is my half-assed attempt to implement a suffix tree. This is a cobbled together algorithm using materials that
+ * I found at http://marknelson.us/1996/08/01/suffix-trees/ and:
+ *
+ * Ukkonen E. (1995). On-line construction of suffix trees. Algorithmica 14 (3): 249--260
+ * Gusfield, Dan (1999). Algorithms on Strings, Trees and Sequences: Computer Science and Computational Biology.
+ * USA: Cambridge University Press
+ *
+ * The Ukkonen paper is the seminal paper describing the on-line method of constructing a suffix tree.
+ *
+ * I have chosen to store the nodes of the tree as a vector of pointers to SuffixNode objects. The root is stored at
+ * nodeVector[0]. Each tree also stores the sequence name and the string that corresponds to the actual sequence.
+ * Finally, this class provides a way of counting the number of suffixes that are needed in one tree to generate a new
+ * sequence (countSuffixes). This method is used to determine similarity between sequences and was inspired by the
+ * article and Perl source code provided at http://www.ddj.com/web-development/184416093.
+ *
+ */
+
+#include "sequence.hpp"
+#include "suffixnodes.hpp"
+#include "suffixtree.hpp"
+
+using namespace std;
+
+//********************************************************************************************************************
+
+inline bool compareParents(SuffixNode* left, SuffixNode* right){// this is necessary to print the tree and to sort the
+ return (left->getParentNode() < right->getParentNode()); // nodes in order of their parent
+}
+
+//********************************************************************************************************************
+
+SuffixTree::SuffixTree(){}
+
+//********************************************************************************************************************
+
+SuffixTree::~SuffixTree(){
+ for(int i=0;i<nodeVector.size();i++){ delete nodeVector[i]; }
+}
+
+//********************************************************************************************************************
+
+void SuffixTree::loadSequence(Sequence* seq){
+ nodeCounter = 0; // initially there are 0 nodes in the tree
+ activeStartPosition = 0;
+ activeEndPosition = -1;
+ seqName = seq->getName();
+ sequence = seq->convert2ints();
+ sequence += '5'; // this essentially concatenates a '$' to the end of the sequence to
+ int seqLength = sequence.length(); // make it a cononical suffix tree
+
+ nodeVector.push_back(new SuffixBranch(-1, 0, -1)); // enter the root of the suffix tree
+
+ activeNode = root = 0;
+ string hold;
+ for(int i=0;i<seqLength;i++){
+ addPrefix(i); // step through the sequence adding each prefix
+ }
+}
+
+//********************************************************************************************************************
+
+string SuffixTree::getSeqName() {
+ return seqName;
+}
+
+//********************************************************************************************************************
+
+void SuffixTree::print(){
+ vector<SuffixNode*> hold = nodeVector;
+ sort(hold.begin(), hold.end(), compareParents);
+ cout << "Address\t\tParent\tNode\tSuffix\tStartC\tEndC\tSuffix" << endl;
+ for(int i=1;i<=nodeCounter;i++){
+ hold[i]->print(sequence, i);
+ }
+}
+
+//********************************************************************************************************************
+
+int SuffixTree::countSuffixes(string compareSequence, int& minValue){ // here we count the number of suffix parts
+ // we need to rewrite a user supplied sequence. if the
+ int numSuffixes = 0; // count exceeds the supplied minValue, bail out. The
+ int seqLength = compareSequence.length(); // time complexity should be O(L)
+ int position = 0;
+
+ int presentNode = 0;
+
+ while(position < seqLength){ // while the position in the query sequence isn't at the end...
+
+ if(numSuffixes > minValue) { return 1000000; } // bail if the count gets too high
+
+ int newNode = nodeVector[presentNode]->getChild(compareSequence[position]); // see if the current node has a
+ // child that matches the next character in the query
+ if(newNode == -1){
+ if(presentNode == 0){ position++; } // if not, go back to the root and increase the count
+ numSuffixes++; // by one.
+ presentNode = 0;
+ }
+ else{ // if there is, move to that node and see how far down
+ presentNode = newNode; // it we can get
+
+ for(int i=nodeVector[newNode]->getStartCharPos(); i<=nodeVector[newNode]->getEndCharPos(); i++){
+ if(compareSequence[position] == sequence[i]){
+ position++; // as long as the query and branch agree, keep going
+ }
+ else{
+ numSuffixes++; // if there is a mismatch, increase the number of
+ presentNode = 0; // suffixes and go back to the root
+ break;
+ }
+ }
+ }
+ // if we get all the way through the node we'll go to the top of the while loop and find the child node
+ // that corresponds to what we are interested in
+ }
+ numSuffixes--; // the method puts an extra count on numSuffixes
+
+ if(numSuffixes < minValue) { minValue = numSuffixes; } // if the count is less than the previous minValue,
+ return numSuffixes; // change the value and return the number of suffixes
+
+}
+
+//********************************************************************************************************************
+
+void SuffixTree::canonize(){ // if you have to ask how this works, you don't really want to know and this really
+ // isn't the place to ask.
+ if ( isExplicit() == 0 ) { // if the node has no children...
+
+ int tempNodeIndex = nodeVector[activeNode]->getChild(sequence[activeStartPosition]);
+ SuffixNode* tempNode = nodeVector[tempNodeIndex];
+
+ int span = tempNode->getEndCharPos() - tempNode->getStartCharPos();
+
+ while ( span <= ( activeEndPosition - activeStartPosition ) ) {
+
+ activeStartPosition = activeStartPosition + span + 1;
+
+ activeNode = tempNodeIndex;
+
+ if ( activeStartPosition <= activeEndPosition ) {
+ tempNodeIndex = nodeVector[tempNodeIndex]->getChild(sequence[activeStartPosition]);
+ tempNode = nodeVector[tempNodeIndex];
+ span = tempNode->getEndCharPos() - tempNode->getStartCharPos();
+ }
+
+ }
+ }
+}
+
+//********************************************************************************************************************
+
+int SuffixTree::split(int nodeIndex, int position){ // leaves stay leaves, etc, to split a leaf we make a new interior
+ // node and reconnect everything
+ SuffixNode* node = nodeVector[nodeIndex]; // get the node that needs to be split
+ SuffixNode* parentNode = nodeVector[node->getParentNode()]; // get it's parent node
+
+ parentNode->eraseChild(sequence[node->getStartCharPos()]); // erase the present node from the registry of its parent
+
+ nodeCounter++;
+ SuffixNode* newNode = new SuffixBranch(node->getParentNode(), node->getStartCharPos(), node->getStartCharPos() + activeEndPosition - activeStartPosition); // create a new node that will link the parent with the old child
+ parentNode->setChildren(sequence[newNode->getStartCharPos()], nodeCounter);// give the parent the new child
+ nodeVector.push_back(newNode);
+
+ node->setParentNode(nodeCounter); // give the original node the new node as its parent
+ newNode->setChildren(sequence[node->getStartCharPos() + activeEndPosition - activeStartPosition + 1], nodeIndex);
+ // put the original node in the registry of the new node's children
+ newNode->setSuffixNode(activeNode);//link the new node with the old active node
+
+ // recalculate the startCharPosition of the outermost node
+ node->setStartCharPos(node->getStartCharPos() + activeEndPosition - activeStartPosition + 1 );
+
+ return node->getParentNode();
+}
+
+//********************************************************************************************************************
+
+void SuffixTree::makeSuffixLink(int& previous, int present){
+
+// here we link the nodes that are suffixes of one another to rapidly speed through the tree
+ if ( previous > 0 ) { nodeVector[previous]->setSuffixNode(present); }
+ else { /* do nothing */ }
+
+ previous = present;
+}
+
+//********************************************************************************************************************
+
+void SuffixTree::addPrefix(int prefixPosition){
+
+ int lastParentNode = -1; // we need to place a new prefix in the suffix tree
+ int parentNode = 0;
+
+ while(1){
+
+ parentNode = activeNode;
+
+ if(isExplicit() == 1){ // if the node is explicit (has kids), try to follow it down the branch if its there...
+ if(nodeVector[activeNode]->getChild(sequence[prefixPosition]) != -1){ // break out and get next prefix...
+ break;
+ }
+ else{ // ...otherwise continue, we'll need to make a new node later on...
+ }
+ }
+ else{ // if it's not explicit (no kids), read through and see if all of the chars agree...
+ int tempNode = nodeVector[activeNode]->getChild(sequence[activeStartPosition]);
+ int span = activeEndPosition - activeStartPosition;
+
+ if(sequence[nodeVector[tempNode]->getStartCharPos() + span + 1] == sequence[prefixPosition] ){
+ break; // if the existing suffix agrees with the new one, grab a new prefix...
+ }
+ else{
+ parentNode = split(tempNode, prefixPosition); // ... otherwise we need to split the node
+ }
+
+ }
+
+ nodeCounter++; // we need to generate a new node here if the kid didn't exist, or we split a node
+ SuffixNode* newSuffixLeaf = new SuffixLeaf(parentNode, prefixPosition, sequence.length()-1);
+ nodeVector[parentNode]->setChildren(sequence[prefixPosition], nodeCounter);
+ nodeVector.push_back(newSuffixLeaf);
+
+ makeSuffixLink( lastParentNode, parentNode ); // make a suffix link for the parent node
+
+ if(nodeVector[activeNode]->getParentNode() == -1){ // move along the start position for the tree
+ activeStartPosition++;
+ }
+ else {
+ activeNode = nodeVector[activeNode]->getSuffixNode();
+ }
+ canonize(); // frankly, i'm not entirely clear on what canonize does.
+ }
+
+ makeSuffixLink( lastParentNode, parentNode );
+ activeEndPosition++; // move along the end position for the tree
+
+ canonize(); // frankly, i'm not entirely clear on what canonize does.
+
+}
+
+//********************************************************************************************************************
+
--- /dev/null
+#ifndef SUFFIXTREE_H
+#define SUFFIXTREE_H
+
+/*
+ * suffixtree.h
+ *
+ *
+ * Created by Pat Schloss on 12/15/08.
+ * Copyright 2008 Patrick D. Schloss. All rights reserved.
+ *
+ * This is my half-assed attempt to implement a suffix tree. This is a cobbled together algorithm using materials that
+ * I found at http://marknelson.us/1996/08/01/suffix-trees/ and:
+ *
+ * Ukkonen E. (1995). On-line construction of suffix trees. Algorithmica 14 (3): 249--260
+ * Gusfield, Dan (1999). Algorithms on Strings, Trees and Sequences: Computer Science and Computational Biology.
+ * USA: Cambridge University Press
+ *
+ * The Ukkonen paper is the seminal paper describing the on-line method of constructing a suffix tree.
+ *
+ * I have chosen to store the nodes of the tree as a vector of pointers to SuffixNode objects. The root is stored at
+ * nodeVector[0]. Each tree also stores the sequence name and the string that corresponds to the actual sequence.
+ * Finally, this class provides a way of counting the number of suffixes that are needed in one tree to generate a new
+ * sequence (countSuffixes). This method is used to determine similarity between sequences and was inspired by the
+ * article and Perl source code provided at http://www.ddj.com/web-development/184416093.
+ *
+ */
+
+#include "mothur.h"
+
+using namespace std;
+
+class SuffixNode;
+
+//********************************************************************************************************************
+
+class SuffixTree {
+
+public:
+ SuffixTree();
+ ~SuffixTree();
+// SuffixTree(string, string);
+
+ void loadSequence(Sequence*);
+ string getSeqName();
+ void print();
+ int countSuffixes(string, int&);
+
+private:
+ void addPrefix(int);
+ void canonize();
+ int split(int, int);
+ void makeSuffixLink(int&, int);
+ bool isExplicit(){ return activeStartPosition > activeEndPosition; }
+
+ int activeStartPosition;
+ int activeEndPosition;
+
+ vector<SuffixNode*> nodeVector;
+ int root;
+ int activeNode;
+ int nodeCounter;
+ string seqName;
+ string sequence;
+
+};
+
+//********************************************************************************************************************
+
+#endif
commands["concensus"] = "concensus";
commands["help"] = "help";
commands["filter.seqs"] = "filter.seqs";
+ commands["align.seqs"] = "align.seqs";
commands["quit"] = "quit";
//{"parameter1","parameter2",...,"last parameter"};
string readdistArray[] = {"phylip","column", "name","cutoff","precision", "group"};
-
commandParameters["read.dist"] = addParameters(readdistArray, sizeof(readdistArray)/sizeof(string));
string readotuArray[] = {"list","order","shared", "line", "label","group","sabund", "rabund"};
string distanceArray[] = {"fasta","phylip","clustal","nexus", "calc", "ends", "cutoff", "processors"};
commandParameters["dist.seqs"] = addParameters(distanceArray, sizeof(distanceArray)/sizeof(string));
+ string AlignArray[] = {"fasta","phylip","clustal","nexus", "template", "ksize", "align", "match", "mismatch", "gapopen", "gapextend"};
+ commandParameters["align.seqs"] = addParameters(AlignArray, sizeof(AlignArray)/sizeof(string));
+
string quitArray[] = {};
commandParameters["quit"] = addParameters(quitArray, sizeof(quitArray)/sizeof(string));