From: westcott Date: Mon, 16 May 2011 19:22:44 +0000 (+0000) Subject: added chimera.uchime X-Git-Url: https://git.donarmstrong.com/?p=mothur.git;a=commitdiff_plain;h=a150db3c2bfd7b76420048a1e94ebe397f2c6045 added chimera.uchime --- diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index 49ad9cc..e80c733 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -15,6 +15,29 @@ A71CB160130B04A2001E7287 /* anosimcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71CB15E130B04A2001E7287 /* anosimcommand.cpp */; }; A71FE12C12EDF72400963CA7 /* mergegroupscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A71FE12B12EDF72400963CA7 /* mergegroupscommand.cpp */; }; A727864412E9E28C00F86ABA /* removerarecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A727864312E9E28C00F86ABA /* removerarecommand.cpp */; }; + A74D3687137DAB8300332B0C /* addtargets2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D3655137DAB8300332B0C /* addtargets2.cpp */; }; + A74D3688137DAB8400332B0C /* alignchime.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D3656137DAB8300332B0C /* alignchime.cpp */; }; + A74D3689137DAB8400332B0C /* alignchimel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D3657137DAB8300332B0C /* alignchimel.cpp */; }; + A74D368A137DAB8400332B0C /* alnparams.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D365A137DAB8300332B0C /* alnparams.cpp */; }; + A74D368B137DAB8400332B0C /* alpha.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D365C137DAB8300332B0C /* alpha.cpp */; }; + A74D368C137DAB8400332B0C /* alpha2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D365E137DAB8300332B0C /* alpha2.cpp */; }; + A74D368D137DAB8400332B0C /* fractid.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D3664137DAB8300332B0C /* fractid.cpp */; }; + A74D368E137DAB8400332B0C /* getparents.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D3665137DAB8300332B0C /* getparents.cpp */; }; + A74D368F137DAB8400332B0C /* globalalign2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D3666137DAB8300332B0C /* globalalign2.cpp */; }; + A74D3690137DAB8400332B0C /* make3way.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D366A137DAB8300332B0C /* make3way.cpp */; }; + A74D3691137DAB8400332B0C /* mx.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D366B137DAB8300332B0C /* mx.cpp */; }; + A74D3692137DAB8400332B0C /* myutils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D366E137DAB8300332B0C /* myutils.cpp */; }; + A74D3693137DAB8400332B0C /* path.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D3672137DAB8300332B0C /* path.cpp */; }; + A74D3694137DAB8400332B0C /* searchchime.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D3674137DAB8300332B0C /* searchchime.cpp */; }; + A74D3695137DAB8400332B0C /* seqdb.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D3676137DAB8300332B0C /* seqdb.cpp */; }; + A74D3696137DAB8400332B0C /* setnucmx.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D3678137DAB8300332B0C /* setnucmx.cpp */; }; + A74D3697137DAB8400332B0C /* sfasta.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D3679137DAB8300332B0C /* sfasta.cpp */; }; + A74D3698137DAB8400332B0C /* tracebackbit.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D367F137DAB8300332B0C /* tracebackbit.cpp */; }; + A74D3699137DAB8400332B0C /* uchime_main.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D3681137DAB8300332B0C /* uchime_main.cpp */; }; + A74D369A137DAB8400332B0C /* usort.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D3683137DAB8300332B0C /* usort.cpp */; }; + A74D369B137DAB8400332B0C /* viterbifast.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D3684137DAB8300332B0C /* viterbifast.cpp */; }; + A74D369C137DAB8400332B0C /* writechhit.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D3686137DAB8300332B0C /* writechhit.cpp */; }; + A74D36B8137DAFAA00332B0C /* chimerauchimecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A74D36B7137DAFAA00332B0C /* chimerauchimecommand.cpp */; }; A75790591301749D00A30DAB /* homovacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A75790581301749D00A30DAB /* homovacommand.cpp */; }; A778FE6B134CA6CA00C0BA33 /* getcommandinfocommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A778FE6A134CA6CA00C0BA33 /* getcommandinfocommand.cpp */; }; A799F5B91309A3E000AEEFA0 /* makefastqcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A799F5B81309A3E000AEEFA0 /* makefastqcommand.cpp */; }; @@ -323,6 +346,58 @@ A71FE12B12EDF72400963CA7 /* mergegroupscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mergegroupscommand.cpp; sourceTree = ""; }; A727864212E9E28C00F86ABA /* removerarecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = removerarecommand.h; sourceTree = ""; }; A727864312E9E28C00F86ABA /* removerarecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = removerarecommand.cpp; sourceTree = ""; }; + A74D3655137DAB8300332B0C /* addtargets2.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = addtargets2.cpp; sourceTree = ""; }; + A74D3656137DAB8300332B0C /* alignchime.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignchime.cpp; sourceTree = ""; }; + A74D3657137DAB8300332B0C /* alignchimel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alignchimel.cpp; sourceTree = ""; }; + A74D3658137DAB8300332B0C /* allocs.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = allocs.h; sourceTree = ""; }; + A74D3659137DAB8300332B0C /* alnheuristics.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = alnheuristics.h; sourceTree = ""; }; + A74D365A137DAB8300332B0C /* alnparams.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alnparams.cpp; sourceTree = ""; }; + A74D365B137DAB8300332B0C /* alnparams.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = alnparams.h; sourceTree = ""; }; + A74D365C137DAB8300332B0C /* alpha.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alpha.cpp; sourceTree = ""; }; + A74D365D137DAB8300332B0C /* alpha.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = alpha.h; sourceTree = ""; }; + A74D365E137DAB8300332B0C /* alpha2.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = alpha2.cpp; sourceTree = ""; }; + A74D365F137DAB8300332B0C /* chainer.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chainer.h; sourceTree = ""; }; + A74D3660137DAB8300332B0C /* chime.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chime.h; sourceTree = ""; }; + A74D3661137DAB8300332B0C /* diagbox.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = diagbox.h; sourceTree = ""; }; + A74D3662137DAB8300332B0C /* dp.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = dp.h; sourceTree = ""; }; + A74D3663137DAB8300332B0C /* evalue.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = evalue.h; sourceTree = ""; }; + A74D3664137DAB8300332B0C /* fractid.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = fractid.cpp; sourceTree = ""; }; + A74D3665137DAB8300332B0C /* getparents.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getparents.cpp; sourceTree = ""; }; + A74D3666137DAB8300332B0C /* globalalign2.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = globalalign2.cpp; sourceTree = ""; }; + A74D3667137DAB8300332B0C /* help.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = help.h; sourceTree = ""; }; + A74D3668137DAB8300332B0C /* hsp.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hsp.h; sourceTree = ""; }; + A74D3669137DAB8300332B0C /* hspfinder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hspfinder.h; sourceTree = ""; }; + A74D366A137DAB8300332B0C /* make3way.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = make3way.cpp; sourceTree = ""; }; + A74D366B137DAB8300332B0C /* mx.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = mx.cpp; sourceTree = ""; }; + A74D366C137DAB8300332B0C /* mx.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = mx.h; sourceTree = ""; }; + A74D366D137DAB8300332B0C /* myopts.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = myopts.h; sourceTree = ""; }; + A74D366E137DAB8300332B0C /* myutils.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = myutils.cpp; sourceTree = ""; }; + A74D366F137DAB8300332B0C /* myutils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = myutils.h; sourceTree = ""; }; + A74D3670137DAB8300332B0C /* orf.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = orf.h; sourceTree = ""; }; + A74D3671137DAB8300332B0C /* out.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = out.h; sourceTree = ""; }; + A74D3672137DAB8300332B0C /* path.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = path.cpp; sourceTree = ""; }; + A74D3673137DAB8300332B0C /* path.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = path.h; sourceTree = ""; }; + A74D3674137DAB8300332B0C /* searchchime.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = searchchime.cpp; sourceTree = ""; }; + A74D3675137DAB8300332B0C /* seq.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = seq.h; sourceTree = ""; }; + A74D3676137DAB8300332B0C /* seqdb.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = seqdb.cpp; sourceTree = ""; }; + A74D3677137DAB8300332B0C /* seqdb.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = seqdb.h; sourceTree = ""; }; + A74D3678137DAB8300332B0C /* setnucmx.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = setnucmx.cpp; sourceTree = ""; }; + A74D3679137DAB8300332B0C /* sfasta.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sfasta.cpp; sourceTree = ""; }; + A74D367A137DAB8300332B0C /* sfasta.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sfasta.h; sourceTree = ""; }; + A74D367B137DAB8300332B0C /* svnmods.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = svnmods.h; sourceTree = ""; }; + A74D367C137DAB8300332B0C /* svnversion.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = svnversion.h; sourceTree = ""; }; + A74D367D137DAB8300332B0C /* timers.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = timers.h; sourceTree = ""; }; + A74D367E137DAB8300332B0C /* timing.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = timing.h; sourceTree = ""; }; + A74D367F137DAB8300332B0C /* tracebackbit.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = tracebackbit.cpp; sourceTree = ""; }; + A74D3680137DAB8300332B0C /* uc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = uc.h; sourceTree = ""; }; + A74D3681137DAB8300332B0C /* uchime_main.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = uchime_main.cpp; sourceTree = ""; }; + A74D3682137DAB8300332B0C /* ultra.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ultra.h; sourceTree = ""; }; + A74D3683137DAB8300332B0C /* usort.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = usort.cpp; sourceTree = ""; }; + A74D3684137DAB8300332B0C /* viterbifast.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = viterbifast.cpp; sourceTree = ""; }; + A74D3685137DAB8300332B0C /* windex.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = windex.h; sourceTree = ""; }; + A74D3686137DAB8300332B0C /* writechhit.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = writechhit.cpp; sourceTree = ""; }; + A74D36B6137DAFAA00332B0C /* chimerauchimecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimerauchimecommand.h; sourceTree = ""; }; + A74D36B7137DAFAA00332B0C /* chimerauchimecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = chimerauchimecommand.cpp; sourceTree = ""; }; A75790571301749D00A30DAB /* homovacommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = homovacommand.h; sourceTree = ""; }; A75790581301749D00A30DAB /* homovacommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = homovacommand.cpp; sourceTree = ""; }; A778FE69134CA6CA00C0BA33 /* getcommandinfocommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getcommandinfocommand.h; sourceTree = ""; }; @@ -1032,6 +1107,63 @@ name = Products; sourceTree = ""; }; + A74D3644137DA7CE00332B0C /* uchime */ = { + isa = PBXGroup; + children = ( + A74D3655137DAB8300332B0C /* addtargets2.cpp */, + A74D3656137DAB8300332B0C /* alignchime.cpp */, + A74D3657137DAB8300332B0C /* alignchimel.cpp */, + A74D3658137DAB8300332B0C /* allocs.h */, + A74D3659137DAB8300332B0C /* alnheuristics.h */, + A74D365A137DAB8300332B0C /* alnparams.cpp */, + A74D365B137DAB8300332B0C /* alnparams.h */, + A74D365C137DAB8300332B0C /* alpha.cpp */, + A74D365D137DAB8300332B0C /* alpha.h */, + A74D365E137DAB8300332B0C /* alpha2.cpp */, + A74D365F137DAB8300332B0C /* chainer.h */, + A74D3660137DAB8300332B0C /* chime.h */, + A74D3661137DAB8300332B0C /* diagbox.h */, + A74D3662137DAB8300332B0C /* dp.h */, + A74D3663137DAB8300332B0C /* evalue.h */, + A74D3664137DAB8300332B0C /* fractid.cpp */, + A74D3665137DAB8300332B0C /* getparents.cpp */, + A74D3666137DAB8300332B0C /* globalalign2.cpp */, + A74D3667137DAB8300332B0C /* help.h */, + A74D3668137DAB8300332B0C /* hsp.h */, + A74D3669137DAB8300332B0C /* hspfinder.h */, + A74D366A137DAB8300332B0C /* make3way.cpp */, + A74D366B137DAB8300332B0C /* mx.cpp */, + A74D366C137DAB8300332B0C /* mx.h */, + A74D366D137DAB8300332B0C /* myopts.h */, + A74D366E137DAB8300332B0C /* myutils.cpp */, + A74D366F137DAB8300332B0C /* myutils.h */, + A74D3670137DAB8300332B0C /* orf.h */, + A74D3671137DAB8300332B0C /* out.h */, + A74D3672137DAB8300332B0C /* path.cpp */, + A74D3673137DAB8300332B0C /* path.h */, + A74D3674137DAB8300332B0C /* searchchime.cpp */, + A74D3675137DAB8300332B0C /* seq.h */, + A74D3676137DAB8300332B0C /* seqdb.cpp */, + A74D3677137DAB8300332B0C /* seqdb.h */, + A74D3678137DAB8300332B0C /* setnucmx.cpp */, + A74D3679137DAB8300332B0C /* sfasta.cpp */, + A74D367A137DAB8300332B0C /* sfasta.h */, + A74D367B137DAB8300332B0C /* svnmods.h */, + A74D367C137DAB8300332B0C /* svnversion.h */, + A74D367D137DAB8300332B0C /* timers.h */, + A74D367E137DAB8300332B0C /* timing.h */, + A74D367F137DAB8300332B0C /* tracebackbit.cpp */, + A74D3680137DAB8300332B0C /* uc.h */, + A74D3681137DAB8300332B0C /* uchime_main.cpp */, + A74D3682137DAB8300332B0C /* ultra.h */, + A74D3683137DAB8300332B0C /* usort.cpp */, + A74D3684137DAB8300332B0C /* viterbifast.cpp */, + A74D3685137DAB8300332B0C /* windex.h */, + A74D3686137DAB8300332B0C /* writechhit.cpp */, + ); + name = uchime; + sourceTree = ""; + }; A7E9BA3812D3956100DA6239 /* commands */ = { isa = PBXGroup; children = ( @@ -1058,6 +1190,8 @@ A7E9B68212D37EC400DA6239 /* chimerapintailcommand.cpp */, A7E9B68B12D37EC400DA6239 /* chimeraslayercommand.h */, A7E9B68A12D37EC400DA6239 /* chimeraslayercommand.cpp */, + A74D36B6137DAFAA00332B0C /* chimerauchimecommand.h */, + A74D36B7137DAFAA00332B0C /* chimerauchimecommand.cpp */, A7E9B68D12D37EC400DA6239 /* chopseqscommand.h */, A7E9B68C12D37EC400DA6239 /* chopseqscommand.cpp */, A7E9B69112D37EC400DA6239 /* classifyotucommand.h */, @@ -1493,6 +1627,7 @@ A7E9BA4512D3965600DA6239 /* chimera */ = { isa = PBXGroup; children = ( + A74D3644137DA7CE00332B0C /* uchime */, A7E9B65C12D37EC300DA6239 /* bellerophon.cpp */, A7E9B65D12D37EC300DA6239 /* bellerophon.h */, A7E9B67412D37EC400DA6239 /* ccode.cpp */, @@ -1937,6 +2072,29 @@ A7FE7C401330EA1000F7B327 /* getcurrentcommand.cpp in Sources */, A7FE7E6D13311EA400F7B327 /* setcurrentcommand.cpp in Sources */, A778FE6B134CA6CA00C0BA33 /* getcommandinfocommand.cpp in Sources */, + A74D3687137DAB8300332B0C /* addtargets2.cpp in Sources */, + A74D3688137DAB8400332B0C /* alignchime.cpp in Sources */, + A74D3689137DAB8400332B0C /* alignchimel.cpp in Sources */, + A74D368A137DAB8400332B0C /* alnparams.cpp in Sources */, + A74D368B137DAB8400332B0C /* alpha.cpp in Sources */, + A74D368C137DAB8400332B0C /* alpha2.cpp in Sources */, + A74D368D137DAB8400332B0C /* fractid.cpp in Sources */, + A74D368E137DAB8400332B0C /* getparents.cpp in Sources */, + A74D368F137DAB8400332B0C /* globalalign2.cpp in Sources */, + A74D3690137DAB8400332B0C /* make3way.cpp in Sources */, + A74D3691137DAB8400332B0C /* mx.cpp in Sources */, + A74D3692137DAB8400332B0C /* myutils.cpp in Sources */, + A74D3693137DAB8400332B0C /* path.cpp in Sources */, + A74D3694137DAB8400332B0C /* searchchime.cpp in Sources */, + A74D3695137DAB8400332B0C /* seqdb.cpp in Sources */, + A74D3696137DAB8400332B0C /* setnucmx.cpp in Sources */, + A74D3697137DAB8400332B0C /* sfasta.cpp in Sources */, + A74D3698137DAB8400332B0C /* tracebackbit.cpp in Sources */, + A74D3699137DAB8400332B0C /* uchime_main.cpp in Sources */, + A74D369A137DAB8400332B0C /* usort.cpp in Sources */, + A74D369B137DAB8400332B0C /* viterbifast.cpp in Sources */, + A74D369C137DAB8400332B0C /* writechhit.cpp in Sources */, + A74D36B8137DAFAA00332B0C /* chimerauchimecommand.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -1977,6 +2135,9 @@ ARCHS = "$(ARCHS_STANDARD_64_BIT)"; DEPLOYMENT_LOCATION = NO; GCC_C_LANGUAGE_STANDARD = gnu99; + GCC_ENABLE_SSE3_EXTENSIONS = NO; + GCC_ENABLE_SSE41_EXTENSIONS = NO; + GCC_ENABLE_SSE42_EXTENSIONS = NO; GCC_OPTIMIZATION_LEVEL = 0; GCC_PREPROCESSOR_DEFINITIONS = ( "MOTHUR_FILES=\"\\\"../release\\\"\"", diff --git a/addtargets2.cpp b/addtargets2.cpp new file mode 100644 index 0000000..4e0dbd1 --- /dev/null +++ b/addtargets2.cpp @@ -0,0 +1,38 @@ +//#if UCHIMES + +#include "myutils.h" +#include "chime.h" +#include "ultra.h" +#include + +const float MAX_WORD_COUNT_DROP = 1; + +void SortDescending(const vector &Values, vector &Order); +bool GlobalAlign(const SeqData &Query, const SeqData &Target, string &Path); +double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path); +void USort(const SeqData &Query, const SeqDB &DB, vector &WordCounts, + vector &Order); + +void AddTargets(SeqDB &DB, const SeqData &Query, set &TargetIndexes) + { + const unsigned SeqCount = DB.GetSeqCount(); + if (SeqCount == 0) + return; + + vector WordCounts; + vector Order; + USort(Query, DB, WordCounts, Order); + asserta(SIZE(Order) == SeqCount); + unsigned TopSeqIndex = Order[0]; + float TopWordCount = WordCounts[TopSeqIndex]; + for (unsigned i = 0; i < SeqCount; ++i) + { + unsigned SeqIndex = Order[i]; + float WordCount = WordCounts[SeqIndex]; + if (TopWordCount - WordCount > MAX_WORD_COUNT_DROP) + return; + TargetIndexes.insert(SeqIndex); + } + } + +//#endif diff --git a/alignchime.cpp b/alignchime.cpp new file mode 100644 index 0000000..d7b05a8 --- /dev/null +++ b/alignchime.cpp @@ -0,0 +1,649 @@ +#include "myutils.h" +#include "seq.h" +#include "chime.h" +#include "dp.h" + +#define TRACE 0 +#define TRACE_BS 0 + +void Make3Way(const SeqData &SDQ, const SeqData &SDA, const SeqData &SDB, + const string &PathQA, const string &PathQB, + string &Q3, string &A3, string &B3); + +void AlignChimeLocal3(const string &Q3, const string &A3, const string &B3, + const string &QLabel, const string &ALabel, const string &BLabel, + ChimeHit2 &Hit); + +double GetScore2(double Y, double N, double A) + { + return Y/(opt_xn*(N + opt_dn) + opt_xa*A); + } + +void AlignChimeGlobal3(const string &Q3, const string &A3, const string &B3, + const string &QLabel, const string &ALabel, const string &BLabel, + ChimeHit2 &Hit) + { + Hit.Clear(); + Hit.QLabel = QLabel; + + const byte *Q3Seq = (const byte *) Q3.c_str(); + const byte *A3Seq = (const byte *) A3.c_str(); + const byte *B3Seq = (const byte *) B3.c_str(); + + const unsigned ColCount = SIZE(Q3); + asserta(SIZE(A3) == ColCount && SIZE(B3) == ColCount); + +#if TRACE + Log("Q %5u %*.*s\n", ColCount, ColCount, ColCount, Q3Seq); + Log("A %5u %*.*s\n", ColCount, ColCount, ColCount, A3Seq); + Log("B %5u %*.*s\n", ColCount, ColCount, ColCount, B3Seq); +#endif + +// Discard terminal gaps + unsigned ColLo = UINT_MAX; + unsigned ColHi = UINT_MAX; + for (unsigned Col = 2; Col + 2 < ColCount; ++Col) + { + char q = Q3Seq[Col]; + char a = A3Seq[Col]; + char b = B3Seq[Col]; + + if (isacgt(q) && isacgt(a) && isacgt(b)) + { + if (ColLo == UINT_MAX) + ColLo = Col; + ColHi = Col; + } + } + + if (ColLo == UINT_MAX) + return; + + unsigned QPos = 0; + unsigned APos = 0; + unsigned BPos = 0; + unsigned DiffCount = 0; + + vector ColToQPos(ColLo, UINT_MAX); + vector AccumCount(ColLo, UINT_MAX); + vector AccumSameA(ColLo, UINT_MAX); + vector AccumSameB(ColLo, UINT_MAX); + vector AccumForA(ColLo, UINT_MAX); + vector AccumForB(ColLo, UINT_MAX); + vector AccumAbstain(ColLo, UINT_MAX); + vector AccumAgainst(ColLo, UINT_MAX); + + unsigned SumSameA = 0; + unsigned SumSameB = 0; + unsigned SumSameAB = 0; + unsigned Sum = 0; + unsigned SumForA = 0; + unsigned SumForB = 0; + unsigned SumAbstain = 0; + unsigned SumAgainst = 0; + for (unsigned Col = ColLo; Col <= ColHi; ++Col) + { + char q = Q3Seq[Col]; + char a = A3Seq[Col]; + char b = B3Seq[Col]; + + if (isacgt(q) && isacgt(a) && isacgt(b)) + { + if (q == a) + ++SumSameA; + if (q == b) + ++SumSameB; + if (a == b) + ++SumSameAB; + if (q == a && q != b) + ++SumForA; + if (q == b && q != a) + ++SumForB; + if (a == b && q != a) + ++SumAgainst; + if (q != a && q != b) + ++SumAbstain; + ++Sum; + } + + ColToQPos.push_back(QPos); + AccumSameA.push_back(SumSameA); + AccumSameB.push_back(SumSameB); + AccumCount.push_back(Sum); + AccumForA.push_back(SumForA); + AccumForB.push_back(SumForB); + AccumAbstain.push_back(SumAbstain); + AccumAgainst.push_back(SumAgainst); + + if (q != '-') + ++QPos; + if (a != '-') + ++APos; + if (b != '-') + ++BPos; + } + + asserta(SIZE(ColToQPos) == ColHi+1); + asserta(SIZE(AccumSameA) == ColHi+1); + asserta(SIZE(AccumSameB) == ColHi+1); + asserta(SIZE(AccumAbstain) == ColHi+1); + asserta(SIZE(AccumAgainst) == ColHi+1); + + double IdQA = double(SumSameA)/Sum; + double IdQB = double(SumSameB)/Sum; + double IdAB = double(SumSameAB)/Sum; + double MaxId = max(IdQA, IdQB); + +#if TRACE + Log("IdQA=%.1f%% IdQB=%.1f%% IdAB=%.1f\n", IdQA*100.0, IdQB*100.0, IdAB*100.0); + Log("\n"); + Log(" x AQB IdAL IdBL IdAR IdBR DivAB DivBA YAL YBL YAR YBR AbL AbR ScoreAB ScoreAB XLo Xhi\n"); + Log("----- --- ----- ----- ----- ----- ------ ------ ----- ----- ----- ----- ----- ----- ------- ------- ----- -----\n"); +#endif + unsigned BestXLo = UINT_MAX; + unsigned BestXHi = UINT_MAX; + double BestDiv = 0.0; + double BestIdQM = 0.0; + double BestScore = 0.0; + +// Find range of cols BestXLo..BestXHi that maximizes score + bool FirstA = false; + +// NOTE: Must be < ColHi not <= because use Col+1 below + for (unsigned Col = ColLo; Col < ColHi; ++Col) + { + char q = Q3Seq[Col]; + char a = A3Seq[Col]; + char b = B3Seq[Col]; + + unsigned SameAL = AccumSameA[Col]; + unsigned SameBL = AccumSameB[Col]; + unsigned SameAR = SumSameA - AccumSameA[Col]; + unsigned SameBR = SumSameB - AccumSameB[Col]; + + double IdAB = double(SameAL + SameBR)/Sum; + double IdBA = double(SameBL + SameAR)/Sum; + + unsigned ForAL = AccumForA[Col]; + unsigned ForBL = AccumForB[Col]; + unsigned ForAR = SumForA - AccumForA[Col+1]; + unsigned ForBR = SumForB - AccumForB[Col+1]; + unsigned AbL = AccumAbstain[Col]; + unsigned AbR = SumAbstain - AccumAbstain[Col+1]; + + double ScoreAB = GetScore2(ForAL, ForBL, AbL)*GetScore2(ForBR, ForAR, AbR); + double ScoreBA = GetScore2(ForBL, ForAL, AbL)*GetScore2(ForAR, ForBR, AbR); + + double DivAB = IdAB/MaxId; + double DivBA = IdBA/MaxId; + double MaxDiv = max(DivAB, DivBA); + + //if (MaxDiv > BestDiv) + // { + // BestDiv = MaxDiv; + // BestXLo = Col; + // BestXHi = Col; + // FirstA = (DivAB > DivBA); + // if (FirstA) + // BestIdQM = IdAB; + // else + // BestIdQM = IdBA; + // } + //else if (MaxDiv == BestDiv) + // BestXHi = Col; + + double MaxScore = max(ScoreAB, ScoreBA); + if (MaxScore > BestScore) + { + BestScore = MaxScore; + BestXLo = Col; + BestXHi = Col; + FirstA = (ScoreAB > ScoreBA); + if (FirstA) + BestIdQM = IdAB; + else + BestIdQM = IdBA; + if (MaxDiv > BestDiv) + BestDiv = MaxDiv; + } + else if (MaxScore == BestScore) + { + BestXHi = Col; + if (MaxDiv > BestDiv) + BestDiv = MaxDiv; + } + +#if TRACE + { + Log("%5u", Col); + char q = Q3Seq[Col]; + char a = A3Seq[Col]; + char b = B3Seq[Col]; + Log(" %c%c%c", a, q, b); + Log(" %5u", SameAL); + Log(" %5u", SameBL); + Log(" %5u", SameAR); + Log(" %5u", SameBR); + Log(" %5.4f", DivAB); + Log(" %5.4f", DivBA); + Log(" %5u", ForAL); + Log(" %5u", ForBL); + Log(" %5u", ForAR); + Log(" %5u", ForBR); + Log(" %5u", AbL); + Log(" %5u", AbR); + Log(" %7.4f", ScoreAB); + Log(" %7.4f", ScoreBA); + if (BestXLo != UINT_MAX) + Log(" %5u", BestXLo); + if (BestXHi != UINT_MAX) + Log(" %5u", BestXHi); + Log("\n"); + } +#endif + } + + if (BestXLo == UINT_MAX) + { +#if TRACE + Log("\n"); + Log("No crossover found.\n"); +#endif + return; + } +#if TRACE + Log("BestX col %u - %u\n", BestXLo, BestXHi); +#endif + +// Find maximum region of identity within BestXLo..BestXHi + unsigned ColXLo = (BestXLo + BestXHi)/2; + unsigned ColXHi = ColXLo; + unsigned SegLo = UINT_MAX; + unsigned SegHi = UINT_MAX; + for (unsigned Col = BestXLo; Col <= BestXHi; ++Col) + { + char q = Q3Seq[Col]; + char a = A3Seq[Col]; + char b = B3Seq[Col]; + + if (q == a && q == b) + { + if (SegLo == UINT_MAX) + SegLo = Col; + SegHi = Col; + } + else + { + unsigned SegLength = SegHi - SegLo + 1; + unsigned BestSegLength = ColXHi - ColXLo + 1; + if (SegLength > BestSegLength) + { + ColXLo = SegLo; + ColXHi = SegHi; + } + SegLo = UINT_MAX; + SegHi = UINT_MAX; + } + } + unsigned SegLength = SegHi - SegLo + 1; + unsigned BestSegLength = ColXHi - ColXLo + 1; + if (SegLength > BestSegLength) + { + ColXLo = SegLo; + ColXHi = SegHi; + } + + QPos = 0; + for (unsigned x = 0; x < ColCount; ++x) + { + if (x == ColXLo) + Hit.QXLo = QPos; + else if (x == ColXHi) + { + Hit.QXHi = QPos; + break; + } + char q = Q3Seq[x]; + if (q != '-') + ++QPos; + } + + Hit.ColXLo = ColXLo; + Hit.ColXHi = ColXHi; + + //if (FirstA) + // { + // Hit.LY = AccumForA[ColXLo]; + // Hit.LN = AccumForB[ColXLo]; + + // Hit.RY = SumForB - AccumForB[ColXHi]; + // Hit.RN = SumForA - AccumForA[ColXHi]; + // } + //else + // { + // Hit.LY = AccumForB[ColXLo]; + // Hit.LN = AccumForA[ColXLo]; + // Hit.RY = SumForA - AccumForA[ColXHi]; + // Hit.RN = SumForB - AccumForB[ColXHi]; + // } + + //Hit.LA = AccumAgainst[ColXLo]; + //Hit.LD = AccumAbstain[ColXLo]; + + //Hit.RA = SumAgainst - AccumAgainst[ColXHi]; + //Hit.RD = SumAbstain - AccumAbstain[ColXHi]; + + Hit.PctIdAB = IdAB*100.0; + Hit.PctIdQM = BestIdQM*100.0; + + Hit.Div = (BestDiv - 1.0)*100.0; + + //Hit.QSD = QSD; + Hit.Q3 = Q3; + Hit.QLabel = QLabel; + if (FirstA) + { + //Hit.ASD = ASD; + //Hit.BSD = BSD; + //Hit.PathQA = PathQA; + //Hit.PathQB = PathQB; + Hit.A3 = A3; + Hit.B3 = B3; + Hit.ALabel = ALabel; + Hit.BLabel = BLabel; + Hit.PctIdQA = IdQA*100.0; + Hit.PctIdQB = IdQB*100.0; + } + else + { + Hit.A3 = B3; + Hit.B3 = A3; + Hit.ALabel = BLabel; + Hit.BLabel = ALabel; + Hit.PctIdQA = IdQB*100.0; + Hit.PctIdQB = IdQA*100.0; + } + +// CS SNPs + Hit.CS_LY = 0; + Hit.CS_LN = 0; + Hit.CS_RY = 0; + Hit.CS_RN = 0; + Hit.CS_LA = 0; + Hit.CS_RA = 0; + + //vector Cons; + //for (unsigned Col = 0; Col < ColCount; ++Col) + // { + // char q = Q3Seq[Col]; + // char a = A3Seq[Col]; + // char b = B3Seq[Col]; + // if (q == a && q == b && a == b) + // { + // Cons.push_back(1.0f); + // continue; + // } + + // bool gapq = isgap(q); + // bool gapa = isgap(a); + // bool gapb = isgap(b); + + // if (!gapq && !gapa && !gapb) + // { + // if (q == a || q == b || a == b) + // Cons.push_back(0.75); + // else + // Cons.push_back(0.5); + // } + // else + // { + // if (!gapa && (a == b || a == q)) + // Cons.push_back(0.5f); + // else if (!gapb && b == q) + // Cons.push_back(0.5f); + // else + // Cons.push_back(0.0f); + // } + // } + + //float fLY = 0.0f; + //float fLN = 0.0f; + //float fLA = 0.0f; + //float fRY = 0.0f; + //float fRN = 0.0f; + //float fRA = 0.0f; + for (unsigned Col = ColLo; Col <= ColHi; ++Col) + { + char q = Q3Seq[Col]; + char a = A3Seq[Col]; + char b = B3Seq[Col]; + if (q == a && q == b && a == b) + continue; + + unsigned ngaps = 0; + if (isgap(q)) + ++ngaps; + if (isgap(a)) + ++ngaps; + if (isgap(b)) + ++ngaps; + + if (opt_skipgaps) + { + if (ngaps == 3) + continue; + } + else + { + if (ngaps == 2) + continue; + } + + if (!FirstA) + swap(a, b); + + //float AvgCons = (Cons[Col-2] + Cons[Col-1] + Cons[Col+1] + Cons[Col+2])/4; + //if (Col < ColXLo) + // { + // if (q == a && q != b) + // fLY += AvgCons; + // else if (q == b && q != a) + // fLN += AvgCons; + // else + // fLA += AvgCons; + // } + //else if (Col > ColXHi) + // { + // if (q == b && q != a) + // fRY += AvgCons; + // else if (q == a && q != b) + // fRN += AvgCons; + // else + // fRA += AvgCons; + // } + + if (opt_skipgaps2) + { + if (Col > 0 && (isgap(Q3Seq[Col-1]) || isgap(A3Seq[Col-1]) || isgap(B3Seq[Col-1]))) + continue; + if (Col + 1 < ColCount && (isgap(Q3Seq[Col+1]) || isgap(A3Seq[Col+1]) || isgap(B3Seq[Col+1]))) + continue; + } + + //if (Col > 0 && isgap(Q3Seq[Col-1])) + //continue; + //if (Col + 1 < ColCount && isgap(Q3Seq[Col+1])) + // continue; + + if (Col < ColXLo) + { + if (q == a && q != b) + ++Hit.CS_LY; + else if (q == b && q != a) + ++Hit.CS_LN; + else + ++Hit.CS_LA; + } + else if (Col > ColXHi) + { + if (q == b && q != a) + ++Hit.CS_RY; + else if (q == a && q != b) + ++Hit.CS_RN; + else + ++Hit.CS_RA; + } + } + + double ScoreL = GetScore2(Hit.CS_LY, Hit.CS_LN, Hit.CS_LA); + double ScoreR = GetScore2(Hit.CS_RY, Hit.CS_RN, Hit.CS_RA); + Hit.Score = ScoreL*ScoreR; + + extern bool g_UchimeDeNovo; + + //if (0)//g_UchimeDeNovo) + // { + // double AbQ = GetAbFromLabel(QLabel.c_str()); + // double AbA = GetAbFromLabel(ALabel.c_str()); + // double AbB = GetAbFromLabel(BLabel.c_str()); + // if (AbQ > 0.0 && AbA > 0.0 && AbB > 0.0) + // { + // double MinAb = min(AbA, AbB); + // double Ratio = MinAb/AbQ; + // double t = Ratio - opt_abx; + // // double Factor = 2.0/(1.0 + exp(-t)); + // double Factor = min(Ratio, opt_abx)/opt_abx; + // if (opt_verbose) + // Log("Score %.4f Ab factor %.4f >%s\n", Hit.Score, Factor, QLabel.c_str()); + // Hit.Score *= Factor; + // } + // } + + extern FILE *g_fUChimeAlns; + if (g_fUChimeAlns != 0 && Hit.Div > 0.0) + { + void WriteChimeHitX(FILE *f, const ChimeHit2 &Hit); + WriteChimeHitX(g_fUChimeAlns, Hit); + } + } + +void AlignChime3(const string &Q3, const string &A3, const string &B3, + const string &QLabel, const string &ALabel, const string &BLabel, + ChimeHit2 &Hit) + { + if (opt_ucl) + AlignChimeLocal3(Q3, A3, B3, QLabel, ALabel, BLabel, Hit); + else + AlignChimeGlobal3(Q3, A3, B3, QLabel, ALabel, BLabel, Hit); + } + +static void StripGaps(const byte *Seq, unsigned L, string &s) + { + s.clear(); + for (unsigned i = 0; i < L; ++i) + { + char c = Seq[i]; + if (!isgap(c)) + s.push_back(c); + } + } + +static void StripGapsAlloc(const SeqData &SDIn, SeqData &SDOut) + { + SDOut = SDIn; + byte *s = myalloc(byte, SDIn.L); + unsigned k = 0; + for (unsigned i = 0; i < SDIn.L; ++i) + { + char c = SDIn.Seq[i]; + if (!isgap(c)) + s[k++] = toupper(c); + } + SDOut.Seq = s; + SDOut.L = k; + } + +void AlignChime(const SeqData &QSD, const SeqData &ASD, const SeqData &BSD, + const string &PathQA, const string &PathQB, ChimeHit2 &Hit) + { + //if (opt_ucl) + // { + // AlignChimeLocal(QSD, ASD, BSD, PathQA, PathQB, Hit); + // return; + // } + + string Q3; + string A3; + string B3; + Make3Way(QSD, ASD, BSD, PathQA, PathQB, Q3, A3, B3); + + AlignChime3(Q3, A3, B3, QSD.Label, ASD.Label, BSD.Label, Hit); + } + +void AlignChime3SDRealign(const SeqData &QSD3, const SeqData &ASD3, const SeqData &BSD3, + ChimeHit2 &Hit) + { + SeqData QSD; + SeqData ASD; + SeqData BSD; + StripGapsAlloc(QSD3, QSD); + StripGapsAlloc(ASD3, ASD); + StripGapsAlloc(BSD3, BSD); + + string PathQA; + string PathQB; + bool FoundQA = GlobalAlign(QSD, ASD, PathQA); + bool FoundQB = GlobalAlign(QSD, BSD, PathQB); + if (!FoundQA || !FoundQB) + { + Hit.Clear(); + Hit.QLabel = QSD3.Label; + return; + } + + AlignChime(QSD, ASD, BSD, PathQA, PathQB, Hit); + + myfree((void *) QSD.Seq); + myfree((void *) ASD.Seq); + myfree((void *) BSD.Seq); + } + +void AlignChime3SD(const SeqData &QSD3, const SeqData &ASD3, const SeqData &BSD3, + ChimeHit2 &Hit) + { + if (opt_realign) + { + AlignChime3SDRealign(QSD3, ASD3, BSD3, Hit); + return; + } + + string Q3; + string A3; + string B3; + + const unsigned ColCount = QSD3.L; + asserta(ASD3.L == ColCount && BSD3.L == ColCount); + + Q3.reserve(ColCount); + A3.reserve(ColCount); + B3.reserve(ColCount); + + const byte *QS = QSD3.Seq; + const byte *AS = ASD3.Seq; + const byte *BS = BSD3.Seq; + for (unsigned Col = 0; Col < ColCount; ++Col) + { + byte q = toupper(QS[Col]); + byte a = toupper(AS[Col]); + byte b = toupper(BS[Col]); + + if (isgap(q) && isgap(a) && isgap(b)) + continue; + + Q3.push_back(q); + A3.push_back(a); + B3.push_back(b); + } + + AlignChime3(Q3, A3, B3, QSD3.Label, ASD3.Label, BSD3.Label, Hit); + } diff --git a/alignchimel.cpp b/alignchimel.cpp new file mode 100644 index 0000000..ae152af --- /dev/null +++ b/alignchimel.cpp @@ -0,0 +1,417 @@ +#include "myutils.h" +#include "seq.h" +#include "chime.h" + +#define TRACE 0 + +/*** +Let: + S[i] = Score of col i: 0=no SNP, +1 = Y, -3 = N or A. + + V[k] = Best segment score from j, j+1 .. k for all possible j + max(j) Sum i=j..k S[i] + +Recursion relation: + V[k] = S[k] + max (V[k-1], 0) +***/ + +void AlignChimeGlobal3(const string &Q3, const string &A3, const string &B3, + const string &QLabel, const string &ALabel, const string &BLabel, + ChimeHit2 &Hit); + +void Make3Way(const SeqData &SDQ, const SeqData &SDA, const SeqData &SDB, + const string &PathQA, const string &PathQB, + string &Q3, string &A3, string &B3); + +double GetScore2(double Y, double N, double A); + +void AlignChimeLocal3(const string &Q3, const string &A3, const string &B3, + const string &QLabel, const string &ALabel, const string &BLabel, + ChimeHit2 &Hit) + { + Hit.Clear(); + + const byte *Q3Seq = (const byte *) Q3.c_str(); + const byte *A3Seq = (const byte *) A3.c_str(); + const byte *B3Seq = (const byte *) B3.c_str(); + + const unsigned ColCount = SIZE(Q3); + asserta(SIZE(A3) == ColCount && SIZE(B3) == ColCount); + + vector ColScoresA(ColCount, 0.0f); + vector ColScoresB(ColCount, 0.0f); + + float ScoreN = -(float) opt_xn; + unsigned QL = 0; + for (unsigned Col = 0; Col < ColCount; ++Col) + { + char q = Q3Seq[Col]; + char a = A3Seq[Col]; + char b = B3Seq[Col]; + + if (!isgap(q)) + ++QL; + + if (q == a && q == b && a == b) + continue; + + if (isgap(q) || isgap(a) || isgap(b)) + continue; + + if (Col > 0 && (isgap(Q3Seq[Col-1]) || isgap(A3Seq[Col-1]) || isgap(B3Seq[Col-1]))) + continue; + + if (Col + 1 < ColCount && (isgap(Q3Seq[Col+1]) || isgap(A3Seq[Col+1]) || isgap(B3Seq[Col+1]))) + continue; + + if (q == a && q != b) + ColScoresA[Col] = 1; + else + ColScoresA[Col] = ScoreN; + + if (q == b && q != a) + ColScoresB[Col] = 1; + else + ColScoresB[Col] = ScoreN; + } + + vector LVA(ColCount, 0.0f); + vector LVB(ColCount, 0.0f); + + LVA[0] = ColScoresA[0]; + LVB[0] = ColScoresB[0]; + for (unsigned Col = 1; Col < ColCount; ++Col) + { + LVA[Col] = max(LVA[Col-1], 0.0f) + ColScoresA[Col]; + LVB[Col] = max(LVB[Col-1], 0.0f) + ColScoresB[Col]; + } + + vector RVA(ColCount, 0.0f); + vector RVB(ColCount, 0.0f); + + RVA[ColCount-1] = ColScoresA[ColCount-1]; + RVB[ColCount-1] = ColScoresB[ColCount-1]; + for (int Col = ColCount-2; Col >= 0; --Col) + { + RVA[Col] = max(RVA[Col+1], 0.0f) + ColScoresA[Col]; + RVB[Col] = max(RVB[Col+1], 0.0f) + ColScoresB[Col]; + } + + bool FirstA = true; + float MaxSum = 0.0; + unsigned ColX = UINT_MAX; + for (unsigned Col = 1; Col < ColCount-1; ++Col) + { + float Sum = LVA[Col] + RVB[Col+1]; + if (Sum > MaxSum) + { + FirstA = true; + MaxSum = Sum; + ColX = Col; + } + } + + for (unsigned Col = 1; Col < ColCount-1; ++Col) + { + float Sum = LVB[Col] + RVA[Col+1]; + if (Sum > MaxSum) + { + FirstA = false; + MaxSum = Sum; + ColX = Col; + } + } + if (ColX == UINT_MAX) + return; + + unsigned ColLo = UINT_MAX; + unsigned ColHi = UINT_MAX; + if (FirstA) + { + float Sum = 0.0f; + for (int Col = ColX; Col >= 0; --Col) + { + Sum += ColScoresA[Col]; + if (Sum >= LVA[ColX]) + { + ColLo = Col; + break; + } + } + asserta(Sum >= LVA[ColX]); + Sum = 0.0f; + for (unsigned Col = ColX+1; Col < ColCount; ++Col) + { + Sum += ColScoresB[Col]; + if (Sum >= RVB[ColX]) + { + ColHi = Col; + break; + } + } + asserta(Sum >= RVB[ColX]); + } + else + { + float Sum = 0.0f; + for (int Col = ColX; Col >= 0; --Col) + { + Sum += ColScoresB[Col]; + if (Sum >= LVB[ColX]) + { + ColLo = Col; + break; + } + } + asserta(Sum >= LVB[ColX]); + Sum = 0.0f; + for (unsigned Col = ColX+1; Col < ColCount; ++Col) + { + Sum += ColScoresA[Col]; + if (Sum >= RVA[ColX]) + { + ColHi = Col; + break; + } + } + asserta(Sum >= RVA[ColX]); + } + + unsigned ColXHi = ColX; + for (unsigned Col = ColX + 1; Col < ColCount; ++Col) + { + char q = Q3Seq[Col]; + char a = A3Seq[Col]; + char b = B3Seq[Col]; + + if (q == a && q == b && !isgap(q)) + ColXHi = Col; + else + break; + } + + unsigned ColXLo = ColX; + for (int Col = (int) ColX - 1; Col >= 0; --Col) + { + char q = Q3Seq[Col]; + char a = A3Seq[Col]; + char b = B3Seq[Col]; + + if (q == a && q == b && !isgap(q)) + ColXLo = Col; + else + break; + } + + unsigned IdQA = 0; + unsigned IdQB = 0; + unsigned IdAB = 0; + unsigned NQA = 0; + unsigned NQB = 0; + unsigned NAB = 0; + for (unsigned Col = 0; Col < ColCount; ++Col) + { + char q = Q3Seq[Col]; + char a = A3Seq[Col]; + char b = B3Seq[Col]; + + if (!isgap(q) && !isgap(a)) + { + ++NQA; + if (q == a) + ++IdQA; + } + + if (!isgap(q) && !isgap(b)) + { + ++NQB; + if (q == b) + ++IdQB; + } + + if (!isgap(a) && !isgap(b)) + { + ++NAB; + if (a == b) + ++IdAB; + } + } + + Hit.PctIdQA = Pct(IdQA, NQA); + Hit.PctIdQB = Pct(IdQB, NQB); + Hit.PctIdAB = Pct(IdAB, NAB); + + unsigned LIdQA = 0; + unsigned LIdQB = 0; + for (unsigned Col = ColLo; Col < ColXLo; ++Col) + { + char q = Q3Seq[Col]; + char a = A3Seq[Col]; + char b = B3Seq[Col]; + + if (!isgap(q) && !isgap(a)) + { + if (q == a) + ++LIdQA; + } + + if (!isgap(q) && !isgap(b)) + { + if (q == b) + ++LIdQB; + } + } + + unsigned RIdQA = 0; + unsigned RIdQB = 0; + for (unsigned Col = ColXHi+1; Col <= ColHi; ++Col) + { + char q = Q3Seq[Col]; + char a = A3Seq[Col]; + char b = B3Seq[Col]; + + if (!isgap(q) && !isgap(a)) + { + if (q == a) + ++RIdQA; + } + + if (!isgap(q) && !isgap(b)) + { + if (q == b) + ++RIdQB; + } + } + + unsigned IdDiffL = max(LIdQA, LIdQB) - min(LIdQA, LIdQB); + unsigned IdDiffR = max(RIdQA, RIdQB) - min(RIdQA, RIdQB); + unsigned MinIdDiff = min(IdDiffL, IdDiffR); + unsigned ColRange = ColHi - ColLo + 1; + if (opt_queryfract > 0.0f && float(ColRange)/float(QL) < opt_queryfract) + return; + +// double Div = Pct(MinIdDiff, QSD.L); + +#if TRACE + { + Log(" Col A Q B ScoreA ScoreB LVA LVB RVA RVB\n"); + Log("----- - - - ------- ------- ------- ------- ------- -------\n"); + for (unsigned Col = 0; Col < ColCount; ++Col) + { + if (ColScoresA[Col] == 0.0 && ColScoresB[Col] == 0.0) + continue; + + char q = Q3Seq[Col]; + char a = A3Seq[Col]; + char b = B3Seq[Col]; + Log("%5u %c %c %c", Col, a, q, b); + + if (ColScoresA[Col] == 0.0) + Log(" %7.7s", ""); + else + Log(" %7.1f", ColScoresA[Col]); + + if (ColScoresB[Col] == 0.0) + Log(" %7.7s", ""); + else + Log(" %7.1f", ColScoresB[Col]); + + Log(" %7.1f %7.1f %7.1f %7.1f", LVA[Col], LVB[Col], RVA[Col], RVB[Col]); + + Log("\n"); + } + Log("\n"); + Log("MaxSum %.1f, ColLo %u, ColXLo %u, ColX %u, ColXHi %u, ColHi %u, AF %c\n", + MaxSum, ColLo, ColXLo, ColX, ColXHi, ColHi, tof(FirstA)); + Log(" LIdQA %u, LIdQB %u, RIdQA %u, RIdQB %u\n", LIdQA, LIdQB, RIdQA, RIdQB); + } +#endif + + string Q3L; + string A3L; + string B3L; + for (unsigned Col = ColLo; Col <= ColHi; ++Col) + { + char q = Q3[Col]; + char a = A3[Col]; + char b = B3[Col]; + + Q3L += q; + A3L += a; + B3L += b; + } + + AlignChimeGlobal3(Q3L, A3L, B3L, QLabel, ALabel, BLabel, Hit); + +#if 0 +// CS SNPs + Hit.CS_LY = 0; + Hit.CS_LN = 0; + Hit.CS_RY = 0; + Hit.CS_RN = 0; + Hit.CS_LA = 0; + Hit.CS_RA = 0; + for (unsigned Col = ColLo; Col <= ColHi; ++Col) + { + char q = Q3Seq[Col]; + char a = A3Seq[Col]; + char b = B3Seq[Col]; + if (q == a && q == b && a == b) + continue; + if (isgap(q) || isgap(a) || isgap(b)) + continue; + if (Col > 0 && (isgap(Q3Seq[Col-1]) || isgap(A3Seq[Col-1]) || isgap(B3Seq[Col-1]))) + continue; + if (Col + 1 < ColCount && (isgap(Q3Seq[Col+1]) || isgap(A3Seq[Col+1]) || isgap(B3Seq[Col+1]))) + continue; + + if (!FirstA) + swap(a, b); + + if (Col < ColXLo) + { + if (q == a && q != b) + ++Hit.CS_LY; + else if (q == b && q != a) + ++Hit.CS_LN; + else + ++Hit.CS_LA; + } + else if (Col > ColXHi) + { + if (q == b && q != a) + ++Hit.CS_RY; + else if (q == a && q != b) + ++Hit.CS_RN; + else + ++Hit.CS_RA; + } + } + + double ScoreL = GetScore2(Hit.CS_LY, Hit.CS_LN, Hit.CS_LA); + double ScoreR = GetScore2(Hit.CS_RY, Hit.CS_RN, Hit.CS_RA); + Hit.Score = ScoreL*ScoreR; + + //Hit.QSD = QSD; + //if (FirstA) + // { + // Hit.ASD = ASD; + // Hit.BSD = BSD; + // Hit.PathQA = PathQA; + // Hit.PathQB = PathQB; + // } + //else + // { + // Hit.ASD = BSD; + // Hit.BSD = ASD; + // } + + //Hit.ColLo = ColLo; + //Hit.ColXLo = ColXLo; + //Hit.ColXHi = ColXHi; + //Hit.ColHi = ColHi; + //Hit.Div = Div; + +// Hit.LogMe(); +#endif + } diff --git a/allocs.h b/allocs.h new file mode 100644 index 0000000..157d03e --- /dev/null +++ b/allocs.h @@ -0,0 +1,24 @@ +A(Alpha) +A(Mx) +A(ChainBrute) +A(Chainer) +A(Test) +A(CompressPath) +A(HSPFinder) +A(Main) +A(Clumps) +A(Path) +A(SeqDB) +A(SFasta) +A(SWUngapped) +A(AllocBit) +A(Ultra) +A(UPGMA) +A(Windex) +A(XDropBwd) +A(Xlat) +A(MPath) +A(ScoreCache) +A(TargetHits) +A(Out) +A(Hashdex) diff --git a/alnheuristics.h b/alnheuristics.h new file mode 100644 index 0000000..9a8d283 --- /dev/null +++ b/alnheuristics.h @@ -0,0 +1,29 @@ +#ifndef alnheuristics_h +#define alnheuristics_h + +struct AlnParams; + +struct AlnHeuristics + { + unsigned BandRadius; + unsigned HSPFinderWordLength; + float SeedT; + + float XDropG; // GappedBlast default + float XDropU; // UngappedBlast default + float XDropUG; // UngappedBlast called by GappedBlast + + unsigned MinGlobalHSPLength; + + AlnHeuristics(); + void InitFromCmdLine(const AlnParams &AP); + void InitGlobalFull(); + + bool IsGlobalFull() const + { + return MinGlobalHSPLength == 0 && BandRadius == 0; + } + + }; + +#endif // alnheuristics_h diff --git a/alnparams.cpp b/alnparams.cpp new file mode 100644 index 0000000..d1b9036 --- /dev/null +++ b/alnparams.cpp @@ -0,0 +1,414 @@ +#include "myutils.h" +#include // for FLT_MAX +#include "mx.h" +#include "alnparams.h" +#include "hsp.h" + +#define TEST 0 + +void SetBLOSUM62(); +void SetNucSubstMx(double Match, double Mismatch); +void ReadSubstMx(const string &FileName, Mx &Mxf); + +extern Mx g_SubstMxf; +extern float **g_SubstMx; + +void AlnParams::Clear() + { + SubstMxName = 0; + LocalOpen = OBVIOUSLY_WRONG_PENALTY; + LocalExt = OBVIOUSLY_WRONG_PENALTY; + OpenA = OBVIOUSLY_WRONG_PENALTY; + OpenB = OBVIOUSLY_WRONG_PENALTY; + ExtA = OBVIOUSLY_WRONG_PENALTY; + ExtB = OBVIOUSLY_WRONG_PENALTY; + LOpenA = OBVIOUSLY_WRONG_PENALTY; + LOpenB = OBVIOUSLY_WRONG_PENALTY; + ROpenA = OBVIOUSLY_WRONG_PENALTY; + ROpenB = OBVIOUSLY_WRONG_PENALTY; + LExtA = OBVIOUSLY_WRONG_PENALTY; + LExtB = OBVIOUSLY_WRONG_PENALTY; + RExtA = OBVIOUSLY_WRONG_PENALTY; + RExtB = OBVIOUSLY_WRONG_PENALTY; + Nucleo = false; + NucleoSet = false; + } + +bool AlnParams::Is2() const + { + float g = OpenA; + float e = ExtA; + if (OpenB != g || LOpenA != g || LOpenB != g || ROpenA != g || ROpenB != g) + return false; + if (ExtB != e || LExtA != e || LExtB != e || RExtA != e || RExtB != e) + return false; + return true; + } + +bool AlnParams::Is4() const + { + float g = OpenA; + float tg = LOpenA; + float e = ExtA; + float te = LExtA; + if (OpenB != g || LOpenA != tg || LOpenB != tg || ROpenA != tg || ROpenB != tg) + return false; + if (ExtB != e || LExtA != te || LExtB != te || RExtA != te || RExtB != te) + return false; + return true; + } + +const char *AlnParams::GetType() const + { + if (Is2()) + return "2"; + else if (Is4()) + return "4"; + return "12"; + } + +void AlnParams::Init2(const float * const *Mx, float Open, float Ext) + { + SubstMx = Mx; + OpenA = OpenB = LOpenA = LOpenB = ROpenA = ROpenB = Open; + ExtA = ExtB = LExtA = LExtB = RExtA = RExtB = Ext; + } + +void AlnParams::SetLocal(float Open, float Ext) + { + LocalOpen = Open; + LocalExt = Ext; + } + +void AlnParams::Init4(const float * const *Mx, float Open, float Ext, + float TermOpen, float TermExt) + { + SubstMx = Mx; + OpenA = OpenB = Open; + LOpenA = LOpenB = ROpenA = ROpenB = TermOpen; + ExtA = ExtB = Ext; + LExtA = LExtB = RExtA = RExtB = TermExt; + } + +void AlnParams::Init(const AlnParams &AP, const HSPData &HSP, + unsigned LA, unsigned LB) + { + SubstMx = AP.SubstMx; + OpenA = AP.OpenA; + OpenB = AP.OpenB; + ExtA = AP.ExtA; + ExtB = AP.ExtB; + + if (HSP.LeftA()) + { + LOpenA = AP.LOpenA; + LExtA = AP.LExtA; + } + else + { + LOpenA = AP.OpenA; + LExtA = AP.ExtA; + } + + if (HSP.LeftB()) + { + LOpenB = AP.LOpenB; + LExtB = AP.LExtB; + } + else + { + LOpenB = AP.OpenB; + LExtB = AP.ExtB; + } + + if (HSP.RightA(LA)) + { + ROpenA = AP.ROpenA; + RExtA = AP.RExtA; + } + else + { + ROpenA = AP.OpenA; + RExtA = AP.ExtA; + } + + if (HSP.RightB(LB)) + { + ROpenB = AP.ROpenB; + RExtB = AP.RExtB; + } + else + { + ROpenB = AP.OpenB; + RExtB = AP.ExtB; + } + } + +void AlnParams::LogMe() const + { + Log("AlnParams(%s)", GetType()); + if (Is2()) + Log(" g=%.1f e=%.1f", -OpenA, -ExtA); + else if (Is4()) + Log(" g=%.1f tg=%.1f e=%.1f te=%.1f", -OpenA, -ExtA, -LOpenA, -LExtA); + else + Log( +" gA=%.1f gB=%.1f gAL=%.1f gBL=%.1f gAR=%.1f gBR=%.1f eA=%.1f eB=%.1f eAL=%.1f eBL=%.1f eAR=%.1f eBR=%.1f", + OpenA, OpenB, LOpenA, LOpenB, ROpenA, ROpenB, ExtA, ExtB, LExtA, LExtB, RExtA, RExtB); + Log("\n"); + } + +/*** +Open/Ext format string is one or more: + [...] + +Value is (positive) penalty or * (disabled). +Flag is: + Q Query. + T Target sequence. + I Internal gaps (defafault internal and terminal). + E End gaps (default internal and terminal). + L Left end. + R Right end. +***/ + +static void ParseGapStr(const string &s, + float &QI, float &QL, float &QR, + float &TI, float &TL, float &TR) + { + if (s.empty()) + return; + + bool Q = false; + bool T = false; + bool I = false; + bool E = false; + bool L = false; + bool R = false; + + const unsigned K = SIZE(s); + unsigned Dec = 0; + float Value = FLT_MAX; + for (unsigned i = 0; i <= K; ++i) + { + char c = s.c_str()[i]; + if (c == 0 || c == '/') + { + if (Value == FLT_MAX) + Die("Invalid gap penalty string, missing penalty '%s'", s.c_str()); + if (!Q && !T && !I && !E && !L && !R) + { + Q = true; + T = true; + L = true; + R = true; + I = true; + } + + if (!E && !I && !L && !R) + { + E = false; + I = true; + L = true; + R = true; + } + + if (E) + { + if (L || R) + Die("Invalid gap penalty string (E and L or R) '%s'", s.c_str()); + L = true; + R = true; + } + + if (!Q && !T) + { + Q = true; + T = true; + } + + if (Q && L) + QL = -Value; + if (Q && R) + QR = -Value; + if (Q && I) + QI = -Value; + if (T && L) + TL = -Value; + if (T && R) + TR = -Value; + if (T && I) + TI = -Value; + + Value = FLT_MAX; + Dec = 0; + Q = false; + T = false; + I = false; + E = false; + L = false; + R = false; + } + else if (c == '*') + { + if (Value != FLT_MAX) + Die("Invalid gap penalty (* in floating point number) '%s'", s.c_str()); + Value = -MINUS_INFINITY; + } + else if (isdigit(c)) + { + if (Value == -MINUS_INFINITY) + Die("Invalid gap penalty (* in floating point number) '%s'", s.c_str()); + if (Value == FLT_MAX) + Value = 0.0; + if (Dec > 0) + { + Dec *= 10; + Value += float(c - '0')/Dec; + } + else + Value = Value*10 + (c - '0'); + } + else if (c == '.') + { + if (Dec > 0) + Die("Invalid gap penalty (two decimal points) '%s'", s.c_str()); + Dec = 1; + } + else + { + switch (c) + { + case 'Q': + Q = true; + break; + case 'T': + T = true; + break; + case 'I': + I = true; + break; + case 'L': + L = true; + break; + case 'R': + R = true; + break; + case 'E': + E = true; + break; + default: + Die("Invalid char '%c' in gap penalty string '%s'", c, s.c_str()); + } + } + } + } + +void AlnParams::SetPenalties(const string &OpenStr, const string &ExtStr) + { + ParseGapStr(OpenStr, OpenA, LOpenA, ROpenA, OpenB, LOpenB, ROpenB); + ParseGapStr(ExtStr, ExtA, LExtA, RExtA, ExtB, LExtB, RExtB); + } + +void AlnParams::SetMxFromCmdLine(bool IsNucleo) + { + if (IsNucleo) + SetNucSubstMx(opt_match, opt_mismatch); + else + { + if (opt_matrix == "") + { + SubstMxName = "BLOSUM62"; + SetBLOSUM62(); + } + else + { + ReadSubstMx(opt_matrix, g_SubstMxf); + g_SubstMx = g_SubstMxf.GetData(); + g_SubstMxf.LogMe(); + SubstMxName = opt_matrix.c_str(); + } + } + SubstMx = g_SubstMx; + asserta(SubstMx != 0); + } + +void AlnParams::InitFromCmdLine(bool IsNucleo) + { + Clear(); + Nucleo = IsNucleo; + NucleoSet = true; + + SetMxFromCmdLine(IsNucleo); + +// Local + if (optset_lopen || optset_lext) + { + if (!optset_lopen || !optset_lext) + Die("Must set both --lopen and --lext"); + if (opt_lopen < 0.0 || opt_lext < 0.0) + Die("Invalid --lopen/--lext, gap penalties must be >= 0"); + SetLocal(float(-opt_lopen), float(-opt_lext)); + } + else + { + // Same penalties, if-statement to note could differ. + if (IsNucleo) + SetLocal(-10.0f, -1.0f); + else + SetLocal(-10.0f, -1.0f); + } + +// Global + if (IsNucleo) + Init4(g_SubstMx, -10.0, -1.0, -0.5, -0.5); + else + Init4(g_SubstMx, -17.0, -1.0, -0.5, -0.5); + SetPenalties(opt_gapopen, opt_gapext); + } + +float AlnParams::GetLocalOpen() const + { + return LocalOpen; + } + +float AlnParams::GetLocalExt() const + { + return LocalExt; + } + +bool AlnParams::GetIsNucleo() const + { + asserta(NucleoSet); + return Nucleo; + } + +unsigned GetWindexWordLength(bool Nucleo) + { + if (optset_w) + return opt_w; + + if (Nucleo) + return 8; + else + return 5; + } + +#if TEST +static void Test1(const string &os, const string &es) + { + AlnParams AP; + Log("\n"); + Log("OpenStr %s\n", os.c_str()); + Log(" ExtStr %s\n", es.c_str()); + AP.SetPenalties(os, es); + AP.LogMe(); + } + +void TestGapStr() + { + Test1("17I/0.5E", "1I/0.5E"); + Test1("17I/0.5L/0.4R", "1Q/2T"); + Test1("1QL/2QR/3QI/4TL/5TR/6TI", ".1QL/.2QR/.3QI/.4TL/.5TR/.6TI"); + } +#endif // TEST diff --git a/alnparams.h b/alnparams.h new file mode 100644 index 0000000..4037912 --- /dev/null +++ b/alnparams.h @@ -0,0 +1,59 @@ +#ifndef alnparams_h +#define alnparams_h + +struct HSPData; + +// Gap penalty scores are negative +// (i.e., are scores, not penalties). +struct AlnParams + { + const char *SubstMxName; + const float * const *SubstMx; + + bool Nucleo; + bool NucleoSet; + +// Local gaps + float LocalOpen; + float LocalExt; + +// Global internal gaps + float OpenA; + float OpenB; + + float ExtA; + float ExtB; + +// Global terminal gaps + float LOpenA; + float LOpenB; + float ROpenA; + float ROpenB; + + float LExtA; + float LExtB; + float RExtA; + float RExtB; + + void Clear(); + void SetLocal(float Open, float Ext); + void Init2(const float * const *Mx, float Open, float Ext); + void Init4(const float * const *Mx, float Open, float Ext, float TermOpen, float TermExt); + void Init(const AlnParams &AP, const HSPData &HSP, unsigned LA, unsigned LB); + void InitFromCmdLine(bool Nucleo); + void SetMxFromCmdLine(bool Nucleo); + void SetPenalties(const string &OpenStr, const string &ExtStr); + float GetLocalOpen() const; + float GetLocalExt() const; + bool GetIsNucleo() const; + + bool Is2() const; + bool Is4() const; + const char *GetType() const; + + void LogMe() const; + }; + +const float OBVIOUSLY_WRONG_PENALTY = 1000.0; + +#endif // alnparams_h diff --git a/alpha.cpp b/alpha.cpp new file mode 100644 index 0000000..0efca3b --- /dev/null +++ b/alpha.cpp @@ -0,0 +1,2761 @@ +// Generated by /p/py/alphac.py +#include "alpha.h" + +unsigned g_CharToLetterAminoStop[256] = + { + INVALID_LETTER, // [ 0] 0x00 + INVALID_LETTER, // [ 1] 0x01 + INVALID_LETTER, // [ 2] 0x02 + INVALID_LETTER, // [ 3] 0x03 + INVALID_LETTER, // [ 4] 0x04 + INVALID_LETTER, // [ 5] 0x05 + INVALID_LETTER, // [ 6] 0x06 + INVALID_LETTER, // [ 7] 0x07 + INVALID_LETTER, // [ 8] 0x08 + INVALID_LETTER, // [ 9] 0x09 + INVALID_LETTER, // [ 10] 0x0a + INVALID_LETTER, // [ 11] 0x0b + INVALID_LETTER, // [ 12] 0x0c + INVALID_LETTER, // [ 13] 0x0d + INVALID_LETTER, // [ 14] 0x0e + INVALID_LETTER, // [ 15] 0x0f + INVALID_LETTER, // [ 16] 0x10 + INVALID_LETTER, // [ 17] 0x11 + INVALID_LETTER, // [ 18] 0x12 + INVALID_LETTER, // [ 19] 0x13 + INVALID_LETTER, // [ 20] 0x14 + INVALID_LETTER, // [ 21] 0x15 + INVALID_LETTER, // [ 22] 0x16 + INVALID_LETTER, // [ 23] 0x17 + INVALID_LETTER, // [ 24] 0x18 + INVALID_LETTER, // [ 25] 0x19 + INVALID_LETTER, // [ 26] 0x1a + INVALID_LETTER, // [ 27] 0x1b + INVALID_LETTER, // [ 28] 0x1c + INVALID_LETTER, // [ 29] 0x1d + INVALID_LETTER, // [ 30] 0x1e + INVALID_LETTER, // [ 31] 0x1f + INVALID_LETTER, // [ 32] ' ' + INVALID_LETTER, // [ 33] '!' + INVALID_LETTER, // [ 34] '"' + INVALID_LETTER, // [ 35] '#' + INVALID_LETTER, // [ 36] '$' + INVALID_LETTER, // [ 37] '%' + INVALID_LETTER, // [ 38] '&' + INVALID_LETTER, // [ 39] ''' + INVALID_LETTER, // [ 40] '(' + INVALID_LETTER, // [ 41] ')' + 20 , // [ 42] '*' = STP + INVALID_LETTER, // [ 43] '+' + INVALID_LETTER, // [ 44] ',' + INVALID_LETTER, // [ 45] '-' + INVALID_LETTER, // [ 46] '.' + INVALID_LETTER, // [ 47] '/' + INVALID_LETTER, // [ 48] '0' + INVALID_LETTER, // [ 49] '1' + INVALID_LETTER, // [ 50] '2' + INVALID_LETTER, // [ 51] '3' + INVALID_LETTER, // [ 52] '4' + INVALID_LETTER, // [ 53] '5' + INVALID_LETTER, // [ 54] '6' + INVALID_LETTER, // [ 55] '7' + INVALID_LETTER, // [ 56] '8' + INVALID_LETTER, // [ 57] '9' + INVALID_LETTER, // [ 58] ':' + INVALID_LETTER, // [ 59] ';' + INVALID_LETTER, // [ 60] '<' + INVALID_LETTER, // [ 61] '=' + INVALID_LETTER, // [ 62] '>' + INVALID_LETTER, // [ 63] '?' + INVALID_LETTER, // [ 64] '@' + 0 , // [ 65] 'A' = Ala + INVALID_LETTER, // [ 66] 'B' + 1 , // [ 67] 'C' = Cys + 2 , // [ 68] 'D' = Asp + 3 , // [ 69] 'E' = Glu + 4 , // [ 70] 'F' = Phe + 5 , // [ 71] 'G' = Gly + 6 , // [ 72] 'H' = His + 7 , // [ 73] 'I' = Ile + INVALID_LETTER, // [ 74] 'J' + 8 , // [ 75] 'K' = Lys + 9 , // [ 76] 'L' = Leu + 10 , // [ 77] 'M' = Met + 11 , // [ 78] 'N' = Asn + INVALID_LETTER, // [ 79] 'O' + 12 , // [ 80] 'P' = Pro + 13 , // [ 81] 'Q' = Gln + 14 , // [ 82] 'R' = Arg + 15 , // [ 83] 'S' = Ser + 16 , // [ 84] 'T' = Thr + INVALID_LETTER, // [ 85] 'U' + 17 , // [ 86] 'V' = Val + 18 , // [ 87] 'W' = Trp + INVALID_LETTER, // [ 88] 'X' + 19 , // [ 89] 'Y' = Tyr + INVALID_LETTER, // [ 90] 'Z' + INVALID_LETTER, // [ 91] '[' + INVALID_LETTER, // [ 92] '\' + INVALID_LETTER, // [ 93] ']' + INVALID_LETTER, // [ 94] '^' + INVALID_LETTER, // [ 95] '_' + INVALID_LETTER, // [ 96] '`' + 0 , // [ 97] 'a' = Ala + INVALID_LETTER, // [ 98] 'b' + 1 , // [ 99] 'c' = Cys + 2 , // [100] 'd' = Asp + 3 , // [101] 'e' = Glu + 4 , // [102] 'f' = Phe + 5 , // [103] 'g' = Gly + 6 , // [104] 'h' = His + 7 , // [105] 'i' = Ile + INVALID_LETTER, // [106] 'j' + 8 , // [107] 'k' = Lys + 9 , // [108] 'l' = Leu + 10 , // [109] 'm' = Met + 11 , // [110] 'n' = Asn + INVALID_LETTER, // [111] 'o' + 12 , // [112] 'p' = Pro + 13 , // [113] 'q' = Gln + 14 , // [114] 'r' = Arg + 15 , // [115] 's' = Ser + 16 , // [116] 't' = Thr + INVALID_LETTER, // [117] 'u' + 17 , // [118] 'v' = Val + 18 , // [119] 'w' = Trp + INVALID_LETTER, // [120] 'x' + 19 , // [121] 'y' = Tyr + INVALID_LETTER, // [122] 'z' + INVALID_LETTER, // [123] '{' + INVALID_LETTER, // [124] '|' + INVALID_LETTER, // [125] '}' + INVALID_LETTER, // [126] '~' + INVALID_LETTER, // [127] 0x7f + INVALID_LETTER, // [128] 0x80 + INVALID_LETTER, // [129] 0x81 + INVALID_LETTER, // [130] 0x82 + INVALID_LETTER, // [131] 0x83 + INVALID_LETTER, // [132] 0x84 + INVALID_LETTER, // [133] 0x85 + INVALID_LETTER, // [134] 0x86 + INVALID_LETTER, // [135] 0x87 + INVALID_LETTER, // [136] 0x88 + INVALID_LETTER, // [137] 0x89 + INVALID_LETTER, // [138] 0x8a + INVALID_LETTER, // [139] 0x8b + INVALID_LETTER, // [140] 0x8c + INVALID_LETTER, // [141] 0x8d + INVALID_LETTER, // [142] 0x8e + INVALID_LETTER, // [143] 0x8f + INVALID_LETTER, // [144] 0x90 + INVALID_LETTER, // [145] 0x91 + INVALID_LETTER, // [146] 0x92 + INVALID_LETTER, // [147] 0x93 + INVALID_LETTER, // [148] 0x94 + INVALID_LETTER, // [149] 0x95 + INVALID_LETTER, // [150] 0x96 + INVALID_LETTER, // [151] 0x97 + INVALID_LETTER, // [152] 0x98 + INVALID_LETTER, // [153] 0x99 + INVALID_LETTER, // [154] 0x9a + INVALID_LETTER, // [155] 0x9b + INVALID_LETTER, // [156] 0x9c + INVALID_LETTER, // [157] 0x9d + INVALID_LETTER, // [158] 0x9e + INVALID_LETTER, // [159] 0x9f + INVALID_LETTER, // [160] 0xa0 + INVALID_LETTER, // [161] 0xa1 + INVALID_LETTER, // [162] 0xa2 + INVALID_LETTER, // [163] 0xa3 + INVALID_LETTER, // [164] 0xa4 + INVALID_LETTER, // [165] 0xa5 + INVALID_LETTER, // [166] 0xa6 + INVALID_LETTER, // [167] 0xa7 + INVALID_LETTER, // [168] 0xa8 + INVALID_LETTER, // [169] 0xa9 + INVALID_LETTER, // [170] 0xaa + INVALID_LETTER, // [171] 0xab + INVALID_LETTER, // [172] 0xac + INVALID_LETTER, // [173] 0xad + INVALID_LETTER, // [174] 0xae + INVALID_LETTER, // [175] 0xaf + INVALID_LETTER, // [176] 0xb0 + INVALID_LETTER, // [177] 0xb1 + INVALID_LETTER, // [178] 0xb2 + INVALID_LETTER, // [179] 0xb3 + INVALID_LETTER, // [180] 0xb4 + INVALID_LETTER, // [181] 0xb5 + INVALID_LETTER, // [182] 0xb6 + INVALID_LETTER, // [183] 0xb7 + INVALID_LETTER, // [184] 0xb8 + INVALID_LETTER, // [185] 0xb9 + INVALID_LETTER, // [186] 0xba + INVALID_LETTER, // [187] 0xbb + INVALID_LETTER, // [188] 0xbc + INVALID_LETTER, // [189] 0xbd + INVALID_LETTER, // [190] 0xbe + INVALID_LETTER, // [191] 0xbf + INVALID_LETTER, // [192] 0xc0 + INVALID_LETTER, // [193] 0xc1 + INVALID_LETTER, // [194] 0xc2 + INVALID_LETTER, // [195] 0xc3 + INVALID_LETTER, // [196] 0xc4 + INVALID_LETTER, // [197] 0xc5 + INVALID_LETTER, // [198] 0xc6 + INVALID_LETTER, // [199] 0xc7 + INVALID_LETTER, // [200] 0xc8 + INVALID_LETTER, // [201] 0xc9 + INVALID_LETTER, // [202] 0xca + INVALID_LETTER, // [203] 0xcb + INVALID_LETTER, // [204] 0xcc + INVALID_LETTER, // [205] 0xcd + INVALID_LETTER, // [206] 0xce + INVALID_LETTER, // [207] 0xcf + INVALID_LETTER, // [208] 0xd0 + INVALID_LETTER, // [209] 0xd1 + INVALID_LETTER, // [210] 0xd2 + INVALID_LETTER, // [211] 0xd3 + INVALID_LETTER, // [212] 0xd4 + INVALID_LETTER, // [213] 0xd5 + INVALID_LETTER, // [214] 0xd6 + INVALID_LETTER, // [215] 0xd7 + INVALID_LETTER, // [216] 0xd8 + INVALID_LETTER, // [217] 0xd9 + INVALID_LETTER, // [218] 0xda + INVALID_LETTER, // [219] 0xdb + INVALID_LETTER, // [220] 0xdc + INVALID_LETTER, // [221] 0xdd + INVALID_LETTER, // [222] 0xde + INVALID_LETTER, // [223] 0xdf + INVALID_LETTER, // [224] 0xe0 + INVALID_LETTER, // [225] 0xe1 + INVALID_LETTER, // [226] 0xe2 + INVALID_LETTER, // [227] 0xe3 + INVALID_LETTER, // [228] 0xe4 + INVALID_LETTER, // [229] 0xe5 + INVALID_LETTER, // [230] 0xe6 + INVALID_LETTER, // [231] 0xe7 + INVALID_LETTER, // [232] 0xe8 + INVALID_LETTER, // [233] 0xe9 + INVALID_LETTER, // [234] 0xea + INVALID_LETTER, // [235] 0xeb + INVALID_LETTER, // [236] 0xec + INVALID_LETTER, // [237] 0xed + INVALID_LETTER, // [238] 0xee + INVALID_LETTER, // [239] 0xef + INVALID_LETTER, // [240] 0xf0 + INVALID_LETTER, // [241] 0xf1 + INVALID_LETTER, // [242] 0xf2 + INVALID_LETTER, // [243] 0xf3 + INVALID_LETTER, // [244] 0xf4 + INVALID_LETTER, // [245] 0xf5 + INVALID_LETTER, // [246] 0xf6 + INVALID_LETTER, // [247] 0xf7 + INVALID_LETTER, // [248] 0xf8 + INVALID_LETTER, // [249] 0xf9 + INVALID_LETTER, // [250] 0xfa + INVALID_LETTER, // [251] 0xfb + INVALID_LETTER, // [252] 0xfc + INVALID_LETTER, // [253] 0xfd + INVALID_LETTER, // [254] 0xfe + INVALID_LETTER, // [255] 0xff + }; +unsigned g_CharToLetterAmino[256] = + { + INVALID_LETTER, // [ 0] 0x00 + INVALID_LETTER, // [ 1] 0x01 + INVALID_LETTER, // [ 2] 0x02 + INVALID_LETTER, // [ 3] 0x03 + INVALID_LETTER, // [ 4] 0x04 + INVALID_LETTER, // [ 5] 0x05 + INVALID_LETTER, // [ 6] 0x06 + INVALID_LETTER, // [ 7] 0x07 + INVALID_LETTER, // [ 8] 0x08 + INVALID_LETTER, // [ 9] 0x09 + INVALID_LETTER, // [ 10] 0x0a + INVALID_LETTER, // [ 11] 0x0b + INVALID_LETTER, // [ 12] 0x0c + INVALID_LETTER, // [ 13] 0x0d + INVALID_LETTER, // [ 14] 0x0e + INVALID_LETTER, // [ 15] 0x0f + INVALID_LETTER, // [ 16] 0x10 + INVALID_LETTER, // [ 17] 0x11 + INVALID_LETTER, // [ 18] 0x12 + INVALID_LETTER, // [ 19] 0x13 + INVALID_LETTER, // [ 20] 0x14 + INVALID_LETTER, // [ 21] 0x15 + INVALID_LETTER, // [ 22] 0x16 + INVALID_LETTER, // [ 23] 0x17 + INVALID_LETTER, // [ 24] 0x18 + INVALID_LETTER, // [ 25] 0x19 + INVALID_LETTER, // [ 26] 0x1a + INVALID_LETTER, // [ 27] 0x1b + INVALID_LETTER, // [ 28] 0x1c + INVALID_LETTER, // [ 29] 0x1d + INVALID_LETTER, // [ 30] 0x1e + INVALID_LETTER, // [ 31] 0x1f + INVALID_LETTER, // [ 32] ' ' + INVALID_LETTER, // [ 33] '!' + INVALID_LETTER, // [ 34] '"' + INVALID_LETTER, // [ 35] '#' + INVALID_LETTER, // [ 36] '$' + INVALID_LETTER, // [ 37] '%' + INVALID_LETTER, // [ 38] '&' + INVALID_LETTER, // [ 39] ''' + INVALID_LETTER, // [ 40] '(' + INVALID_LETTER, // [ 41] ')' + INVALID_LETTER, // [ 42] '*' + INVALID_LETTER, // [ 43] '+' + INVALID_LETTER, // [ 44] ',' + INVALID_LETTER, // [ 45] '-' + INVALID_LETTER, // [ 46] '.' + INVALID_LETTER, // [ 47] '/' + INVALID_LETTER, // [ 48] '0' + INVALID_LETTER, // [ 49] '1' + INVALID_LETTER, // [ 50] '2' + INVALID_LETTER, // [ 51] '3' + INVALID_LETTER, // [ 52] '4' + INVALID_LETTER, // [ 53] '5' + INVALID_LETTER, // [ 54] '6' + INVALID_LETTER, // [ 55] '7' + INVALID_LETTER, // [ 56] '8' + INVALID_LETTER, // [ 57] '9' + INVALID_LETTER, // [ 58] ':' + INVALID_LETTER, // [ 59] ';' + INVALID_LETTER, // [ 60] '<' + INVALID_LETTER, // [ 61] '=' + INVALID_LETTER, // [ 62] '>' + INVALID_LETTER, // [ 63] '?' + INVALID_LETTER, // [ 64] '@' + 0 , // [ 65] 'A' = Ala + INVALID_LETTER, // [ 66] 'B' + 1 , // [ 67] 'C' = Cys + 2 , // [ 68] 'D' = Asp + 3 , // [ 69] 'E' = Glu + 4 , // [ 70] 'F' = Phe + 5 , // [ 71] 'G' = Gly + 6 , // [ 72] 'H' = His + 7 , // [ 73] 'I' = Ile + INVALID_LETTER, // [ 74] 'J' + 8 , // [ 75] 'K' = Lys + 9 , // [ 76] 'L' = Leu + 10 , // [ 77] 'M' = Met + 11 , // [ 78] 'N' = Asn + INVALID_LETTER, // [ 79] 'O' + 12 , // [ 80] 'P' = Pro + 13 , // [ 81] 'Q' = Gln + 14 , // [ 82] 'R' = Arg + 15 , // [ 83] 'S' = Ser + 16 , // [ 84] 'T' = Thr + INVALID_LETTER, // [ 85] 'U' + 17 , // [ 86] 'V' = Val + 18 , // [ 87] 'W' = Trp + INVALID_LETTER, // [ 88] 'X' + 19 , // [ 89] 'Y' = Tyr + INVALID_LETTER, // [ 90] 'Z' + INVALID_LETTER, // [ 91] '[' + INVALID_LETTER, // [ 92] '\' + INVALID_LETTER, // [ 93] ']' + INVALID_LETTER, // [ 94] '^' + INVALID_LETTER, // [ 95] '_' + INVALID_LETTER, // [ 96] '`' + 0 , // [ 97] 'a' = Ala + INVALID_LETTER, // [ 98] 'b' + 1 , // [ 99] 'c' = Cys + 2 , // [100] 'd' = Asp + 3 , // [101] 'e' = Glu + 4 , // [102] 'f' = Phe + 5 , // [103] 'g' = Gly + 6 , // [104] 'h' = His + 7 , // [105] 'i' = Ile + INVALID_LETTER, // [106] 'j' + 8 , // [107] 'k' = Lys + 9 , // [108] 'l' = Leu + 10 , // [109] 'm' = Met + 11 , // [110] 'n' = Asn + INVALID_LETTER, // [111] 'o' + 12 , // [112] 'p' = Pro + 13 , // [113] 'q' = Gln + 14 , // [114] 'r' = Arg + 15 , // [115] 's' = Ser + 16 , // [116] 't' = Thr + INVALID_LETTER, // [117] 'u' + 17 , // [118] 'v' = Val + 18 , // [119] 'w' = Trp + INVALID_LETTER, // [120] 'x' + 19 , // [121] 'y' = Tyr + INVALID_LETTER, // [122] 'z' + INVALID_LETTER, // [123] '{' + INVALID_LETTER, // [124] '|' + INVALID_LETTER, // [125] '}' + INVALID_LETTER, // [126] '~' + INVALID_LETTER, // [127] 0x7f + INVALID_LETTER, // [128] 0x80 + INVALID_LETTER, // [129] 0x81 + INVALID_LETTER, // [130] 0x82 + INVALID_LETTER, // [131] 0x83 + INVALID_LETTER, // [132] 0x84 + INVALID_LETTER, // [133] 0x85 + INVALID_LETTER, // [134] 0x86 + INVALID_LETTER, // [135] 0x87 + INVALID_LETTER, // [136] 0x88 + INVALID_LETTER, // [137] 0x89 + INVALID_LETTER, // [138] 0x8a + INVALID_LETTER, // [139] 0x8b + INVALID_LETTER, // [140] 0x8c + INVALID_LETTER, // [141] 0x8d + INVALID_LETTER, // [142] 0x8e + INVALID_LETTER, // [143] 0x8f + INVALID_LETTER, // [144] 0x90 + INVALID_LETTER, // [145] 0x91 + INVALID_LETTER, // [146] 0x92 + INVALID_LETTER, // [147] 0x93 + INVALID_LETTER, // [148] 0x94 + INVALID_LETTER, // [149] 0x95 + INVALID_LETTER, // [150] 0x96 + INVALID_LETTER, // [151] 0x97 + INVALID_LETTER, // [152] 0x98 + INVALID_LETTER, // [153] 0x99 + INVALID_LETTER, // [154] 0x9a + INVALID_LETTER, // [155] 0x9b + INVALID_LETTER, // [156] 0x9c + INVALID_LETTER, // [157] 0x9d + INVALID_LETTER, // [158] 0x9e + INVALID_LETTER, // [159] 0x9f + INVALID_LETTER, // [160] 0xa0 + INVALID_LETTER, // [161] 0xa1 + INVALID_LETTER, // [162] 0xa2 + INVALID_LETTER, // [163] 0xa3 + INVALID_LETTER, // [164] 0xa4 + INVALID_LETTER, // [165] 0xa5 + INVALID_LETTER, // [166] 0xa6 + INVALID_LETTER, // [167] 0xa7 + INVALID_LETTER, // [168] 0xa8 + INVALID_LETTER, // [169] 0xa9 + INVALID_LETTER, // [170] 0xaa + INVALID_LETTER, // [171] 0xab + INVALID_LETTER, // [172] 0xac + INVALID_LETTER, // [173] 0xad + INVALID_LETTER, // [174] 0xae + INVALID_LETTER, // [175] 0xaf + INVALID_LETTER, // [176] 0xb0 + INVALID_LETTER, // [177] 0xb1 + INVALID_LETTER, // [178] 0xb2 + INVALID_LETTER, // [179] 0xb3 + INVALID_LETTER, // [180] 0xb4 + INVALID_LETTER, // [181] 0xb5 + INVALID_LETTER, // [182] 0xb6 + INVALID_LETTER, // [183] 0xb7 + INVALID_LETTER, // [184] 0xb8 + INVALID_LETTER, // [185] 0xb9 + INVALID_LETTER, // [186] 0xba + INVALID_LETTER, // [187] 0xbb + INVALID_LETTER, // [188] 0xbc + INVALID_LETTER, // [189] 0xbd + INVALID_LETTER, // [190] 0xbe + INVALID_LETTER, // [191] 0xbf + INVALID_LETTER, // [192] 0xc0 + INVALID_LETTER, // [193] 0xc1 + INVALID_LETTER, // [194] 0xc2 + INVALID_LETTER, // [195] 0xc3 + INVALID_LETTER, // [196] 0xc4 + INVALID_LETTER, // [197] 0xc5 + INVALID_LETTER, // [198] 0xc6 + INVALID_LETTER, // [199] 0xc7 + INVALID_LETTER, // [200] 0xc8 + INVALID_LETTER, // [201] 0xc9 + INVALID_LETTER, // [202] 0xca + INVALID_LETTER, // [203] 0xcb + INVALID_LETTER, // [204] 0xcc + INVALID_LETTER, // [205] 0xcd + INVALID_LETTER, // [206] 0xce + INVALID_LETTER, // [207] 0xcf + INVALID_LETTER, // [208] 0xd0 + INVALID_LETTER, // [209] 0xd1 + INVALID_LETTER, // [210] 0xd2 + INVALID_LETTER, // [211] 0xd3 + INVALID_LETTER, // [212] 0xd4 + INVALID_LETTER, // [213] 0xd5 + INVALID_LETTER, // [214] 0xd6 + INVALID_LETTER, // [215] 0xd7 + INVALID_LETTER, // [216] 0xd8 + INVALID_LETTER, // [217] 0xd9 + INVALID_LETTER, // [218] 0xda + INVALID_LETTER, // [219] 0xdb + INVALID_LETTER, // [220] 0xdc + INVALID_LETTER, // [221] 0xdd + INVALID_LETTER, // [222] 0xde + INVALID_LETTER, // [223] 0xdf + INVALID_LETTER, // [224] 0xe0 + INVALID_LETTER, // [225] 0xe1 + INVALID_LETTER, // [226] 0xe2 + INVALID_LETTER, // [227] 0xe3 + INVALID_LETTER, // [228] 0xe4 + INVALID_LETTER, // [229] 0xe5 + INVALID_LETTER, // [230] 0xe6 + INVALID_LETTER, // [231] 0xe7 + INVALID_LETTER, // [232] 0xe8 + INVALID_LETTER, // [233] 0xe9 + INVALID_LETTER, // [234] 0xea + INVALID_LETTER, // [235] 0xeb + INVALID_LETTER, // [236] 0xec + INVALID_LETTER, // [237] 0xed + INVALID_LETTER, // [238] 0xee + INVALID_LETTER, // [239] 0xef + INVALID_LETTER, // [240] 0xf0 + INVALID_LETTER, // [241] 0xf1 + INVALID_LETTER, // [242] 0xf2 + INVALID_LETTER, // [243] 0xf3 + INVALID_LETTER, // [244] 0xf4 + INVALID_LETTER, // [245] 0xf5 + INVALID_LETTER, // [246] 0xf6 + INVALID_LETTER, // [247] 0xf7 + INVALID_LETTER, // [248] 0xf8 + INVALID_LETTER, // [249] 0xf9 + INVALID_LETTER, // [250] 0xfa + INVALID_LETTER, // [251] 0xfb + INVALID_LETTER, // [252] 0xfc + INVALID_LETTER, // [253] 0xfd + INVALID_LETTER, // [254] 0xfe + INVALID_LETTER, // [255] 0xff + }; + +unsigned char g_LetterToCharAmino[256] = + { + 'A', // [0] + 'C', // [1] + 'D', // [2] + 'E', // [3] + 'F', // [4] + 'G', // [5] + 'H', // [6] + 'I', // [7] + 'K', // [8] + 'L', // [9] + 'M', // [10] + 'N', // [11] + 'P', // [12] + 'Q', // [13] + 'R', // [14] + 'S', // [15] + 'T', // [16] + 'V', // [17] + 'W', // [18] + 'Y', // [19] + '*', // [20] + INVALID_CHAR, // [21] + INVALID_CHAR, // [22] + INVALID_CHAR, // [23] + INVALID_CHAR, // [24] + INVALID_CHAR, // [25] + INVALID_CHAR, // [26] + INVALID_CHAR, // [27] + INVALID_CHAR, // [28] + INVALID_CHAR, // [29] + INVALID_CHAR, // [30] + INVALID_CHAR, // [31] + INVALID_CHAR, // [32] + INVALID_CHAR, // [33] + INVALID_CHAR, // [34] + INVALID_CHAR, // [35] + INVALID_CHAR, // [36] + INVALID_CHAR, // [37] + INVALID_CHAR, // [38] + INVALID_CHAR, // [39] + INVALID_CHAR, // [40] + INVALID_CHAR, // [41] + INVALID_CHAR, // [42] + INVALID_CHAR, // [43] + INVALID_CHAR, // [44] + INVALID_CHAR, // [45] + INVALID_CHAR, // [46] + INVALID_CHAR, // [47] + INVALID_CHAR, // [48] + INVALID_CHAR, // [49] + INVALID_CHAR, // [50] + INVALID_CHAR, // [51] + INVALID_CHAR, // [52] + INVALID_CHAR, // [53] + INVALID_CHAR, // [54] + INVALID_CHAR, // [55] + INVALID_CHAR, // [56] + INVALID_CHAR, // [57] + INVALID_CHAR, // [58] + INVALID_CHAR, // [59] + INVALID_CHAR, // [60] + INVALID_CHAR, // [61] + INVALID_CHAR, // [62] + INVALID_CHAR, // [63] + INVALID_CHAR, // [64] + INVALID_CHAR, // [65] + INVALID_CHAR, // [66] + INVALID_CHAR, // [67] + INVALID_CHAR, // [68] + INVALID_CHAR, // [69] + INVALID_CHAR, // [70] + INVALID_CHAR, // [71] + INVALID_CHAR, // [72] + INVALID_CHAR, // [73] + INVALID_CHAR, // [74] + INVALID_CHAR, // [75] + INVALID_CHAR, // [76] + INVALID_CHAR, // [77] + INVALID_CHAR, // [78] + INVALID_CHAR, // [79] + INVALID_CHAR, // [80] + INVALID_CHAR, // [81] + INVALID_CHAR, // [82] + INVALID_CHAR, // [83] + INVALID_CHAR, // [84] + INVALID_CHAR, // [85] + INVALID_CHAR, // [86] + INVALID_CHAR, // [87] + INVALID_CHAR, // [88] + INVALID_CHAR, // [89] + INVALID_CHAR, // [90] + INVALID_CHAR, // [91] + INVALID_CHAR, // [92] + INVALID_CHAR, // [93] + INVALID_CHAR, // [94] + INVALID_CHAR, // [95] + INVALID_CHAR, // [96] + INVALID_CHAR, // [97] + INVALID_CHAR, // [98] + INVALID_CHAR, // [99] + INVALID_CHAR, // [100] + INVALID_CHAR, // [101] + INVALID_CHAR, // [102] + INVALID_CHAR, // [103] + INVALID_CHAR, // [104] + INVALID_CHAR, // [105] + INVALID_CHAR, // [106] + INVALID_CHAR, // [107] + INVALID_CHAR, // [108] + INVALID_CHAR, // [109] + INVALID_CHAR, // [110] + INVALID_CHAR, // [111] + INVALID_CHAR, // [112] + INVALID_CHAR, // [113] + INVALID_CHAR, // [114] + INVALID_CHAR, // [115] + INVALID_CHAR, // [116] + INVALID_CHAR, // [117] + INVALID_CHAR, // [118] + INVALID_CHAR, // [119] + INVALID_CHAR, // [120] + INVALID_CHAR, // [121] + INVALID_CHAR, // [122] + INVALID_CHAR, // [123] + INVALID_CHAR, // [124] + INVALID_CHAR, // [125] + INVALID_CHAR, // [126] + INVALID_CHAR, // [127] + INVALID_CHAR, // [128] + INVALID_CHAR, // [129] + INVALID_CHAR, // [130] + INVALID_CHAR, // [131] + INVALID_CHAR, // [132] + INVALID_CHAR, // [133] + INVALID_CHAR, // [134] + INVALID_CHAR, // [135] + INVALID_CHAR, // [136] + INVALID_CHAR, // [137] + INVALID_CHAR, // [138] + INVALID_CHAR, // [139] + INVALID_CHAR, // [140] + INVALID_CHAR, // [141] + INVALID_CHAR, // [142] + INVALID_CHAR, // [143] + INVALID_CHAR, // [144] + INVALID_CHAR, // [145] + INVALID_CHAR, // [146] + INVALID_CHAR, // [147] + INVALID_CHAR, // [148] + INVALID_CHAR, // [149] + INVALID_CHAR, // [150] + INVALID_CHAR, // [151] + INVALID_CHAR, // [152] + INVALID_CHAR, // [153] + INVALID_CHAR, // [154] + INVALID_CHAR, // [155] + INVALID_CHAR, // [156] + INVALID_CHAR, // [157] + INVALID_CHAR, // [158] + INVALID_CHAR, // [159] + INVALID_CHAR, // [160] + INVALID_CHAR, // [161] + INVALID_CHAR, // [162] + INVALID_CHAR, // [163] + INVALID_CHAR, // [164] + INVALID_CHAR, // [165] + INVALID_CHAR, // [166] + INVALID_CHAR, // [167] + INVALID_CHAR, // [168] + INVALID_CHAR, // [169] + INVALID_CHAR, // [170] + INVALID_CHAR, // [171] + INVALID_CHAR, // [172] + INVALID_CHAR, // [173] + INVALID_CHAR, // [174] + INVALID_CHAR, // [175] + INVALID_CHAR, // [176] + INVALID_CHAR, // [177] + INVALID_CHAR, // [178] + INVALID_CHAR, // [179] + INVALID_CHAR, // [180] + INVALID_CHAR, // [181] + INVALID_CHAR, // [182] + INVALID_CHAR, // [183] + INVALID_CHAR, // [184] + INVALID_CHAR, // [185] + INVALID_CHAR, // [186] + INVALID_CHAR, // [187] + INVALID_CHAR, // [188] + INVALID_CHAR, // [189] + INVALID_CHAR, // [190] + INVALID_CHAR, // [191] + INVALID_CHAR, // [192] + INVALID_CHAR, // [193] + INVALID_CHAR, // [194] + INVALID_CHAR, // [195] + INVALID_CHAR, // [196] + INVALID_CHAR, // [197] + INVALID_CHAR, // [198] + INVALID_CHAR, // [199] + INVALID_CHAR, // [200] + INVALID_CHAR, // [201] + INVALID_CHAR, // [202] + INVALID_CHAR, // [203] + INVALID_CHAR, // [204] + INVALID_CHAR, // [205] + INVALID_CHAR, // [206] + INVALID_CHAR, // [207] + INVALID_CHAR, // [208] + INVALID_CHAR, // [209] + INVALID_CHAR, // [210] + INVALID_CHAR, // [211] + INVALID_CHAR, // [212] + INVALID_CHAR, // [213] + INVALID_CHAR, // [214] + INVALID_CHAR, // [215] + INVALID_CHAR, // [216] + INVALID_CHAR, // [217] + INVALID_CHAR, // [218] + INVALID_CHAR, // [219] + INVALID_CHAR, // [220] + INVALID_CHAR, // [221] + INVALID_CHAR, // [222] + INVALID_CHAR, // [223] + INVALID_CHAR, // [224] + INVALID_CHAR, // [225] + INVALID_CHAR, // [226] + INVALID_CHAR, // [227] + INVALID_CHAR, // [228] + INVALID_CHAR, // [229] + INVALID_CHAR, // [230] + INVALID_CHAR, // [231] + INVALID_CHAR, // [232] + INVALID_CHAR, // [233] + INVALID_CHAR, // [234] + INVALID_CHAR, // [235] + INVALID_CHAR, // [236] + INVALID_CHAR, // [237] + INVALID_CHAR, // [238] + INVALID_CHAR, // [239] + INVALID_CHAR, // [240] + INVALID_CHAR, // [241] + INVALID_CHAR, // [242] + INVALID_CHAR, // [243] + INVALID_CHAR, // [244] + INVALID_CHAR, // [245] + INVALID_CHAR, // [246] + INVALID_CHAR, // [247] + INVALID_CHAR, // [248] + INVALID_CHAR, // [249] + INVALID_CHAR, // [250] + INVALID_CHAR, // [251] + INVALID_CHAR, // [252] + INVALID_CHAR, // [253] + INVALID_CHAR, // [254] + INVALID_CHAR, // [255] + }; + +unsigned g_CharToLetterNucleo[256] = + { + INVALID_LETTER, // [ 0] = 0x00 + INVALID_LETTER, // [ 1] = 0x01 + INVALID_LETTER, // [ 2] = 0x02 + INVALID_LETTER, // [ 3] = 0x03 + INVALID_LETTER, // [ 4] = 0x04 + INVALID_LETTER, // [ 5] = 0x05 + INVALID_LETTER, // [ 6] = 0x06 + INVALID_LETTER, // [ 7] = 0x07 + INVALID_LETTER, // [ 8] = 0x08 + INVALID_LETTER, // [ 9] = 0x09 + INVALID_LETTER, // [ 10] = 0x0a + INVALID_LETTER, // [ 11] = 0x0b + INVALID_LETTER, // [ 12] = 0x0c + INVALID_LETTER, // [ 13] = 0x0d + INVALID_LETTER, // [ 14] = 0x0e + INVALID_LETTER, // [ 15] = 0x0f + INVALID_LETTER, // [ 16] = 0x10 + INVALID_LETTER, // [ 17] = 0x11 + INVALID_LETTER, // [ 18] = 0x12 + INVALID_LETTER, // [ 19] = 0x13 + INVALID_LETTER, // [ 20] = 0x14 + INVALID_LETTER, // [ 21] = 0x15 + INVALID_LETTER, // [ 22] = 0x16 + INVALID_LETTER, // [ 23] = 0x17 + INVALID_LETTER, // [ 24] = 0x18 + INVALID_LETTER, // [ 25] = 0x19 + INVALID_LETTER, // [ 26] = 0x1a + INVALID_LETTER, // [ 27] = 0x1b + INVALID_LETTER, // [ 28] = 0x1c + INVALID_LETTER, // [ 29] = 0x1d + INVALID_LETTER, // [ 30] = 0x1e + INVALID_LETTER, // [ 31] = 0x1f + INVALID_LETTER, // [ 32] = 32 + INVALID_LETTER, // [ 33] = 33 + INVALID_LETTER, // [ 34] = 34 + INVALID_LETTER, // [ 35] = 35 + INVALID_LETTER, // [ 36] = 36 + INVALID_LETTER, // [ 37] = 37 + INVALID_LETTER, // [ 38] = 38 + INVALID_LETTER, // [ 39] = 39 + INVALID_LETTER, // [ 40] = 40 + INVALID_LETTER, // [ 41] = 41 + INVALID_LETTER, // [ 42] = 42 + INVALID_LETTER, // [ 43] = 43 + INVALID_LETTER, // [ 44] = 44 + INVALID_LETTER, // [ 45] = 45 + INVALID_LETTER, // [ 46] = 46 + INVALID_LETTER, // [ 47] = 47 + INVALID_LETTER, // [ 48] = 48 + INVALID_LETTER, // [ 49] = 49 + INVALID_LETTER, // [ 50] = 50 + INVALID_LETTER, // [ 51] = 51 + INVALID_LETTER, // [ 52] = 52 + INVALID_LETTER, // [ 53] = 53 + INVALID_LETTER, // [ 54] = 54 + INVALID_LETTER, // [ 55] = 55 + INVALID_LETTER, // [ 56] = 56 + INVALID_LETTER, // [ 57] = 57 + INVALID_LETTER, // [ 58] = 58 + INVALID_LETTER, // [ 59] = 59 + INVALID_LETTER, // [ 60] = 60 + INVALID_LETTER, // [ 61] = 61 + INVALID_LETTER, // [ 62] = 62 + INVALID_LETTER, // [ 63] = 63 + INVALID_LETTER, // [ 64] = 64 + 0 , // [ 65] = A (Nucleotide) + INVALID_LETTER, // [ 66] = 66 + 1 , // [ 67] = C (Nucleotide) + INVALID_LETTER, // [ 68] = 68 + INVALID_LETTER, // [ 69] = 69 + INVALID_LETTER, // [ 70] = 70 + 2 , // [ 71] = G (Nucleotide) + INVALID_LETTER, // [ 72] = 72 + INVALID_LETTER, // [ 73] = 73 + INVALID_LETTER, // [ 74] = 74 + INVALID_LETTER, // [ 75] = 75 + INVALID_LETTER, // [ 76] = 76 + INVALID_LETTER, // [ 77] = 77 + INVALID_LETTER, // [ 78] = 78 + INVALID_LETTER, // [ 79] = 79 + INVALID_LETTER, // [ 80] = 80 + INVALID_LETTER, // [ 81] = 81 + INVALID_LETTER, // [ 82] = 82 + INVALID_LETTER, // [ 83] = 83 + 3 , // [ 84] = T (Nucleotide) + 3 , // [ 85] = U (Nucleotide) + INVALID_LETTER, // [ 86] = 86 + INVALID_LETTER, // [ 87] = 87 + INVALID_LETTER, // [ 88] = 88 + INVALID_LETTER, // [ 89] = 89 + INVALID_LETTER, // [ 90] = 90 + INVALID_LETTER, // [ 91] = 91 + INVALID_LETTER, // [ 92] = 92 + INVALID_LETTER, // [ 93] = 93 + INVALID_LETTER, // [ 94] = 94 + INVALID_LETTER, // [ 95] = 95 + INVALID_LETTER, // [ 96] = 96 + 0 , // [ 97] = a (Nucleotide) + INVALID_LETTER, // [ 98] = 98 + 1 , // [ 99] = c (Nucleotide) + INVALID_LETTER, // [100] = 100 + INVALID_LETTER, // [101] = 101 + INVALID_LETTER, // [102] = 102 + 2 , // [103] = g (Nucleotide) + INVALID_LETTER, // [104] = 104 + INVALID_LETTER, // [105] = 105 + INVALID_LETTER, // [106] = 106 + INVALID_LETTER, // [107] = 107 + INVALID_LETTER, // [108] = 108 + INVALID_LETTER, // [109] = 109 + INVALID_LETTER, // [110] = 110 + INVALID_LETTER, // [111] = 111 + INVALID_LETTER, // [112] = 112 + INVALID_LETTER, // [113] = 113 + INVALID_LETTER, // [114] = 114 + INVALID_LETTER, // [115] = 115 + 3 , // [116] = t (Nucleotide) + 3 , // [117] = u (Nucleotide) + INVALID_LETTER, // [118] = 118 + INVALID_LETTER, // [119] = 119 + INVALID_LETTER, // [120] = 120 + INVALID_LETTER, // [121] = 121 + INVALID_LETTER, // [122] = 122 + INVALID_LETTER, // [123] = 123 + INVALID_LETTER, // [124] = 124 + INVALID_LETTER, // [125] = 125 + INVALID_LETTER, // [126] = 126 + INVALID_LETTER, // [127] = 0x7f + INVALID_LETTER, // [128] = 0x80 + INVALID_LETTER, // [129] = 0x81 + INVALID_LETTER, // [130] = 0x82 + INVALID_LETTER, // [131] = 0x83 + INVALID_LETTER, // [132] = 0x84 + INVALID_LETTER, // [133] = 0x85 + INVALID_LETTER, // [134] = 0x86 + INVALID_LETTER, // [135] = 0x87 + INVALID_LETTER, // [136] = 0x88 + INVALID_LETTER, // [137] = 0x89 + INVALID_LETTER, // [138] = 0x8a + INVALID_LETTER, // [139] = 0x8b + INVALID_LETTER, // [140] = 0x8c + INVALID_LETTER, // [141] = 0x8d + INVALID_LETTER, // [142] = 0x8e + INVALID_LETTER, // [143] = 0x8f + INVALID_LETTER, // [144] = 0x90 + INVALID_LETTER, // [145] = 0x91 + INVALID_LETTER, // [146] = 0x92 + INVALID_LETTER, // [147] = 0x93 + INVALID_LETTER, // [148] = 0x94 + INVALID_LETTER, // [149] = 0x95 + INVALID_LETTER, // [150] = 0x96 + INVALID_LETTER, // [151] = 0x97 + INVALID_LETTER, // [152] = 0x98 + INVALID_LETTER, // [153] = 0x99 + INVALID_LETTER, // [154] = 0x9a + INVALID_LETTER, // [155] = 0x9b + INVALID_LETTER, // [156] = 0x9c + INVALID_LETTER, // [157] = 0x9d + INVALID_LETTER, // [158] = 0x9e + INVALID_LETTER, // [159] = 0x9f + INVALID_LETTER, // [160] = 0xa0 + INVALID_LETTER, // [161] = 0xa1 + INVALID_LETTER, // [162] = 0xa2 + INVALID_LETTER, // [163] = 0xa3 + INVALID_LETTER, // [164] = 0xa4 + INVALID_LETTER, // [165] = 0xa5 + INVALID_LETTER, // [166] = 0xa6 + INVALID_LETTER, // [167] = 0xa7 + INVALID_LETTER, // [168] = 0xa8 + INVALID_LETTER, // [169] = 0xa9 + INVALID_LETTER, // [170] = 0xaa + INVALID_LETTER, // [171] = 0xab + INVALID_LETTER, // [172] = 0xac + INVALID_LETTER, // [173] = 0xad + INVALID_LETTER, // [174] = 0xae + INVALID_LETTER, // [175] = 0xaf + INVALID_LETTER, // [176] = 0xb0 + INVALID_LETTER, // [177] = 0xb1 + INVALID_LETTER, // [178] = 0xb2 + INVALID_LETTER, // [179] = 0xb3 + INVALID_LETTER, // [180] = 0xb4 + INVALID_LETTER, // [181] = 0xb5 + INVALID_LETTER, // [182] = 0xb6 + INVALID_LETTER, // [183] = 0xb7 + INVALID_LETTER, // [184] = 0xb8 + INVALID_LETTER, // [185] = 0xb9 + INVALID_LETTER, // [186] = 0xba + INVALID_LETTER, // [187] = 0xbb + INVALID_LETTER, // [188] = 0xbc + INVALID_LETTER, // [189] = 0xbd + INVALID_LETTER, // [190] = 0xbe + INVALID_LETTER, // [191] = 0xbf + INVALID_LETTER, // [192] = 0xc0 + INVALID_LETTER, // [193] = 0xc1 + INVALID_LETTER, // [194] = 0xc2 + INVALID_LETTER, // [195] = 0xc3 + INVALID_LETTER, // [196] = 0xc4 + INVALID_LETTER, // [197] = 0xc5 + INVALID_LETTER, // [198] = 0xc6 + INVALID_LETTER, // [199] = 0xc7 + INVALID_LETTER, // [200] = 0xc8 + INVALID_LETTER, // [201] = 0xc9 + INVALID_LETTER, // [202] = 0xca + INVALID_LETTER, // [203] = 0xcb + INVALID_LETTER, // [204] = 0xcc + INVALID_LETTER, // [205] = 0xcd + INVALID_LETTER, // [206] = 0xce + INVALID_LETTER, // [207] = 0xcf + INVALID_LETTER, // [208] = 0xd0 + INVALID_LETTER, // [209] = 0xd1 + INVALID_LETTER, // [210] = 0xd2 + INVALID_LETTER, // [211] = 0xd3 + INVALID_LETTER, // [212] = 0xd4 + INVALID_LETTER, // [213] = 0xd5 + INVALID_LETTER, // [214] = 0xd6 + INVALID_LETTER, // [215] = 0xd7 + INVALID_LETTER, // [216] = 0xd8 + INVALID_LETTER, // [217] = 0xd9 + INVALID_LETTER, // [218] = 0xda + INVALID_LETTER, // [219] = 0xdb + INVALID_LETTER, // [220] = 0xdc + INVALID_LETTER, // [221] = 0xdd + INVALID_LETTER, // [222] = 0xde + INVALID_LETTER, // [223] = 0xdf + INVALID_LETTER, // [224] = 0xe0 + INVALID_LETTER, // [225] = 0xe1 + INVALID_LETTER, // [226] = 0xe2 + INVALID_LETTER, // [227] = 0xe3 + INVALID_LETTER, // [228] = 0xe4 + INVALID_LETTER, // [229] = 0xe5 + INVALID_LETTER, // [230] = 0xe6 + INVALID_LETTER, // [231] = 0xe7 + INVALID_LETTER, // [232] = 0xe8 + INVALID_LETTER, // [233] = 0xe9 + INVALID_LETTER, // [234] = 0xea + INVALID_LETTER, // [235] = 0xeb + INVALID_LETTER, // [236] = 0xec + INVALID_LETTER, // [237] = 0xed + INVALID_LETTER, // [238] = 0xee + INVALID_LETTER, // [239] = 0xef + INVALID_LETTER, // [240] = 0xf0 + INVALID_LETTER, // [241] = 0xf1 + INVALID_LETTER, // [242] = 0xf2 + INVALID_LETTER, // [243] = 0xf3 + INVALID_LETTER, // [244] = 0xf4 + INVALID_LETTER, // [245] = 0xf5 + INVALID_LETTER, // [246] = 0xf6 + INVALID_LETTER, // [247] = 0xf7 + INVALID_LETTER, // [248] = 0xf8 + INVALID_LETTER, // [249] = 0xf9 + INVALID_LETTER, // [250] = 0xfa + INVALID_LETTER, // [251] = 0xfb + INVALID_LETTER, // [252] = 0xfc + INVALID_LETTER, // [253] = 0xfd + INVALID_LETTER, // [254] = 0xfe + INVALID_LETTER, // [255] = 0xff + }; + +unsigned char g_LetterToCharNucleo[256] = + { + 'A', // [0] + 'C', // [1] + 'G', // [2] + 'T', // [3] + INVALID_CHAR, // [4] + INVALID_CHAR, // [5] + INVALID_CHAR, // [6] + INVALID_CHAR, // [7] + INVALID_CHAR, // [8] + INVALID_CHAR, // [9] + INVALID_CHAR, // [10] + INVALID_CHAR, // [11] + INVALID_CHAR, // [12] + INVALID_CHAR, // [13] + INVALID_CHAR, // [14] + INVALID_CHAR, // [15] + INVALID_CHAR, // [16] + INVALID_CHAR, // [17] + INVALID_CHAR, // [18] + INVALID_CHAR, // [19] + INVALID_CHAR, // [20] + INVALID_CHAR, // [21] + INVALID_CHAR, // [22] + INVALID_CHAR, // [23] + INVALID_CHAR, // [24] + INVALID_CHAR, // [25] + INVALID_CHAR, // [26] + INVALID_CHAR, // [27] + INVALID_CHAR, // [28] + INVALID_CHAR, // [29] + INVALID_CHAR, // [30] + INVALID_CHAR, // [31] + INVALID_CHAR, // [32] + INVALID_CHAR, // [33] + INVALID_CHAR, // [34] + INVALID_CHAR, // [35] + INVALID_CHAR, // [36] + INVALID_CHAR, // [37] + INVALID_CHAR, // [38] + INVALID_CHAR, // [39] + INVALID_CHAR, // [40] + INVALID_CHAR, // [41] + INVALID_CHAR, // [42] + INVALID_CHAR, // [43] + INVALID_CHAR, // [44] + INVALID_CHAR, // [45] + INVALID_CHAR, // [46] + INVALID_CHAR, // [47] + INVALID_CHAR, // [48] + INVALID_CHAR, // [49] + INVALID_CHAR, // [50] + INVALID_CHAR, // [51] + INVALID_CHAR, // [52] + INVALID_CHAR, // [53] + INVALID_CHAR, // [54] + INVALID_CHAR, // [55] + INVALID_CHAR, // [56] + INVALID_CHAR, // [57] + INVALID_CHAR, // [58] + INVALID_CHAR, // [59] + INVALID_CHAR, // [60] + INVALID_CHAR, // [61] + INVALID_CHAR, // [62] + INVALID_CHAR, // [63] + INVALID_CHAR, // [64] + INVALID_CHAR, // [65] + INVALID_CHAR, // [66] + INVALID_CHAR, // [67] + INVALID_CHAR, // [68] + INVALID_CHAR, // [69] + INVALID_CHAR, // [70] + INVALID_CHAR, // [71] + INVALID_CHAR, // [72] + INVALID_CHAR, // [73] + INVALID_CHAR, // [74] + INVALID_CHAR, // [75] + INVALID_CHAR, // [76] + INVALID_CHAR, // [77] + INVALID_CHAR, // [78] + INVALID_CHAR, // [79] + INVALID_CHAR, // [80] + INVALID_CHAR, // [81] + INVALID_CHAR, // [82] + INVALID_CHAR, // [83] + INVALID_CHAR, // [84] + INVALID_CHAR, // [85] + INVALID_CHAR, // [86] + INVALID_CHAR, // [87] + INVALID_CHAR, // [88] + INVALID_CHAR, // [89] + INVALID_CHAR, // [90] + INVALID_CHAR, // [91] + INVALID_CHAR, // [92] + INVALID_CHAR, // [93] + INVALID_CHAR, // [94] + INVALID_CHAR, // [95] + INVALID_CHAR, // [96] + INVALID_CHAR, // [97] + INVALID_CHAR, // [98] + INVALID_CHAR, // [99] + INVALID_CHAR, // [100] + INVALID_CHAR, // [101] + INVALID_CHAR, // [102] + INVALID_CHAR, // [103] + INVALID_CHAR, // [104] + INVALID_CHAR, // [105] + INVALID_CHAR, // [106] + INVALID_CHAR, // [107] + INVALID_CHAR, // [108] + INVALID_CHAR, // [109] + INVALID_CHAR, // [110] + INVALID_CHAR, // [111] + INVALID_CHAR, // [112] + INVALID_CHAR, // [113] + INVALID_CHAR, // [114] + INVALID_CHAR, // [115] + INVALID_CHAR, // [116] + INVALID_CHAR, // [117] + INVALID_CHAR, // [118] + INVALID_CHAR, // [119] + INVALID_CHAR, // [120] + INVALID_CHAR, // [121] + INVALID_CHAR, // [122] + INVALID_CHAR, // [123] + INVALID_CHAR, // [124] + INVALID_CHAR, // [125] + INVALID_CHAR, // [126] + INVALID_CHAR, // [127] + INVALID_CHAR, // [128] + INVALID_CHAR, // [129] + INVALID_CHAR, // [130] + INVALID_CHAR, // [131] + INVALID_CHAR, // [132] + INVALID_CHAR, // [133] + INVALID_CHAR, // [134] + INVALID_CHAR, // [135] + INVALID_CHAR, // [136] + INVALID_CHAR, // [137] + INVALID_CHAR, // [138] + INVALID_CHAR, // [139] + INVALID_CHAR, // [140] + INVALID_CHAR, // [141] + INVALID_CHAR, // [142] + INVALID_CHAR, // [143] + INVALID_CHAR, // [144] + INVALID_CHAR, // [145] + INVALID_CHAR, // [146] + INVALID_CHAR, // [147] + INVALID_CHAR, // [148] + INVALID_CHAR, // [149] + INVALID_CHAR, // [150] + INVALID_CHAR, // [151] + INVALID_CHAR, // [152] + INVALID_CHAR, // [153] + INVALID_CHAR, // [154] + INVALID_CHAR, // [155] + INVALID_CHAR, // [156] + INVALID_CHAR, // [157] + INVALID_CHAR, // [158] + INVALID_CHAR, // [159] + INVALID_CHAR, // [160] + INVALID_CHAR, // [161] + INVALID_CHAR, // [162] + INVALID_CHAR, // [163] + INVALID_CHAR, // [164] + INVALID_CHAR, // [165] + INVALID_CHAR, // [166] + INVALID_CHAR, // [167] + INVALID_CHAR, // [168] + INVALID_CHAR, // [169] + INVALID_CHAR, // [170] + INVALID_CHAR, // [171] + INVALID_CHAR, // [172] + INVALID_CHAR, // [173] + INVALID_CHAR, // [174] + INVALID_CHAR, // [175] + INVALID_CHAR, // [176] + INVALID_CHAR, // [177] + INVALID_CHAR, // [178] + INVALID_CHAR, // [179] + INVALID_CHAR, // [180] + INVALID_CHAR, // [181] + INVALID_CHAR, // [182] + INVALID_CHAR, // [183] + INVALID_CHAR, // [184] + INVALID_CHAR, // [185] + INVALID_CHAR, // [186] + INVALID_CHAR, // [187] + INVALID_CHAR, // [188] + INVALID_CHAR, // [189] + INVALID_CHAR, // [190] + INVALID_CHAR, // [191] + INVALID_CHAR, // [192] + INVALID_CHAR, // [193] + INVALID_CHAR, // [194] + INVALID_CHAR, // [195] + INVALID_CHAR, // [196] + INVALID_CHAR, // [197] + INVALID_CHAR, // [198] + INVALID_CHAR, // [199] + INVALID_CHAR, // [200] + INVALID_CHAR, // [201] + INVALID_CHAR, // [202] + INVALID_CHAR, // [203] + INVALID_CHAR, // [204] + INVALID_CHAR, // [205] + INVALID_CHAR, // [206] + INVALID_CHAR, // [207] + INVALID_CHAR, // [208] + INVALID_CHAR, // [209] + INVALID_CHAR, // [210] + INVALID_CHAR, // [211] + INVALID_CHAR, // [212] + INVALID_CHAR, // [213] + INVALID_CHAR, // [214] + INVALID_CHAR, // [215] + INVALID_CHAR, // [216] + INVALID_CHAR, // [217] + INVALID_CHAR, // [218] + INVALID_CHAR, // [219] + INVALID_CHAR, // [220] + INVALID_CHAR, // [221] + INVALID_CHAR, // [222] + INVALID_CHAR, // [223] + INVALID_CHAR, // [224] + INVALID_CHAR, // [225] + INVALID_CHAR, // [226] + INVALID_CHAR, // [227] + INVALID_CHAR, // [228] + INVALID_CHAR, // [229] + INVALID_CHAR, // [230] + INVALID_CHAR, // [231] + INVALID_CHAR, // [232] + INVALID_CHAR, // [233] + INVALID_CHAR, // [234] + INVALID_CHAR, // [235] + INVALID_CHAR, // [236] + INVALID_CHAR, // [237] + INVALID_CHAR, // [238] + INVALID_CHAR, // [239] + INVALID_CHAR, // [240] + INVALID_CHAR, // [241] + INVALID_CHAR, // [242] + INVALID_CHAR, // [243] + INVALID_CHAR, // [244] + INVALID_CHAR, // [245] + INVALID_CHAR, // [246] + INVALID_CHAR, // [247] + INVALID_CHAR, // [248] + INVALID_CHAR, // [249] + INVALID_CHAR, // [250] + INVALID_CHAR, // [251] + INVALID_CHAR, // [252] + INVALID_CHAR, // [253] + INVALID_CHAR, // [254] + INVALID_CHAR, // [255] + }; + +unsigned g_CodonWordToAminoLetter[4*4*4] = + { + 8 , // [ 0] = AAA K (Lys) + 11, // [ 1] = AAC N (Asn) + 8 , // [ 2] = AAG K (Lys) + 11, // [ 3] = AAT N (Asn) + 16, // [ 4] = ACA T (Thr) + 16, // [ 5] = ACC T (Thr) + 16, // [ 6] = ACG T (Thr) + 16, // [ 7] = ACT T (Thr) + 14, // [ 8] = AGA R (Arg) + 15, // [ 9] = AGC S (Ser) + 14, // [10] = AGG R (Arg) + 15, // [11] = AGT S (Ser) + 7 , // [12] = ATA I (Ile) + 7 , // [13] = ATC I (Ile) + 10, // [14] = ATG M (Met) + 7 , // [15] = ATT I (Ile) + 13, // [16] = CAA Q (Gln) + 6 , // [17] = CAC H (His) + 13, // [18] = CAG Q (Gln) + 6 , // [19] = CAT H (His) + 12, // [20] = CCA P (Pro) + 12, // [21] = CCC P (Pro) + 12, // [22] = CCG P (Pro) + 12, // [23] = CCT P (Pro) + 14, // [24] = CGA R (Arg) + 14, // [25] = CGC R (Arg) + 14, // [26] = CGG R (Arg) + 14, // [27] = CGT R (Arg) + 9 , // [28] = CTA L (Leu) + 9 , // [29] = CTC L (Leu) + 9 , // [30] = CTG L (Leu) + 9 , // [31] = CTT L (Leu) + 3 , // [32] = GAA E (Glu) + 2 , // [33] = GAC D (Asp) + 3 , // [34] = GAG E (Glu) + 2 , // [35] = GAT D (Asp) + 0 , // [36] = GCA A (Ala) + 0 , // [37] = GCC A (Ala) + 0 , // [38] = GCG A (Ala) + 0 , // [39] = GCT A (Ala) + 5 , // [40] = GGA G (Gly) + 5 , // [41] = GGC G (Gly) + 5 , // [42] = GGG G (Gly) + 5 , // [43] = GGT G (Gly) + 17, // [44] = GTA V (Val) + 17, // [45] = GTC V (Val) + 17, // [46] = GTG V (Val) + 17, // [47] = GTT V (Val) + 20, // [48] = TAA * (STP) + 19, // [49] = TAC Y (Tyr) + 20, // [50] = TAG * (STP) + 19, // [51] = TAT Y (Tyr) + 15, // [52] = TCA S (Ser) + 15, // [53] = TCC S (Ser) + 15, // [54] = TCG S (Ser) + 15, // [55] = TCT S (Ser) + 20, // [56] = TGA * (STP) + 1 , // [57] = TGC C (Cys) + 18, // [58] = TGG W (Trp) + 1 , // [59] = TGT C (Cys) + 9 , // [60] = TTA L (Leu) + 4 , // [61] = TTC F (Phe) + 9 , // [62] = TTG L (Leu) + 4 , // [63] = TTT F (Phe) + }; + +char g_CodonWordToAminoChar[4*4*4] = + { + 'K', // [ 0] = AAA (Lys) + 'N', // [ 1] = AAC (Asn) + 'K', // [ 2] = AAG (Lys) + 'N', // [ 3] = AAT (Asn) + 'T', // [ 4] = ACA (Thr) + 'T', // [ 5] = ACC (Thr) + 'T', // [ 6] = ACG (Thr) + 'T', // [ 7] = ACT (Thr) + 'R', // [ 8] = AGA (Arg) + 'S', // [ 9] = AGC (Ser) + 'R', // [10] = AGG (Arg) + 'S', // [11] = AGT (Ser) + 'I', // [12] = ATA (Ile) + 'I', // [13] = ATC (Ile) + 'M', // [14] = ATG (Met) + 'I', // [15] = ATT (Ile) + 'Q', // [16] = CAA (Gln) + 'H', // [17] = CAC (His) + 'Q', // [18] = CAG (Gln) + 'H', // [19] = CAT (His) + 'P', // [20] = CCA (Pro) + 'P', // [21] = CCC (Pro) + 'P', // [22] = CCG (Pro) + 'P', // [23] = CCT (Pro) + 'R', // [24] = CGA (Arg) + 'R', // [25] = CGC (Arg) + 'R', // [26] = CGG (Arg) + 'R', // [27] = CGT (Arg) + 'L', // [28] = CTA (Leu) + 'L', // [29] = CTC (Leu) + 'L', // [30] = CTG (Leu) + 'L', // [31] = CTT (Leu) + 'E', // [32] = GAA (Glu) + 'D', // [33] = GAC (Asp) + 'E', // [34] = GAG (Glu) + 'D', // [35] = GAT (Asp) + 'A', // [36] = GCA (Ala) + 'A', // [37] = GCC (Ala) + 'A', // [38] = GCG (Ala) + 'A', // [39] = GCT (Ala) + 'G', // [40] = GGA (Gly) + 'G', // [41] = GGC (Gly) + 'G', // [42] = GGG (Gly) + 'G', // [43] = GGT (Gly) + 'V', // [44] = GTA (Val) + 'V', // [45] = GTC (Val) + 'V', // [46] = GTG (Val) + 'V', // [47] = GTT (Val) + '*', // [48] = TAA (STP) + 'Y', // [49] = TAC (Tyr) + '*', // [50] = TAG (STP) + 'Y', // [51] = TAT (Tyr) + 'S', // [52] = TCA (Ser) + 'S', // [53] = TCC (Ser) + 'S', // [54] = TCG (Ser) + 'S', // [55] = TCT (Ser) + '*', // [56] = TGA (STP) + 'C', // [57] = TGC (Cys) + 'W', // [58] = TGG (Trp) + 'C', // [59] = TGT (Cys) + 'L', // [60] = TTA (Leu) + 'F', // [61] = TTC (Phe) + 'L', // [62] = TTG (Leu) + 'F', // [63] = TTT (Phe) + }; + +unsigned char g_CharToCompChar[256] = + { + INVALID_CHAR, // [ 0] + INVALID_CHAR, // [ 1] + INVALID_CHAR, // [ 2] + INVALID_CHAR, // [ 3] + INVALID_CHAR, // [ 4] + INVALID_CHAR, // [ 5] + INVALID_CHAR, // [ 6] + INVALID_CHAR, // [ 7] + INVALID_CHAR, // [ 8] + INVALID_CHAR, // [ 9] + INVALID_CHAR, // [ 10] + INVALID_CHAR, // [ 11] + INVALID_CHAR, // [ 12] + INVALID_CHAR, // [ 13] + INVALID_CHAR, // [ 14] + INVALID_CHAR, // [ 15] + INVALID_CHAR, // [ 16] + INVALID_CHAR, // [ 17] + INVALID_CHAR, // [ 18] + INVALID_CHAR, // [ 19] + INVALID_CHAR, // [ 20] + INVALID_CHAR, // [ 21] + INVALID_CHAR, // [ 22] + INVALID_CHAR, // [ 23] + INVALID_CHAR, // [ 24] + INVALID_CHAR, // [ 25] + INVALID_CHAR, // [ 26] + INVALID_CHAR, // [ 27] + INVALID_CHAR, // [ 28] + INVALID_CHAR, // [ 29] + INVALID_CHAR, // [ 30] + INVALID_CHAR, // [ 31] + INVALID_CHAR, // [ 32] + INVALID_CHAR, // [ 33] + INVALID_CHAR, // [ 34] + INVALID_CHAR, // [ 35] + INVALID_CHAR, // [ 36] + INVALID_CHAR, // [ 37] + INVALID_CHAR, // [ 38] + INVALID_CHAR, // [ 39] + INVALID_CHAR, // [ 40] + INVALID_CHAR, // [ 41] + INVALID_CHAR, // [ 42] + INVALID_CHAR, // [ 43] + INVALID_CHAR, // [ 44] + INVALID_CHAR, // [ 45] + INVALID_CHAR, // [ 46] + INVALID_CHAR, // [ 47] + INVALID_CHAR, // [ 48] + INVALID_CHAR, // [ 49] + INVALID_CHAR, // [ 50] + INVALID_CHAR, // [ 51] + INVALID_CHAR, // [ 52] + INVALID_CHAR, // [ 53] + INVALID_CHAR, // [ 54] + INVALID_CHAR, // [ 55] + INVALID_CHAR, // [ 56] + INVALID_CHAR, // [ 57] + INVALID_CHAR, // [ 58] + INVALID_CHAR, // [ 59] + INVALID_CHAR, // [ 60] + INVALID_CHAR, // [ 61] + INVALID_CHAR, // [ 62] + INVALID_CHAR, // [ 63] + INVALID_CHAR, // [ 64] + 'T', // [ 65] A -> T + INVALID_CHAR, // [ 66] + 'G', // [ 67] C -> G + INVALID_CHAR, // [ 68] + INVALID_CHAR, // [ 69] + INVALID_CHAR, // [ 70] + 'C', // [ 71] G -> C + INVALID_CHAR, // [ 72] + INVALID_CHAR, // [ 73] + INVALID_CHAR, // [ 74] + INVALID_CHAR, // [ 75] + INVALID_CHAR, // [ 76] + INVALID_CHAR, // [ 77] + INVALID_CHAR, // [ 78] + INVALID_CHAR, // [ 79] + INVALID_CHAR, // [ 80] + INVALID_CHAR, // [ 81] + INVALID_CHAR, // [ 82] + INVALID_CHAR, // [ 83] + 'A', // [ 84] T -> A + 'A', // [ 85] U -> A + INVALID_CHAR, // [ 86] + INVALID_CHAR, // [ 87] + INVALID_CHAR, // [ 88] + INVALID_CHAR, // [ 89] + INVALID_CHAR, // [ 90] + INVALID_CHAR, // [ 91] + INVALID_CHAR, // [ 92] + INVALID_CHAR, // [ 93] + INVALID_CHAR, // [ 94] + INVALID_CHAR, // [ 95] + INVALID_CHAR, // [ 96] + 'T', // [ 97] a -> T + INVALID_CHAR, // [ 98] + 'G', // [ 99] c -> G + INVALID_CHAR, // [100] + INVALID_CHAR, // [101] + INVALID_CHAR, // [102] + 'C', // [103] g -> C + INVALID_CHAR, // [104] + INVALID_CHAR, // [105] + INVALID_CHAR, // [106] + INVALID_CHAR, // [107] + INVALID_CHAR, // [108] + INVALID_CHAR, // [109] + INVALID_CHAR, // [110] + INVALID_CHAR, // [111] + INVALID_CHAR, // [112] + INVALID_CHAR, // [113] + INVALID_CHAR, // [114] + INVALID_CHAR, // [115] + 'A', // [116] t -> A + 'A', // [117] u -> A + INVALID_CHAR, // [118] + INVALID_CHAR, // [119] + INVALID_CHAR, // [120] + INVALID_CHAR, // [121] + INVALID_CHAR, // [122] + INVALID_CHAR, // [123] + INVALID_CHAR, // [124] + INVALID_CHAR, // [125] + INVALID_CHAR, // [126] + INVALID_CHAR, // [127] + INVALID_CHAR, // [128] + INVALID_CHAR, // [129] + INVALID_CHAR, // [130] + INVALID_CHAR, // [131] + INVALID_CHAR, // [132] + INVALID_CHAR, // [133] + INVALID_CHAR, // [134] + INVALID_CHAR, // [135] + INVALID_CHAR, // [136] + INVALID_CHAR, // [137] + INVALID_CHAR, // [138] + INVALID_CHAR, // [139] + INVALID_CHAR, // [140] + INVALID_CHAR, // [141] + INVALID_CHAR, // [142] + INVALID_CHAR, // [143] + INVALID_CHAR, // [144] + INVALID_CHAR, // [145] + INVALID_CHAR, // [146] + INVALID_CHAR, // [147] + INVALID_CHAR, // [148] + INVALID_CHAR, // [149] + INVALID_CHAR, // [150] + INVALID_CHAR, // [151] + INVALID_CHAR, // [152] + INVALID_CHAR, // [153] + INVALID_CHAR, // [154] + INVALID_CHAR, // [155] + INVALID_CHAR, // [156] + INVALID_CHAR, // [157] + INVALID_CHAR, // [158] + INVALID_CHAR, // [159] + INVALID_CHAR, // [160] + INVALID_CHAR, // [161] + INVALID_CHAR, // [162] + INVALID_CHAR, // [163] + INVALID_CHAR, // [164] + INVALID_CHAR, // [165] + INVALID_CHAR, // [166] + INVALID_CHAR, // [167] + INVALID_CHAR, // [168] + INVALID_CHAR, // [169] + INVALID_CHAR, // [170] + INVALID_CHAR, // [171] + INVALID_CHAR, // [172] + INVALID_CHAR, // [173] + INVALID_CHAR, // [174] + INVALID_CHAR, // [175] + INVALID_CHAR, // [176] + INVALID_CHAR, // [177] + INVALID_CHAR, // [178] + INVALID_CHAR, // [179] + INVALID_CHAR, // [180] + INVALID_CHAR, // [181] + INVALID_CHAR, // [182] + INVALID_CHAR, // [183] + INVALID_CHAR, // [184] + INVALID_CHAR, // [185] + INVALID_CHAR, // [186] + INVALID_CHAR, // [187] + INVALID_CHAR, // [188] + INVALID_CHAR, // [189] + INVALID_CHAR, // [190] + INVALID_CHAR, // [191] + INVALID_CHAR, // [192] + INVALID_CHAR, // [193] + INVALID_CHAR, // [194] + INVALID_CHAR, // [195] + INVALID_CHAR, // [196] + INVALID_CHAR, // [197] + INVALID_CHAR, // [198] + INVALID_CHAR, // [199] + INVALID_CHAR, // [200] + INVALID_CHAR, // [201] + INVALID_CHAR, // [202] + INVALID_CHAR, // [203] + INVALID_CHAR, // [204] + INVALID_CHAR, // [205] + INVALID_CHAR, // [206] + INVALID_CHAR, // [207] + INVALID_CHAR, // [208] + INVALID_CHAR, // [209] + INVALID_CHAR, // [210] + INVALID_CHAR, // [211] + INVALID_CHAR, // [212] + INVALID_CHAR, // [213] + INVALID_CHAR, // [214] + INVALID_CHAR, // [215] + INVALID_CHAR, // [216] + INVALID_CHAR, // [217] + INVALID_CHAR, // [218] + INVALID_CHAR, // [219] + INVALID_CHAR, // [220] + INVALID_CHAR, // [221] + INVALID_CHAR, // [222] + INVALID_CHAR, // [223] + INVALID_CHAR, // [224] + INVALID_CHAR, // [225] + INVALID_CHAR, // [226] + INVALID_CHAR, // [227] + INVALID_CHAR, // [228] + INVALID_CHAR, // [229] + INVALID_CHAR, // [230] + INVALID_CHAR, // [231] + INVALID_CHAR, // [232] + INVALID_CHAR, // [233] + INVALID_CHAR, // [234] + INVALID_CHAR, // [235] + INVALID_CHAR, // [236] + INVALID_CHAR, // [237] + INVALID_CHAR, // [238] + INVALID_CHAR, // [239] + INVALID_CHAR, // [240] + INVALID_CHAR, // [241] + INVALID_CHAR, // [242] + INVALID_CHAR, // [243] + INVALID_CHAR, // [244] + INVALID_CHAR, // [245] + INVALID_CHAR, // [246] + INVALID_CHAR, // [247] + INVALID_CHAR, // [248] + INVALID_CHAR, // [249] + INVALID_CHAR, // [250] + INVALID_CHAR, // [251] + INVALID_CHAR, // [252] + INVALID_CHAR, // [253] + INVALID_CHAR, // [254] + INVALID_CHAR, // [255] +}; + +unsigned g_CharToCompLetter[256] = + { + INVALID_LETTER, // [ 0] + INVALID_LETTER, // [ 1] + INVALID_LETTER, // [ 2] + INVALID_LETTER, // [ 3] + INVALID_LETTER, // [ 4] + INVALID_LETTER, // [ 5] + INVALID_LETTER, // [ 6] + INVALID_LETTER, // [ 7] + INVALID_LETTER, // [ 8] + INVALID_LETTER, // [ 9] + INVALID_LETTER, // [ 10] + INVALID_LETTER, // [ 11] + INVALID_LETTER, // [ 12] + INVALID_LETTER, // [ 13] + INVALID_LETTER, // [ 14] + INVALID_LETTER, // [ 15] + INVALID_LETTER, // [ 16] + INVALID_LETTER, // [ 17] + INVALID_LETTER, // [ 18] + INVALID_LETTER, // [ 19] + INVALID_LETTER, // [ 20] + INVALID_LETTER, // [ 21] + INVALID_LETTER, // [ 22] + INVALID_LETTER, // [ 23] + INVALID_LETTER, // [ 24] + INVALID_LETTER, // [ 25] + INVALID_LETTER, // [ 26] + INVALID_LETTER, // [ 27] + INVALID_LETTER, // [ 28] + INVALID_LETTER, // [ 29] + INVALID_LETTER, // [ 30] + INVALID_LETTER, // [ 31] + INVALID_LETTER, // [ 32] + INVALID_LETTER, // [ 33] + INVALID_LETTER, // [ 34] + INVALID_LETTER, // [ 35] + INVALID_LETTER, // [ 36] + INVALID_LETTER, // [ 37] + INVALID_LETTER, // [ 38] + INVALID_LETTER, // [ 39] + INVALID_LETTER, // [ 40] + INVALID_LETTER, // [ 41] + INVALID_LETTER, // [ 42] + INVALID_LETTER, // [ 43] + INVALID_LETTER, // [ 44] + INVALID_LETTER, // [ 45] + INVALID_LETTER, // [ 46] + INVALID_LETTER, // [ 47] + INVALID_LETTER, // [ 48] + INVALID_LETTER, // [ 49] + INVALID_LETTER, // [ 50] + INVALID_LETTER, // [ 51] + INVALID_LETTER, // [ 52] + INVALID_LETTER, // [ 53] + INVALID_LETTER, // [ 54] + INVALID_LETTER, // [ 55] + INVALID_LETTER, // [ 56] + INVALID_LETTER, // [ 57] + INVALID_LETTER, // [ 58] + INVALID_LETTER, // [ 59] + INVALID_LETTER, // [ 60] + INVALID_LETTER, // [ 61] + INVALID_LETTER, // [ 62] + INVALID_LETTER, // [ 63] + INVALID_LETTER, // [ 64] + 3, // [ 65] A -> T + INVALID_LETTER, // [ 66] + 2, // [ 67] C -> G + INVALID_LETTER, // [ 68] + INVALID_LETTER, // [ 69] + INVALID_LETTER, // [ 70] + 1, // [ 71] G -> C + INVALID_LETTER, // [ 72] + INVALID_LETTER, // [ 73] + INVALID_LETTER, // [ 74] + INVALID_LETTER, // [ 75] + INVALID_LETTER, // [ 76] + INVALID_LETTER, // [ 77] + INVALID_LETTER, // [ 78] + INVALID_LETTER, // [ 79] + INVALID_LETTER, // [ 80] + INVALID_LETTER, // [ 81] + INVALID_LETTER, // [ 82] + INVALID_LETTER, // [ 83] + 0, // [ 84] T -> A + 0, // [ 85] U -> A + INVALID_LETTER, // [ 86] + INVALID_LETTER, // [ 87] + INVALID_LETTER, // [ 88] + INVALID_LETTER, // [ 89] + INVALID_LETTER, // [ 90] + INVALID_LETTER, // [ 91] + INVALID_LETTER, // [ 92] + INVALID_LETTER, // [ 93] + INVALID_LETTER, // [ 94] + INVALID_LETTER, // [ 95] + INVALID_LETTER, // [ 96] + 3, // [ 97] a -> T + INVALID_LETTER, // [ 98] + 2, // [ 99] c -> G + INVALID_LETTER, // [100] + INVALID_LETTER, // [101] + INVALID_LETTER, // [102] + 1, // [103] g -> C + INVALID_LETTER, // [104] + INVALID_LETTER, // [105] + INVALID_LETTER, // [106] + INVALID_LETTER, // [107] + INVALID_LETTER, // [108] + INVALID_LETTER, // [109] + INVALID_LETTER, // [110] + INVALID_LETTER, // [111] + INVALID_LETTER, // [112] + INVALID_LETTER, // [113] + INVALID_LETTER, // [114] + INVALID_LETTER, // [115] + 0, // [116] t -> A + 0, // [117] u -> A + INVALID_LETTER, // [118] + INVALID_LETTER, // [119] + INVALID_LETTER, // [120] + INVALID_LETTER, // [121] + INVALID_LETTER, // [122] + INVALID_LETTER, // [123] + INVALID_LETTER, // [124] + INVALID_LETTER, // [125] + INVALID_LETTER, // [126] + INVALID_LETTER, // [127] + INVALID_LETTER, // [128] + INVALID_LETTER, // [129] + INVALID_LETTER, // [130] + INVALID_LETTER, // [131] + INVALID_LETTER, // [132] + INVALID_LETTER, // [133] + INVALID_LETTER, // [134] + INVALID_LETTER, // [135] + INVALID_LETTER, // [136] + INVALID_LETTER, // [137] + INVALID_LETTER, // [138] + INVALID_LETTER, // [139] + INVALID_LETTER, // [140] + INVALID_LETTER, // [141] + INVALID_LETTER, // [142] + INVALID_LETTER, // [143] + INVALID_LETTER, // [144] + INVALID_LETTER, // [145] + INVALID_LETTER, // [146] + INVALID_LETTER, // [147] + INVALID_LETTER, // [148] + INVALID_LETTER, // [149] + INVALID_LETTER, // [150] + INVALID_LETTER, // [151] + INVALID_LETTER, // [152] + INVALID_LETTER, // [153] + INVALID_LETTER, // [154] + INVALID_LETTER, // [155] + INVALID_LETTER, // [156] + INVALID_LETTER, // [157] + INVALID_LETTER, // [158] + INVALID_LETTER, // [159] + INVALID_LETTER, // [160] + INVALID_LETTER, // [161] + INVALID_LETTER, // [162] + INVALID_LETTER, // [163] + INVALID_LETTER, // [164] + INVALID_LETTER, // [165] + INVALID_LETTER, // [166] + INVALID_LETTER, // [167] + INVALID_LETTER, // [168] + INVALID_LETTER, // [169] + INVALID_LETTER, // [170] + INVALID_LETTER, // [171] + INVALID_LETTER, // [172] + INVALID_LETTER, // [173] + INVALID_LETTER, // [174] + INVALID_LETTER, // [175] + INVALID_LETTER, // [176] + INVALID_LETTER, // [177] + INVALID_LETTER, // [178] + INVALID_LETTER, // [179] + INVALID_LETTER, // [180] + INVALID_LETTER, // [181] + INVALID_LETTER, // [182] + INVALID_LETTER, // [183] + INVALID_LETTER, // [184] + INVALID_LETTER, // [185] + INVALID_LETTER, // [186] + INVALID_LETTER, // [187] + INVALID_LETTER, // [188] + INVALID_LETTER, // [189] + INVALID_LETTER, // [190] + INVALID_LETTER, // [191] + INVALID_LETTER, // [192] + INVALID_LETTER, // [193] + INVALID_LETTER, // [194] + INVALID_LETTER, // [195] + INVALID_LETTER, // [196] + INVALID_LETTER, // [197] + INVALID_LETTER, // [198] + INVALID_LETTER, // [199] + INVALID_LETTER, // [200] + INVALID_LETTER, // [201] + INVALID_LETTER, // [202] + INVALID_LETTER, // [203] + INVALID_LETTER, // [204] + INVALID_LETTER, // [205] + INVALID_LETTER, // [206] + INVALID_LETTER, // [207] + INVALID_LETTER, // [208] + INVALID_LETTER, // [209] + INVALID_LETTER, // [210] + INVALID_LETTER, // [211] + INVALID_LETTER, // [212] + INVALID_LETTER, // [213] + INVALID_LETTER, // [214] + INVALID_LETTER, // [215] + INVALID_LETTER, // [216] + INVALID_LETTER, // [217] + INVALID_LETTER, // [218] + INVALID_LETTER, // [219] + INVALID_LETTER, // [220] + INVALID_LETTER, // [221] + INVALID_LETTER, // [222] + INVALID_LETTER, // [223] + INVALID_LETTER, // [224] + INVALID_LETTER, // [225] + INVALID_LETTER, // [226] + INVALID_LETTER, // [227] + INVALID_LETTER, // [228] + INVALID_LETTER, // [229] + INVALID_LETTER, // [230] + INVALID_LETTER, // [231] + INVALID_LETTER, // [232] + INVALID_LETTER, // [233] + INVALID_LETTER, // [234] + INVALID_LETTER, // [235] + INVALID_LETTER, // [236] + INVALID_LETTER, // [237] + INVALID_LETTER, // [238] + INVALID_LETTER, // [239] + INVALID_LETTER, // [240] + INVALID_LETTER, // [241] + INVALID_LETTER, // [242] + INVALID_LETTER, // [243] + INVALID_LETTER, // [244] + INVALID_LETTER, // [245] + INVALID_LETTER, // [246] + INVALID_LETTER, // [247] + INVALID_LETTER, // [248] + INVALID_LETTER, // [249] + INVALID_LETTER, // [250] + INVALID_LETTER, // [251] + INVALID_LETTER, // [252] + INVALID_LETTER, // [253] + INVALID_LETTER, // [254] + INVALID_LETTER, // [255] +}; + +bool g_IsAminoChar[256] = + { + false, // [ 0] 0x00 + false, // [ 1] 0x01 + false, // [ 2] 0x02 + false, // [ 3] 0x03 + false, // [ 4] 0x04 + false, // [ 5] 0x05 + false, // [ 6] 0x06 + false, // [ 7] 0x07 + false, // [ 8] 0x08 + false, // [ 9] 0x09 + false, // [ 10] 0x0a + false, // [ 11] 0x0b + false, // [ 12] 0x0c + false, // [ 13] 0x0d + false, // [ 14] 0x0e + false, // [ 15] 0x0f + false, // [ 16] 0x10 + false, // [ 17] 0x11 + false, // [ 18] 0x12 + false, // [ 19] 0x13 + false, // [ 20] 0x14 + false, // [ 21] 0x15 + false, // [ 22] 0x16 + false, // [ 23] 0x17 + false, // [ 24] 0x18 + false, // [ 25] 0x19 + false, // [ 26] 0x1a + false, // [ 27] 0x1b + false, // [ 28] 0x1c + false, // [ 29] 0x1d + false, // [ 30] 0x1e + false, // [ 31] 0x1f + false, // [ 32] ' ' + false, // [ 33] '!' + false, // [ 34] '"' + false, // [ 35] '#' + false, // [ 36] '$' + false, // [ 37] '%' + false, // [ 38] '&' + false, // [ 39] ''' + false, // [ 40] '(' + false, // [ 41] ')' + true, // [ 42] '*' = STP + false, // [ 43] '+' + false, // [ 44] ',' + false, // [ 45] '-' + false, // [ 46] '.' + false, // [ 47] '/' + false, // [ 48] '0' + false, // [ 49] '1' + false, // [ 50] '2' + false, // [ 51] '3' + false, // [ 52] '4' + false, // [ 53] '5' + false, // [ 54] '6' + false, // [ 55] '7' + false, // [ 56] '8' + false, // [ 57] '9' + false, // [ 58] ':' + false, // [ 59] ';' + false, // [ 60] '<' + false, // [ 61] '=' + false, // [ 62] '>' + false, // [ 63] '?' + false, // [ 64] '@' + true, // [ 65] 'A' = Ala + false, // [ 66] 'B' + true, // [ 67] 'C' = Cys + true, // [ 68] 'D' = Asp + true, // [ 69] 'E' = Glu + true, // [ 70] 'F' = Phe + true, // [ 71] 'G' = Gly + true, // [ 72] 'H' = His + true, // [ 73] 'I' = Ile + false, // [ 74] 'J' + true, // [ 75] 'K' = Lys + true, // [ 76] 'L' = Leu + true, // [ 77] 'M' = Met + true, // [ 78] 'N' = Asn + false, // [ 79] 'O' + true, // [ 80] 'P' = Pro + true, // [ 81] 'Q' = Gln + true, // [ 82] 'R' = Arg + true, // [ 83] 'S' = Ser + true, // [ 84] 'T' = Thr + false, // [ 85] 'U' + true, // [ 86] 'V' = Val + true, // [ 87] 'W' = Trp + false, // [ 88] 'X' + true, // [ 89] 'Y' = Tyr + false, // [ 90] 'Z' + false, // [ 91] '[' + false, // [ 92] '\' + false, // [ 93] ']' + false, // [ 94] '^' + false, // [ 95] '_' + false, // [ 96] '`' + true, // [ 97] 'A' = Ala + false, // [ 98] 'B' + true, // [ 99] 'C' = Cys + true, // [100] 'D' = Asp + true, // [101] 'E' = Glu + true, // [102] 'F' = Phe + true, // [103] 'G' = Gly + true, // [104] 'H' = His + true, // [105] 'I' = Ile + false, // [106] 'J' + true, // [107] 'K' = Lys + true, // [108] 'L' = Leu + true, // [109] 'M' = Met + true, // [110] 'N' = Asn + false, // [111] 'O' + true, // [112] 'P' = Pro + true, // [113] 'Q' = Gln + true, // [114] 'R' = Arg + true, // [115] 'S' = Ser + true, // [116] 'T' = Thr + false, // [117] 'U' + true, // [118] 'V' = Val + true, // [119] 'W' = Trp + false, // [120] 'X' + true, // [121] 'Y' = Tyr + false, // [122] 'Z' + false, // [123] '{' + false, // [124] '|' + false, // [125] '}' + false, // [126] '~' + false, // [127] 0x7f + false, // [128] 0x80 + false, // [129] 0x81 + false, // [130] 0x82 + false, // [131] 0x83 + false, // [132] 0x84 + false, // [133] 0x85 + false, // [134] 0x86 + false, // [135] 0x87 + false, // [136] 0x88 + false, // [137] 0x89 + false, // [138] 0x8a + false, // [139] 0x8b + false, // [140] 0x8c + false, // [141] 0x8d + false, // [142] 0x8e + false, // [143] 0x8f + false, // [144] 0x90 + false, // [145] 0x91 + false, // [146] 0x92 + false, // [147] 0x93 + false, // [148] 0x94 + false, // [149] 0x95 + false, // [150] 0x96 + false, // [151] 0x97 + false, // [152] 0x98 + false, // [153] 0x99 + false, // [154] 0x9a + false, // [155] 0x9b + false, // [156] 0x9c + false, // [157] 0x9d + false, // [158] 0x9e + false, // [159] 0x9f + false, // [160] 0xa0 + false, // [161] 0xa1 + false, // [162] 0xa2 + false, // [163] 0xa3 + false, // [164] 0xa4 + false, // [165] 0xa5 + false, // [166] 0xa6 + false, // [167] 0xa7 + false, // [168] 0xa8 + false, // [169] 0xa9 + false, // [170] 0xaa + false, // [171] 0xab + false, // [172] 0xac + false, // [173] 0xad + false, // [174] 0xae + false, // [175] 0xaf + false, // [176] 0xb0 + false, // [177] 0xb1 + false, // [178] 0xb2 + false, // [179] 0xb3 + false, // [180] 0xb4 + false, // [181] 0xb5 + false, // [182] 0xb6 + false, // [183] 0xb7 + false, // [184] 0xb8 + false, // [185] 0xb9 + false, // [186] 0xba + false, // [187] 0xbb + false, // [188] 0xbc + false, // [189] 0xbd + false, // [190] 0xbe + false, // [191] 0xbf + false, // [192] 0xc0 + false, // [193] 0xc1 + false, // [194] 0xc2 + false, // [195] 0xc3 + false, // [196] 0xc4 + false, // [197] 0xc5 + false, // [198] 0xc6 + false, // [199] 0xc7 + false, // [200] 0xc8 + false, // [201] 0xc9 + false, // [202] 0xca + false, // [203] 0xcb + false, // [204] 0xcc + false, // [205] 0xcd + false, // [206] 0xce + false, // [207] 0xcf + false, // [208] 0xd0 + false, // [209] 0xd1 + false, // [210] 0xd2 + false, // [211] 0xd3 + false, // [212] 0xd4 + false, // [213] 0xd5 + false, // [214] 0xd6 + false, // [215] 0xd7 + false, // [216] 0xd8 + false, // [217] 0xd9 + false, // [218] 0xda + false, // [219] 0xdb + false, // [220] 0xdc + false, // [221] 0xdd + false, // [222] 0xde + false, // [223] 0xdf + false, // [224] 0xe0 + false, // [225] 0xe1 + false, // [226] 0xe2 + false, // [227] 0xe3 + false, // [228] 0xe4 + false, // [229] 0xe5 + false, // [230] 0xe6 + false, // [231] 0xe7 + false, // [232] 0xe8 + false, // [233] 0xe9 + false, // [234] 0xea + false, // [235] 0xeb + false, // [236] 0xec + false, // [237] 0xed + false, // [238] 0xee + false, // [239] 0xef + false, // [240] 0xf0 + false, // [241] 0xf1 + false, // [242] 0xf2 + false, // [243] 0xf3 + false, // [244] 0xf4 + false, // [245] 0xf5 + false, // [246] 0xf6 + false, // [247] 0xf7 + false, // [248] 0xf8 + false, // [249] 0xf9 + false, // [250] 0xfa + false, // [251] 0xfb + false, // [252] 0xfc + false, // [253] 0xfd + false, // [254] 0xfe + false, // [255] 0xff + }; + +bool g_IsNucleoChar[256] = + { + false, // [ 0] 0x00 + false, // [ 1] 0x01 + false, // [ 2] 0x02 + false, // [ 3] 0x03 + false, // [ 4] 0x04 + false, // [ 5] 0x05 + false, // [ 6] 0x06 + false, // [ 7] 0x07 + false, // [ 8] 0x08 + false, // [ 9] 0x09 + false, // [ 10] 0x0a + false, // [ 11] 0x0b + false, // [ 12] 0x0c + false, // [ 13] 0x0d + false, // [ 14] 0x0e + false, // [ 15] 0x0f + false, // [ 16] 0x10 + false, // [ 17] 0x11 + false, // [ 18] 0x12 + false, // [ 19] 0x13 + false, // [ 20] 0x14 + false, // [ 21] 0x15 + false, // [ 22] 0x16 + false, // [ 23] 0x17 + false, // [ 24] 0x18 + false, // [ 25] 0x19 + false, // [ 26] 0x1a + false, // [ 27] 0x1b + false, // [ 28] 0x1c + false, // [ 29] 0x1d + false, // [ 30] 0x1e + false, // [ 31] 0x1f + false, // [ 32] ' ' + false, // [ 33] '!' + false, // [ 34] '"' + false, // [ 35] '#' + false, // [ 36] '$' + false, // [ 37] '%' + false, // [ 38] '&' + false, // [ 39] ''' + false, // [ 40] '(' + false, // [ 41] ')' + false, // [ 42] '*' + false, // [ 43] '+' + false, // [ 44] ',' + false, // [ 45] '-' + false, // [ 46] '.' + false, // [ 47] '/' + false, // [ 48] '0' + false, // [ 49] '1' + false, // [ 50] '2' + false, // [ 51] '3' + false, // [ 52] '4' + false, // [ 53] '5' + false, // [ 54] '6' + false, // [ 55] '7' + false, // [ 56] '8' + false, // [ 57] '9' + false, // [ 58] ':' + false, // [ 59] ';' + false, // [ 60] '<' + false, // [ 61] '=' + false, // [ 62] '>' + false, // [ 63] '?' + false, // [ 64] '@' + true, // [ 65] 'A' (Nucleotide) + false, // [ 66] 'B' + true, // [ 67] 'C' (Nucleotide) + false, // [ 68] 'D' + false, // [ 69] 'E' + false, // [ 70] 'F' + true, // [ 71] 'G' (Nucleotide) + false, // [ 72] 'H' + false, // [ 73] 'I' + false, // [ 74] 'J' + false, // [ 75] 'K' + false, // [ 76] 'L' + false, // [ 77] 'M' + true, // [ 78] 'N' (Nucleotide) + false, // [ 79] 'O' + false, // [ 80] 'P' + false, // [ 81] 'Q' + false, // [ 82] 'R' + false, // [ 83] 'S' + true, // [ 84] 'T' (Nucleotide) + true, // [ 85] 'U' (Nucleotide) + false, // [ 86] 'V' + false, // [ 87] 'W' + false, // [ 88] 'X' + false, // [ 89] 'Y' + false, // [ 90] 'Z' + false, // [ 91] '[' + false, // [ 92] '\' + false, // [ 93] ']' + false, // [ 94] '^' + false, // [ 95] '_' + false, // [ 96] '`' + true, // [ 97] 'A' (Nucleotide) + false, // [ 98] 'B' + true, // [ 99] 'C' (Nucleotide) + false, // [100] 'D' + false, // [101] 'E' + false, // [102] 'F' + true, // [103] 'G' (Nucleotide) + false, // [104] 'H' + false, // [105] 'I' + false, // [106] 'J' + false, // [107] 'K' + false, // [108] 'L' + false, // [109] 'M' + true, // [110] 'N' (Nucleotide) + false, // [111] 'O' + false, // [112] 'P' + false, // [113] 'Q' + false, // [114] 'R' + false, // [115] 'S' + true, // [116] 'T' (Nucleotide) + true, // [117] 'U' (Nucleotide) + false, // [118] 'V' + false, // [119] 'W' + false, // [120] 'X' + false, // [121] 'Y' + false, // [122] 'Z' + false, // [123] '{' + false, // [124] '|' + false, // [125] '}' + false, // [126] '~' + false, // [127] 0x7f + false, // [128] 0x80 + false, // [129] 0x81 + false, // [130] 0x82 + false, // [131] 0x83 + false, // [132] 0x84 + false, // [133] 0x85 + false, // [134] 0x86 + false, // [135] 0x87 + false, // [136] 0x88 + false, // [137] 0x89 + false, // [138] 0x8a + false, // [139] 0x8b + false, // [140] 0x8c + false, // [141] 0x8d + false, // [142] 0x8e + false, // [143] 0x8f + false, // [144] 0x90 + false, // [145] 0x91 + false, // [146] 0x92 + false, // [147] 0x93 + false, // [148] 0x94 + false, // [149] 0x95 + false, // [150] 0x96 + false, // [151] 0x97 + false, // [152] 0x98 + false, // [153] 0x99 + false, // [154] 0x9a + false, // [155] 0x9b + false, // [156] 0x9c + false, // [157] 0x9d + false, // [158] 0x9e + false, // [159] 0x9f + false, // [160] 0xa0 + false, // [161] 0xa1 + false, // [162] 0xa2 + false, // [163] 0xa3 + false, // [164] 0xa4 + false, // [165] 0xa5 + false, // [166] 0xa6 + false, // [167] 0xa7 + false, // [168] 0xa8 + false, // [169] 0xa9 + false, // [170] 0xaa + false, // [171] 0xab + false, // [172] 0xac + false, // [173] 0xad + false, // [174] 0xae + false, // [175] 0xaf + false, // [176] 0xb0 + false, // [177] 0xb1 + false, // [178] 0xb2 + false, // [179] 0xb3 + false, // [180] 0xb4 + false, // [181] 0xb5 + false, // [182] 0xb6 + false, // [183] 0xb7 + false, // [184] 0xb8 + false, // [185] 0xb9 + false, // [186] 0xba + false, // [187] 0xbb + false, // [188] 0xbc + false, // [189] 0xbd + false, // [190] 0xbe + false, // [191] 0xbf + false, // [192] 0xc0 + false, // [193] 0xc1 + false, // [194] 0xc2 + false, // [195] 0xc3 + false, // [196] 0xc4 + false, // [197] 0xc5 + false, // [198] 0xc6 + false, // [199] 0xc7 + false, // [200] 0xc8 + false, // [201] 0xc9 + false, // [202] 0xca + false, // [203] 0xcb + false, // [204] 0xcc + false, // [205] 0xcd + false, // [206] 0xce + false, // [207] 0xcf + false, // [208] 0xd0 + false, // [209] 0xd1 + false, // [210] 0xd2 + false, // [211] 0xd3 + false, // [212] 0xd4 + false, // [213] 0xd5 + false, // [214] 0xd6 + false, // [215] 0xd7 + false, // [216] 0xd8 + false, // [217] 0xd9 + false, // [218] 0xda + false, // [219] 0xdb + false, // [220] 0xdc + false, // [221] 0xdd + false, // [222] 0xde + false, // [223] 0xdf + false, // [224] 0xe0 + false, // [225] 0xe1 + false, // [226] 0xe2 + false, // [227] 0xe3 + false, // [228] 0xe4 + false, // [229] 0xe5 + false, // [230] 0xe6 + false, // [231] 0xe7 + false, // [232] 0xe8 + false, // [233] 0xe9 + false, // [234] 0xea + false, // [235] 0xeb + false, // [236] 0xec + false, // [237] 0xed + false, // [238] 0xee + false, // [239] 0xef + false, // [240] 0xf0 + false, // [241] 0xf1 + false, // [242] 0xf2 + false, // [243] 0xf3 + false, // [244] 0xf4 + false, // [245] 0xf5 + false, // [246] 0xf6 + false, // [247] 0xf7 + false, // [248] 0xf8 + false, // [249] 0xf9 + false, // [250] 0xfa + false, // [251] 0xfb + false, // [252] 0xfc + false, // [253] 0xfd + false, // [254] 0xfe + false, // [255] 0xff + }; + +bool g_IsACGTU[256] = + { + false, // [ 0] 0x00 + false, // [ 1] 0x01 + false, // [ 2] 0x02 + false, // [ 3] 0x03 + false, // [ 4] 0x04 + false, // [ 5] 0x05 + false, // [ 6] 0x06 + false, // [ 7] 0x07 + false, // [ 8] 0x08 + false, // [ 9] 0x09 + false, // [ 10] 0x0a + false, // [ 11] 0x0b + false, // [ 12] 0x0c + false, // [ 13] 0x0d + false, // [ 14] 0x0e + false, // [ 15] 0x0f + false, // [ 16] 0x10 + false, // [ 17] 0x11 + false, // [ 18] 0x12 + false, // [ 19] 0x13 + false, // [ 20] 0x14 + false, // [ 21] 0x15 + false, // [ 22] 0x16 + false, // [ 23] 0x17 + false, // [ 24] 0x18 + false, // [ 25] 0x19 + false, // [ 26] 0x1a + false, // [ 27] 0x1b + false, // [ 28] 0x1c + false, // [ 29] 0x1d + false, // [ 30] 0x1e + false, // [ 31] 0x1f + false, // [ 32] ' ' + false, // [ 33] '!' + false, // [ 34] '"' + false, // [ 35] '#' + false, // [ 36] '$' + false, // [ 37] '%' + false, // [ 38] '&' + false, // [ 39] ''' + false, // [ 40] '(' + false, // [ 41] ')' + false, // [ 42] '*' + false, // [ 43] '+' + false, // [ 44] ',' + false, // [ 45] '-' + false, // [ 46] '.' + false, // [ 47] '/' + false, // [ 48] '0' + false, // [ 49] '1' + false, // [ 50] '2' + false, // [ 51] '3' + false, // [ 52] '4' + false, // [ 53] '5' + false, // [ 54] '6' + false, // [ 55] '7' + false, // [ 56] '8' + false, // [ 57] '9' + false, // [ 58] ':' + false, // [ 59] ';' + false, // [ 60] '<' + false, // [ 61] '=' + false, // [ 62] '>' + false, // [ 63] '?' + false, // [ 64] '@' + true, // [ 65] 'A' (ACGT) + false, // [ 66] 'B' + true, // [ 67] 'C' (ACGT) + false, // [ 68] 'D' + false, // [ 69] 'E' + false, // [ 70] 'F' + true, // [ 71] 'G' (ACGT) + false, // [ 72] 'H' + false, // [ 73] 'I' + false, // [ 74] 'J' + false, // [ 75] 'K' + false, // [ 76] 'L' + false, // [ 77] 'M' + false, // [ 78] 'N' + false, // [ 79] 'O' + false, // [ 80] 'P' + false, // [ 81] 'Q' + false, // [ 82] 'R' + false, // [ 83] 'S' + true, // [ 84] 'T' (ACGT) + true, // [ 85] 'U' (ACGT) + false, // [ 86] 'V' + false, // [ 87] 'W' + false, // [ 88] 'X' + false, // [ 89] 'Y' + false, // [ 90] 'Z' + false, // [ 91] '[' + false, // [ 92] '\' + false, // [ 93] ']' + false, // [ 94] '^' + false, // [ 95] '_' + false, // [ 96] '`' + true, // [ 97] 'A' (ACGT) + false, // [ 98] 'B' + true, // [ 99] 'C' (ACGT) + false, // [100] 'D' + false, // [101] 'E' + false, // [102] 'F' + true, // [103] 'G' (ACGT) + false, // [104] 'H' + false, // [105] 'I' + false, // [106] 'J' + false, // [107] 'K' + false, // [108] 'L' + false, // [109] 'M' + false, // [110] 'N' + false, // [111] 'O' + false, // [112] 'P' + false, // [113] 'Q' + false, // [114] 'R' + false, // [115] 'S' + true, // [116] 'T' (ACGT) + true, // [117] 'U' (ACGT) + false, // [118] 'V' + false, // [119] 'W' + false, // [120] 'X' + false, // [121] 'Y' + false, // [122] 'Z' + false, // [123] '{' + false, // [124] '|' + false, // [125] '}' + false, // [126] '~' + false, // [127] 0x7f + false, // [128] 0x80 + false, // [129] 0x81 + false, // [130] 0x82 + false, // [131] 0x83 + false, // [132] 0x84 + false, // [133] 0x85 + false, // [134] 0x86 + false, // [135] 0x87 + false, // [136] 0x88 + false, // [137] 0x89 + false, // [138] 0x8a + false, // [139] 0x8b + false, // [140] 0x8c + false, // [141] 0x8d + false, // [142] 0x8e + false, // [143] 0x8f + false, // [144] 0x90 + false, // [145] 0x91 + false, // [146] 0x92 + false, // [147] 0x93 + false, // [148] 0x94 + false, // [149] 0x95 + false, // [150] 0x96 + false, // [151] 0x97 + false, // [152] 0x98 + false, // [153] 0x99 + false, // [154] 0x9a + false, // [155] 0x9b + false, // [156] 0x9c + false, // [157] 0x9d + false, // [158] 0x9e + false, // [159] 0x9f + false, // [160] 0xa0 + false, // [161] 0xa1 + false, // [162] 0xa2 + false, // [163] 0xa3 + false, // [164] 0xa4 + false, // [165] 0xa5 + false, // [166] 0xa6 + false, // [167] 0xa7 + false, // [168] 0xa8 + false, // [169] 0xa9 + false, // [170] 0xaa + false, // [171] 0xab + false, // [172] 0xac + false, // [173] 0xad + false, // [174] 0xae + false, // [175] 0xaf + false, // [176] 0xb0 + false, // [177] 0xb1 + false, // [178] 0xb2 + false, // [179] 0xb3 + false, // [180] 0xb4 + false, // [181] 0xb5 + false, // [182] 0xb6 + false, // [183] 0xb7 + false, // [184] 0xb8 + false, // [185] 0xb9 + false, // [186] 0xba + false, // [187] 0xbb + false, // [188] 0xbc + false, // [189] 0xbd + false, // [190] 0xbe + false, // [191] 0xbf + false, // [192] 0xc0 + false, // [193] 0xc1 + false, // [194] 0xc2 + false, // [195] 0xc3 + false, // [196] 0xc4 + false, // [197] 0xc5 + false, // [198] 0xc6 + false, // [199] 0xc7 + false, // [200] 0xc8 + false, // [201] 0xc9 + false, // [202] 0xca + false, // [203] 0xcb + false, // [204] 0xcc + false, // [205] 0xcd + false, // [206] 0xce + false, // [207] 0xcf + false, // [208] 0xd0 + false, // [209] 0xd1 + false, // [210] 0xd2 + false, // [211] 0xd3 + false, // [212] 0xd4 + false, // [213] 0xd5 + false, // [214] 0xd6 + false, // [215] 0xd7 + false, // [216] 0xd8 + false, // [217] 0xd9 + false, // [218] 0xda + false, // [219] 0xdb + false, // [220] 0xdc + false, // [221] 0xdd + false, // [222] 0xde + false, // [223] 0xdf + false, // [224] 0xe0 + false, // [225] 0xe1 + false, // [226] 0xe2 + false, // [227] 0xe3 + false, // [228] 0xe4 + false, // [229] 0xe5 + false, // [230] 0xe6 + false, // [231] 0xe7 + false, // [232] 0xe8 + false, // [233] 0xe9 + false, // [234] 0xea + false, // [235] 0xeb + false, // [236] 0xec + false, // [237] 0xed + false, // [238] 0xee + false, // [239] 0xef + false, // [240] 0xf0 + false, // [241] 0xf1 + false, // [242] 0xf2 + false, // [243] 0xf3 + false, // [244] 0xf4 + false, // [245] 0xf5 + false, // [246] 0xf6 + false, // [247] 0xf7 + false, // [248] 0xf8 + false, // [249] 0xf9 + false, // [250] 0xfa + false, // [251] 0xfb + false, // [252] 0xfc + false, // [253] 0xfd + false, // [254] 0xfe + false, // [255] 0xff + }; + +float g_AminoFreqs[20] = + { + 0.0777f, // 'A' = Ala + 0.0161f, // 'C' = Cys + 0.0527f, // 'D' = Asp + 0.0631f, // 'E' = Glu + 0.0417f, // 'F' = Phe + 0.0718f, // 'G' = Gly + 0.0238f, // 'H' = His + 0.0606f, // 'I' = Ile + 0.0601f, // 'K' = Lys + 0.0906f, // 'L' = Leu + 0.0233f, // 'M' = Met + 0.0439f, // 'N' = Asn + 0.0456f, // 'P' = Pro + 0.0368f, // 'Q' = Gln + 0.0526f, // 'R' = Arg + 0.0639f, // 'S' = Ser + 0.0570f, // 'T' = Thr + 0.0712f, // 'V' = Val + 0.0134f, // 'W' = Trp + 0.0339f, // 'Y' = Tyr + }; diff --git a/alpha.h b/alpha.h new file mode 100644 index 0000000..e021b7f --- /dev/null +++ b/alpha.h @@ -0,0 +1,50 @@ +#ifndef alpha_h +#define alpha_h + +#include +#include + +using namespace std; + +const unsigned INVALID_LETTER = 0; +const unsigned char INVALID_CHAR = '?'; + +extern unsigned g_CharToLetterAmino[]; +extern unsigned g_CharToLetterAminoStop[]; +extern unsigned char g_LetterToCharAmino[]; +extern unsigned g_CharToLetterNucleo[]; +extern unsigned char g_LetterToCharNucleo[]; +extern unsigned g_CodonWordToAminoLetter[]; +extern char g_CodonWordToAminoChar[]; +extern unsigned char g_CharToCompChar[]; +extern unsigned g_CharToCompLetter[]; +extern bool g_IsAminoChar[]; +extern bool g_IsNucleoChar[]; +extern bool g_IsACGTU[]; +extern float g_AminoFreqs[]; + +extern unsigned g_CharToLetterRed[]; +extern unsigned char g_LetterToCharRed[]; +extern unsigned g_RedAlphaSize; + +void LogRedAlphaRed(); +void ReadRedAlphaFromFile(const string &FileName); +unsigned char GetAminoCharFrom3NucChars(unsigned char c1, unsigned char c2, + unsigned char c3); + +static inline bool AminoLetterIsStartCodon(unsigned char Letter) + { + return Letter == 10; + } + +static inline bool AminoLetterIsStopCodon(unsigned char Letter) + { + return Letter == 20; + } + +const char *WordToStr(unsigned Word, unsigned WordLength, bool Nucleo); +const char *WordToStrNucleo(unsigned Word, unsigned WordLength); +const char *WordToStrAmino(unsigned Word, unsigned WordLength); +const char *WordToStrAmino2(unsigned Word, unsigned WordLength, char *Str); + +#endif // alpha_h diff --git a/alpha2.cpp b/alpha2.cpp new file mode 100644 index 0000000..26bc1c6 --- /dev/null +++ b/alpha2.cpp @@ -0,0 +1,100 @@ +#include "myutils.h" +#include "alpha.h" +#include "timing.h" + +bool isgap(byte c) + { + return c == '-' || c == '.'; + } + +const char *WordToStrAmino(unsigned Word, unsigned WordLength) + { + static char Str[32]; + for (unsigned i = 0; i < WordLength; ++i) + { + unsigned Letter = Word%20; + Str[WordLength-i-1] = g_LetterToCharAmino[Letter]; + Word /= 20; + } + Str[WordLength] = 0; + return Str; + } + +const char *WordToStrAmino2(unsigned Word, unsigned WordLength, char *Str) + { + for (unsigned i = 0; i < WordLength; ++i) + { + unsigned Letter = Word%20; + Str[WordLength-i-1] = g_LetterToCharAmino[Letter]; + Word /= 20; + } + Str[WordLength] = 0; + return Str; + } + +const char *WordToStrNucleo(unsigned Word, unsigned WordLength) + { + static char Str[32]; + for (unsigned i = 0; i < WordLength; ++i) + { + unsigned Letter = Word%4; + Str[WordLength-i-1] = g_LetterToCharNucleo[Letter]; + Word /= 4; + } + Str[WordLength] = 0; + return Str; + } + +const char *WordToStr(unsigned Word, unsigned WordLength, bool Nucleo) + { + return (Nucleo ? WordToStrNucleo : WordToStrAmino)(Word, WordLength); + } + +byte *RevCompAlloc(const byte *Seq, unsigned L) + { + byte *RCSeq = MYALLOC(byte, L, Alpha); + + for (unsigned i = 0; i < L; ++i) + RCSeq[L-i-1] = g_CharToCompChar[Seq[i]]; + + return RCSeq; + } + +void RevCompInPlace(byte *Seq, unsigned L) + { + unsigned L1 = L - 1; + unsigned L2 = L/2; + for (unsigned i = 0; i < L2; ++i) + { + unsigned j = L1 - i; + unsigned ci = Seq[i]; + unsigned cj = Seq[j]; + + unsigned ri = g_CharToCompChar[ci]; + unsigned rj = g_CharToCompChar[cj]; + + Seq[i] = rj; + Seq[j] = ri; + } + + if (L%2 == 1) + Seq[L2] = g_CharToCompChar[Seq[L2]]; + } + +void RevComp(const byte *Seq, unsigned L, byte *RCSeq) + { + for (unsigned i = 0; i < L; ++i) + RCSeq[L-i-1] = g_CharToCompChar[Seq[i]]; + } + +unsigned char GetAminoCharFrom3NucChars(unsigned char c1, unsigned char c2, + unsigned char c3) + { + unsigned Letter1 = g_CharToLetterNucleo[c1]; + unsigned Letter2 = g_CharToLetterNucleo[c2]; + unsigned Letter3 = g_CharToLetterNucleo[c3]; + unsigned Word = Letter1*(4*4) + Letter2*4 + Letter3; + + unsigned Letter = g_CodonWordToAminoLetter[Word]; + return g_LetterToCharAmino[Letter]; + } diff --git a/blastdb.cpp b/blastdb.cpp index 4f3047d..70349e5 100644 --- a/blastdb.cpp +++ b/blastdb.cpp @@ -77,8 +77,15 @@ vector BlastDB::findClosestSequences(Sequence* seq, int n) { // wordsize used in megablast. I'm sure we're sacrificing accuracy for speed, but anyother way would take way too // long. With this setting, it seems comparable in speed to the suffix tree approach. - string blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);; - blastCommand += (" -i " + (queryFileName+seq->getName()) + " -o " + blastFileName+seq->getName()); + string blastCommand; + #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) + + blastCommand = path + "blast/bin/blastall -p blastn -d " + dbFileName + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);; + blastCommand += (" -i " + (queryFileName+seq->getName()) + " -o " + blastFileName+seq->getName()); + #else + blastCommand = "\"" + path + "blast\\bin\\blastall\" -p blastn -d " + dbFileName + " -m 8 -W 28 -v " + toString(n) + " -b " + toString(n);; + blastCommand += (" -i " + (queryFileName+seq->getName()) + " -o " + blastFileName+seq->getName()); + #endif system(blastCommand.c_str()); ifstream m8FileHandle; @@ -133,7 +140,7 @@ vector BlastDB::findClosestMegaBlast(Sequence* seq, int n, int minPerID) { blastCommand = path + "blast/bin/megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn blastCommand += (" -i " + (queryFileName+seq->getName()) + " -o " + blastFileName+seq->getName()); #else - blastCommand = path + "blast\\bin\\megablast -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn + blastCommand = "\"" + path + "blast\\bin\\megablast\" -e 1e-10 -d " + dbFileName + " -m 8 -b " + toString(n) + " -v " + toString(n); //-W 28 -p blastn blastCommand += (" -i " + (queryFileName+seq->getName()) + " -o " + blastFileName+seq->getName()); #endif @@ -206,7 +213,7 @@ void BlastDB::generateDB() { #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) formatdbCommand = path + "blast/bin/formatdb -p F -o T -i " + dbFileName; // format the database, -o option gives us the ability #else - formatdbCommand = path + "blast\\bin\\formatdb -p F -o T -i " + dbFileName; + formatdbCommand = "\"" + path + "blast\\bin\\formatdb\" -p F -o T -i " + dbFileName; #endif system(formatdbCommand.c_str()); // to get the right sequence names, i think. -p F // option tells formatdb that seqs are DNA, not prot diff --git a/chainer.h b/chainer.h new file mode 100644 index 0000000..a954dc0 --- /dev/null +++ b/chainer.h @@ -0,0 +1,79 @@ +#ifndef chainer_h +#define chainer_h + +#include "hsp.h" +#include "seq.h" +#include + +const float BAD_SCORE = -9e9f; + +struct TargetHit + { + unsigned TargetIndex; + unsigned TargetLo; + unsigned TargetHi; + int QueryFrame; + float RawScore; // SOMETIMES USED FOR BIT SCORE!!! +// unsigned TargetLength; + + void LogMe() const + { + Log("lo %u, hi %u, frame %d, score %.1f\n", + TargetLo, TargetHi, QueryFrame, RawScore); + } + }; + +struct ChainData + { + unsigned LastHSPIndex; + unsigned Ahi; + unsigned Bhi; + float Score; + }; + +class Chainer + { +public: + HSPData **m_HSPs; // memory owned elsewhere + unsigned m_HSPCount; + unsigned m_MaxHSPCount; + + BPData *m_BPs; + + unsigned *m_PrevHSPIndexes; // Predecessor in chain + float *m_HSPIndexToChainScore; + + list m_Chains; // Live HSP indexes + +public: + Chainer(); + ~Chainer(); + void Reset(); + void Clear(bool ctor = false); + float Chain(HSPData **HSPs, unsigned HSPCount, HSPData **OptChain, + unsigned &OptChainLength); + bool ResolveOverlaps(const SeqData &SA, const SeqData &SB, double MinScore, + const float * const *SubstMx, HSPData **InHSPs, unsigned InHSPCount, + HSPData **OutHSPs, unsigned &OutHSPCount); + void ResolveOverlap(HSPData &HSP1, HSPData &HSP2); + + float ChainBrute(HSPData **HSPs, unsigned HSPCount, HSPData **OptChain, + unsigned &OptChainLength); + void LogMe() const; + void LogHSPs(HSPData **HSPs, unsigned HSPCount) const; + void LogBPs() const; + + static bool IsValidChain(HSPData **HSPs, unsigned HSPCount); + static void AssertValidChain(HSPData **HSPs, unsigned HSPCount); + static void LogChain(HSPData **HSPs, unsigned HSPCount); + static void LogChain2(HSPData **HSPs, unsigned HSPCount); + static float GetChainScore(HSPData **HSPs, unsigned HSPCount); + +private: + void AllocHSPCount(unsigned MaxHSPCount); + void SetBPs(); + void SortBPs(); + unsigned FindBestChainLT(unsigned Ahi, unsigned Bhi); + }; + +#endif // chainer_h diff --git a/chime.h b/chime.h new file mode 100644 index 0000000..1b0662a --- /dev/null +++ b/chime.h @@ -0,0 +1,104 @@ +#ifndef chime_h +#define chime_h + +#include "seq.h" + +struct ChimeHit2 + { + string QLabel; + string ALabel; + string BLabel; + string Q3; + string A3; + string B3; + + //unsigned LY, LN, LA, LD; + //unsigned RY, RN, RA, RD; + double PctIdQT, PctIdQA, PctIdQB, PctIdQM, PctIdAB; + + unsigned ColLo; + unsigned ColXLo; + unsigned ColXHi; + unsigned ColHi; + unsigned QXLo; + unsigned QXHi; + + double Div; + double Score; + double H; + + unsigned CS_LY, CS_LN, CS_LA, CS_RY, CS_RN, CS_RA; + + float AbQ; + float AbA; + float AbB; + + ChimeHit2() + { + Clear(); + } + + void Clear() + { + Q3.clear(); + A3.clear(); + B3.clear(); + QLabel.clear(); + ALabel.clear(); + BLabel.clear(); + + //LY = LN = LA = LD = UINT_MAX; + //RY = RN = RA = RD = UINT_MAX; + ColLo = ColHi = QXLo = QXHi = ColXLo = ColXHi = UINT_MAX; + CS_LY = CS_LN = CS_LA = CS_RY = CS_RN = CS_RA = UINT_MAX; + PctIdQT = PctIdQA = PctIdQB = PctIdQM = PctIdAB = -1.0; + Div = -1.0; + H = -1.0; + Score = -1.0; + AbQ = AbA = AbB = -1.0f; + }; + + bool Accept() const + { + return Score >= opt_minh && Div >= opt_mindiv && CS_LY >= opt_mindiffs && CS_RY >= opt_mindiffs; + } + + void LogMe() const + { + Log("@L %c ", yon(Score >= 1.0 && Div >= 1.0)); + Log(" %.4f", Score); + Log(" LY %u LN %u LA %u", CS_LY, CS_LN, CS_LA); + Log(" RY %u RN %u RA %u", CS_RY, CS_RN, CS_RA); + Log(" Div %.1f%%", Div); + Log(" Q=%s", QLabel.c_str()); + Log(" A=%s", ALabel.c_str()); + Log(" B=%s", BLabel.c_str()); + Log(" QA %.1f%% QB=%.1f%% AB=%.1f%% QM=%.1f%%", PctIdQA, PctIdQB, PctIdAB, PctIdQM); + Log("\n"); + } + + bool operator<(const ChimeHit2 &rhs) const + { + if (Score == rhs.Score) + return Div > rhs.Div; + return Score > rhs.Score; + } + }; + +static inline bool isacgt(char c) + { + return c == 'A' || c == 'C' || c == 'G' || c == 'T'; + } + +static bool inline isgap(char c) + { + return c == '-' || c == '.'; + } + +void GetChunkInfo(unsigned L, unsigned &Length, vector &Los); +float GetAbFromLabel(const string &Label); +void WriteChimeHitCS(FILE *f, const ChimeHit2 &Hit); +void WriteChimeHit(FILE *f, const ChimeHit2 &Hit); +void WriteChimeFileHdr(FILE *f); + +#endif // chime_h diff --git a/chimerauchimecommand.cpp b/chimerauchimecommand.cpp index 58e6781..abd699e 100644 --- a/chimerauchimecommand.cpp +++ b/chimerauchimecommand.cpp @@ -605,6 +605,9 @@ int ChimeraUchimeCommand::createProcesses(string outputFileName, string filename remove((accnos + toString(processIDS[i]) + ".temp").c_str()); } #endif + //get rid of the file pieces. + for (int i = 0; i < files.size(); i++) { remove(files[i].c_str()); } + return num; } catch(exception& e) { diff --git a/chimerauchimecommand.h b/chimerauchimecommand.h new file mode 100644 index 0000000..36e4a39 --- /dev/null +++ b/chimerauchimecommand.h @@ -0,0 +1,58 @@ +#ifndef CHIMERAUCHIMECOMMAND_H +#define CHIMERAUCHIMECOMMAND_H + + +/* + * chimerauchimecommand.h + * Mothur + * + * Created by westcott on 5/13/11. + * Copyright 2011 Schloss Lab. All rights reserved. + * + */ + +#include "mothur.h" +#include "command.hpp" + +/***********************************************************/ + +class ChimeraUchimeCommand : public Command { +public: + ChimeraUchimeCommand(string); + ChimeraUchimeCommand(); + ~ChimeraUchimeCommand() {} + + vector setParameters(); + string getCommandName() { return "chimera.uchime"; } + string getCommandCategory() { return "Sequence Processing"; } + string getHelpString(); + string getCitation() { return "http://drive5.com/uchime/ \nhttp://www.mothur.org/wiki/Chimera.uchime"; } + + + int execute(); + void help() { m->mothurOut(getHelpString()); } + +private: + vector processIDS; //processid + int driver(string, string, string); + int createProcesses(string, string, string); + +#ifdef USE_MPI + int driverMPI(int, int, MPI_File&, MPI_File&, MPI_File&, MPI_File&, vector&); +#endif + + bool abort; + string fastafile, templatefile, outputDir, namefile; + int processors; + + vector outputNames; + vector fastaFileNames; + vector nameFileNames; + +}; + +/***********************************************************/ + +#endif + + diff --git a/collectsharedcommand.cpp b/collectsharedcommand.cpp index 46f49b2..d72dd71 100644 --- a/collectsharedcommand.cpp +++ b/collectsharedcommand.cpp @@ -57,7 +57,7 @@ vector CollectSharedCommand::setParameters(){ CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pshared); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); CommandParameter pfreq("freq", "Number", "", "100", "", "", "",false,false); parameters.push_back(pfreq); - CommandParameter pcalc("calc", "Multiple", "sharedchao-sharedsobs-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan-kstest-whittaker-sharednseqs-ochiai-anderberg-skulczynski-kulczynskicody-lennon-morisitahorn-braycurtis-odum-canberra-structeuclidean-structchord-hellinger-manhattan-structpearson-soergel-spearman-structkulczynski-speciesprofile-structchi2-hamming-gower-memchi2-memchord-memeuclidean-mempearson", "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan", "", "", "",true,false); parameters.push_back(pcalc); + CommandParameter pcalc("calc", "Multiple", "sharedchao-sharedsobs-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan-kstest-whittaker-sharednseqs-ochiai-anderberg-kulczynski-kulczynskicody-lennon-morisitahorn-braycurtis-odum-canberra-structeuclidean-structchord-hellinger-manhattan-structpearson-soergel-spearman-structkulczynski-speciesprofile-structchi2-hamming-gower-memchi2-memchord-memeuclidean-mempearson", "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan", "", "", "",true,false); parameters.push_back(pcalc); CommandParameter pall("all", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pall); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); @@ -119,7 +119,7 @@ CollectSharedCommand::CollectSharedCommand(){ outputTypes["sharednseqs"] = tempOutNames; outputTypes["ochiai"] = tempOutNames; outputTypes["anderberg"] = tempOutNames; - outputTypes["skulczynski"] = tempOutNames; + outputTypes["kulczynski"] = tempOutNames; outputTypes["kulczynskicody"] = tempOutNames; outputTypes["lennon"] = tempOutNames; outputTypes["morisitahorn"] = tempOutNames; @@ -191,7 +191,7 @@ CollectSharedCommand::CollectSharedCommand(string option) { outputTypes["sharednseqs"] = tempOutNames; outputTypes["ochiai"] = tempOutNames; outputTypes["anderberg"] = tempOutNames; - outputTypes["skulczynski"] = tempOutNames; + outputTypes["kulczynski"] = tempOutNames; outputTypes["kulczynskicody"] = tempOutNames; outputTypes["lennon"] = tempOutNames; outputTypes["morisitahorn"] = tempOutNames; @@ -335,7 +335,7 @@ CollectSharedCommand::CollectSharedCommand(string option) { }else if (Estimators[i] == "anderberg") { cDisplays.push_back(new CollectDisplay(new Anderberg(), new SharedOneColumnFile(fileNameRoot+"anderberg"))); outputNames.push_back(fileNameRoot+"anderberg"); outputTypes["anderberg"].push_back(fileNameRoot+"anderberg"); - }else if (Estimators[i] == "skulczynski") { + }else if (Estimators[i] == "kulczynski") { cDisplays.push_back(new CollectDisplay(new Kulczynski(), new SharedOneColumnFile(fileNameRoot+"kulczynski"))); outputNames.push_back(fileNameRoot+"kulczynski"); outputTypes["kulczynski"].push_back(fileNameRoot+"kulczynski"); }else if (Estimators[i] == "kulczynskicody") { diff --git a/commandfactory.cpp b/commandfactory.cpp index 63e0690..681e3e5 100644 --- a/commandfactory.cpp +++ b/commandfactory.cpp @@ -69,6 +69,7 @@ #include "chimeraslayercommand.h" #include "chimerapintailcommand.h" #include "chimerabellerophoncommand.h" +#include "chimerauchimecommand.h" #include "setlogfilecommand.h" #include "phylodiversitycommand.h" #include "makegroupcommand.h" @@ -247,6 +248,7 @@ CommandFactory::CommandFactory(){ commands["chimera.ccode"] = "MPIEnabled"; commands["chimera.check"] = "MPIEnabled"; commands["chimera.slayer"] = "MPIEnabled"; + commands["chimera.uchime"] = "MPIEnabled"; commands["chimera.pintail"] = "MPIEnabled"; commands["chimera.bellerophon"] = "MPIEnabled"; commands["screen.seqs"] = "MPIEnabled"; @@ -351,6 +353,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString){ else if(commandName == "chimera.ccode") { command = new ChimeraCcodeCommand(optionString); } else if(commandName == "chimera.check") { command = new ChimeraCheckCommand(optionString); } else if(commandName == "chimera.slayer") { command = new ChimeraSlayerCommand(optionString); } + else if(commandName == "chimera.uchime") { command = new ChimeraUchimeCommand(optionString); } else if(commandName == "chimera.pintail") { command = new ChimeraPintailCommand(optionString); } else if(commandName == "chimera.bellerophon") { command = new ChimeraBellerophonCommand(optionString); } else if(commandName == "phylotype") { command = new PhylotypeCommand(optionString); } @@ -486,6 +489,7 @@ Command* CommandFactory::getCommand(string commandName, string optionString, str else if(commandName == "classify.seqs") { pipecommand = new ClassifySeqsCommand(optionString); } else if(commandName == "chimera.ccode") { pipecommand = new ChimeraCcodeCommand(optionString); } else if(commandName == "chimera.check") { pipecommand = new ChimeraCheckCommand(optionString); } + else if(commandName == "chimera.uchime") { pipecommand = new ChimeraUchimeCommand(optionString); } else if(commandName == "chimera.slayer") { pipecommand = new ChimeraSlayerCommand(optionString); } else if(commandName == "chimera.pintail") { pipecommand = new ChimeraPintailCommand(optionString); } else if(commandName == "chimera.bellerophon") { pipecommand = new ChimeraBellerophonCommand(optionString); } @@ -610,6 +614,7 @@ Command* CommandFactory::getCommand(string commandName){ else if(commandName == "chimera.ccode") { shellcommand = new ChimeraCcodeCommand(); } else if(commandName == "chimera.check") { shellcommand = new ChimeraCheckCommand(); } else if(commandName == "chimera.slayer") { shellcommand = new ChimeraSlayerCommand(); } + else if(commandName == "chimera.uchime") { shellcommand = new ChimeraUchimeCommand(); } else if(commandName == "chimera.pintail") { shellcommand = new ChimeraPintailCommand(); } else if(commandName == "chimera.bellerophon") { shellcommand = new ChimeraBellerophonCommand(); } else if(commandName == "phylotype") { shellcommand = new PhylotypeCommand(); } diff --git a/diagbox.h b/diagbox.h new file mode 100644 index 0000000..0c5846c --- /dev/null +++ b/diagbox.h @@ -0,0 +1,193 @@ +#ifndef diagbox_h +#define diagbox_h + +struct DiagBox; + +void GetDiagBox(unsigned LA, unsigned LB, unsigned DiagLo, unsigned DiagHi, DiagBox &Box); +void GetDiagRange(unsigned LA, unsigned LB, unsigned d, + unsigned &mini, unsigned &minj, unsigned &maxi, unsigned &maxj); +void GetDiagLoHi(unsigned LA, unsigned LB, const char *Path, + unsigned &dlo, unsigned &dhi); + +struct DiagBox + { + DiagBox() + { + } + + DiagBox(unsigned LA_, unsigned LB_, unsigned DiagLo, unsigned DiagHi) + { + //GetDiagBox(LA, LB, DiagLo, DiagHi, *this); + //Validate(); + Init(LA_, LB_, DiagLo, DiagHi); + } + + void Init(unsigned LA_, unsigned LB_, unsigned DiagLo, unsigned DiagHi) + { + GetDiagBox(LA_, LB_, DiagLo, DiagHi, *this); + Validate(); + } + + unsigned LA; + unsigned LB; + + unsigned dlo; + unsigned dhi; + + unsigned dlo_mini; + unsigned dlo_minj; + + unsigned dlo_maxi; + unsigned dlo_maxj; + + unsigned dhi_mini; + unsigned dhi_minj; + + unsigned dhi_maxi; + unsigned dhi_maxj; + + unsigned GetDiag(unsigned i, unsigned j) const + { + return LA - i + j; + } + +// i, j are positions 0..LA-1, 0..LB-1. + bool InBox(unsigned i, unsigned j) const + { + unsigned d = GetDiag(i, j); + return d >= dlo && d <= dhi; + } + +/*** +i, j are 0-based prefix lengths 0..LA, 0..LB. + +A full path is in the box iff all match pairs are in the box. + +A partial path that aligns a prefix of A to a prefix of B as +in D.P.) is in the box iff it is is the prefix of at least +one full path that is in the box. + +A D.P. matrix entry X[i][j] is in the box iff there is at +least one full path aligning the first i letters of A and +the first j letters of B ending in a column of type X, i.e. +if there exists a partial path in the box that ends in X. + +Assume terminals appear in all paths, and DI/ID forbidden. + +Intuitively seems that by these definitions D is in box iff +DM or MD is in box, I is in box iff IM or MI is in box. +Don't have proof.. +***/ + bool InBoxDPM(unsigned i, unsigned j) const + { + // Special case for M[0][0] + if (i == 0 && j == 0) + return true; + if (i == 0 || j == 0) + return false; + unsigned d = GetDiag(i-1, j-1); + return d >= dlo && d <= dhi; + } + + bool InBoxDPD(unsigned i, unsigned j) const + { + bool MD = i == 0 ? false : InBoxDPM(i-1, j); + bool DM = (i == LA || j == LB) ? false : InBoxDPM(i+1, j+1); + return MD || DM; + } + + bool InBoxDPI(unsigned i, unsigned j) const + { + bool MI = j == 0 ? false : InBoxDPM(i, j-1); + bool IM = (i == LA || j == LB) ? false : InBoxDPM(i+1, j+1); + return MI || IM; + } + + // d = LA - i + j = 1 .. LA+LB-1 + void Validate() const + { + asserta(dlo <= dhi); + asserta(dlo >= GetDiag(LA-1, 0)); + asserta(dhi <= GetDiag(0, LB-1)); + + asserta(GetDiag(dlo_mini, dlo_minj) == dlo); + asserta(GetDiag(dlo_maxi, dlo_maxj) == dlo); + asserta(GetDiag(dhi_mini, dhi_minj) == dhi); + asserta(GetDiag(dhi_maxi, dhi_maxj) == dhi); + + asserta(dlo_mini >= dhi_mini); + asserta(dlo_minj <= dhi_minj); + asserta(dlo_maxi >= dhi_maxi); + asserta(dlo_maxj <= dhi_maxj); + } + + unsigned GetMini() const + { + return dhi_mini; + } + + unsigned GetMaxi() const + { + return dlo_maxi; + } + + unsigned GetMinj() const + { + return dlo_minj; + } + + unsigned GetMaxj() const + { + return dhi_maxj; + } +/*** + i = 0..LA-1 + j = 0..LB-1 + d = LA - i + j = 1 .. LA+LB-1 + j = d - LA + i + i = LA - d + j +***/ + void GetRange_j(unsigned i, unsigned &Startj, unsigned &Endj) const + { + // j = d - LA + i + if (dlo + i >= LA) + Startj = dlo + i - LA; + else + Startj = 0; + + if (Startj >= LB) + Startj = LB - 1; + + if (dhi + i + 1 >= LA) + Endj = dhi + i + 1 - LA; + else + Endj = 0; + + if (Endj > LB) + Endj = LB; + + asserta(Endj >= Startj); + } + + void LogMe() const + { + Log("LA=%u LB=%d dlo(%u): (%u,%u)-(%u,%u) dhi(%u): (%u,%u)-(%u,%u) i=[%u-%u] j=[%u-%u]\n", + LA, LB, + dlo, + dlo_mini, dlo_minj, + dlo_maxi, dlo_maxj, + dhi, + dhi_mini, dhi_minj, + dhi_maxi, dhi_maxj, + GetMini(), GetMaxi(), + GetMinj(), GetMaxj()); + } + }; + +typedef const char *(*NWDIAG)(const byte *A, unsigned LA, const byte *B, unsigned LB, + unsigned DiagLo, unsigned DiagHi, bool LeftTerm, bool RightTerm); + +const char *NWBandWrap(NWDIAG NW, const byte *A, unsigned LA, const byte *B, unsigned LB, + unsigned DiagLo, unsigned DiagHi, bool LeftTerm, bool RightTerm); + +#endif // diagbox_h diff --git a/dp.h b/dp.h new file mode 100644 index 0000000..c771538 --- /dev/null +++ b/dp.h @@ -0,0 +1,164 @@ +#ifndef dp_h +#define dp_h + +#define SAVE_FAST 0 + +#include "myutils.h" +#include "mx.h" +#include "seqdb.h" +#include "diagbox.h" +#include "path.h" +#include "alnparams.h" +#include "alnheuristics.h" +#include "hspfinder.h" + +typedef void (*OnPathFn)(const string &Path, bool Full); + +enum XType + { + XType_Full=1, + XType_Fwd=2, + XType_Bwd=3, + }; + +// public +float ViterbiBrute(const byte *A, unsigned LA, const byte *B, unsigned LB, + unsigned DiagLo, unsigned DiagHi, const AlnParams &AP, PathData &PD); + +float ViterbiSimple(const byte *A, unsigned LA, const byte *B, unsigned LB, + const AlnParams &AP, PathData &PD); + +float ViterbiSimpleBand(const byte *A, unsigned LA, const byte *B, unsigned LB, + const AlnParams &AP, unsigned DiagLo, unsigned DiagHi, PathData &PD); + +float ViterbiFast(const byte *A, unsigned LA, const byte *B, unsigned LB, + const AlnParams &AP, PathData &PD); + +float ViterbiFastBand(const byte *A, unsigned LA, const byte *B, unsigned LB, + unsigned DiagLo, unsigned DiagHi, const AlnParams &AP, PathData &PD); + +float ViterbiFastMainDiag(const byte *A, unsigned LA, const byte *B, unsigned LB, + unsigned BandRadius, const AlnParams &AP, PathData &PD); + +float XDropFwdSimple(const byte *A, unsigned LA, const byte *B, unsigned LB, + const AlnParams &AP, float XDrop, unsigned &Leni, unsigned &Lenj, PathData &PD); + +float XDropBwdSimple(const byte *A, unsigned LA, const byte *B, unsigned LB, + const AlnParams &AP, float XDrop, unsigned &Leni, unsigned &Lenj, PathData &PD); + +float XDropFwdFast(const byte *A, unsigned LA, const byte *B, unsigned LB, + const AlnParams &AP, float XDrop, unsigned &Leni, unsigned &Lenj, PathData &PD); + +float XDropBwdFast(const byte *A, unsigned LA, const byte *B, unsigned LB, + const AlnParams &AP, float XDrop, unsigned &Leni, unsigned &Lenj, PathData &PD); + +void XDropAlign(const byte *A, unsigned LA, const byte *B, unsigned LB, + unsigned AncLoi, unsigned AncLoj, unsigned AncLen, const AlnParams &AP, + float XDrop, HSPData &HSP, PathData &PD); + +float SWSimple(const byte *A, unsigned LA, const byte *B, unsigned LB, + const AlnParams &AP, unsigned &Loi, unsigned &Leni, unsigned &Lenj, + unsigned &Hij, PathData &PD); + +float SWFast(const byte *A, unsigned LA, const byte *B, unsigned LB, + const AlnParams &AP, unsigned &Loi, unsigned &Leni, unsigned &Lenj, + unsigned &Hij, PathData &PD); + +void SWFast2(const SeqData &SA, const SeqData &SB, const AlnParams &AP, + HSPData &HSP, PathData &PD); + +void SWSimple2(const SeqData &SA, const SeqData &SB, const AlnParams &AP, + HSPData &HSP, PathData &PD); + +float SWUngapped(const byte *A, unsigned LA, const byte *B, unsigned LB, + const float * const *SubstMx, unsigned &LoA, unsigned &LoB, unsigned &Len); + +void SWUngapped2(const SeqData &SA, const SeqData &SB, const AlnParams &AP, + HSPData &HSP); + +float SWFastNTB(const byte *A, unsigned LA, const byte *B, unsigned LB, + const AlnParams &AP); + +void GlobalAlignBand(const byte *A, unsigned LA, const byte *B, unsigned LB, + const AlnParams &AP, unsigned BandRadius, PathData &PD); + +bool GlobalAlign(const SeqData &Query, const SeqData &Target, const AlnParams &AP, + const AlnHeuristics &AH, HSPFinder &HF, float MinFractId, float &HSPFractId, + PathData &PD); + +bool GlobalAlign(const SeqData &Query, const SeqData &Target, string &Path); + +void GetBruteMxs(Mx **M, Mx **D, Mx **I); +void GetSimpleDPMxs(Mx **M, Mx **D, Mx **I); +void GetSimpleBandMxs(Mx **M, Mx **D, Mx **I); +void GetXDropFwdSimpleDPMxs(Mx **M, Mx **D, Mx **I); +#if SAVE_FAST +void GetFastMxs(Mx **M, Mx **D, Mx **I); +void GetFastBandMxs(Mx **M, Mx **D, Mx **I); +#endif + +// private +void TraceBackBit(unsigned LA, unsigned LB, char State, PathData &PD); +void TraceBackBitSW(unsigned LA, unsigned LB, unsigned Besti, unsigned Bestj, + unsigned &Leni, unsigned &Lenj, PathData &PD); +void EnumPaths(unsigned L1, unsigned L2, bool SubPaths, OnPathFn OnPath); +void AllocBit(unsigned LA, unsigned LB); + +const byte TRACEBITS_DM = 0x01; +const byte TRACEBITS_IM = 0x02; +const byte TRACEBITS_MD = 0x04; +const byte TRACEBITS_MI = 0x08; +const byte TRACEBITS_SM = 0x10; +const byte TRACEBITS_UNINIT = ~0x1f; + +extern Mx g_Mx_TBBit; +extern float *g_DPRow1; +extern float *g_DPRow2; +extern byte **g_TBBit; + +static inline void Max_xM(float &Score, float MM, float DM, float IM, byte &State) + { + Score = MM; + State = 'M'; + + if (DM > Score) + { + Score = DM; + State = 'D'; + } + if (IM > Score) + { + Score = IM; + State = 'I'; + } + } + +static inline void Max_xD(float &Score, float MD, float DD, byte &State) + { + if (MD >= DD) + { + Score = MD; + State = 'M'; + } + else + { + Score = DD; + State = 'D'; + } + } + +static inline void Max_xI(float &Score, float MI, float II, byte &State) + { + if (MI >= II) + { + Score = MI; + State = 'M'; + } + else + { + Score = II; + State = 'I'; + } + } + +#endif // dp_h diff --git a/evalue.h b/evalue.h new file mode 100644 index 0000000..c9308db --- /dev/null +++ b/evalue.h @@ -0,0 +1,25 @@ +#ifndef evalue_h +#define evalue_h + +#include + +void SetKarlin(double GappedLambda, double UngappedLambda, + double GappedK, double UngappedK, double DBLength);\ + +double GetKarlinDBLength(); +void SetKarlinDBLength(double DBLength); +void LogKarlin(); +void SetKarlinAmino(double DBLength); +void SetKarlinNucleo(double DBLength); +void SetKarlin(double DBLength, bool Nucleo); +double ComputeBitScoreGapped(double Score); +double ComputeBitScoreUngapped(double Score); +double ComputeEvalueGapped(double Score, unsigned QueryLength); +double ComputeEvalueUngapped(double Score, unsigned QueryLength); +double ComputeMinScoreGivenEvalueAGapped(double Evalue, unsigned Area); +double ComputeMinScoreGivenEvalueAUngapped(double Evalue, unsigned Area); +double ComputeMinScoreGivenEvalueQGapped(double Evalue, unsigned QueryLength); +double ComputeMinScoreGivenEvalueQUngapped(double Evalue, unsigned QueryLength); +double ComputeEvalueGappedFromBitScore(double BitScore, unsigned QueryLength); + +#endif // evalue_h diff --git a/fractid.cpp b/fractid.cpp new file mode 100644 index 0000000..f298877 --- /dev/null +++ b/fractid.cpp @@ -0,0 +1,449 @@ +#include "myutils.h" +#include "alpha.h" + +//unsigned g_MaxL = 0; + +static bool *g_IsChar = g_IsAminoChar; + +// Term gaps allowed in query (A) only +static double GetFractIdGivenPathDerep(const byte *A, const byte *B, const char *Path, + char *ptrDesc) + { + if (*Path == 'D') + { + if (ptrDesc != 0) + sprintf(ptrDesc, "(term gap in Query)"); + return 0; + } + + const char *LastM = 0; + for (const char *p = Path; *p; ++p) + if (*p == 'M') + LastM = p; + + unsigned PosA = 0; + unsigned PosB = 0; + unsigned Ids = 0; + unsigned Diffs = 0; + unsigned Cols = 0; + for (const char *p = Path; *p && p != LastM; ++p) + { + ++Cols; + char c = *p; + if (c == 'M') + { + byte a = toupper(A[PosA]); + byte b = toupper(B[PosB]); + if (g_IsChar[a] && g_IsChar[b]) + { + if (a == b) + ++Ids; + else + ++Diffs; + } + else + --Cols; + } + if (c == 'D' || c == 'I') + ++Diffs; + if (c == 'M' || c == 'D') + ++PosA; + if (c == 'M' || c == 'I') + ++PosB; + } + + double FractId = (Cols == 0 ? 0.0 : 1.0 - double(Diffs)/double(Cols)); + if (ptrDesc != 0) + sprintf(ptrDesc, "(ids=%u/cols=%u)", Ids, Cols); + return FractId; + } + +static double GetFractIdGivenPathAllDiffs(const byte *A, const byte *B, const char *Path, + char *ptrDesc) + { + unsigned PosA = 0; + unsigned PosB = 0; + unsigned Ids = 0; + unsigned Diffs = 0; + unsigned Cols = 0; + for (const char *p = Path; *p; ++p) + { + ++Cols; + char c = *p; + if (c == 'M') + { + byte a = toupper(A[PosA]); + byte b = toupper(B[PosB]); + if (g_IsChar[a] && g_IsChar[b]) + { + if (a == b) + ++Ids; + else + ++Diffs; + } + else + --Cols; + } + if (c == 'D' || c == 'I') + ++Diffs; + if (c == 'M' || c == 'D') + ++PosA; + if (c == 'M' || c == 'I') + ++PosB; + } + + double FractId = (Cols == 0 ? 0.0 : 1.0 - double(Diffs)/double(Cols)); + if (ptrDesc != 0) + sprintf(ptrDesc, "(ids=%u/cols=%u)", Ids, Cols); + return FractId; + } + +static double GetFractIdGivenPathInternalDiffs(const byte *A, const byte *B, + const char *Path, char *ptrDesc) + { + unsigned i = 0; + unsigned FirstM = UINT_MAX; + unsigned LastM = UINT_MAX; + for (const char *p = Path; *p; ++p) + { + if (*p == 'M') + { + if (FirstM == UINT_MAX) + FirstM = i; + LastM = i; + } + ++i; + } + if (FirstM == UINT_MAX) + { + if (ptrDesc != 0) + strcpy(ptrDesc, "(no matches)"); + return 0.0; + } + + unsigned PosA = 0; + unsigned PosB = 0; + unsigned Ids = 0; + unsigned Diffs = 0; + unsigned Cols = 0; + for (unsigned i = 0; i < FirstM; ++i) + { + char c = Path[i]; + if (c == 'M' || c == 'D') + ++PosA; + if (c == 'M' || c == 'I') + ++PosB; + } + + for (unsigned i = FirstM; i <= LastM; ++i) + { + ++Cols; + char c = Path[i]; + if (c == 'M') + { + byte a = toupper(A[PosA]); + byte b = toupper(B[PosB]); + if (g_IsChar[a] && g_IsChar[b]) + { + if (a == b) + ++Ids; + else + ++Diffs; + } + else + --Cols; + } + if (c == 'D' || c == 'I') + ++Diffs; + if (c == 'M' || c == 'D') + ++PosA; + if (c == 'M' || c == 'I') + ++PosB; + } + + double FractId = (Cols == 0 ? 0.0 : 1.0 - double(Diffs)/double(Cols)); + if (ptrDesc != 0) + sprintf(ptrDesc, "(ids=%u/cols=%u)", Ids, Cols); + return FractId; + } + +static double GetFractIdGivenPathMBL(const byte *A, const byte *B, const char *Path, + char *ptrDesc) + { + unsigned PosA = 0; + unsigned PosB = 0; + unsigned Mismatches = 0; + unsigned Gaps = 0; + for (const char *p = Path; *p; ++p) + { + char c = *p; + if (c == 'M' && toupper(A[PosA]) != toupper(B[PosB])) + ++Mismatches; + if (c == 'D' || c == 'I' && (p == Path || p[-1] == 'M')) + ++Gaps; + if (c == 'M' || c == 'D') + ++PosA; + if (c == 'M' || c == 'I') + ++PosB; + } + unsigned Diffs = Gaps + Mismatches; + double FractDiffs = (PosB == 0 ? 0.0 : double(Diffs)/double(PosB)); + if (ptrDesc != 0) + sprintf(ptrDesc, "Gap opens %u, Id=1 - [(diffs=%u)/(target_length=%u)]", + Gaps, Diffs, PosB); + double FractId = 1.0 - FractDiffs; + if (FractId < 0.0) + return 0.0; + return FractId; + } + +static double GetFractIdGivenPathBLAST(const byte *A, const byte *B, const char *Path, + char *ptrDesc) + { + unsigned PosA = 0; + unsigned PosB = 0; + unsigned Ids = 0; + unsigned Wilds = 0; + unsigned Cols = 0; + for (const char *p = Path; *p; ++p) + { + ++Cols; + char c = *p; + if (c == 'M') + { + byte a = toupper(A[PosA]); + byte b = toupper(B[PosB]); + if (g_IsChar[a] && g_IsChar[b]) + { + if (a == b) + ++Ids; + } + else + ++Wilds; + } + if (c == 'M' || c == 'D') + ++PosA; + if (c == 'M' || c == 'I') + ++PosB; + } + asserta(Cols >= Wilds); + Cols -= Wilds; + double FractId = Cols == 0 ? 0.0f : float(Ids)/float(Cols); + if (ptrDesc != 0) + sprintf(ptrDesc, "(ids=%u/cols=%u)", Ids, Cols); + return FractId; + } + +static double GetFractIdGivenPathDefault(const byte *A, const byte *B, const char *Path, + char *ptrDesc) + { + unsigned PosA = 0; + unsigned PosB = 0; + unsigned Ids = 0; + unsigned Wilds = 0; + for (const char *p = Path; *p; ++p) + { + char c = *p; + if (c == 'M') + { + byte a = toupper(A[PosA]); + byte b = toupper(B[PosB]); + if (g_IsChar[a] && g_IsChar[b]) + { + if (a == b) + ++Ids; + } + else + ++Wilds; + } + if (c == 'M' || c == 'D') + ++PosA; + if (c == 'M' || c == 'I') + ++PosB; + } + unsigned MinLen = min(PosA, PosB) - Wilds; + double FractId = (MinLen == 0 ? 0.0 : double(Ids)/double(MinLen)); + if (ptrDesc != 0) + sprintf(ptrDesc, "(ids=%u/shorter_length=%u)", Ids, MinLen); + return FractId; + } + +double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path, + bool Nucleo, char *ptrDesc, unsigned IdDef) + { + if (Nucleo) + g_IsChar = g_IsACGTU; + else + g_IsChar = g_IsAminoChar; + + if (Path == 0) + { + if (ptrDesc != 0) + strcpy(ptrDesc, "(NULL path)"); + return 0.0; + } + + unsigned ColCount = (unsigned) strlen(Path); + if (ColCount == 0) + return 0.0; + + if (opt_leftjust) + { + if (Path[0] != 'M' || Path[ColCount-1] == 'D') + { + if (ptrDesc != 0) + strcpy(ptrDesc, "(leftjust)"); + return 0.0; + } + } + + if (opt_rightjust) + { + if (Path[0] == 'D' || Path[ColCount-1] != 'M') + { + if (ptrDesc != 0) + strcpy(ptrDesc, "(rightjust)"); + return 0.0; + } + } + + double FractId = 0.0; + //if (opt_idprefix > 0) + // { + // for (unsigned i = 0; i < opt_idprefix; ++i) + // { + // char c = Path[i]; + // if (c != 'M' || toupper(A[i]) != toupper(B[i])) + // { + // if (ptrDesc != 0) + // sprintf(ptrDesc, "Prefix ids %u < idprefix(%u)", + // i, opt_idprefix); + // return 0.0; + // } + // } + // } + + //if (opt_idsuffix > 0) + // { + // unsigned Cols = strlen(Path); + // for (unsigned i = 0; i < opt_idsuffix && i > Cols; ++i) + // { + // unsigned k = Cols - 1 - i; + // char c = Path[k]; + // if (c != 'M' || toupper(A[k]) != toupper(B[k])) + // { + // if (ptrDesc != 0) + // sprintf(ptrDesc, "Suffix ids %u < idsuffix(%u)", + // i, opt_idsuffix); + // return 0.0; + // } + // } + // } + + if (opt_maxqgap > 0 || opt_maxtgap > 0) + { + unsigned L = 0; + const char *LastM = 0; + for (const char *p = Path; *p; ++p) + if (*p == 'M') + LastM = p; + +// g_MaxL = 0; + for (const char *p = Path; *p && p != LastM; ++p) + { + char c = *p; + switch (c) + { + case 'M': + if (L > 0) + { + if (p[-1] == 'D') + { + if (L > opt_maxtgap) + { + if (ptrDesc != 0) + sprintf(ptrDesc, "(maxtgap)"); + return 0.0; + } + } + else if (p[-1] == 'I') + { + if (L > opt_maxqgap) + { + if (ptrDesc != 0) + sprintf(ptrDesc, "(maxqgap)"); + return 0.0; + } + } + else + asserta(false); + } + L = 0; + break; + + case 'D': + case 'I': + ++L; + //if (L > g_MaxL) + // g_MaxL = L; + break; + + default: + asserta(false); + } + } + } + + switch (IdDef) + { + case 0: + FractId = GetFractIdGivenPathDefault(A, B, Path, ptrDesc); + break; + + case 1: + FractId = GetFractIdGivenPathAllDiffs(A, B, Path, ptrDesc); + break; + + case 2: + FractId = GetFractIdGivenPathInternalDiffs(A, B, Path, ptrDesc); + break; + + case 3: + FractId = GetFractIdGivenPathMBL(A, B, Path, ptrDesc); + break; + + case 4: + FractId = GetFractIdGivenPathBLAST(A, B, Path, ptrDesc); + break; + + case 5: + FractId = GetFractIdGivenPathDerep(A, B, Path, ptrDesc); + break; + + default: + Die("--iddef %u invalid", opt_iddef); + } + + return FractId; + } + +double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path, + bool Nucleo, char *ptrDesc) + { + return GetFractIdGivenPath(A, B, Path, Nucleo, ptrDesc, opt_iddef); + } + +double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path, bool Nucleo) + { + return GetFractIdGivenPath(A, B, Path, Nucleo, (char *) 0); + } + +double GetFractIdGivenPath(const byte *A, const byte *B, const string &Path) + { + return GetFractIdGivenPath(A, B, Path.c_str(), true); + } + +double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path) + { + return GetFractIdGivenPath(A, B, Path, true); + } diff --git a/getparents.cpp b/getparents.cpp new file mode 100644 index 0000000..d82f902 --- /dev/null +++ b/getparents.cpp @@ -0,0 +1,89 @@ +#include "myutils.h" +#include "chime.h" +#include "ultra.h" +#include + +void AddTargets(Ultra &U, const SeqData &Query, set &TargetIndexes); + +void GetChunkInfo(unsigned L, unsigned &Length, vector &Los) + { + Los.clear(); + + if (L <= opt_minchunk) + { + Length = L; + Los.push_back(0); + return; + } + + Length = (L - 1)/opt_chunks + 1; + if (Length < opt_minchunk) + Length = opt_minchunk; + + unsigned Lo = 0; + for (;;) + { + if (Lo + Length >= L) + { + Lo = L - Length - 1; + Los.push_back(Lo); + return; + } + Los.push_back(Lo); + Lo += Length; + } + } + +void GetCandidateParents(Ultra &U, const SeqData &QSD, float AbQ, + vector &Parents) + { + Parents.clear(); + + set TargetIndexes; + + unsigned QL = QSD.L; + + SeqData QuerySD = QSD; + + unsigned ChunkLength; + vector ChunkLos; + GetChunkInfo(QL, ChunkLength, ChunkLos); + unsigned ChunkCount = SIZE(ChunkLos); + for (unsigned ChunkIndex = 0; ChunkIndex < ChunkCount; ++ChunkIndex) + { + unsigned Lo = ChunkLos[ChunkIndex]; + asserta(Lo + ChunkLength <= QL); + + const byte *Chunk = QSD.Seq + Lo; + + // THIS MESSES UP --self!! + //char Prefix[32]; + //sprintf(Prefix, "%u|", Lo); + //string ChunkLabel = string(Prefix) + string(QSD.Label); + + //QuerySD.Label = ChunkLabel.c_str(); + QuerySD.Seq = Chunk; + QuerySD.L = ChunkLength; + + AddTargets(U, QuerySD, TargetIndexes); + + Lo += ChunkLength; + } + + for (set::const_iterator p = TargetIndexes.begin(); + p != TargetIndexes.end(); ++p) + { + unsigned TargetIndex = *p; + bool Accept = true; + if (AbQ > 0.0f) + { + const char *TargetLabel = U.GetSeedLabel(TargetIndex); + float AbT = GetAbFromLabel(string(TargetLabel)); + if (AbT > 0.0f && AbT < opt_abskew*AbQ) + Accept = false; + } + + if (Accept) + Parents.push_back(TargetIndex); + } + } diff --git a/globalalign2.cpp b/globalalign2.cpp new file mode 100644 index 0000000..6bb35a9 --- /dev/null +++ b/globalalign2.cpp @@ -0,0 +1,45 @@ +//#if UCHIMES + +#include "dp.h" +#include "seq.h" + +static AlnParams g_AP; +static bool g_APInitDone = false; + +bool GlobalAlign(const SeqData &Query, const SeqData &Target, PathData &PD) + { + if (!g_APInitDone) + { + g_AP.InitFromCmdLine(true); + g_APInitDone = true; + } + + ViterbiFast(Query.Seq, Query.L, Target.Seq, Target.L, g_AP, PD); + return true; + } + +bool GlobalAlign(const SeqData &Query, const SeqData &Target, string &Path) + { + PathData PD; + GlobalAlign(Query, Target, PD); + Path = string(PD.Start); + return true; + } + +bool GlobalAlign(const SeqData &Query, const SeqData &Target, const AlnParams &/*AP*/, + const AlnHeuristics &AH, HSPFinder &/*HF*/, float /*MinFractId*/, float &/*HSPId*/, PathData &PD) + { + PD.Clear(); + string Path; + bool Found = GlobalAlign(Query, Target, Path); + if (!Found) + return false; + unsigned n = SIZE(Path); + PD.Alloc(n+1); + memcpy(PD.Front, Path.c_str(), n); + PD.Start = PD.Front; + PD.Start[n] = 0; + return true; + } + +//#endif // UCHIMES diff --git a/help.h b/help.h new file mode 100644 index 0000000..9d7a89f --- /dev/null +++ b/help.h @@ -0,0 +1,127 @@ +"\n" +"Usage\n" +"-----\n" +"\n" +"uchime --input query.fasta [--db db.fasta] [--uchimeout results.uchime]\n" +" [--uchimealns results.alns]\n" +"\n" +"Options\n" +"-------\n" +"\n" +"--input filename\n" +" Query sequences in FASTA format.\n" +" If the --db option is not specificed, uchime uses de novo\n" +" detection. In de novo mode, relative abundance must be given\n" +" by a string /ab=xxx/ somewhere in the label, where xxx is a\n" +" floating-point number, e.g. >F00QGH67HG/ab=1.2/.\n" +"\n" +"--db filename\n" +" Reference database in FASTA format.\n" +" Optional, if not specified uchime uses de novo mode.\n" +"\n" +" ***WARNING*** The database is searched ONLY on the plus strand.\n" +" You MUST include reverse-complemented sequences in the database\n" +" if you want both strands to be searched.\n" +"\n" +"--abskew x\n" +" Minimum abundance skew. Default 1.9. De novo mode only.\n" +" Abundance skew is:\n" +" min [ abund(parent1), abund(parent2) ] / abund(query).\n" +"\n" +"--uchimeout filename\n" +" Output in tabbed format with one record per query sequence.\n" +" First field is score (h), second field is query label.\n" +" For details, see manual.\n" +"\n" +"--uchimealns filename\n" +" Multiple alignments of query sequences to parents in human-\n" +" readable format. Alignments show columns with differences\n" +" that support or contradict a chimeric model.\n" +"\n" +"--minh h\n" +" Mininum score to report chimera. Default 0.3. Values from 0.1\n" +" to 5 might be reasonable. Lower values increase sensitivity\n" +" but may report more false positives. If you decrease --xn,\n" +" you may need to increase --minh, and vice versa.\n" +"\n" +"--mindiv div\n" +" Minimum divergence ratio, default 0.5. Div ratio is 100%% - \n" +" %%identity between query sequence and the closest candidate for\n" +" being a parent. If you don't care about very close chimeras,\n" +" then you could increase --mindiv to, say, 1.0 or 2.0, and\n" +" also decrease --min h, say to 0.1, to increase sensitivity.\n" +" How well this works will depend on your data. Best is to\n" +" tune parameters on a good benchmark.\n" +"\n" +"--xn beta\n" +" Weight of a no vote, also called the beta parameter. Default 8.0.\n" +" Decreasing this weight to around 3 or 4 may give better\n" +" performance on denoised data.\n" +"\n" +"--dn n\n" +" Pseudo-count prior on number of no votes. Default 1.4. Probably\n" +" no good reason to change this unless you can retune to a good\n" +" benchmark for your data. Reasonable values are probably in the\n" +" range from 0.2 to 2.\n" +"\n" +"--xa w\n" +" Weight of an abstain vote. Default 1. So far, results do not\n" +" seem to be very sensitive to this parameter, but if you have\n" +" a good training set might be worth trying. Reasonable values\n" +" might range from 0.1 to 2.\n" +"\n" +"--chunks n\n" +" Number of chunks to extract from the query sequence when searching\n" +" for parents. Default 4.\n" +"\n" +"--[no]ovchunks\n" +" [Do not] use overlapping chunks. Default do not.\n" +"\n" +"--minchunk n\n" +" Minimum length of a chunk. Default 64.\n" +"\n" +"--idsmoothwindow w\n" +" Length of id smoothing window. Default 32.\n" +"\n" +"--minsmoothid f\n" +" Minimum factional identity over smoothed window of candidate parent.\n" +" Default 0.95.\n" +"\n" +"--maxp n\n" +" Maximum number of candidate parents to consider. Default 2. In tests so\n" +" far, increasing --maxp gives only a very small improvement in sensivity\n" +" but tends to increase the error rate quite a bit.\n" +"\n" +"--[no]skipgaps\n" +"--[no]skipgaps2\n" +" These options control how gapped columns affect counting of diffs.\n" +" If --skipgaps is specified, columns containing gaps do not found as diffs.\n" +" If --skipgaps2 is specified, if column is immediately adjacent to\n" +" a column containing a gap, it is not counted as a diff.\n" +" Default is --skipgaps --skipgaps2.\n" +"\n" +"--minlen L\n" +"--maxlen L\n" +" Minimum and maximum sequence length. Defaults 10, 10000.\n" +" Applies to both query and reference sequences.\n" +"\n" +"--ucl\n" +" Use local-X alignments. Default is global-X. On tests so far, global-X\n" +" is always better; this option is retained because it just might work\n" +" well on some future type of data.\n" +"\n" +"--queryfract f\n" +" Minimum fraction of the query sequence that must be covered by a local-X\n" +" alignment. Default 0.5. Applies only when --ucl is specified.\n" +"\n" +"--quiet\n" +" Do not display progress messages on stderr.\n" +"\n" +"--log filename\n" +" Write miscellaneous information to the log file. Mostly of interest\n" +" to me (the algorithm developer). Use --verbose to get more info.\n" +"\n" +"--self\n" +" In reference database mode, exclude a reference sequence if it has\n" +" the same label as the query. This is useful for benchmarking by using\n" +" the ref db as a query to test for false positives.\n" diff --git a/hsp.h b/hsp.h new file mode 100644 index 0000000..339256f --- /dev/null +++ b/hsp.h @@ -0,0 +1,114 @@ +#ifndef hsp_h +#define hsp_h 1 + +struct HSPData + { + unsigned Loi; + unsigned Loj; + unsigned Leni; + unsigned Lenj; + float Score; + unsigned User; + + unsigned GetLength() const + { + if (Leni != Lenj) + Die("HSP::GetLength(): Leni %u, Lenj %u, Loi %u, Loj %u, Score %.1f", + Leni, Lenj, Loi, Loj, Score); + + return Leni; + } + + unsigned GetHii() const + { + assert(Leni > 0); + return Loi + Leni - 1; + } + + unsigned GetHij() const + { + assert(Lenj > 0); + return Loj + Lenj - 1; + } + + bool LeftA() const + { + return Loi == 0; + } + + bool LeftB() const + { + return Loj == 0; + } + + bool RightA(unsigned LA) const + { + return Loi + Leni == LA; + } + + bool RightB(unsigned LB) const + { + return Loj + Lenj == LB; + } + + unsigned GetIdCount(const byte *A, const byte *B) const + { + unsigned Count = 0; + unsigned K = GetLength(); + for (unsigned k = 0; k < K; ++k) + { + byte a = A[Loi+k]; + byte b = B[Loj+k]; + if (toupper(a) == toupper(b)) + Count++; + } + return Count; + } + + double OverlapFract(const HSPData &HSP) const + { + if (Leni == 0 || Lenj == 0) + return 0.0; + + unsigned MaxLoi = max(Loi, HSP.Loi); + unsigned MaxLoj = max(Loj, HSP.Loj); + unsigned MinHii = min(GetHii(), HSP.GetHii()); + unsigned MinHij = min(GetHij(), HSP.GetHij()); + + unsigned Ovi = (MinHii < MaxLoi) ? 0 : MinHii - MaxLoi; + unsigned Ovj = (MinHij < MaxLoj) ? 0 : MinHij - MaxLoj; + + asserta(Ovi <= Leni && Ovj <= Lenj); + return double(Ovi*Ovj)/double(Leni*Lenj); + } + + bool operator<(const HSPData &rhs) const + { + return Loi < rhs.Loi; + } + + void LogMe() const + { + Log("Loi=%u Loj=%u Li=%u Lj=%u Score=%.1f\n", Loi, Loj, Leni, Lenj, Score); + } + + void LogMe2() const + { + Log("(%u-%u,%u-%u/%.1f)", Loi, GetHii(), Loj, GetHij(), Score); + } + }; + +// Bendpoint +struct BPData + { + unsigned Pos; + bool IsLo; + unsigned Index; + + void LogMe() const + { + Log("BP%s Pos %u Ix %u", (IsLo ? "lo" : "hi"), Pos, Index); + } + }; + +#endif // hsp_h diff --git a/hspfinder.h b/hspfinder.h new file mode 100644 index 0000000..2b8e9d8 --- /dev/null +++ b/hspfinder.h @@ -0,0 +1,13 @@ +#ifndef hspfinder_h +#define hspfinder_h + +#include "seq.h" + +class HSPFinder + { +public: + void SetA(const SeqData &/*SD*/) {} + void SetB(const SeqData &/*SD*/) {} + }; + +#endif // hspfinder_h diff --git a/make3way.cpp b/make3way.cpp new file mode 100644 index 0000000..ce88f86 --- /dev/null +++ b/make3way.cpp @@ -0,0 +1,173 @@ +#include "myutils.h" +#include "sfasta.h" +#include "path.h" +#include "dp.h" + +void Make3Way(const SeqData &QSD, const SeqData &ASD, const SeqData &BSD, + const string &PathQA, const string &PathQB, + string &Q3, string &A3, string &B3) + { + Q3.clear(); + A3.clear(); + B3.clear(); + +#if DEBUG + { + unsigned QLen = 0; + unsigned ALen = 0; + for (unsigned i = 0; i < SIZE(PathQA); ++i) + { + char c = PathQA[i]; + if (c == 'M' || c == 'D') + ++QLen; + if (c == 'M' || c == 'I') + ++ALen; + } + asserta(QLen == QSD.L); + asserta(ALen == ASD.L); + } + { + unsigned QLen = 0; + unsigned BLen = 0; + for (unsigned i = 0; i < SIZE(PathQB); ++i) + { + char c = PathQB[i]; + if (c == 'M' || c == 'D') + ++QLen; + if (c == 'M' || c == 'I') + ++BLen; + } + asserta(QLen == QSD.L); + asserta(BLen == BSD.L); + } +#endif + + const byte *Q = QSD.Seq; + const byte *A = ASD.Seq; + const byte *B = BSD.Seq; + + unsigned LQ = QSD.L; + unsigned LA = ASD.L; + unsigned LB = BSD.L; + + vector InsertCountsA(LQ+1, 0); + unsigned QPos = 0; + for (unsigned i = 0; i < SIZE(PathQA); ++i) + { + char c = PathQA[i]; + if (c == 'M' || c == 'D') + ++QPos; + else + { + asserta(c == 'I'); + asserta(QPos <= LQ); + ++(InsertCountsA[QPos]); + } + } + + vector InsertCountsB(LQ+1, 0); + QPos = 0; + for (unsigned i = 0; i < SIZE(PathQB); ++i) + { + char c = PathQB[i]; + if (c == 'M' || c == 'D') + ++QPos; + else + { + asserta(c == 'I'); + asserta(QPos <= LQ); + ++(InsertCountsB[QPos]); + } + } + + vector InsertCounts; + for (unsigned i = 0; i <= LQ; ++i) + { + unsigned is = max(InsertCountsA[i], InsertCountsB[i]); + InsertCounts.push_back(is); + } + + for (unsigned i = 0; i < LQ; ++i) + { + for (unsigned k = 0; k < InsertCounts[i]; ++k) + Q3.push_back('-'); + asserta(i < LQ); + Q3.push_back(toupper(Q[i])); + } + for (unsigned k = 0; k < InsertCounts[LQ]; ++k) + Q3.push_back('-'); + +// A + QPos = 0; + unsigned APos = 0; + unsigned is = 0; + for (unsigned i = 0; i < SIZE(PathQA); ++i) + { + char c = PathQA[i]; + if (c == 'M' || c == 'D') + { + unsigned isq = InsertCounts[QPos]; + asserta(is <= isq); + for (unsigned i = 0; i < InsertCounts[QPos]-is; ++i) + A3.push_back('-'); + is = 0; + ++QPos; + } + if (c == 'M') + { + asserta(APos < LA); + A3.push_back(toupper(A[APos++])); + } + else if (c == 'D') + A3.push_back('-'); + else if (c == 'I') + { + ++is; + asserta(APos < LA); + A3.push_back(toupper(A[APos++])); + } + } + asserta(is <= InsertCounts[LQ]); + for (unsigned k = 0; k < InsertCounts[LQ]-is; ++k) + A3.push_back('-'); + asserta(QPos == LQ); + asserta(APos == LA); + +// B + QPos = 0; + unsigned BPos = 0; + is = 0; + for (unsigned i = 0; i < SIZE(PathQB); ++i) + { + char c = PathQB[i]; + if (c == 'M' || c == 'D') + { + asserta(is <= InsertCounts[QPos]); + for (unsigned i = 0; i < InsertCounts[QPos]-is; ++i) + B3.push_back('-'); + is = 0; + ++QPos; + } + if (c == 'M') + { + asserta(BPos < LB); + B3.push_back(toupper(B[BPos++])); + } + else if (c == 'D') + B3.push_back('-'); + else if (c == 'I') + { + ++is; + asserta(BPos < LB); + B3.push_back(toupper(B[BPos++])); + } + } + asserta(is <= InsertCounts[LQ]); + for (unsigned k = 0; k < InsertCounts[LQ]-is; ++k) + B3.push_back('-'); + asserta(APos == LA); + asserta(BPos == LB); + + asserta(SIZE(Q3) == SIZE(A3)); + asserta(SIZE(Q3) == SIZE(B3)); + } diff --git a/mothurout.cpp b/mothurout.cpp index c2ef698..3205ae7 100644 --- a/mothurout.cpp +++ b/mothurout.cpp @@ -1154,7 +1154,43 @@ vector MothurOut::divideFile(string filename, int& proc) { exit(1); } } - +/**************************************************************************************************/ +int MothurOut::divideFile(string filename, int& proc, vector& files) { + try{ + + vector filePos = divideFile(filename, proc); + + for (int i = 0; i < (filePos.size()-1); i++) { + + //read file chunk + ifstream in; + openInputFile(filename, in); + in.seekg(filePos[i]); + unsigned long int size = filePos[(i+1)] - filePos[i]; + char* chunk = new char[size]; + in.read(chunk, size); + in.close(); + + //open new file + string fileChunkName = filename + "." + toString(i) + ".tmp"; + ofstream out; + openOutputFile(fileChunkName, out); + + out << chunk << endl; + out.close(); + delete[] chunk; + + //save name + files.push_back(fileChunkName); + } + + return 0; + } + catch(exception& e) { + errorOut(e, "MothurOut", "divideFile"); + exit(1); + } +} /***********************************************************************/ bool MothurOut::isTrue(string f){ diff --git a/mothurout.h b/mothurout.h index f6f505d..8ac8400 100644 --- a/mothurout.h +++ b/mothurout.h @@ -44,6 +44,7 @@ class MothurOut { //functions from mothur.h //file operations vector divideFile(string, int&); + int divideFile(string, int&, vector&); vector setFilePosEachLine(string, int&); vector setFilePosFasta(string, int&); string sortFile(string, string); diff --git a/mx.cpp b/mx.cpp new file mode 100644 index 0000000..48c347e --- /dev/null +++ b/mx.cpp @@ -0,0 +1,294 @@ +#include "myutils.h" +#include "mx.h" +#include "seqdb.h" +#include "seq.h" + +char ProbToChar(float p); + +list *MxBase::m_Matrices = 0; +unsigned MxBase::m_AllocCount; +unsigned MxBase::m_ZeroAllocCount; +unsigned MxBase::m_GrowAllocCount; +double MxBase::m_TotalBytes; +double MxBase::m_MaxBytes; + +static const char *LogizeStr(const char *s) + { + double d = atof(s); + d = log(d); + return TypeToStr(float(d)); + } + +static const char *ExpizeStr(const char *s) + { + double d = atof(s); + d = exp(d); + return TypeToStr(float(d)); + } + +void MxBase::OnCtor(MxBase *Mx) + { + if (m_Matrices == 0) + m_Matrices = new list; + asserta(m_Matrices != 0); + m_Matrices->push_front(Mx); + } + +void MxBase::OnDtor(MxBase *Mx) + { + if (m_Matrices == 0) + { + Warning("MxBase::OnDtor, m_Matrices = 0"); + return; + } + for (list::iterator p = m_Matrices->begin(); + p != m_Matrices->end(); ++p) + { + if (*p == Mx) + { + m_Matrices->erase(p); + if (m_Matrices->empty()) + delete m_Matrices; + return; + } + } + Warning("MxBase::OnDtor, not found"); + } + +//float **MxBase::Getf(const string &Name) +// { +// Mx *m = (Mx *) Get(Name); +// asserta(m->GetTypeSize() == sizeof(float)); +// return m->GetData(); +// } +// +//double **MxBase::Getd(const string &Name) +// { +// Mx *m = (Mx *) Get(Name); +// asserta(m->GetTypeSize() == sizeof(double)); +// return m->GetData(); +// } +// +//char **MxBase::Getc(const string &Name) +// { +// Mx *m = (Mx *) Get(Name); +// asserta(m->GetTypeSize() == sizeof(char)); +// return m->GetData(); +// } + +void MxBase::Alloc(const char *Name, unsigned RowCount, unsigned ColCount, + const SeqDB *DB, unsigned IdA, unsigned IdB) + { + Alloc(Name, RowCount, ColCount, DB, IdA, IdB, 0, 0); + } + +void MxBase::Alloc(const char *Name, unsigned RowCount, unsigned ColCount, + const SeqData *SA, const SeqData *SB) + { + Alloc(Name, RowCount, ColCount, 0, UINT_MAX, UINT_MAX, SA, SB); + } + +void MxBase::Alloc(const char *Name, unsigned RowCount, unsigned ColCount, + const SeqDB *DB, unsigned IdA, unsigned IdB, const SeqData *SA, const SeqData *SB) + { + StartTimer(MxBase_Alloc); + + ++m_AllocCount; + if (m_AllocatedRowCount == 0) + ++m_ZeroAllocCount; + + if (DB != 0) + { + asserta(IdA != UINT_MAX); + asserta(IdB != UINT_MAX); + asserta(RowCount >= DB->GetSeqLength(IdA) + 1); + asserta(ColCount >= DB->GetSeqLength(IdB) + 1); + } + if (RowCount > m_AllocatedRowCount || ColCount > m_AllocatedColCount) + { + if (m_AllocatedRowCount > 0) + { + if (opt_logmemgrows) + Log("MxBase::Alloc grow %s %u x %u -> %u x %u, %s bytes\n", + Name, m_AllocatedRowCount, m_AllocatedColCount, + RowCount, ColCount, + IntToStr(GetBytes())); + ++m_GrowAllocCount; + } + + m_TotalBytes -= GetBytes(); + + PauseTimer(MxBase_Alloc); + StartTimer(MxBase_FreeData); + FreeData(); + EndTimer(MxBase_FreeData); + StartTimer(MxBase_Alloc); + + unsigned N = max(RowCount + 16, m_AllocatedRowCount); + unsigned M = max(ColCount + 16, m_AllocatedColCount); + N = max(N, M); + + PauseTimer(MxBase_Alloc); + StartTimer(MxBase_AllocData); + AllocData(N, N); + EndTimer(MxBase_AllocData); + StartTimer(MxBase_Alloc); + + m_TotalBytes += GetBytes(); + if (m_TotalBytes > m_MaxBytes) + m_MaxBytes = m_TotalBytes; + } + + unsigned n = sizeof(m_Name)-1; + strncpy(m_Name, Name, n); + m_Name[n] = 0; + m_RowCount = RowCount; + m_ColCount = ColCount; + m_SeqDB = DB; + m_IdA = IdA; + m_IdB = IdB; + m_SA = SA; + m_SB = SB; + + EndTimer(MxBase_Alloc); + } + +void MxBase::LogMe(bool WithData, int Opts) const + { + Log("\n"); + if (Opts & OPT_EXP) + Log("Exp "); + else if (Opts & OPT_LOG) + Log("Log "); + bool ZeroBased = ((Opts & OPT_ZERO_BASED) != 0); + Log("%s(%p) Rows %u/%u, Cols %u/%u", + m_Name, this, + m_RowCount, m_AllocatedRowCount, + m_ColCount, m_AllocatedColCount); + if (m_SeqDB != 0 && m_IdA != UINT_MAX) + Log(", A=%s", m_SeqDB->GetLabel(m_IdA)); + else if (m_SA != 0) + Log(", A=%s", m_SA->Label); + if (m_SeqDB != 0 && m_IdB != UINT_MAX) + Log(", B=%s", m_SeqDB->GetLabel(m_IdB)); + else if (m_SB != 0) + Log(", B=%s", m_SB->Label); + Log("\n"); + if (!WithData || m_RowCount == 0 || m_ColCount == 0) + return; + + const char *z = GetAsStr(0, 0); + unsigned Width = strlen(z); + unsigned Mod = 1; + for (unsigned i = 0; i < Width; ++i) + Mod *= 10; + + if (m_Alpha[0] != 0) + { + Log("// Alphabet=%s\n", m_Alpha); + Log("// "); + unsigned n = strlen(m_Alpha); + for (unsigned j = 0; j < n; ++j) + Log(" %*c", Width, m_Alpha[j]); + Log("\n"); + for (unsigned i = 0; i < n; ++i) + { + Log("/* %c */ {", m_Alpha[i]); + unsigned ci = m_Alpha[i]; + for (unsigned j = 0; j < n; ++j) + { + unsigned cj = m_Alpha[j]; + Log("%s,", GetAsStr(ci, cj)); + } + Log("}, // %c\n", m_Alpha[i]); + } + return; + } + else if (m_Alpha2[0] != 0) + { + unsigned n = strlen(m_Alpha2); + Log("// Alphabet=%s\n", m_Alpha2); + Log("// "); + for (unsigned j = 0; j < n; ++j) + Log(" %*c", Width, m_Alpha2[j]); + Log("\n"); + for (unsigned i = 0; i < n; ++i) + { + Log("/* %c */ {", m_Alpha2[i]); + unsigned ci = m_Alpha2[i]; + for (unsigned j = 0; j < n; ++j) + Log("%s,", GetAsStr(i, j)); + Log("}, // %c\n", m_Alpha2[i]); + } + return; + } + + const byte *A = 0; + const byte *B = 0; + if (m_SeqDB != 0 && m_IdA != UINT_MAX) + A = m_SeqDB->GetSeq(m_IdA); + else if (m_SA != 0) + A = m_SA->Seq; + if (m_SeqDB != 0 && m_IdB != UINT_MAX) + B = m_SeqDB->GetSeq(m_IdB); + else if (m_SB != 0) + B = m_SB->Seq; + + if (B != 0) + { + if (A != 0) + Log(" "); + Log("%5.5s", ""); + if (ZeroBased) + for (unsigned j = 0; j < m_ColCount; ++j) + Log("%*c", Width, B[j]); + else + for (unsigned j = 0; j < m_ColCount; ++j) + Log("%*c", Width, j == 0 ? ' ' : B[j-1]); + Log("\n"); + } + + if (A != 0) + Log(" "); + Log("%5.5s", ""); + for (unsigned j = 0; j < m_ColCount; ++j) + Log("%*u", Width, j%Mod); + Log("\n"); + + for (unsigned i = 0; i < m_RowCount; ++i) + { + if (A != 0) + { + if (ZeroBased) + Log("%c ", A[i]); + else + Log("%c ", i == 0 ? ' ' : A[i-1]); + } + Log("%4u ", i); + + for (unsigned j = 0; j < m_ColCount; ++j) + { + const char *s = GetAsStr(i, j); + if (Opts & OPT_LOG) + s = LogizeStr(s); + else if (Opts & OPT_EXP) + s = ExpizeStr(s); + Log("%s", s); + } + Log("\n"); + } + } +static unsigned g_MatrixFileCount; + +void MxBase::LogCounts() + { + Log("\n"); + Log("MxBase::LogCounts()\n"); + Log(" What N\n"); + Log("---------- ----------\n"); + Log(" Allocs %10u\n", m_AllocCount); + Log("ZeroAllocs %10u\n", m_ZeroAllocCount); + Log(" Grows %10u\n", m_GrowAllocCount); + Log(" Bytes %10.10s\n", MemBytesToStr(m_TotalBytes)); + Log(" Max bytes %10.10s\n", MemBytesToStr(m_MaxBytes)); + } diff --git a/mx.h b/mx.h new file mode 100644 index 0000000..1438900 --- /dev/null +++ b/mx.h @@ -0,0 +1,454 @@ +#ifndef mx_h +#define mx_h + +#include +#include +#include +#include "timing.h" +#include "myutils.h" + +const int OPT_LOG = 0x01; +const int OPT_EXP = 0x02; +const int OPT_ZERO_BASED = 0x04; +const float MINUS_INFINITY = -9e9f; +const float UNINIT = -8e8f; + +struct SeqData; + +template const char *TypeToStr(T t) + { + Die("Unspecialised TypeToStr() called"); + ureturn(0); + } + +template<> inline const char *TypeToStr(unsigned short f) + { + static char s[16]; + + sprintf(s, "%12u", f); + return s; + } + +template<> inline const char *TypeToStr(short f) + { + static char s[16]; + + sprintf(s, "%12d", f); + return s; + } + +template<> inline const char *TypeToStr(int f) + { + static char s[16]; + + sprintf(s, "%5d", f); + return s; + } + +template<> inline const char *TypeToStr(float f) + { + static char s[16]; + + if (f == UNINIT) + sprintf(s, "%12.12s", "?"); + else if (f < MINUS_INFINITY/2) + sprintf(s, "%12.12s", "*"); + else if (f == 0.0f) + sprintf(s, "%12.12s", "."); + else if (f >= -1e5 && f <= 1e5) + sprintf(s, "%12.5f", f); + else + sprintf(s, "%12.4g", f); + return s; + } + +template<> inline const char *TypeToStr(double f) + { + static char s[16]; + + if (f < -1e9) + sprintf(s, "%12.12s", "*"); + else if (f == 0.0f) + sprintf(s, "%12.12s", "."); + else if (f >= -1e-5 && f <= 1e5) + sprintf(s, "%12.5f", f); + else + sprintf(s, "%12.4g", f); + return s; + } + +static inline const char *FloatToStr(float f, string &s) + { + s = TypeToStr(f); + return s.c_str(); + } + +template<> inline const char *TypeToStr(char c) + { + static char s[2]; + s[0] = c; + return s; + } + +template<> inline const char *TypeToStr(byte c) + { + static char s[2]; + s[0] = c; + return s; + } + +template<> inline const char *TypeToStr(bool tof) + { + static char s[2]; + s[0] = tof ? 'T' : 'F'; + return s; + } + +struct SeqDB; + +struct MxBase + { +private: + MxBase(const MxBase &rhs); + MxBase &operator=(const MxBase &rhs); + +public: + char m_Name[32]; + char m_Alpha[32]; + char m_Alpha2[32]; + unsigned m_RowCount; + unsigned m_ColCount; + unsigned m_AllocatedRowCount; + unsigned m_AllocatedColCount; + const SeqDB *m_SeqDB; + unsigned m_IdA; + unsigned m_IdB; + const SeqData *m_SA; + const SeqData *m_SB; + + static list *m_Matrices; + //static MxBase *Get(const string &Name); + //static float **Getf(const string &Name); + //static double **Getd(const string &Name); + //static char **Getc(const string &Name); + + static unsigned m_AllocCount; + static unsigned m_ZeroAllocCount; + static unsigned m_GrowAllocCount; + static double m_TotalBytes; + static double m_MaxBytes; + + static void OnCtor(MxBase *Mx); + static void OnDtor(MxBase *Mx); + + MxBase() + { + m_AllocatedRowCount = 0; + m_AllocatedColCount = 0; + m_RowCount = 0; + m_ColCount = 0; + m_IdA = UINT_MAX; + m_IdB = UINT_MAX; + m_SeqDB = 0; + OnCtor(this); + } + virtual ~MxBase() + { + OnDtor(this); + } + + virtual unsigned GetTypeSize() const = 0; + virtual unsigned GetBytes() const = 0; + + void Clear() + { + FreeData(); + m_AllocatedRowCount = 0; + m_AllocatedColCount = 0; + m_RowCount = 0; + m_ColCount = 0; + m_IdA = UINT_MAX; + m_IdB = UINT_MAX; + m_SA = 0; + m_SB = 0; + } + + bool Empty() const + { + return m_RowCount == 0; + } + + virtual void AllocData(unsigned RowCount, unsigned ColCount) = 0; + virtual void FreeData() = 0; + virtual const char *GetAsStr(unsigned i, unsigned j) const = 0; + + void SetAlpha(const char *Alpha) + { + unsigned n = sizeof(m_Alpha); + strncpy(m_Alpha, Alpha, n); + m_Alpha[n] = 0; + } + + void Alloc(const char *Name, unsigned RowCount, unsigned ColCount, + const SeqDB *DB, unsigned IdA, unsigned IdB, + const SeqData *SA, const SeqData *SB); + + void Alloc(const char *Name, unsigned RowCount, unsigned ColCount, + const SeqDB *DB = 0, unsigned IdA = UINT_MAX, unsigned IdB = UINT_MAX); + + void Alloc(const char *Name, unsigned RowCount, unsigned ColCount, + const SeqData *SA, const SeqData *SB); + + static void LogAll() + { + Log("\n"); + if (m_Matrices == 0) + { + Log("MxBase::m_Matrices=0\n"); + return; + } + Log("\n"); + Log("AllRows AllCols Sz MB Name\n"); + Log("------- ------- ---- -------- ----\n"); + double TotalMB = 0; + for (list::const_iterator p = m_Matrices->begin(); + p != m_Matrices->end(); ++p) + { + const MxBase *Mx = *p; + if (Mx == 0) + continue; + //if (Mx->m_RowCount != 0 || ShowEmpty) + // Mx->LogMe(WithData); + unsigned ar = Mx->m_AllocatedRowCount; + if (ar == 0) + continue; + unsigned ac = Mx->m_AllocatedColCount; + unsigned sz = Mx->GetTypeSize(); + double MB = (double) ar*(double) ac*(double) sz/1e6; + TotalMB += MB; + Log("%7u %7u %4u %8.2f %s\n", ar, ac, sz, MB, Mx->m_Name); + } + Log(" --------\n"); + Log("%7.7s %7.7s %4.4s %8.2f\n", "", "", "", TotalMB); + } + + void LogMe(bool WithData = true, int Opts = 0) const; + static void LogCounts(); + }; + +template struct Mx : public MxBase + { +// Disable unimplemented stuff +private: + Mx(Mx &rhs); + Mx &operator=(Mx &rhs); + // const Mx &operator=(const Mx &rhs) const; + +public: + T **m_Data; + + Mx() + { + m_Data = 0; + } + + ~Mx() + { + FreeData(); + } + + virtual void AllocData(unsigned RowCount, unsigned ColCount) + { + if (opt_logmemgrows) + Log("MxBase::AllocData(%u,%u) %s bytes, Name=%s\n", + RowCount, ColCount, IntToStr(GetBytes()), m_Name); + // m_Data = myalloc(RowCount); + m_Data = MYALLOC(T *, RowCount, Mx); + for (unsigned i = 0; i < RowCount; ++i) + // m_Data[i] = myalloc(ColCount); + m_Data[i] = MYALLOC(T, ColCount, Mx); + AddBytes("Mx_AllocData", RowCount*sizeof(T *) + RowCount*ColCount*sizeof(T)); + + m_AllocatedRowCount = RowCount; + m_AllocatedColCount = ColCount; + } + + virtual void FreeData() + { + for (unsigned i = 0; i < m_AllocatedRowCount; ++i) + MYFREE(m_Data[i], m_AllocatedColCount, Mx); + MYFREE(m_Data, m_AllocatedRowCount, Mx); + SubBytes("Mx_AllocData", + m_AllocatedRowCount*sizeof(T *) + m_AllocatedRowCount*m_AllocatedColCount*sizeof(T)); + + m_Data = 0; + m_RowCount = 0; + m_ColCount = 0; + m_AllocatedRowCount = 0; + m_AllocatedColCount = 0; + } + + T **GetData() + { + return (T **) m_Data; + } + + T Get(unsigned i, unsigned j) const + { + assert(i < m_RowCount); + assert(j < m_ColCount); + return m_Data[i][j]; + } + + void Put(unsigned i, unsigned j, T x) const + { + assert(i < m_RowCount); + assert(j < m_ColCount); + m_Data[i][j] = x; + } + + T GetOffDiagAvgs(vector &Avgs) const + { + if (m_RowCount != m_ColCount) + Die("GetOffDiagAvgs, not symmetrical"); + Avgs.clear(); + T Total = T(0); + for (unsigned i = 0; i < m_RowCount; ++i) + { + T Sum = T(0); + for (unsigned j = 0; j < m_ColCount; ++j) + { + if (j == i) + continue; + Sum += m_Data[i][j]; + } + T Avg = Sum/(m_RowCount-1); + Total += Avg; + Avgs.push_back(Avg); + } + return m_RowCount == 0 ? T(0) : Total/m_RowCount; + } + + unsigned GetTypeSize() const + { + return sizeof(T); + } + + virtual unsigned GetBytes() const + { + return m_AllocatedRowCount*m_AllocatedColCount*GetTypeSize() + + m_AllocatedRowCount*sizeof(T *); + } + + const char *GetAsStr(unsigned i, unsigned j) const + { + return TypeToStr(Get(i, j)); + } + + const T *const *const GetData() const + { + return (const T *const *) m_Data; + } + + void Copy(const Mx &rhs) + { + Alloc("Copy", rhs.m_RowCount, rhs.m_ColCount, rhs.m_SeqDB, rhs.m_IdA, rhs.m_IdB); + const T * const *Data = rhs.GetData(); + for (unsigned i = 0; i < m_RowCount; ++i) + for (unsigned j = 0; j < m_ColCount; ++j) + m_Data[i][j] = Data[i][j]; + } + + void Assign(T v) + { + for (unsigned i = 0; i < m_RowCount; ++i) + for (unsigned j = 0; j < m_ColCount; ++j) + m_Data[i][j] = v; + } + + bool Eq(const Mx &rhs, bool Bwd = false) const + { + if (rhs.m_ColCount != m_ColCount) + return false; + if (rhs.m_RowCount != m_RowCount) + return false; + const T * const*d = rhs.GetData(); + int i1 = Bwd ? m_RowCount : 0; + int j1 = Bwd ? m_ColCount : 0; + int i2 = Bwd ? -1 : m_RowCount; + int j2 = Bwd ? -1 : m_ColCount; + for (int i = i1; i != i2; Bwd ? --i : ++i) + for (int j = j1; j != j2; Bwd ? --j : ++j) + { + float x = m_Data[i][j]; + float y = d[i][j]; + if (x < -1e10 && y < -1e10) + continue; + if (!feq(x, y)) + { + Warning("%s[%d][%d] = %g, %s = %g", + m_Name, i, j, x, rhs.m_Name, y); + return false; + } + } + return true; + } + + bool EqMask(const Mx &rhs, const Mx &Mask) const + { + if (rhs.m_ColCount != m_ColCount) + return false; + if (rhs.m_RowCount != m_RowCount) + return false; + + if (Mask.m_ColCount != m_ColCount) + return false; + if (Mask.m_RowCount != m_RowCount) + return false; + + const T * const*d = rhs.GetData(); + bool Bwd = false; + int i1 = Bwd ? m_RowCount : 0; + int j1 = Bwd ? m_ColCount : 0; + int i2 = Bwd ? -1 : m_RowCount; + int j2 = Bwd ? -1 : m_ColCount; + for (int i = i1; i != i2; Bwd ? --i : ++i) + for (int j = j1; j != j2; Bwd ? --j : ++j) + { + if (!Mask.m_Data[i][j]) + continue; + float x = m_Data[i][j]; + float y = d[i][j]; + if (x < -1e10 && y < -1e10) + continue; + if (!feq(x, y)) + { + Warning("%s[%d][%d] = %g, %s = %g", + m_Name, i, j, x, rhs.m_Name, y); + return false; + } + } + return true; + } + + void Init(T v) + { + for (unsigned i = 0; i < m_RowCount; ++i) + for (unsigned j = 0; j < m_ColCount; ++j) + m_Data[i][j] = v; + } + }; + +void WriteMx(const string &Name, Mx &Mxf); + +template void ReserveMx(Mx &Mxf, unsigned N = UINT_MAX) + { + if (Mxf.m_AllocatedRowCount > 0) + return; + extern unsigned g_MaxInputSeqLength; + if (N == UINT_MAX) + N = g_MaxInputSeqLength+1; + Mxf.Alloc("(Reserved)", N, N); + } + +#endif // mx_h diff --git a/myopts.h b/myopts.h new file mode 100644 index 0000000..ba901ea --- /dev/null +++ b/myopts.h @@ -0,0 +1,190 @@ +#ifndef MY_VERSION +#define MY_VERSION "4.2" +#endif + +STR_OPT( input, 0) +STR_OPT( query, 0) +STR_OPT( db, 0) +STR_OPT( sort, 0) +STR_OPT( output, 0) +STR_OPT( uc, 0) +STR_OPT( clstr2uc, 0) +STR_OPT( uc2clstr, 0) +STR_OPT( uc2fasta, 0) +STR_OPT( uc2fastax, 0) +STR_OPT( mergesort, 0) +STR_OPT( tmpdir, ".") +STR_OPT( staralign, 0) +STR_OPT( sortuc, 0) +STR_OPT( blastout, 0) +STR_OPT( blast6out, 0) +STR_OPT( fastapairs, 0) +STR_OPT( idchar, "|") +STR_OPT( diffchar, " ") +STR_OPT( uchime, 0) +STR_OPT( gapopen, 0) +STR_OPT( gapext, 0) +STR_OPT( uhire, 0) +STR_OPT( ids, "99,98,95,90,85,80,70,50,35") +STR_OPT( seeds, 0) +STR_OPT( clump, 0) +STR_OPT( clumpout, 0) +STR_OPT( clump2fasta, 0) +STR_OPT( clumpfasta, 0) +STR_OPT( hireout, 0) +STR_OPT( mergeclumps, 0) +STR_OPT( alpha, 0) +STR_OPT( hspalpha, 0) +STR_OPT( probmx, 0) +STR_OPT( matrix, 0) +STR_OPT( tracestate, 0) +STR_OPT( chainout, 0) +STR_OPT( cluster, 0) +STR_OPT( computekl, 0) +STR_OPT( userout, 0) +STR_OPT( userfields, 0) +STR_OPT( seedsout, 0) +STR_OPT( chainhits, 0) +STR_OPT( findorfs, 0) +STR_OPT( strand, 0) +STR_OPT( getseqs, 0) +STR_OPT( labels, 0) +STR_OPT( doug, 0) +STR_OPT( makeindex, 0) +STR_OPT( indexstats, 0) +STR_OPT( uchimeout, 0) +STR_OPT( uchimealns, 0) +STR_OPT( xframe, 0) +STR_OPT( mkctest, 0) +STR_OPT( allpairs, 0) +STR_OPT( fastq2fasta, 0) +STR_OPT( otusort, 0) +STR_OPT( sparsedist, 0) +STR_OPT( sparsedistparams, 0) +STR_OPT( mcc, 0) +STR_OPT( utax, 0) +STR_OPT( simcl, 0) +STR_OPT( absort, 0) +STR_OPT( cc, 0) +STR_OPT( uslink, 0) + +UNS_OPT( band, 16, 0, UINT_MAX) +UNS_OPT( minlen, 10, 1, UINT_MAX) +UNS_OPT( maxlen, 10000, 1, UINT_MAX) +UNS_OPT( w, 0, 1, UINT_MAX) +UNS_OPT( k, 0, 1, UINT_MAX) +UNS_OPT( stepwords, 8, 0, UINT_MAX) +UNS_OPT( maxaccepts, 1, 0, UINT_MAX) +UNS_OPT( maxrejects, 8, 0, UINT_MAX) +UNS_OPT( maxtargets, 0, 0, UINT_MAX) +UNS_OPT( minhsp, 32, 1, UINT_MAX) +UNS_OPT( bump, 50, 0, 100) +UNS_OPT( rowlen, 64, 8, UINT_MAX) +UNS_OPT( idprefix, 0, 0, UINT_MAX) +UNS_OPT( idsuffix, 0, 0, UINT_MAX) +UNS_OPT( chunks, 4, 2, UINT_MAX) +UNS_OPT( minchunk, 64, 2, UINT_MAX) +UNS_OPT( maxclump, 1000, 1, UINT_MAX) +UNS_OPT( iddef, 0, 0, UINT_MAX) +UNS_OPT( mincodons, 20, 1, UINT_MAX) +UNS_OPT( maxovd, 8, 0, UINT_MAX) +UNS_OPT( max2, 40, 0, UINT_MAX) +UNS_OPT( querylen, 500, 0, UINT_MAX) +UNS_OPT( targetlen, 500, 0, UINT_MAX) +UNS_OPT( orfstyle, (1+2+4), 0, UINT_MAX) +UNS_OPT( dbstep, 1, 1, UINT_MAX) +UNS_OPT( randseed, 1, 0, UINT_MAX) +UNS_OPT( maxp, 2, 2, UINT_MAX) +UNS_OPT( idsmoothwindow, 32, 1, UINT_MAX) +UNS_OPT( mindiffs, 3, 1, UINT_MAX) +UNS_OPT( maxspan1, 24, 1, UINT_MAX) +UNS_OPT( maxspan2, 24, 1, UINT_MAX) +UNS_OPT( minorfcov, 16, 1, UINT_MAX) +UNS_OPT( hashsize, 4195879, 1, UINT_MAX) +UNS_OPT( maxpoly, 0, 0, UINT_MAX) +UNS_OPT( droppct, 50, 0, 100) +UNS_OPT( secs, 10, 0, UINT_MAX) +UNS_OPT( maxqgap, 0, 0, UINT_MAX) +UNS_OPT( maxtgap, 0, 0, UINT_MAX) + +INT_OPT( frame, 0, -3, +3) + +TOG_OPT( trace, false) +TOG_OPT( logmemgrows, false) +TOG_OPT( trunclabels, false) +TOG_OPT( verbose, false) +TOG_OPT( wordcountreject, true) +TOG_OPT( rev, false) +TOG_OPT( output_rejects, false) +TOG_OPT( blast_termgaps, false) +TOG_OPT( fastalign, true) +TOG_OPT( flushuc, false) +TOG_OPT( stable_sort, false) +TOG_OPT( minus_frames, true) +TOG_OPT( usort, true) +TOG_OPT( nb, false) +TOG_OPT( twohit, true) +TOG_OPT( ssort, false) +TOG_OPT( log_query, false) +TOG_OPT( log_hothits, false) +TOG_OPT( logwordstats, false) +TOG_OPT( ucl, false) +TOG_OPT( skipgaps2, true) +TOG_OPT( skipgaps, true) +TOG_OPT( denovo, false) +TOG_OPT( cartoon_orfs, false) +TOG_OPT( label_ab, false) +TOG_OPT( wordweight, false) +TOG_OPT( isort, false) +TOG_OPT( selfid, false) +TOG_OPT( leftjust, false) +TOG_OPT( rightjust, false) + +FLT_OPT( id, 0.0, 0.0, 1.0) +FLT_OPT( weak_id, 0.0, 0.0, 1.0) +FLT_OPT( match, 1.0, 0.0, FLT_MAX) +FLT_OPT( mismatch, -2.0, 0.0, FLT_MAX) +FLT_OPT( split, 1000.0, 1.0, FLT_MAX) +FLT_OPT( evalue, 10.0, 0.0, FLT_MAX) +FLT_OPT( weak_evalue, 10.0, 0.0, FLT_MAX) +FLT_OPT( evalue_g, 10.0, 0.0, FLT_MAX) +FLT_OPT( chain_evalue, 10.0, 0.0, FLT_MAX) +FLT_OPT( xdrop_u, 16.0, 0.0, FLT_MAX) +FLT_OPT( xdrop_g, 32.0, 0.0, FLT_MAX) +FLT_OPT( xdrop_ug, 16.0, 0.0, FLT_MAX) +FLT_OPT( xdrop_nw, 16.0, 0.0, FLT_MAX) +FLT_OPT( ka_gapped_lambda, 0.0, 0.0, FLT_MAX) +FLT_OPT( ka_ungapped_lambda, 0.0, 0.0, FLT_MAX) +FLT_OPT( ka_gapped_k, 0.0, 0.0, FLT_MAX) +FLT_OPT( ka_ungapped_k, 0.0, 0.0, FLT_MAX) +FLT_OPT( ka_dbsize, 0.0, 0.0, FLT_MAX) +FLT_OPT( chain_targetfract, 0.0, 0.0, 1.0) +FLT_OPT( targetfract, 0.0, 0.0, 1.0) +FLT_OPT( queryfract, 0.0, 0.0, 1.0) +FLT_OPT( fspenalty, 16.0, 0.0, FLT_MAX) +FLT_OPT( sspenalty, 20.0, 0.0, FLT_MAX) +FLT_OPT( seedt1, 13.0, 0.0, FLT_MAX) +FLT_OPT( seedt2, 11.0, 0.0, FLT_MAX) +FLT_OPT( lopen, 11.0, 0.0, FLT_MAX) +FLT_OPT( lext, 1.0, 0.0, FLT_MAX) +FLT_OPT( minh, 0.3, 0.0, FLT_MAX) +FLT_OPT( xn, 8.0, 0.0, FLT_MAX) +FLT_OPT( dn, 1.4, 0.0, FLT_MAX) +FLT_OPT( xa, 1.0, 0.0, FLT_MAX) +FLT_OPT( mindiv, 0.5, 0.0, 100.0) +FLT_OPT( abskew, 2, 0.0, 100.0) +FLT_OPT( abx, 8.0, 0.0, 100.0) +FLT_OPT( minspanratio1, 0.7, 0.0, 1.0) +FLT_OPT( minspanratio2, 0.7, 0.0, 1.0) + +FLAG_OPT( usersort) +FLAG_OPT( exact) +FLAG_OPT( optimal) +FLAG_OPT( self) +FLAG_OPT( ungapped) +FLAG_OPT( global) +FLAG_OPT( local) +FLAG_OPT( xlat) +FLAG_OPT( realign) +FLAG_OPT( hash) +FLAG_OPT( derep) diff --git a/myutils.cpp b/myutils.cpp new file mode 100755 index 0000000..ea983eb --- /dev/null +++ b/myutils.cpp @@ -0,0 +1,1844 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _MSC_VER +#include +#include +#include +#include +#include +#else +#include +#include +#include +#include +#include +#include +#endif + +#include "myutils.h" + +const char *SVN_VERSION = +#include "svnversion.h" +; + +#define TEST_UTILS 0 + +using namespace std; + +const unsigned MY_IO_BUFSIZ = 32000; +const unsigned MAX_FORMATTED_STRING_LENGTH = 64000; + +static char *g_IOBuffers[256]; +static time_t g_StartTime = time(0); +static vector g_Argv; +static double g_PeakMemUseBytes; + +#if TEST_UTILS +void TestUtils() + { + const int C = 100000000; + for (int i = 0; i < C; ++i) + ProgressStep(i, C, "something or other"); + + Progress("\n"); + Progress("Longer message\r"); + Sleep(1000); + Progress("Short\r"); + Sleep(1000); + Progress("And longer again\r"); + Sleep(1000); + Progress("Shrt\n"); + Sleep(1000); + const unsigned N = 10; + unsigned M = 10; + for (unsigned i = 0; i < N; ++i) + { + ProgressStep(i, N, "Allocating 1MB blocks"); + for (unsigned j = 0; j < M; ++j) + { + ProgressStep(j, M, "Inner loop"); + malloc(100000); + Sleep(500); + } + } + } +#endif // TEST_UTILS + +static void AllocBuffer(FILE *f) + { + int fd = fileno(f); + if (fd < 0 || fd >= 256) + return; + if (g_IOBuffers[fd] == 0) + g_IOBuffers[fd] = myalloc(char, MY_IO_BUFSIZ); + setvbuf(f, g_IOBuffers[fd], _IOFBF, MY_IO_BUFSIZ); + } + +static void FreeBuffer(FILE *f) + { + int fd = fileno(f); + if (fd < 0 || fd >= 256) + return; + if (g_IOBuffers[fd] == 0) + return; + myfree(g_IOBuffers[fd]); + g_IOBuffers[fd] = 0; + } + +unsigned GetElapsedSecs() + { + return (unsigned) (time(0) - g_StartTime); + } + +static unsigned g_NewCalls; +static unsigned g_FreeCalls; +static double g_InitialMemUseBytes; +static double g_TotalAllocBytes; +static double g_TotalFreeBytes; +static double g_NetBytes; +static double g_MaxNetBytes; + +void LogAllocStats() + { + Log("\n"); + Log(" Allocs %u\n", g_NewCalls); + Log(" Frees %u\n", g_FreeCalls); + Log("Initial alloc %s\n", MemBytesToStr(g_InitialMemUseBytes)); + Log(" Total alloc %s\n", MemBytesToStr(g_TotalAllocBytes)); + Log(" Total free %s\n", MemBytesToStr(g_TotalFreeBytes)); + Log(" Net bytes %s\n", MemBytesToStr(g_NetBytes)); + Log("Max net bytes %s\n", MemBytesToStr(g_MaxNetBytes)); + Log(" Peak total %s\n", MemBytesToStr(g_MaxNetBytes + g_InitialMemUseBytes)); + } + +bool StdioFileExists(const string &FileName) + { + struct stat SD; + int i = stat(FileName.c_str(), &SD); + return i == 0; + } + +void myassertfail(const char *Exp, const char *File, unsigned Line) + { + Die("%s(%u) assert failed: %s", File, Line, Exp); + } + +bool myisatty(int fd) + { + return isatty(fd) != 0; + } + +#ifdef _MSC_VER +#include +int fseeko(FILE *stream, off_t offset, int whence) + { + off_t FilePos = _fseeki64(stream, offset, whence); + return (FilePos == -1L) ? -1 : 0; + } +#define ftello(fm) (off_t) _ftelli64(fm) +#endif + +void LogStdioFileState(FILE *f) + { + unsigned long tellpos = (unsigned long) ftello(f); + long fseek_pos = fseek(f, 0, SEEK_CUR); + int fd = fileno(f); + Log("FILE * %p\n", f); + Log("fileno %d\n", fd); + Log("feof %d\n", feof(f)); + Log("ferror %d\n", ferror(f)); + Log("ftell %ld\n", tellpos); + Log("fseek %ld\n", fseek_pos); +#if !defined(_GNU_SOURCE) && !defined(__APPLE_CC__) + fpos_t fpos; + int fgetpos_retval = fgetpos(f, &fpos); + Log("fpos %ld (retval %d)\n", (long) fpos, fgetpos_retval); +// Log("eof %d\n", _eof(fd)); +#endif +#ifdef _MSC_VER + __int64 pos64 = _ftelli64(f); + Log("_ftelli64 %lld\n", pos64); +#endif + } + +FILE *OpenStdioFile(const string &FileName) + { + const char *Mode = "rb"; + FILE *f = fopen(FileName.c_str(), Mode); + if (f == 0) + { + if (errno == EFBIG) + { + if (sizeof(off_t) == 4) + Die("File too big, off_t is 32 bits, recompile needed"); + else + Die("Cannot open '%s', file too big (off_t=%u bits)", + FileName.c_str(), sizeof(off_t)*8); + } + Die("Cannot open %s, errno=%d %s", + FileName.c_str(), errno, strerror(errno)); + } + AllocBuffer(f); + return f; + } + +FILE *CreateStdioFile(const string &FileName) + { + FILE *f = fopen(FileName.c_str(), "wb+"); + if (0 == f) + Die("Cannot create %s, errno=%d %s", + FileName.c_str(), errno, strerror(errno)); + AllocBuffer(f); + return f; + } + +void SetStdioFilePos(FILE *f, off_t Pos) + { + if (0 == f) + Die("SetStdioFilePos failed, f=NULL"); + int Ok = fseeko(f, Pos, SEEK_SET); + off_t NewPos = ftello(f); + if (Ok != 0 || Pos != NewPos) + { + LogStdioFileState(f); + Die("SetStdioFilePos(%d) failed, Ok=%d NewPos=%d", + (int) Pos, Ok, (int) NewPos); + } + } + +void ReadStdioFile(FILE *f, off_t Pos, void *Buffer, unsigned Bytes) + { + if (0 == f) + Die("ReadStdioFile failed, f=NULL"); + SetStdioFilePos(f, Pos); + unsigned BytesRead = fread(Buffer, 1, Bytes, f); + if (BytesRead != Bytes) + { + LogStdioFileState(f); + Die("ReadStdioFile failed, attempted %d bytes, read %d bytes, errno=%d", + (int) Bytes, (int) BytesRead, errno); + } + } + +void ReadStdioFile(FILE *f, void *Buffer, unsigned Bytes) + { + if (0 == f) + Die("ReadStdioFile failed, f=NULL"); + unsigned BytesRead = fread(Buffer, 1, Bytes, f); + if (BytesRead != Bytes) + { + LogStdioFileState(f); + Die("ReadStdioFile failed, attempted %d bytes, read %d bytes, errno=%d", + (int) Bytes, (int) BytesRead, errno); + } + } + +// Return values from functions like lseek, ftell, fgetpos are +// "undefined" for files that cannot seek. Attempt to detect +// whether a file can seek by checking for error returns. +bool CanSetStdioFilePos(FILE *f) + { +// Common special cases + if (f == stdin || f == stdout || f == stderr) + return false; + + fpos_t CurrPos; + int ok1 = fgetpos(f, &CurrPos); + if (ok1 < 0) + return false; + int ok2 = fseek(f, 0, SEEK_END); + if (ok2 < 0) + return false; + fpos_t EndPos; + int ok3 = fgetpos(f, &EndPos); + int ok4 = fsetpos(f, &CurrPos); + if (!ok3 || !ok4) + return false; + return true; + } + +byte *ReadAllStdioFile(FILE *f, unsigned &FileSize) + { + const unsigned BUFF_SIZE = 1024*1024; + + if (CanSetStdioFilePos(f)) + { + off_t Pos = GetStdioFilePos(f); + off_t FileSize = GetStdioFileSize(f); + if (FileSize > UINT_MAX) + Die("ReadAllStdioFile: file size > UINT_MAX"); + SetStdioFilePos(f, 0); + byte *Buffer = myalloc(byte, unsigned(FileSize)); + ReadStdioFile(f, Buffer, unsigned(FileSize)); + SetStdioFilePos(f, Pos); + FileSize = unsigned(FileSize); + return Buffer; + } + +// Can't seek, read one buffer at a time. + FileSize = 0; + +// Just to initialize so that first call to realloc works. + byte *Buffer = (byte *) malloc(4); + if (Buffer == 0) + Die("ReadAllStdioFile, out of memory"); + for (;;) + { + Buffer = (byte *) realloc(Buffer, FileSize + BUFF_SIZE); + unsigned BytesRead = fread(Buffer + FileSize, 1, BUFF_SIZE, f); + FileSize += BytesRead; + if (BytesRead < BUFF_SIZE) + { + Buffer = (byte *) realloc(Buffer, FileSize); + return Buffer; + } + } + } + +byte *ReadAllStdioFile(const std::string &FileName, off_t &FileSize) + { +#if WIN32 + FILE *f = OpenStdioFile(FileName); + FileSize = GetStdioFileSize(f); + CloseStdioFile(f); + + HANDLE h = CreateFile(FileName.c_str(), GENERIC_READ, FILE_SHARE_READ, + NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); + if (h == INVALID_HANDLE_VALUE) + Die("ReadAllStdioFile:Open(%s) failed", FileName.c_str()); + + unsigned uFileSize = (unsigned) FileSize; + if ((off_t) uFileSize != FileSize) + Die("File too big (%.1f Gb): %s", double(FileSize)/1e9, FileName.c_str()); + + byte *Buffer = myalloc(byte, uFileSize); + DWORD BytesRead; + ReadFile(h, Buffer, uFileSize, &BytesRead, NULL); + if (FileSize != BytesRead) + Die("ReadAllStdioFile:Error reading %s, attempted %u got %u", + FileName.c_str(), FileSize, (unsigned) BytesRead); + + CloseHandle(h); + return Buffer; +#else + int h = open(FileName.c_str(), O_RDONLY); + if (h < 0) + Die("ReadAllStdioFile:Cannot open %s", FileName.c_str()); + FileSize = lseek(h, 0, SEEK_END); + if (FileSize == (off_t) (-1)) + Die("ReadAllStdioFile:Error seeking %s", FileName.c_str()); + // byte *Buffer = myalloc(FileSize); + size_t stBytes = (size_t) FileSize; + if ((off_t) stBytes != FileSize) + Die("ReadAllStdioFile: off_t overflow"); + byte *Buffer = (byte *) malloc(stBytes); + if (Buffer == 0) + Die("ReadAllStdioFile: failed to allocate %s", MemBytesToStr(stBytes)); + lseek(h, 0, SEEK_SET); + size_t n = read(h, Buffer, stBytes); + if (n != FileSize) + Die("ReadAllStdioFile, Error reading %s, attempted %g got %g", + FileName.c_str(), (double) FileSize, (double) n); + close(h); + return Buffer; +#endif + } + +void WriteStdioFile(FILE *f, off_t Pos, const void *Buffer, unsigned Bytes) + { + if (0 == f) + Die("WriteStdioFile failed, f=NULL"); + SetStdioFilePos(f, Pos); + unsigned BytesWritten = fwrite(Buffer, 1, Bytes, f); + if (BytesWritten != Bytes) + { + LogStdioFileState(f); + Die("WriteStdioFile failed, attempted %d bytes, wrote %d bytes, errno=%d", + (int) Bytes, (int) BytesWritten, errno); + } + } + +void WriteStdioFile(FILE *f, const void *Buffer, unsigned Bytes) + { + if (0 == f) + Die("WriteStdioFile failed, f=NULL"); + unsigned BytesWritten = fwrite(Buffer, 1, Bytes, f); + if (BytesWritten != Bytes) + { + LogStdioFileState(f); + Die("WriteStdioFile failed, attempted %d bytes, wrote %d bytes, errno=%d", + (int) Bytes, (int) BytesWritten, errno); + } + } + +// Return false on EOF, true if line successfully read. +bool ReadLineStdioFile(FILE *f, char *Line, unsigned Bytes) + { + if (feof(f)) + return false; + if ((int) Bytes < 0) + Die("ReadLineStdioFile: Bytes < 0"); + char *RetVal = fgets(Line, (int) Bytes, f); + if (NULL == RetVal) + { + if (feof(f)) + return false; + if (ferror(f)) + Die("ReadLineStdioFile: errno=%d", errno); + Die("ReadLineStdioFile: fgets=0, feof=0, ferror=0"); + } + + if (RetVal != Line) + Die("ReadLineStdioFile: fgets != Buffer"); + unsigned n = strlen(Line); + if (n < 1 || Line[n-1] != '\n') + Die("ReadLineStdioFile: line too long or missing end-of-line"); + if (n > 0 && (Line[n-1] == '\r' || Line[n-1] == '\n')) + Line[n-1] = 0; + if (n > 1 && (Line[n-2] == '\r' || Line[n-2] == '\n')) + Line[n-2] = 0; + return true; + } + +// Return false on EOF, true if line successfully read. +bool ReadLineStdioFile(FILE *f, string &Line) + { + Line.clear(); + for (;;) + { + int c = fgetc(f); + if (c == -1) + { + if (feof(f)) + { + if (!Line.empty()) + return true; + return false; + } + Die("ReadLineStdioFile, errno=%d", errno); + } + if (c == '\r') + continue; + if (c == '\n') + return true; + Line.push_back((char) c); + } + } + +// Copies all of fFrom regardless of current +// file position, appends to fTo. +void AppendStdioFileToFile(FILE *fFrom, FILE *fTo) + { + off_t SavedFromPos = GetStdioFilePos(fFrom); + off_t FileSize = GetStdioFileSize(fFrom); + const off_t BUFF_SIZE = 1024*1024; + char *Buffer = myalloc(char, BUFF_SIZE); + SetStdioFilePos(fFrom, 0); + off_t BytesRemaining = FileSize; + while (BytesRemaining > 0) + { + off_t BytesToRead = BytesRemaining; + if (BytesToRead > BUFF_SIZE) + BytesToRead = BUFF_SIZE; + ReadStdioFile(fFrom, Buffer, (unsigned) BytesToRead); + WriteStdioFile(fTo, Buffer, (unsigned) BytesToRead); + BytesRemaining -= BytesToRead; + } + SetStdioFilePos(fFrom, SavedFromPos); + } + +void RenameStdioFile(const string &FileNameFrom, const string &FileNameTo) + { + int Ok = rename(FileNameFrom.c_str(), FileNameTo.c_str()); + if (Ok != 0) + Die("RenameStdioFile(%s,%s) failed, errno=%d %s", + FileNameFrom.c_str(), FileNameTo.c_str(), errno, strerror(errno)); + } + +void FlushStdioFile(FILE *f) + { + int Ok = fflush(f); + if (Ok != 0) + Die("fflush(%p)=%d,", f, Ok); + } + +void CloseStdioFile(FILE *f) + { + if (f == 0) + return; + int Ok = fclose(f); + if (Ok != 0) + Die("fclose(%p)=%d", f, Ok); + FreeBuffer(f); + } + +off_t GetStdioFilePos(FILE *f) + { + off_t FilePos = ftello(f); + if (FilePos < 0) + Die("ftello=%d", (int) FilePos); + return FilePos; + } + +off_t GetStdioFileSize(FILE *f) + { + off_t CurrentPos = GetStdioFilePos(f); + int Ok = fseeko(f, 0, SEEK_END); + if (Ok < 0) + Die("fseek in GetFileSize"); + + off_t Length = ftello(f); + if (Length < 0) + Die("ftello in GetFileSize"); + SetStdioFilePos(f, CurrentPos); + return Length; + } + +void DeleteStdioFile(const string &FileName) + { + int Ok = remove(FileName.c_str()); + if (Ok != 0) + Die("remove(%s) failed, errno=%d %s", FileName.c_str(), errno, strerror(errno)); + } + +void myvstrprintf(string &Str, const char *Format, va_list ArgList) + { + static char szStr[MAX_FORMATTED_STRING_LENGTH]; + vsnprintf(szStr, MAX_FORMATTED_STRING_LENGTH-1, Format, ArgList); + szStr[MAX_FORMATTED_STRING_LENGTH - 1] = '\0'; + Str.assign(szStr); + } + +void myvstrprintf(string &Str, const char *Format, ...) + { + va_list ArgList; + va_start(ArgList, Format); + myvstrprintf(Str, Format, ArgList); + va_end(ArgList); + } + +FILE *g_fLog = 0; + +void SetLogFileName(const string &FileName) + { + if (g_fLog != 0) + CloseStdioFile(g_fLog); + g_fLog = 0; + if (FileName.empty()) + return; + g_fLog = CreateStdioFile(FileName); + } + +void Log(const char *Format, ...) + { + if (g_fLog == 0) + return; + + static bool InLog = false; + if (InLog) + return; + + InLog = true; + va_list ArgList; + va_start(ArgList, Format); + vfprintf(g_fLog, Format, ArgList); + va_end(ArgList); + fflush(g_fLog); + InLog = false; + } + +void Die(const char *Format, ...) + { + static bool InDie = false; + if (InDie) + exit(1); + InDie = true; + string Msg; + + if (g_fLog != 0) + setbuf(g_fLog, 0); + va_list ArgList; + va_start(ArgList, Format); + myvstrprintf(Msg, Format, ArgList); + va_end(ArgList); + + fprintf(stderr, "\n\n"); + Log("\n"); + time_t t = time(0); + Log("%s", asctime(localtime(&t))); + for (unsigned i = 0; i < g_Argv.size(); i++) + { + fprintf(stderr, (i == 0) ? "%s" : " %s", g_Argv[i].c_str()); + Log((i == 0) ? "%s" : " %s", g_Argv[i].c_str()); + } + fprintf(stderr, "\n"); + Log("\n"); + + time_t CurrentTime = time(0); + unsigned ElapsedSeconds = unsigned(CurrentTime - g_StartTime); + const char *sstr = SecsToStr(ElapsedSeconds); + Log("Elapsed time: %s\n", sstr); + + const char *szStr = Msg.c_str(); + fprintf(stderr, "\n---Fatal error---\n%s\n", szStr); + Log("\n---Fatal error---\n%s\n", szStr); + +#ifdef _MSC_VER + if (IsDebuggerPresent()) + __debugbreak(); + _CrtSetDbgFlag(0); +#endif + + exit(1); + } + +void Warning(const char *Format, ...) + { + string Msg; + + va_list ArgList; + va_start(ArgList, Format); + myvstrprintf(Msg, Format, ArgList); + va_end(ArgList); + + const char *szStr = Msg.c_str(); + + fprintf(stderr, "\nWARNING: %s\n", szStr); + if (g_fLog != stdout) + { + Log("\nWARNING: %s\n", szStr); + fflush(g_fLog); + } + } + +#ifdef _MSC_VER +double GetMemUseBytes() + { + HANDLE hProc = GetCurrentProcess(); + PROCESS_MEMORY_COUNTERS PMC; + BOOL bOk = GetProcessMemoryInfo(hProc, &PMC, sizeof(PMC)); + if (!bOk) + return 1000000; + double Bytes = (double) PMC.WorkingSetSize; + if (Bytes > g_PeakMemUseBytes) + g_PeakMemUseBytes = Bytes; + return Bytes; + } +#elif linux || __linux__ +double GetMemUseBytes() + { + static char statm[64]; + static int PageSize = 1; + if (0 == statm[0]) + { + PageSize = sysconf(_SC_PAGESIZE); + pid_t pid = getpid(); + sprintf(statm, "/proc/%d/statm", (int) pid); + } + + int fd = open(statm, O_RDONLY); + if (-1 == fd) + return 1000000; + char Buffer[64]; + int n = read(fd, Buffer, sizeof(Buffer) - 1); + close(fd); + fd = -1; + + if (n <= 0) + return 1000000; + + Buffer[n] = 0; + double Pages = atof(Buffer); + + double Bytes = Pages*PageSize; + if (Bytes > g_PeakMemUseBytes) + g_PeakMemUseBytes = Bytes; + return Bytes; + } +#elif defined(__MACH__) +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DEFAULT_MEM_USE 100000000.0 + +double GetMemUseBytes() + { + task_t mytask = mach_task_self(); + struct task_basic_info ti; + memset((void *) &ti, 0, sizeof(ti)); + mach_msg_type_number_t count = TASK_BASIC_INFO_COUNT; + kern_return_t ok = task_info(mytask, TASK_BASIC_INFO, (task_info_t) &ti, &count); + if (ok == KERN_INVALID_ARGUMENT) + return DEFAULT_MEM_USE; + + if (ok != KERN_SUCCESS) + return DEFAULT_MEM_USE; + + double Bytes = (double ) ti.resident_size; + if (Bytes > g_PeakMemUseBytes) + g_PeakMemUseBytes = Bytes; + return Bytes; + } +#else +double GetMemUseBytes() + { + return 0; + } +#endif + +double GetPeakMemUseBytes() + { + return g_PeakMemUseBytes; + } + +const char *SecsToHHMMSS(int Secs) + { + int HH = Secs/3600; + int MM = (Secs - HH*3600)/60; + int SS = Secs%60; + static char Str[16]; + if (HH == 0) + sprintf(Str, "%02d:%02d", MM, SS); + else + sprintf(Str, "%02d:%02d:%02d", HH, MM, SS); + return Str; + } + +const char *SecsToStr(double Secs) + { + if (Secs >= 10.0) + return SecsToHHMMSS((int) Secs); + + static char Str[16]; + if (Secs < 1e-6) + sprintf(Str, "%.2gs", Secs); + else if (Secs < 1e-3) + sprintf(Str, "%.2fms", Secs*1e3); + else + sprintf(Str, "%.3fs", Secs); + return Str; + } + +const char *MemBytesToStr(double Bytes) + { + static char Str[32]; + + if (Bytes < 1e6) + sprintf(Str, "%.1fkb", Bytes/1e3); + else if (Bytes < 10e6) + sprintf(Str, "%.1fMb", Bytes/1e6); + else if (Bytes < 1e9) + sprintf(Str, "%.0fMb", Bytes/1e6); + else if (Bytes < 10e9) + sprintf(Str, "%.1fGb", Bytes/1e9); + else if (Bytes < 100e9) + sprintf(Str, "%.0fGb", Bytes/1e9); + else + sprintf(Str, "%.3gb", Bytes); + return Str; + } + +const char *IntToStr(unsigned i) + { + static char Str[32]; + + double d = (double) i; + if (i < 10000) + sprintf(Str, "%u", i); + else if (i < 1e6) + sprintf(Str, "%.1fk", d/1e3); + else if (i < 10e6) + sprintf(Str, "%.1fM", d/1e6); + else if (i < 1e9) + sprintf(Str, "%.0fM", d/1e6); + else if (i < 10e9) + sprintf(Str, "%.1fG", d/1e9); + else if (i < 100e9) + sprintf(Str, "%.0fG", d/1e9); + else + sprintf(Str, "%.3g", d); + return Str; + } + +const char *FloatToStr(double d) + { + static char Str[32]; + + double a = fabs(d); + if (a < 0.01) + sprintf(Str, "%.3g", a); + else if (a >= 0.01 && a < 1) + sprintf(Str, "%.3f", a); + else if (a <= 10 && a >= 1) + { + double intpart; + if (modf(a, &intpart) < 0.05) + sprintf(Str, "%.0f", d); + else + sprintf(Str, "%.1f", d); + } + else if (a > 10 && a < 10000) + sprintf(Str, "%.0f", d); + else if (a < 1e6) + sprintf(Str, "%.1fk", d/1e3); + else if (a < 10e6) + sprintf(Str, "%.1fM", d/1e6); + else if (a < 1e9) + sprintf(Str, "%.0fM", d/1e6); + else if (a < 10e9) + sprintf(Str, "%.1fG", d/1e9); + else if (a < 100e9) + sprintf(Str, "%.0fG", d/1e9); + else + sprintf(Str, "%.3g", d); + return Str; + } + +bool opt_quiet = false; +bool opt_version = false; +bool opt_logopts = false; +bool opt_compilerinfo = false; +bool opt_help = false; +string opt_log = ""; + +bool optset_quiet = false; +bool optset_version = false; +bool optset_logopts = false; +bool optset_compilerinfo = false; +bool optset_help = false; +bool optset_log = false; + +static string g_CurrentProgressLine; +static string g_ProgressDesc; +static unsigned g_ProgressIndex; +static unsigned g_ProgressCount; + +static unsigned g_CurrProgressLineLength; +static unsigned g_LastProgressLineLength; +static unsigned g_CountsInterval; +static unsigned g_StepCalls; +static time_t g_TimeLastOutputStep; + +static string &GetProgressPrefixStr(string &s) + { + double Bytes = GetMemUseBytes(); + unsigned Secs = GetElapsedSecs(); + s = string(SecsToHHMMSS(Secs)); + if (Bytes > 0) + { + s.push_back(' '); + char Str[32]; + sprintf(Str, "%5.5s", MemBytesToStr(Bytes)); + s += string(Str); + } + s.push_back(' '); + return s; + } + +void ProgressLog(const char *Format, ...) + { + string Str; + va_list ArgList; + va_start(ArgList, Format); + myvstrprintf(Str, Format, ArgList); + va_end(ArgList); + + Log("%s", Str.c_str()); + Progress("%s", Str.c_str()); + } + +void Progress(const char *Format, ...) + { + if (opt_quiet) + return; + + string Str; + va_list ArgList; + va_start(ArgList, Format); + myvstrprintf(Str, Format, ArgList); + va_end(ArgList); + +#if 0 + Log("Progress("); + for (unsigned i = 0; i < Str.size(); ++i) + { + char c = Str[i]; + if (c == '\r') + Log("\\r"); + else if (c == '\n') + Log("\\n"); + else + Log("%c", c); + } + Log(")\n"); +#endif //0 + + for (unsigned i = 0; i < Str.size(); ++i) + { + if (g_CurrProgressLineLength == 0) + { + string s; + GetProgressPrefixStr(s); + for (unsigned j = 0; j < s.size(); ++j) + { + fputc(s[j], stderr); + ++g_CurrProgressLineLength; + } + } + + char c = Str[i]; + if (c == '\n' || c == '\r') + { + for (unsigned j = g_CurrProgressLineLength; j < g_LastProgressLineLength; ++j) + fputc(' ', stderr); + if (c == '\n') + g_LastProgressLineLength = 0; + else + g_LastProgressLineLength = g_CurrProgressLineLength; + g_CurrProgressLineLength = 0; + fputc(c, stderr); + } + else + { + fputc(c, stderr); + ++g_CurrProgressLineLength; + } + } + } + +void ProgressExit() + { + time_t Now = time(0); + struct tm *t = localtime(&Now); + const char *s = asctime(t); + unsigned Secs = GetElapsedSecs(); + + Log("\n"); + Log("Finished %s", s); // there is a newline in s + Log("Elapsed time %s\n", SecsToHHMMSS((int) Secs)); + Log("Max memory %s\n", MemBytesToStr(g_PeakMemUseBytes)); +#if WIN32 && DEBUG +// Skip exit(), which can be very slow in DEBUG build +// VERY DANGEROUS practice, because it skips global destructors. +// But if you know the rules, you can break 'em, right? + ExitProcess(0); +#endif + } + +const char *PctStr(double x, double y) + { + if (y == 0) + { + if (x == 0) + return "100%"; + else + return "inf%"; + } + static char Str[16]; + double p = x*100.0/y; + sprintf(Str, "%5.1f%%", p); + return Str; + } + +string &GetProgressLevelStr(string &s) + { + unsigned Index = g_ProgressIndex; + unsigned Count = g_ProgressCount; + if (Count == UINT_MAX) + { + if (Index == UINT_MAX) + s = "100%"; + else + { + char Tmp[16]; + sprintf(Tmp, "%u", Index); + s = Tmp; + } + } + else + s = string(PctStr(Index+1, Count)); + s += string(" ") + g_ProgressDesc; + return s; + } + +void ProgressStep(unsigned i, unsigned N, const char *Format, ...) + { + if (opt_quiet) + return; + + if (i == 0) + { + string Str; + va_list ArgList; + va_start(ArgList, Format); + myvstrprintf(Str, Format, ArgList); + va_end(ArgList); + g_ProgressDesc = Str; + g_ProgressIndex = 0; + g_ProgressCount = N; + g_CountsInterval = 1; + g_StepCalls = 0; + g_TimeLastOutputStep = 0; + if (g_CurrProgressLineLength > 0) + Progress("\n"); + } + + if (i >= N && i != UINT_MAX) + Die("ProgressStep(%u,%u)", i, N); + bool IsLastStep = (i == UINT_MAX || i + 1 == N); + if (!IsLastStep) + { + ++g_StepCalls; + if (g_StepCalls%g_CountsInterval != 0) + return; + + time_t Now = time(0); + if (Now == g_TimeLastOutputStep) + { + if (g_CountsInterval < 128) + g_CountsInterval = (g_CountsInterval*3)/2; + else + g_CountsInterval += 64; + return; + } + else + { + time_t Secs = Now - g_TimeLastOutputStep; + if (Secs > 1) + g_CountsInterval = unsigned(g_CountsInterval/(Secs*8)); + } + + if (g_CountsInterval < 1) + g_CountsInterval = 1; + + g_TimeLastOutputStep = Now; + } + + g_ProgressIndex = i; + + if (i > 0) + { + va_list ArgList; + va_start(ArgList, Format); + myvstrprintf(g_ProgressDesc, Format, ArgList); + } + + string LevelStr; + GetProgressLevelStr(LevelStr); + Progress(" %s\r", LevelStr.c_str()); + + if (IsLastStep) + { + g_CountsInterval = 1; + fputc('\n', stderr); + } + } + +enum OptType + { + OT_Flag, + OT_Tog, + OT_Int, + OT_Uns, + OT_Str, + OT_Float, + OT_Enum + }; + +struct OptInfo + { + void *Value; + bool *OptSet; + string LongName; + OptType Type; + int iMin; + int iMax; + unsigned uMin; + unsigned uMax; + double dMin; + double dMax; + map EnumValues; + + bool bDefault; + int iDefault; + unsigned uDefault; + double dDefault; + string strDefault; + + string Help; + + bool operator<(const OptInfo &rhs) const + { + return LongName < rhs.LongName; + } + }; + +static set g_Opts; + +void Help() + { + printf("\n"); + + void Usage(); + Usage(); + + for (set::const_iterator p = g_Opts.begin(); p != g_Opts.end(); ++p) + { + const OptInfo &Opt = *p; + + printf("\n"); + string LongName = Opt.LongName.c_str(); + if (Opt.Type == OT_Tog) + LongName = string("[no]") + LongName; + printf(" --%s ", LongName.c_str()); + + switch (Opt.Type) + { + case OT_Flag: + break; + case OT_Tog: + break; + case OT_Int: + printf(""); + break; + case OT_Uns: + printf(""); + break; + case OT_Str: + printf(""); + break; + case OT_Float: + printf(""); + break; + case OT_Enum: + printf(""); + break; + default: + printf("??type"); + break; + } + + printf(" "); + const string &s = Opt.Help; + for (string::const_iterator q = s.begin(); q != s.end(); ++q) + { + char c = *q; + if (c == '\n') + printf("\n "); + else + printf("%c", c); + } + printf("\n"); + } + printf("\n"); + exit(0); + } + +void CmdLineErr(const char *Format, ...) + { + va_list ArgList; + va_start(ArgList, Format); + string Str; + myvstrprintf(Str, Format, ArgList); + va_end(ArgList); + fprintf(stderr, "\n"); + fprintf(stderr, "Invalid command line\n"); + fprintf(stderr, "%s\n", Str.c_str()); + fprintf(stderr, "For list of command-line options use --help.\n"); + fprintf(stderr, "\n"); + exit(1); + } + +static set::iterator GetOptInfo(const string &LongName, + bool ErrIfNotFound) + { + for (set::iterator p = g_Opts.begin(); + p != g_Opts.end(); ++p) + { + const OptInfo &Opt = *p; + if (Opt.LongName == LongName) + return p; + if (Opt.Type == OT_Tog && "no" + Opt.LongName == LongName) + return p; + } + if (ErrIfNotFound) + CmdLineErr("Option --%s is invalid", LongName.c_str()); + return g_Opts.end(); + } + +static void AddOpt(const OptInfo &Opt) + { + if (GetOptInfo(Opt.LongName, false) != g_Opts.end()) + Die("Option --%s defined twice", Opt.LongName.c_str()); + g_Opts.insert(Opt); + } + +#ifdef _MSC_VER +#pragma warning(disable: 4505) // unreferenced local function +#endif + +static void DefineFlagOpt(const string &LongName, const string &Help, + void *Value, bool *OptSet) + { + *(bool *) Value = false; + + OptInfo Opt; + Opt.Value = Value; + Opt.OptSet = OptSet; + Opt.LongName = LongName; + Opt.bDefault = false; + Opt.Help = Help; + Opt.Type = OT_Flag; + AddOpt(Opt); + } + +static void DefineTogOpt(const string &LongName, bool Default, const string &Help, + void *Value, bool *OptSet) + { + *(bool *) Value = Default; + + OptInfo Opt; + Opt.Value = Value; + Opt.OptSet = OptSet; + Opt.LongName = LongName; + Opt.bDefault = Default; + Opt.Help = Help; + Opt.Type = OT_Tog; + AddOpt(Opt); + } + +static void DefineIntOpt(const string &LongName, int Default, int Min, int Max, + const string &Help, void *Value, bool *OptSet) + { + *(int *) Value = Default; + + OptInfo Opt; + Opt.Value = Value; + Opt.OptSet = OptSet; + Opt.LongName = LongName; + Opt.iDefault = Default; + Opt.iMin = Min; + Opt.iMax = Max; + Opt.Help = Help; + Opt.Type = OT_Int; + AddOpt(Opt); + } + +static void DefineUnsOpt(const string &LongName, unsigned Default, unsigned Min, + unsigned Max, const string &Help, void *Value, bool *OptSet) + { + *(unsigned *) Value = Default; + + OptInfo Opt; + Opt.Value = Value; + Opt.OptSet = OptSet; + Opt.LongName = LongName; + Opt.uDefault = Default; + Opt.uMin = Min; + Opt.uMax = Max; + Opt.Help = Help; + Opt.Type = OT_Uns; + AddOpt(Opt); + } + +static void DefineFloatOpt(const string &LongName, double Default, double Min, + double Max, const string &Help, void *Value, bool *OptSet) + { + *(double *) Value = Default; + + OptInfo Opt; + Opt.Value = Value; + Opt.OptSet = OptSet; + Opt.LongName = LongName; + Opt.dDefault = Default; + Opt.dMin = Min; + Opt.dMax = Max; + Opt.Help = Help; + Opt.Type = OT_Float; + AddOpt(Opt); + } + +static void DefineStrOpt(const string &LongName, const char *Default, + const string &Help, void *Value, bool *OptSet) + { + *(string *) Value = (Default == 0 ? "" : string(Default)); + + OptInfo Opt; + Opt.Value = Value; + Opt.OptSet = OptSet; + Opt.LongName = LongName; + Opt.strDefault = (Default == 0 ? "" : string(Default)); + Opt.Help = Help; + Opt.Type = OT_Str; + AddOpt(Opt); + } + +static void ParseEnumValues(const string &Values, map &EnumValues) + { + EnumValues.clear(); + + string Name; + string Value; + bool Eq = false; + for (string::const_iterator p = Values.begin(); ; ++p) + { + char c = (p == Values.end() ? '|' : *p); + if (isspace(c)) + ; + else if (c == '|') + { + if (EnumValues.find(Name) != EnumValues.end()) + Die("Invalid enum values, '%s' defined twice: '%s'", + Name.c_str(), Values.c_str()); + if (Name.empty() || Value.empty()) + Die("Invalid enum values, empty name or value: '%s'", + Values.c_str()); + + EnumValues[Name] = atoi(Value.c_str()); + Name.clear(); + Value.clear(); + Eq = false; + } + else if (c == '=') + Eq = true; + else if (Eq) + Value.push_back(c); + else + Name.push_back(c); + if (p == Values.end()) + return; + } + } + +static void DefineEnumOpt(const string &LongName, const string &ShortName, + int Default, const string &Values, const string &Help, void *Value) + { + *(int *) Value = Default; + + OptInfo Opt; + Opt.Value = Value; + Opt.LongName = LongName; + Opt.iDefault = Default; + Opt.Help = Help; + Opt.Type = OT_Enum; + ParseEnumValues(Values, Opt.EnumValues); + AddOpt(Opt); + } +#undef FLAG_OPT +#undef TOG_OPT +#undef INT_OPT +#undef UNS_OPT +#undef FLT_OPT +#undef STR_OPT +#undef ENUM_OPT +#define FLAG_OPT(LongName) bool opt_##LongName; bool optset_##LongName; +#define TOG_OPT(LongName, Default) bool opt_##LongName; bool optset_##LongName; +#define INT_OPT(LongName, Default, Min, Max) int opt_##LongName; bool optset_##LongName; +#define UNS_OPT(LongName, Default, Min, Max) unsigned opt_##LongName; bool optset_##LongName; +#define FLT_OPT(LongName, Default, Min, Max) double opt_##LongName; bool optset_##LongName; +#define STR_OPT(LongName, Default) string opt_##LongName; bool optset_##LongName; +#define ENUM_OPT(LongName, Values, Default) int opt_##LongName; bool optset_##LongName; +#include "myopts.h" + +static int EnumStrToInt(const OptInfo &Opt, const string &Value) + { + const map &e = Opt.EnumValues; + string s; + for (map::const_iterator p = e.begin(); p != e.end(); ++p) + { + if (Value == p->first) + return p->second; + s += " " + p->first; + } + CmdLineErr("--%s %s not recognized, valid are: %s", + Opt.LongName.c_str(), Value.c_str(), s.c_str()); + ureturn(-1); + } + +static void SetOpt(OptInfo &Opt, const string &Value) + { + *Opt.OptSet = true; + switch (Opt.Type) + { + case OT_Int: + { + *(int *) Opt.Value = atoi(Value.c_str()); + break; + } + case OT_Uns: + { + unsigned uValue = 0; + int n = sscanf(Value.c_str(), "%u", &uValue); + if (n != 1) + CmdLineErr("Invalid value '%s' for --%s", + Value.c_str(), Opt.LongName.c_str()); + *(unsigned *) Opt.Value = uValue; + break; + } + case OT_Float: + { + *(double *) Opt.Value = atof(Value.c_str()); + break; + } + case OT_Str: + { + *(string *) Opt.Value = Value; + break; + } + case OT_Enum: + { + *(int *) Opt.Value = EnumStrToInt(Opt, Value); + break; + } + default: + asserta(false); + } + } + +void LogOpts() + { + for (set::const_iterator p = g_Opts.begin(); p != g_Opts.end(); ++p) + { + const OptInfo &Opt = *p; + Log("%s = ", Opt.LongName.c_str()); + switch (Opt.Type) + { + case OT_Flag: + Log("%s", (*(bool *) Opt.Value) ? "yes" : "no"); + break; + case OT_Tog: + Log("%s", (*(bool *) Opt.Value) ? "on" : "off"); + break; + case OT_Int: + Log("%d", *(int *) Opt.Value); + break; + case OT_Uns: + Log("%u", *(unsigned *) Opt.Value); + break; + case OT_Float: + { + double Value = *(double *) Opt.Value; + if (Value == FLT_MAX) + Log("*"); + else + Log("%g", Value); + break; + } + case OT_Str: + Log("%s", (*(string *) Opt.Value).c_str()); + break; + case OT_Enum: + Log("%d", *(int *) Opt.Value); + break; + default: + asserta(false); + } + Log("\n"); + } + } + +static void CompilerInfo() + { +#ifdef _FILE_OFFSET_BITS + printf("_FILE_OFFSET_BITS=%d\n", _FILE_OFFSET_BITS); +#else + printf("_FILE_OFFSET_BITS not defined\n"); +#endif + +#define x(t) printf("sizeof(" #t ") = %d\n", (int) sizeof(t)); + x(int) + x(long) + x(float) + x(double) + x(void *) + x(off_t) +#undef x + exit(0); + } + +void Split(const string &Str, vector &Fields, char Sep) + { + Fields.clear(); + const unsigned Length = (unsigned) Str.size(); + string s; + for (unsigned i = 0; i < Length; ++i) + { + char c = Str[i]; + if ((Sep == 0 && isspace(c)) || c == Sep) + { + if (!s.empty() || Sep != 0) + Fields.push_back(s); + s.clear(); + } + else + s.push_back(c); + } + if (!s.empty()) + Fields.push_back(s); + } + +static void GetArgsFromFile(const string &FileName, vector &Args) + { + Args.clear(); + + FILE *f = OpenStdioFile(FileName); + string Line; + while (ReadLineStdioFile(f, Line)) + { + size_t n = Line.find('#'); + if (n != string::npos) + Line = Line.substr(0, n); + vector Fields; + Split(Line, Fields); + Args.insert(Args.end(), Fields.begin(), Fields.end()); + } + CloseStdioFile(f); + } + +void MyCmdLine(int argc, char **argv) + { + static unsigned RecurseDepth = 0; + ++RecurseDepth; + + DefineFlagOpt("compilerinfo", "Write info about compiler types and #defines to stdout.", + (void *) &opt_compilerinfo, &optset_compilerinfo); + DefineFlagOpt("quiet", "Turn off progress messages.", (void *) &opt_quiet, &optset_quiet); + DefineFlagOpt("version", "Show version and exit.", (void *) &opt_version, &optset_version); + DefineFlagOpt("logopts", "Log options.", (void *) &opt_logopts, &optset_logopts); + DefineFlagOpt("help", "Display command-line options.", (void *) &opt_help, &optset_help); + DefineStrOpt("log", "", "Log file name.", (void *) &opt_log, &optset_log); + +#undef FLAG_OPT +#undef TOG_OPT +#undef INT_OPT +#undef UNS_OPT +#undef FLT_OPT +#undef STR_OPT +#undef ENUM_OPT +#define FLAG_OPT(LongName) DefineFlagOpt(#LongName, "help", (void *) &opt_##LongName, &optset_##LongName); +#define TOG_OPT(LongName, Default) DefineTogOpt(#LongName, Default, "help", (void *) &opt_##LongName, &optset_##LongName); +#define INT_OPT(LongName, Default, Min, Max) DefineIntOpt(#LongName, Default, Min, Max, "help", (void *) &opt_##LongName, &optset_##LongName); +#define UNS_OPT(LongName, Default, Min, Max) DefineUnsOpt(#LongName, Default, Min, Max, "help", (void *) &opt_##LongName, &optset_##LongName); +#define FLT_OPT(LongName, Default, Min, Max) DefineFloatOpt(#LongName, Default, Min, Max, "help", (void *) &opt_##LongName, &optset_##LongName); +#define STR_OPT(LongName, Default) DefineStrOpt(#LongName, Default, "help", (void *) &opt_##LongName, &optset_##LongName); +#define ENUM_OPT(LongName, Values, Default) DefineEnumOpt(#LongName, Values, Default, "help", (void *) &opt_##LongName, &optset_##LongName); +#include "myopts.h" + + if (RecurseDepth == 0) + g_Argv.clear(); + + for (int i = 0; i < argc; ++i) + g_Argv.push_back(string(argv[i])); + + + int i = 1; + for (;;) + { + if (i >= argc) + break; + const string &Arg = g_Argv[i]; + + if (Arg.empty()) + continue; + else if (Arg == "file:" && i + 1 < argc) + { + const string &FileName = g_Argv[i+1]; + vector Args; + GetArgsFromFile(FileName, Args); + for (vector::const_iterator p = Args.begin(); + p != Args.end(); ++p) + { + g_Argv.push_back(*p); + ++argc; + } + i += 2; + continue; + } + else if (Arg.size() > 1 && Arg[0] == '-') + { + string LongName = (Arg.size() > 2 && Arg[1] == '-' ? Arg.substr(2) : Arg.substr(1)); + OptInfo Opt = *GetOptInfo(LongName, true); + *Opt.OptSet = true; + if (Opt.Type == OT_Flag) + { + g_Opts.erase(Opt); + *(bool *) Opt.Value = true; + g_Opts.insert(Opt); + ++i; + continue; + } + else if (Opt.Type == OT_Tog) + { + g_Opts.erase(Opt); + if (string("no") + Opt.LongName == LongName) + *(bool *) Opt.Value = false; + else + { + asserta(Opt.LongName == LongName); + *(bool *) Opt.Value = true; + } + g_Opts.insert(Opt); + ++i; + continue; + } + + ++i; + if (i >= argc) + CmdLineErr("Missing value for option --%s", LongName.c_str()); + + string Value = g_Argv[i]; + SetOpt(Opt, Value); + + ++i; + continue; + } + else + CmdLineErr("Expected -option_name or --option_name, got '%s'", Arg.c_str()); + } + + --RecurseDepth; + if (RecurseDepth > 0) + return; + + if (opt_help) + Help(); + + if (opt_compilerinfo) + CompilerInfo(); + + SetLogFileName(opt_log); + + if (opt_log != "") + { + for (int i = 0; i < argc; ++i) + Log("%s%s", i == 0 ? "" : " ", g_Argv[i].c_str()); + Log("\n"); + time_t Now = time(0); + struct tm *t = localtime(&Now); + const char *s = asctime(t); + Log("Started %s", s); // there is a newline in s + Log("Version " MY_VERSION ".%s\n", SVN_VERSION); + Log("\n"); + } + + if (opt_logopts) + LogOpts(); + } + +double Pct(double x, double y) + { + if (y == 0.0f) + return 0.0f; + return (x*100.0f)/y; + } + +void GetCmdLine(string &s) + { + s.clear(); + for (unsigned i = 0; i < SIZE(g_Argv); ++i) + { + if (i > 0) + s += " "; + s += g_Argv[i]; + } + } + +char *mystrsave(const char *s) + { + unsigned n = unsigned(strlen(s)); + char *t = myalloc(char, n+1); + memcpy(t, s, n+1); + return t; + } + +void Logu(unsigned u, unsigned w, unsigned prefixspaces) + { + for (unsigned i = 0; i < prefixspaces; ++i) + Log(" "); + if (u == UINT_MAX) + Log("%*.*s", w, w, "*"); + else + Log("%*u", w, u); + } + +void Logf(float x, unsigned w, unsigned prefixspaces) + { + for (unsigned i = 0; i < prefixspaces; ++i) + Log(" "); + if (x == FLT_MAX) + Log("%*.*s", w, w, "*"); + else + Log("%*.2f", w, x); + } + +static uint32 g_SLCG_state = 1; + +// Numerical values used by Microsoft C, according to wikipedia: +// http://en.wikipedia.org/wiki/Linear_congruential_generator +static uint32 g_SLCG_a = 214013; +static uint32 g_SLCG_c = 2531011; + +// Simple Linear Congruential Generator +// Bad properties; used just to initialize the better generator. +static uint32 SLCG_rand() + { + g_SLCG_state = g_SLCG_state*g_SLCG_a + g_SLCG_c; + return g_SLCG_state; + } + +static void SLCG_srand(uint32 Seed) + { + g_SLCG_state = Seed; + for (int i = 0; i < 10; ++i) + SLCG_rand(); + } + +/*** +A multiply-with-carry random number generator, see: +http://en.wikipedia.org/wiki/Multiply-with-carry + +The particular multipliers used here were found on +the web where they are attributed to George Marsaglia. +***/ + +static bool g_InitRandDone = false; +static uint32 g_X[5]; + +uint32 RandInt32() + { + InitRand(); + + uint64 Sum = 2111111111*(uint64) g_X[3] + 1492*(uint64) g_X[2] + + 1776*(uint64) g_X[1] + 5115*(uint64) g_X[0] + g_X[4]; + g_X[3] = g_X[2]; + g_X[2] = g_X[1]; + g_X[1] = g_X[0]; + g_X[4] = (uint32) (Sum >> 32); + g_X[0] = (uint32) Sum; + return g_X[0]; + } + +unsigned randu32() + { + return (unsigned) RandInt32(); + } + +void InitRand() + { + if (g_InitRandDone) + return; +// Do this first to avoid recursion + g_InitRandDone = true; + + unsigned Seed = (optset_randseed ? opt_randseed : (unsigned) (time(0)*getpid())); + Log("RandSeed=%u\n", Seed); + SLCG_srand(Seed); + + for (unsigned i = 0; i < 5; i++) + g_X[i] = SLCG_rand(); + + for (unsigned i = 0; i < 100; i++) + RandInt32(); + } + +// MUST COME AT END BECAUSE OF #undef +#if RCE_MALLOC +#undef mymalloc +#undef myfree +#undef myfree2 +void *mymalloc(unsigned bytes, const char *FileName, int Line) + { + void *rce_malloc(unsigned bytes, const char *FileName, int Line); + return rce_malloc(bytes, FileName, Line); + } + +void myfree(void *p, const char *FileName, int Line) + { + void rce_free(void *p, const char *FileName, int Line); + rce_free(p, FileName, Line); + } + +void myfree2(void *p, unsigned bytes, const char *FileName, int Line) + { + void rce_free(void *p, const char *FileName, int Line); + rce_free(p, FileName, Line); + } + +#else // RCE_MALLOC +void *mymalloc(unsigned bytes) + { + ++g_NewCalls; + if (g_InitialMemUseBytes == 0) + g_InitialMemUseBytes = GetMemUseBytes(); + + g_TotalAllocBytes += bytes; + g_NetBytes += bytes; + if (g_NetBytes > g_MaxNetBytes) + { + if (g_NetBytes > g_MaxNetBytes + 10000000) + GetMemUseBytes();//to force update of peak + g_MaxNetBytes = g_NetBytes; + } + void *p = malloc(bytes); + //void *p = _malloc_dbg(bytes, _NORMAL_BLOCK, __FILE__, __LINE__); + if (0 == p) + { + double b = GetMemUseBytes(); + fprintf(stderr, "\nOut of memory mymalloc(%u), curr %.3g bytes", + (unsigned) bytes, b); + void LogAllocs(); + LogAllocs(); +#if DEBUG && defined(_MSC_VER) + asserta(_CrtCheckMemory()); +#endif + Die("Out of memory, mymalloc(%u), curr %.3g bytes\n", + (unsigned) bytes, b); + } + return p; + } + +void myfree(void *p) + { + if (p == 0) + return; + free(p); + //_free_dbg(p, _NORMAL_BLOCK); + } + +void myfree2(void *p, unsigned bytes) + { + ++g_FreeCalls; + g_TotalFreeBytes += bytes; + g_NetBytes -= bytes; + + if (p == 0) + return; + free(p); + } +#endif diff --git a/myutils.h b/myutils.h new file mode 100644 index 0000000..b63ad3c --- /dev/null +++ b/myutils.h @@ -0,0 +1,274 @@ +#ifndef myutils_h +#define myutils_h + +#define RCE_MALLOC 0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef _MSC_VER +#include +#endif + +using namespace std; + +#ifdef _MSC_VER +#include +#pragma warning(disable: 4996) // deprecated functions +#define _CRT_SECURE_NO_DEPRECATE 1 +#endif + +#if defined(_DEBUG) && !defined(DEBUG) +#define DEBUG 1 +#endif + +#if defined(DEBUG) && !defined(_DEBUG) +#define _DEBUG 1 +#endif + +#ifndef NDEBUG +#define DEBUG 1 +#define _DEBUG 1 +#endif + +typedef unsigned char byte; +typedef unsigned short uint16; +typedef unsigned uint32; +typedef int int32; +typedef double float32; +typedef signed char int8; +typedef unsigned char uint8; + +#ifdef _MSC_VER + +typedef __int64 int64; +typedef unsigned __int64 uint64; + +#define INT64_PRINTF "lld" +#define UINT64_PRINTF "llu" + +#define SIZE_T_PRINTF "u" +#define OFF64_T_PRINTF "lld" + +#define INT64_PRINTFX "llx" +#define UINT64_PRINTFX "llx" + +#define SIZE_T_PRINTFX "x" +#define OFF64_T_PRINTFX "llx" + +#elif defined(__x86_64__) + +typedef long int64; +typedef unsigned long uint64; + +#define INT64_PRINTF "ld" +#define UINT64_PRINTF "lu" + +#define SIZE_T_PRINTF "lu" +#define OFF64_T_PRINTF "ld" + +#define INT64_PRINTFX "lx" +#define UINT64_PRINTFX "lx" + +#define SIZE_T_PRINTFX "lx" +#define OFF64_T_PRINTFX "lx" + +#else + +typedef long long int64; +typedef unsigned long long uint64; + +#define INT64_PRINTF "lld" +#define UINT64_PRINTF "llu" + +#define SIZE_T_PRINTF "u" +#define OFF64_T_PRINTF "lld" + +#define INT64_PRINTFX "llx" +#define UINT64_PRINTFX "llx" + +#define SIZE_T_PRINTFX "x" +#define OFF64_T_PRINTFX "llx" +#endif + +#define d64 INT64_PRINTF +#define u64 UINT64_PRINTF +#define x64 UINT64_PRINTFX + +// const uint64 UINT64_MAX = (~((uint64) 0)); + +void myassertfail(const char *Exp, const char *File, unsigned Line); +#undef assert +#ifdef NDEBUG +#define assert(exp) ((void)0) +#define myassert(exp) ((void)0) +#else +#define assert(exp) (void)( (exp) || (myassertfail(#exp, __FILE__, __LINE__), 0) ) +#define myassert(exp) (void)( (exp) || (myassertfail(#exp, __FILE__, __LINE__), 0) ) +#endif +#define asserta(exp) (void)( (exp) || (myassertfail(#exp, __FILE__, __LINE__), 0) ) + +#define ureturn(x) return (x) + +#define NotUsed(v) ((void *) &v) + +// pom=plus or minus, tof=true or false +static inline char pom(bool Plus) { return Plus ? '+' : '-'; } +static inline char tof(bool x) { return x ? 'T' : 'F'; } +static inline char yon(bool x) { return x ? 'Y' : 'N'; } +unsigned GetElapsedSecs(); + +#if RCE_MALLOC + +void *rce_malloc(unsigned bytes, const char *FileName, int Line); +void rce_free(void *p, const char *FileName, int LineNr); +void rce_chkmem(); + +void rce_dumpmem_(const char *FileName, int LineNr); +#define rce_dumpmem() rce_dumpmem_(__FILE__, __LINE__) + +void rce_assertvalidptr_(void *p, const char *FileName, int LineNr); +#define rce_assertvalidptr(p) rce_assertvalidptr_(p, __FILE__, __LINE__) + +void rce_dumpptr_(void *p, const char *FileName, int LineNr); +#define rce_dumpptr(p) rce_dumpptr_(p, __FILE__, __LINE__) + +#define mymalloc(n) rce_malloc((n), __FILE__, __LINE__) +#define myfree(p) rce_free(p, __FILE__, __LINE__) +#define myfree2(p,n) rce_free(p, __FILE__, __LINE__) +#define myalloc(t, n) (t *) rce_malloc((n)*sizeof(t), __FILE__, __LINE__) + +#else // RCE_MALLOC +void *mymalloc(unsigned bytes); +void myfree2(void *p, unsigned Bytes); +void myfree(void *p); +#define rce_chkmem() /* empty */ +#define myalloc(t, n) (t *) mymalloc((n)*sizeof(t)) +#endif // RCE_MALLOC + +#define SIZE(c) unsigned((c).size()) + +bool myisatty(int fd); + +#ifdef _MSC_VER +#define off_t __int64 +#endif + +FILE *OpenStdioFile(const string &FileName); +FILE *CreateStdioFile(const string &FileName); +bool CanSetStdioFilePos(FILE *f); +void CloseStdioFile(FILE *f); +void SetStdioFilePos(FILE *f, off_t Pos); +void ReadStdioFile(FILE *f, off_t Pos, void *Buffer, unsigned Bytes); +void ReadStdioFile(FILE *f, void *Buffer, unsigned Bytes); +void WriteStdioFile(FILE *f, off_t Pos, const void *Buffer, unsigned Bytes); +void WriteStdioFile(FILE *f, const void *Buffer, unsigned Bytes); +bool ReadLineStdioFile(FILE *f, char *Line, unsigned Bytes); +bool ReadLineStdioFile(FILE *f, string &Line); +byte *ReadAllStdioFile(FILE *f, off_t &FileSize); +byte *ReadAllStdioFile(const string &FileName, off_t &FileSize); +void AppendStdioFileToFile(FILE *fFrom, FILE *fTo); +void FlushStdioFile(FILE *f); +bool StdioFileExists(const string &FileName); +off_t GetStdioFilePos(FILE *f); +off_t GetStdioFileSize(FILE *f); +void LogStdioFileState(FILE *f); +void RenameStdioFile(const string &FileNameFrom, const string &FileNameTo); +void DeleteStdioFile(const string &FileName); + +void myvstrprintf(string &Str, const char *szFormat, va_list ArgList); +void myvstrprintf(string &Str, const char *szFormat, ...); + +void SetLogFileName(const string &FileName); +void Log(const char *szFormat, ...); + +void Die(const char *szFormat, ...); +void Warning(const char *szFormat, ...); + +void ProgressStep(unsigned i, unsigned N, const char *Format, ...); +void Progress(const char *szFormat, ...); +void Progress(const string &Str); +void ProgressLog(const char *szFormat, ...); +void ProgressExit(); + +char *mystrsave(const char *s); + +double GetPeakMemUseBytes(); + +// Are two floats equal to within epsilon? +const double epsilon = 0.01; +inline bool feq(double x, double y, double epsilon) + { + if (fabs(x) > 10000) + epsilon = fabs(x)/10000; + if (fabs(x - y) > epsilon) + return false; + return true; + } + +inline bool feq(double x, double y) + { + if (x < -1e6 && y < -1e6) + return true; + double e = epsilon; + if (fabs(x) > 10000) + e = fabs(x)/10000; + if (fabs(x - y) > e) + return false; + return true; + } + +#define asserteq(x, y) assert(feq(x, y)) +#define assertaeq(x, y) asserta(feq(x, y)) + +#define zero(a, n) memset(a, 0, n*sizeof(a[0])) + +void InitRand(); +unsigned randu32(); +void Split(const string &Str, vector &Fields, char Sep = 0); +double Pct(double x, double y); +double GetMemUseBytes(); +const char *MemBytesToStr(double Bytes); +const char *IntToStr(unsigned i); +const char *FloatToStr(double d); +const char *SecsToStr(double Secs); +void Logu(unsigned u, unsigned w, unsigned prefixspaces = 2); +void Logf(float x, unsigned w, unsigned prefixspaces = 2); +const char *SecsToHHMMSS(int Secs); + +void MyCmdLine(int argc, char **argv); +void CmdLineErr(const char *Format, ...); +void Help(); +void GetCmdLine(string &s); + +#define FLAG_OPT(LongName) extern bool opt_##LongName; extern bool optset_##LongName; +#define TOG_OPT(LongName, Default) extern bool opt_##LongName; extern bool optset_##LongName; +#define INT_OPT(LongName, Default, Min, Max) extern int opt_##LongName; extern bool optset_##LongName; +#define UNS_OPT(LongName, Default, Min, Max) extern unsigned opt_##LongName; extern bool optset_##LongName; +#define FLT_OPT(LongName, Default, Min, Max) extern double opt_##LongName; extern bool optset_##LongName; +#define STR_OPT(LongName, Default) extern string opt_##LongName; extern bool optset_##LongName; +#define ENUM_OPT(LongName, Default, Values) extern int opt_##LongName; extern bool optset_##LongName; +#include "myopts.h" +#undef FLAG_OPT +#undef TOG_OPT +#undef INT_OPT +#undef UNS_OPT +#undef FLT_OPT +#undef STR_OPT +#undef ENUM_OPT + +extern const char *SVN_VERSION; +extern const char *SVN_MODS; +extern bool opt_quiet; +extern bool opt_version; +extern FILE *g_fLog; + +#endif // myutils_h diff --git a/orf.h b/orf.h new file mode 100644 index 0000000..90b29d1 --- /dev/null +++ b/orf.h @@ -0,0 +1,37 @@ +#ifndef orf_h +#define orf_h + +#include "alpha.h" + +struct ORFData + { + const byte *NucSeq; + const byte *AminoSeq; + int Frame; + unsigned NucL; + unsigned AminoL; + unsigned NucLo; + unsigned NucHi; + ORFData *Next; + + unsigned GetNucPosFirstBase() const; + unsigned GetAAPos(unsigned NucPos) const; + unsigned GetCodex(unsigned NucPos) const; + unsigned GetNucLo(unsigned AALo, unsigned AAHi) const; + unsigned GetNucHi(unsigned AALo, unsigned AAHi) const; + unsigned GetAALo(unsigned NucLo, unsigned NucHi) const; + unsigned GetAAHi(unsigned NucLo, unsigned NucHi) const; + unsigned GetNucPosFirstBaseInCodon(unsigned AAPos) const; + unsigned GetNucPosLastBaseInCodon(unsigned AAPos) const; + unsigned RoundToCodonLo(unsigned NucPos) const; + unsigned RoundToCodonHi(unsigned NucPos) const; + void LogMe() const; + void LogMe2() const; + }; + +const byte ORFEND = '.'; + +void GetORFs(const byte *NucSeq, unsigned NucL, vector &ORFs, + unsigned ORFStyle, int FindFrame, int Sign); + +#endif // orf_h diff --git a/out.h b/out.h new file mode 100644 index 0000000..4ca50c7 --- /dev/null +++ b/out.h @@ -0,0 +1,134 @@ +#ifndef out_h +#define out_h + +#include "seq.h" +#include "hsp.h" +#include "orf.h" +#include "path.h" +#include + +struct AlnData + { +/*** +SA.Seq and SB.Seq align. +Reverse strand stuff for nucleotides is handled like this: + SA.RevComp must be false. + If SB.RevComp is true, then SA.Seq is r.c.'d relative to the sequence in + the input file (query or db). If so, coordinates in HSP refer to SB.Seq + so are also r.c.'d relative to the original sequence. +***/ + SeqData SA; + SeqData SB; + HSPData HSP; + const char *Path; + char IdDesc[256]; + + float FractId; + float RawScore; + float BitScore; + float Evalue; + + void LogMe() const + { + Log("AD: "); + HSP.LogMe(); + Log(" %s,%s\n", SA.Label, SB.Label); + } + }; + +bool OnDerepHit(const SeqData &SA, const SeqData &SB); + +bool OnLocalUngappedHit(const SeqData &SA, const SeqData &SB, + const HSPData &HSP, float &Evalue, float &FractId); + +bool OnLocalGappedHit(const SeqData &SA, const SeqData &SB, + const HSPData &HSP, const PathData &PD, float &Evalue, float &FractId); + +bool OnGlobalHit(const SeqData &SA, const SeqData &SB, const PathData &PD, + float &FractId); + +void OnReject(const SeqData &SA, const SeqData &SB, double FractId, + const char *Path); + +void OnNotMatched(const char *Label, unsigned L); +void OnNewCluster(unsigned ClusterIndex, const char *Label, unsigned L); +void OnNewLibCluster(unsigned ClusterIndex, const char *Label, unsigned L); +void OnLibCluster(unsigned ClusterIndex, unsigned Size, double AvgId, + const char *Label); +void OnNewCluster(unsigned ClusterIndex, unsigned Size, double AvgId, + const char *Label); +void OnChainCov(const SeqData &NucleoSD, const SeqData &TargetSD, + float Score, float ChainCov); + +void SetUserFieldIndexes(const string &s); + +void BlastOut(FILE *f, const AlnData &AD); +void Blast6Out(FILE *f, const AlnData &AD); +void FastaPairOut(FILE *f, const AlnData &AD); +void UserOut(FILE *f, const AlnData &AD); + +void BlastOutORF(FILE *f, const AlnData &AD); + +void OpenOutputFiles(); +void CloseOutputFiles(); +void SetLibSeedCount(unsigned DBSeqCount); +const char *UserFieldIndexToStr(unsigned i); + +extern float **g_SubstMx; + +static char g_IdChar = '|'; +static char g_DiffChar = ' '; + +static inline char GetSymN(byte Letter1, byte Letter2) + { + Letter1 = toupper(Letter1); + Letter2 = toupper(Letter2); + if (Letter1 == Letter2) + return g_IdChar; + return g_DiffChar; + } + +static inline char GetSymA(byte Letter1, byte Letter2) + { + Letter1 = toupper(Letter1); + Letter2 = toupper(Letter2); + if (Letter1 == Letter2) + return '|'; + + float Score = g_SubstMx[Letter1][Letter2]; + if (Score >= 2.0f) + return ':'; + if (Score > 0.0f) + return '.'; + return ' '; + } + +static inline char GetSym(byte Letter1, byte Letter2, bool Nucleo) + { + if (Nucleo) + return GetSymN(Letter1, Letter2); + else + return GetSymA(Letter1, Letter2); + } + +static unsigned GetNDig(unsigned n) + { + if (n < 10) + return 1; + if (n < 100) + return 2; + if (n < 1000) + return 3; + if (n < 10000) + return 4; + if (n < 100000) + return 5; + if (n < 1000000) + return 6; + return 10; + } + +extern unsigned *g_UserFieldIndexes; +extern unsigned g_UserFieldCount; + +#endif // out_h diff --git a/path.cpp b/path.cpp new file mode 100644 index 0000000..9340344 --- /dev/null +++ b/path.cpp @@ -0,0 +1,151 @@ +#include "myutils.h" +#include "path.h" +#include "timing.h" + +#define TRACE 0 + +const unsigned PathMagic = 0x9A783A16; + +struct PathBuffer + { + unsigned Magic; + char *Buffer; + unsigned Size; + bool InUse; + }; + +static PathBuffer **g_PathBuffers; +static unsigned g_PathBufferSize; + +static char *AllocBuffer(unsigned Size) + { + if (Size == 0) + return 0; + +// Is a free buffer that is big enough? + for (unsigned i = 0; i < g_PathBufferSize; ++i) + { + PathBuffer *PB = g_PathBuffers[i]; + asserta(PB->Magic == PathMagic); + if (!PB->InUse) + { + if (PB->Size >= Size) + { + PB->InUse = true; + return PB->Buffer; + } + if (PB->Buffer == 0) + { + unsigned Size2 = Size + 1024; + PB->Buffer = MYALLOC(char, Size2, Path); + PB->Size = Size2; + PB->InUse = true; + return PB->Buffer; + } + } + } + +// No available buffer, must expand g_PathBuffers[] + unsigned NewPathBufferSize = g_PathBufferSize + 1024; + PathBuffer **NewPathBuffers = MYALLOC(PathBuffer *, NewPathBufferSize, Path); + + for (unsigned i = 0; i < g_PathBufferSize; ++i) + NewPathBuffers[i] = g_PathBuffers[i]; + + for (unsigned i = g_PathBufferSize; i < NewPathBufferSize; ++i) + { + PathBuffer *PB = MYALLOC(PathBuffer, 1, Path); + PB->Magic = PathMagic; + PB->Buffer = 0; + PB->Size = 0; + PB->InUse = false; + NewPathBuffers[i] = PB; + } + + PathBuffer *PB = NewPathBuffers[g_PathBufferSize]; + + MYFREE(g_PathBuffers, g_PathBufferSize, Path); + g_PathBuffers = NewPathBuffers; + g_PathBufferSize = NewPathBufferSize; + + asserta(!PB->InUse && PB->Buffer == 0); + + unsigned Size2 = Size + 1024; + PB->Buffer = MYALLOC(char, Size2, Path); + PB->Size = Size2; + PB->InUse = true; + return PB->Buffer; + } + +static void FreeBuffer(char *Buffer) + { + if (Buffer == 0) + return; + + for (unsigned i = 0; i < g_PathBufferSize; ++i) + { + PathBuffer *PB = g_PathBuffers[i]; + if (PB->Buffer == Buffer) + { + asserta(PB->InUse); + PB->InUse = false; + return; + } + } + + Die("FreeBuffer, not found"); + } + +void PathData::Alloc(unsigned MaxLen) + { + if (MaxLen < Bytes) + return; + + StartTimer(PathAlloc); + if (Bytes > 0) + { + FreeBuffer(Front); + } + + Bytes = MaxLen + 1; + Front = AllocBuffer(Bytes); + Back = Front + Bytes - 1; + Start = 0; + EndTimer(PathAlloc); + } + +void PathData::Free() + { + FreeBuffer(Front); + Front = 0; + Start = 0; + Back = 0; + } + +void PathData::Copy(const PathData &rhs) + { + Alloc(rhs.Bytes); + strcpy(Front, rhs.Front); + Start = Front + (rhs.Start - rhs.Front); + } + +void PathData::FromStr(const char *PathStr) + { + asserta(PathStr != 0); + unsigned NeededBytes = (unsigned) strlen(PathStr) + 1; + Alloc(NeededBytes); + strcpy(Front, PathStr); + Start = Front; + } + +void LogPathStats() + { + Log("\n"); + unsigned Bytes = 0; + for (unsigned i = 0; i < g_PathBufferSize; ++i) + { + const PathBuffer *PB = g_PathBuffers[i]; + Bytes += PB->Size; + } + Log("%u paths allocated, total memory %u bytes\n", g_PathBufferSize, Bytes); + } diff --git a/path.h b/path.h new file mode 100644 index 0000000..f63be7e --- /dev/null +++ b/path.h @@ -0,0 +1,63 @@ +#ifndef path_h +#define path_h + +struct PathData + { +private: + PathData(PathData &); + PathData &operator=(PathData &); + +public: + char *Start; + char *Front; + char *Back; + unsigned Bytes; + +public: + PathData() + { + Clear(true); + } + ~PathData() + { + Free(); + } + void Free(); + void Alloc(unsigned MaxLen); + void Clear(bool ctor = false) + { + Start = 0; + if (ctor) + { + Front = 0; + Back = 0; + Bytes = 0; + } + else + Free(); + } + void Copy(const PathData &rhs); + void FromStr(const char *PathStr); + void Reverse() + { + asserta(Start != 0); + unsigned L = (unsigned) strlen(Start); + for (unsigned k = 0; k < L/2; ++k) + { + char c = Start[k]; + Start[k] = Start[L-k-1]; + Start[L-k-1] = c; + } + } + void SetEmpty() + { + Start = 0; + } + + bool IsEmpty() const + { + return Start == 0; + } + }; + +#endif // path_h diff --git a/searchchime.cpp b/searchchime.cpp new file mode 100644 index 0000000..c00a9c4 --- /dev/null +++ b/searchchime.cpp @@ -0,0 +1,304 @@ +#include "myutils.h" +#include "ultra.h" +#include "chime.h" +#include "uc.h" +#include "dp.h" +#include +#include + +#define TRACE 0 + +extern FILE *g_fUChime; + +void GetCandidateParents(Ultra &U, const SeqData &QSD, float AbQ, + vector &Parents); + +void AlignChime(const SeqData &QSD, const SeqData &ASD, const SeqData &BSD, + const string &PathQA, const string &PathQB, ChimeHit2 &Hit); + +double GetFractIdGivenPath(const byte *A, const byte *B, const char *Path, bool Nucleo); + +static void GetSmoothedIdVec(const SeqData &QSD, const SeqData &PSD, const string &Path, + vector &IdVec, unsigned d) + { + IdVec.clear(); + const unsigned ColCount = SIZE(Path); + + const byte *Q = QSD.Seq; + const byte *P = PSD.Seq; + + const unsigned QL = QSD.L; + const unsigned PL = PSD.L; + + if (QL <= d) + { + IdVec.resize(QSD.L, 0); + return; + } + + unsigned QPos = 0; + unsigned PPos = 0; + + vector SameVec; + SameVec.reserve(QL); + for (unsigned Col = 0; Col < ColCount; ++Col) + { + char c = Path[Col]; + + bool Same = false; + if (c == 'M') + { + byte q = Q[QPos]; + byte p = P[PPos]; + Same = (toupper(q) == toupper(p)); + } + + if (c == 'M' || c == 'D') + { + ++QPos; + SameVec.push_back(Same); + } + + if (c == 'M' || c == 'I') + ++PPos; + } + + asserta(SIZE(SameVec) == QL); + + unsigned n = 0; + for (unsigned QPos = 0; QPos < d; ++QPos) + { + if (SameVec[QPos]) + ++n; + IdVec.push_back(n); + } + + for (unsigned QPos = d; QPos < QL; ++QPos) + { + if (SameVec[QPos]) + ++n; + IdVec.push_back(n); + if (SameVec[QPos-d]) + --n; + } + asserta(SIZE(IdVec) == QL); + +#if TRACE + { + Log("\n"); + Log("GetSmoothedIdVec\n"); + unsigned QPos = 0; + unsigned PPos = 0; + Log("Q P Same Id\n"); + Log("- - ---- -------\n"); + for (unsigned Col = 0; Col < ColCount; ++Col) + { + char c = Path[Col]; + + bool Same = false; + if (c == 'M') + { + byte q = Q[QPos]; + byte p = P[PPos]; + Same = (toupper(q) == toupper(p)); + Log("%c %c %4c %7d\n", q, p, tof(Same), IdVec[QPos]); + } + + if (c == 'M' || c == 'D') + ++QPos; + if (c == 'M' || c == 'I') + ++PPos; + } + } +#endif + } + +bool SearchChime(Ultra &U, const SeqData &QSD, float QAb, + const AlnParams &AP, const AlnHeuristics &AH, HSPFinder &HF, + float MinFractId, ChimeHit2 &Hit) + { + Hit.Clear(); + Hit.QLabel = QSD.Label; + + if (opt_verbose) + { + Log("\n"); + Log("SearchChime()\n"); + Log("Query>%s\n", QSD.Label); + } + + vector Parents; + GetCandidateParents(U, QSD, QAb, Parents); + + unsigned ParentCount = SIZE(Parents); + if (ParentCount <= 1) + { + if (opt_verbose) + Log("%u candidate parents, done.\n", ParentCount); + return false; + } + + if (opt_fastalign) + HF.SetA(QSD); + HSPFinder *ptrHF = (opt_fastalign ? &HF : 0); + + unsigned ChunkLength; + vector ChunkLos; + GetChunkInfo(QSD.L, ChunkLength, ChunkLos); + const unsigned ChunkCount = SIZE(ChunkLos); + + vector ChunkIndexToBestId(ChunkCount, 0); + vector ChunkIndexToBestParentIndex(ChunkCount, UINT_MAX); + + vector PSDs; + vector Paths; + double TopPctId = 0.0; + unsigned TopParentIndex = UINT_MAX; + unsigned QL = QSD.L; + vector MaxIdVec(QL, 0); + for (unsigned ParentIndex = 0; ParentIndex < ParentCount; ++ParentIndex) + { + unsigned ParentSeqIndex = Parents[ParentIndex]; + + SeqData PSD; + //PSD.Label = U.GetSeedLabel(ParentSeqIndex); + //PSD.Seq = U.GetSeedSeq(ParentSeqIndex); + //PSD.L = U.GetSeedLength(ParentSeqIndex); + //PSD.Index = ParentSeqIndex; + U.GetSeqData(ParentSeqIndex, PSD); + PSDs.push_back(PSD); + + if (opt_fastalign) + HF.SetB(PSD); + + PathData PD; + + float HSPId; + bool Found = GlobalAlign(QSD, PSD, AP, AH, *ptrHF, MinFractId, HSPId, PD); + if (!Found) + { + Paths.push_back(""); + continue; + } + + double PctId = 100.0*GetFractIdGivenPath(QSD.Seq, PSD.Seq, PD.Start, true); + if (opt_selfid && PctId == 100.0) + { + Paths.push_back(""); + continue; + } + + if (PctId > TopPctId) + { + TopParentIndex = ParentIndex; + TopPctId = PctId; + if (TopPctId >= 100.0 - opt_mindiv) + { + if (opt_verbose) + { + Log(" %.1f%% >%s\n", TopPctId, PSD.Label); + Log(" Top hit exceeds ctl threshold, done.\n"); + return false; + } + } + } + + string Path = PD.Start; + Paths.push_back(Path); + + vector IdVec; + GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow); + + for (unsigned QPos = 0; QPos < QL; ++QPos) + if (IdVec[QPos] > MaxIdVec[QPos]) + MaxIdVec[QPos] = IdVec[QPos]; + } + + vector BestParents; + for (unsigned k = 0; k < opt_maxp; ++k) + { + unsigned BestParent = UINT_MAX; + unsigned BestCov = 0; + for (unsigned ParentIndex = 0; ParentIndex < ParentCount; ++ParentIndex) + { + const SeqData &PSD = PSDs[ParentIndex]; + const string &Path = Paths[ParentIndex]; + if (Path == "") + continue; + + vector IdVec; + GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow); + + unsigned Cov = 0; + for (unsigned QPos = 0; QPos < QL; ++QPos) + if (IdVec[QPos] == MaxIdVec[QPos]) + ++Cov; + + if (Cov > BestCov) + { + BestParent = ParentIndex; + BestCov = Cov; + } + } + + if (BestParent == UINT_MAX) + break; + + BestParents.push_back(BestParent); + vector IdVec; + + const SeqData &PSD = PSDs[BestParent]; + const string &Path = Paths[BestParent]; + GetSmoothedIdVec(QSD, PSD, Path, IdVec, opt_idsmoothwindow); + for (unsigned QPos = 0; QPos < QL; ++QPos) + if (IdVec[QPos] == MaxIdVec[QPos]) + MaxIdVec[QPos] = UINT_MAX; + } + + unsigned BestParentCount = SIZE(BestParents); + + if (opt_verbose) + { + Log("%u/%u best parents\n", BestParentCount, ParentCount); + for (unsigned k = 0; k < BestParentCount; ++k) + { + unsigned i = BestParents[k]; + Log(" %s\n", PSDs[i].Label); + } + } + + bool Found = false; + for (unsigned k1 = 0; k1 < BestParentCount; ++k1) + { + unsigned i1 = BestParents[k1]; + asserta(i1 < ParentCount); + + const SeqData &PSD1 = PSDs[i1]; + const string &Path1 = Paths[i1]; + + for (unsigned k2 = k1 + 1; k2 < BestParentCount; ++k2) + { + unsigned i2 = BestParents[k2]; + asserta(i2 < ParentCount); + asserta(i2 != i1); + + const SeqData &PSD2 = PSDs[i2]; + const string &Path2 = Paths[i2]; + + ChimeHit2 Hit2; + AlignChime(QSD, PSD1, PSD2, Path1, Path2, Hit2); + Hit2.PctIdQT = TopPctId; + + if (Hit2.Accept()) + Found = true; + + if (Hit2.Score > Hit.Score) + Hit = Hit2; + + if (opt_verbose) + Hit2.LogMe(); + } + } + + return Found; + } diff --git a/seq.h b/seq.h new file mode 100644 index 0000000..9014641 --- /dev/null +++ b/seq.h @@ -0,0 +1,38 @@ +#ifndef seq_h +#define seq_h + +struct ORFData; + +struct SeqData + { + const char *Label; + const byte *Seq; + unsigned L; + unsigned Index; + +// RevComp means that SeqData.Seq is reverse-complemented relative +// to the sequence in the input file (query or db). Coordinates in +// a hit (e.g., AlnData) will be relative to SeqData.Seq, so both +// the sequence and the coordinates should be r.c.'d for output. + bool RevComp; + bool Nucleo; + const ORFData *ORFParent; + + SeqData() + { + Clear(); + } + + void Clear() + { + Label = 0; + Seq = 0; + L = 0; + Index = UINT_MAX; + RevComp = false; + Nucleo = false; + ORFParent = 0; + } + }; + +#endif // seq_h diff --git a/seqdb.cpp b/seqdb.cpp new file mode 100644 index 0000000..03de189 --- /dev/null +++ b/seqdb.cpp @@ -0,0 +1,289 @@ +#include "myutils.h" +#include "seqdb.h" +#include "alpha.h" +#include "timing.h" +#include "sfasta.h" +#include "seq.h" + +void SeqToFasta(FILE *f, const char *Label, const byte *Seq, unsigned L) + { + const unsigned ROWLEN = 80; + if (Label != 0) + fprintf(f, ">%s\n", Label); + unsigned BlockCount = (L + ROWLEN - 1)/ROWLEN; + for (unsigned BlockIndex = 0; BlockIndex < BlockCount; ++BlockIndex) + { + unsigned From = BlockIndex*ROWLEN; + unsigned To = From + ROWLEN; + if (To >= L) + To = L; + for (unsigned Pos = From; Pos < To; ++Pos) + fputc(Seq[Pos], f); + fputc('\n', f); + } + } + +SeqDB::~SeqDB() + { + Clear(); + } + +SeqDB::SeqDB() + { + Clear(true); + } + +void SeqDB::Clear(bool ctor) + { + if (!ctor) + { + for (unsigned i = 0; i < m_SeqCount; ++i) + { + unsigned n = strlen(m_Labels[i]); + MYFREE(m_Labels[i], n, SeqDB); + MYFREE(m_Seqs[i], m_SeqLengths[i], SeqDB); + } + MYFREE(m_Labels, m_Size, SeqDB); + MYFREE(m_Seqs, m_Size, SeqDB); + MYFREE(m_SeqLengths, m_Size, SeqDB); + } + + m_FileName.clear(); + m_SeqCount = 0; + m_Size = 0; + + m_Labels = 0; + m_Seqs = 0; + m_SeqLengths = 0; + + m_Aligned = false; + m_IsNucleo = false; + m_IsNucleoSet = false; + } + +void SeqDB::InitEmpty(bool Nucleo) + { + Clear(); + m_IsNucleo = Nucleo; + m_IsNucleoSet = true; + } + +void SeqDB::FromFasta(const string &FileName, bool AllowGaps) + { + Clear(); + m_FileName = FileName; + SFasta SF; + + SF.Open(FileName); + SF.m_AllowGaps = AllowGaps; + + ProgressStep(0, 1000, "Reading %s", FileName.c_str()); + for (;;) + { + unsigned QueryPctDoneX10 = SF.GetPctDoneX10(); + ProgressStep(QueryPctDoneX10, 1000, "Reading %s", FileName.c_str()); + const byte *Seq = SF.GetNextSeq(); + if (Seq == 0) + break; + + const char *Label = SF.GetLabel(); + unsigned L = SF.GetSeqLength(); + AddSeq(Label, Seq, L); + } + ProgressStep(999, 1000, "Reading %s", FileName.c_str()); + + SetIsNucleo(); + + Progress("%s sequences\n", IntToStr(GetSeqCount())); + } + +void SeqDB::ToFasta(const string &FileName) const + { + FILE *f = CreateStdioFile(FileName); + for (unsigned SeqIndex = 0; SeqIndex < GetSeqCount(); ++SeqIndex) + ToFasta(f, SeqIndex); + CloseStdioFile(f); + } + +void SeqDB::SeqToFasta(FILE *f, unsigned SeqIndex, bool WithLabel) const + { + if (WithLabel) + fprintf(f, ">%s\n", GetLabel(SeqIndex)); + + const unsigned ROWLEN = 80; + + unsigned L = GetSeqLength(SeqIndex); + const byte *Seq = GetSeq(SeqIndex); + unsigned BlockCount = (L + ROWLEN - 1)/ROWLEN; + for (unsigned BlockIndex = 0; BlockIndex < BlockCount; ++BlockIndex) + { + unsigned From = BlockIndex*ROWLEN; + unsigned To = From + ROWLEN; + if (To >= L) + To = L; + for (unsigned Pos = From; Pos < To; ++Pos) + fputc(Seq[Pos], f); + fputc('\n', f); + } + } + +void SeqDB::ToFasta(FILE *f, unsigned SeqIndex) const + { + asserta(SeqIndex < m_SeqCount); + fprintf(f, ">%s\n", GetLabel(SeqIndex)); + SeqToFasta(f, SeqIndex); + } + +unsigned SeqDB::GetMaxLabelLength() const + { + const unsigned SeqCount = GetSeqCount(); + unsigned MaxL = 0; + for (unsigned Index = 0; Index < SeqCount; ++Index) + { + unsigned L = (unsigned) strlen(m_Labels[Index]); + if (L > MaxL) + MaxL = L; + } + return MaxL; + } + +unsigned SeqDB::GetMaxSeqLength() const + { + const unsigned SeqCount = GetSeqCount(); + unsigned MaxL = 0; + for (unsigned Index = 0; Index < SeqCount; ++Index) + { + unsigned L = m_SeqLengths[Index]; + if (L > MaxL) + MaxL = L; + } + return MaxL; + } + +void SeqDB::LogMe() const + { + Log("\n"); + const unsigned SeqCount = GetSeqCount(); + Log("SeqDB %u seqs, aligned=%c\n", SeqCount, tof(m_Aligned)); + if (SeqCount == 0) + return; + + Log("Index Label Length Seq\n"); + Log("----- ---------------- ------ ---\n"); + for (unsigned Index = 0; Index < SeqCount; ++Index) + { + Log("%5u", Index); + Log(" %16.16s", m_Labels[Index]); + unsigned L = m_SeqLengths[Index]; + Log(" %6u", L); + Log(" %*.*s", L, L, m_Seqs[Index]); + Log("\n"); + } + } + +void SeqDB::GetSeqData(unsigned Id, SeqData &Buffer) const + { + asserta(Id < m_SeqCount); + Buffer.Seq = m_Seqs[Id]; + Buffer.Label = m_Labels[Id]; + Buffer.L = m_SeqLengths[Id]; + Buffer.Index = Id; + Buffer.ORFParent = 0; + Buffer.RevComp = false; + Buffer.Nucleo = IsNucleo(); + } + +void SeqDB::SetIsNucleo() + { + const unsigned SeqCount = GetSeqCount(); + unsigned N = 0; + for (unsigned i = 0; i < 100; ++i) + { + unsigned SeqIndex = unsigned(rand()%SeqCount); + const byte *Seq = GetSeq(SeqIndex); + unsigned L = GetSeqLength(SeqIndex); + const unsigned Pos = unsigned(rand()%L); + byte c = Seq[Pos]; + + if (g_IsNucleoChar[c]) + ++N; + } + m_IsNucleo = (N > 80); + m_IsNucleoSet = true; + } + +unsigned SeqDB::GetTotalLength() const + { + const unsigned SeqCount = GetSeqCount(); + unsigned TotalLength = 0; + for (unsigned Id = 0; Id < SeqCount; ++Id) + TotalLength += GetSeqLength(Id); + return TotalLength; + } + +unsigned SeqDB::AddSeq(const char *Label, const byte *Seq, unsigned L) + { + StartTimer(AddSeq); + if (m_SeqCount >= m_Size) + { + unsigned NewSize = unsigned(m_Size*1.5) + 1024; + char **NewLabels = MYALLOC(char *, NewSize, SeqDB); + byte **NewSeqs = MYALLOC(byte *, NewSize, SeqDB); + unsigned *NewSeqLengths = MYALLOC(unsigned, NewSize, SeqDB); + + for (unsigned i = 0; i < m_SeqCount; ++i) + { + NewLabels[i] = m_Labels[i]; + NewSeqs[i] = m_Seqs[i]; + NewSeqLengths[i] = m_SeqLengths[i]; + } + + MYFREE(m_Labels, m_SeqCount, SeqDB); + MYFREE(m_Seqs, m_SeqCount, SeqDB); + MYFREE(m_SeqLengths, m_SeqCount, SeqDB); + + m_Labels = NewLabels; + m_Seqs = NewSeqs; + m_SeqLengths = NewSeqLengths; + m_Size = NewSize; + } + + unsigned Index = m_SeqCount++; + m_Seqs[Index] = MYALLOC(byte, L, SeqDB); + memcpy(m_Seqs[Index], Seq, L); + + unsigned n = strlen(Label) + 1; + m_Labels[Index] = MYALLOC(char, n, SeqDB); + memcpy(m_Labels[Index], Label, n); + + if (Index == 0) + m_Aligned = true; + else + m_Aligned = (m_Aligned && L == m_SeqLengths[0]); + + m_SeqLengths[Index] = L; + + EndTimer(AddSeq); + return Index; + } + +unsigned SeqDB::GetIndex(const char *Label) const + { + for (unsigned i = 0; i < m_SeqCount; ++i) + if (strcmp(Label, m_Labels[i]) == 0) + return i; + Die("SeqDB::GetIndex(%s), not found", Label); + return UINT_MAX; + } + +void SeqDB::MakeLabelToIndex(map &LabelToIndex) + { + LabelToIndex.clear(); + for (unsigned i = 0; i < m_SeqCount; ++i) + { + const string &Label = string(GetLabel(i)); + if (LabelToIndex.find(Label) != LabelToIndex.end()) + Die("Duplicate label: %s", Label.c_str()); + LabelToIndex[Label] = i; + } + } diff --git a/seqdb.h b/seqdb.h new file mode 100644 index 0000000..fafbdd9 --- /dev/null +++ b/seqdb.h @@ -0,0 +1,109 @@ +#ifndef seqdb_h +#define seqdb_h + +#include +#include +#include "myutils.h" + +struct SeqData; + +using namespace std; + +struct SeqDB + { +private: + SeqDB(const SeqDB &rhs); + SeqDB &operator=(const SeqDB &rhs); + +public: + string m_FileName; + char **m_Labels; + byte **m_Seqs; + unsigned *m_SeqLengths; + unsigned m_SeqCount; + unsigned m_Size; + + bool m_Aligned; + bool m_IsNucleo; + bool m_IsNucleoSet; + +public: + SeqDB(); + ~SeqDB(); + void Clear(bool ctor = false); + void InitEmpty(bool Nucleo); + + unsigned AddSeq(const char *Label, const byte *Seq, unsigned L); + + byte *GetSeq(unsigned SeqIndex) const + { + asserta(SeqIndex < m_SeqCount); + return m_Seqs[SeqIndex]; + } + + const char *GetLabel(unsigned SeqIndex) const + { + asserta(SeqIndex < m_SeqCount); + return m_Labels[SeqIndex]; + } + + unsigned GetSeqLength(unsigned SeqIndex) const + { + asserta(SeqIndex < m_SeqCount); + return m_SeqLengths[SeqIndex]; + } + + unsigned GetSeqCount() const + { + return m_SeqCount; + } + + unsigned GetPairCount() const + { + unsigned SeqCount = GetSeqCount(); + return (SeqCount*(SeqCount - 1))/2; + } + + unsigned GetPairIndex(unsigned SeqIndex1, unsigned SeqIndex2) const + { + if (SeqIndex1 > SeqIndex2) + return (SeqIndex1*(SeqIndex1 - 1))/2 + SeqIndex2; + return (SeqIndex2*(SeqIndex2 - 1))/2 + SeqIndex1; + } + + unsigned GetColCount() const + { + if (!m_Aligned) + Die("SeqDB::GetColCount, not aligned"); + if (m_SeqCount == 0) + Die("SeqDB::GetColCount, empty"); + return m_SeqLengths[0]; + } + + bool IsNucleo() const + { + asserta(m_IsNucleoSet); + return m_IsNucleo; + } + + void GetSeqData(unsigned Id, SeqData &Buffer) const; + + unsigned GetMaxLabelLength() const; + unsigned GetMaxSeqLength() const; + void SetIsNucleo(); + unsigned GetIndex(const char *Label) const; + void MakeLabelToIndex(map &LabelToIndex); + + void LogMe() const; + void FromFasta(const string &FileName, bool AllowGaps = false); + + void ToFasta(const string &FileName) const; + void ToFasta(FILE *f, unsigned SeqIndex) const; + void SeqToFasta(FILE *f, unsigned SeqIndex, bool WithLabel = false) const; + + unsigned GetTotalLength() const; + }; + +bool isgap(byte c); + +#endif diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp index 1ca79b1..620de95 100644 --- a/seqsummarycommand.cpp +++ b/seqsummarycommand.cpp @@ -15,6 +15,7 @@ vector SeqSummaryCommand::setParameters(){ try { CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta); CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname); + CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir); diff --git a/setnucmx.cpp b/setnucmx.cpp new file mode 100644 index 0000000..030ff5a --- /dev/null +++ b/setnucmx.cpp @@ -0,0 +1,77 @@ +#include "myutils.h" +#include "mx.h" + +Mx g_SubstMxf; +float **g_SubstMx; + +static const char Alphabet[] = "ACGTU"; + +void SetNucSubstMx(double Match, double Mismatch) + { + static bool Done = false; + if (Done) + return; + Done = true; + + if (Match <= 0.0) + Die("Match score should be +ve"); + if (Mismatch >= 0.0) + Die("Mismatch score should be -ve"); + + unsigned N = unsigned(strlen(Alphabet)); + + g_SubstMxf.Alloc("NUCMX", 256, 256); + strcpy(g_SubstMxf.m_Alpha, "ACGT"); + g_SubstMxf.Init(0); + g_SubstMx = g_SubstMxf.GetData(); + for (unsigned i = 0; i < N; ++i) + { + for (unsigned j = 0; j < N; ++j) + { + float v = float(i == j ? Match : Mismatch); + + byte ui = (byte) toupper(Alphabet[i]); + byte uj = (byte) toupper(Alphabet[j]); + byte li = (byte) tolower(ui); + byte lj = (byte) tolower(uj); + ui = (byte) toupper(ui); + uj = (byte) toupper(uj); + + g_SubstMx[ui][uj] = v; + g_SubstMx[uj][ui] = v; + + g_SubstMx[ui][lj] = v; + g_SubstMx[uj][li] = v; + + g_SubstMx[li][uj] = v; + g_SubstMx[lj][ui] = v; + + g_SubstMx[li][lj] = v; + g_SubstMx[lj][li] = v; + } + } + + for (unsigned j = 0; j < N; ++j) + { + float v = 0.0f; + + byte ui = (byte) 'N'; + byte uj = (byte) toupper(Alphabet[j]); + byte li = (byte) 'n'; + byte lj = (byte) tolower(uj); + ui = (byte) toupper(ui); + uj = (byte) toupper(uj); + + g_SubstMx[ui][uj] = v; + g_SubstMx[uj][ui] = v; + + g_SubstMx[ui][lj] = v; + g_SubstMx[uj][li] = v; + + g_SubstMx[li][uj] = v; + g_SubstMx[lj][ui] = v; + + g_SubstMx[li][lj] = v; + g_SubstMx[lj][li] = v; + } + } diff --git a/sfasta.cpp b/sfasta.cpp new file mode 100644 index 0000000..5e794c6 --- /dev/null +++ b/sfasta.cpp @@ -0,0 +1,467 @@ +#include "sfasta.h" +#include "orf.h" +#include "alpha.h" +#include "timing.h" + +static inline bool isgap(byte c) + { + return c == '-' || c == '.'; + } + +const unsigned BufferSize = 16*1024*1024; + +static unsigned GetMaxPoly(const byte *Seq, unsigned L) + { + byte CurrChar = Seq[0]; + unsigned Start = 0; + unsigned MaxLen = 1; + for (unsigned i = 1; i < L; ++i) + { + char c = Seq[i]; + if (c != CurrChar || i+1 == L) + { + unsigned Len = i - Start; + if (Len > MaxLen) + MaxLen = Len; + CurrChar = c; + Start = i; + } + } + return MaxLen; + } + +SFasta::SFasta() + { + m_FileName = ""; + m_File = 0; + m_Buffer = 0; + m_BufferSize = 0; + m_BufferOffset = 0; + m_BufferBytes = 0; + m_FilePos = 0; + m_FileSize = 0; + m_Label = 0; + m_SeqLength = 0; + m_TooShortCount = 0; + m_TooLongCount = 0; + m_ShortestLength = 0; + m_LongestLength = 0; + m_IsNucleo = false; + m_IsNucleoSet = false; + } + +SFasta::~SFasta() + { + Clear(); + } + +void SFasta::Clear() + { + MYFREE(m_Buffer, m_BufferSize, SFasta); + if (m_File != 0) + CloseStdioFile(m_File); + + m_FileName = ""; + m_File = 0; + m_Buffer = 0; + m_BufferSize = 0; + m_BufferOffset = 0; + m_BufferBytes = 0; + m_FilePos = 0; + m_FileSize = 0; + m_Label = 0; + m_SeqLength = 0; + m_SeqIndex = UINT_MAX; + m_AllowGaps = false; + m_IsNucleo = false; + m_IsNucleoSet = false; + m_TooShortCount = 0; + m_TooLongCount = 0; + m_ShortestLength = 0; + m_LongestLength = 0; + m_TooPolyCount = 0; + } + +void SFasta::LogMe() const + { + Log("\n"); + Log("SFasta::LogMe()\n"); + Log("FileName=%s\n", m_FileName.c_str()); + Log("FileSize=%u\n", (unsigned) m_FileSize); + Log("FilePos=%u\n", (unsigned) m_FilePos); + Log("BufferSize=%u\n", m_BufferSize); + Log("BufferPos=%u\n", m_BufferOffset); + Log("BufferBytes=%u\n", m_BufferBytes); + if (m_Label == 0) + Log("Label=NULL\n"); + else + Log("Label=%s\n", m_Label); + Log("SeqLength=%u\n", m_SeqLength); + } + +const byte *SFasta::GetNextSeq() + { + for (;;) + { + const byte *Seq = GetNextSeqLo(); + if (Seq == 0) + { + if (m_TooShortCount > 0) + Warning("%u short sequences (--minlen %u, shortest %u) discarded from %s", + m_TooShortCount, opt_minlen, m_ShortestLength, m_FileName.c_str()); + if (m_TooLongCount > 0) + Warning("%u long sequences (--maxlen %u, longest %u) discarded from %s", + m_TooLongCount, opt_maxlen, m_LongestLength, m_FileName.c_str()); + if (m_TooPolyCount > 0) + Warning("%u sequences with long homopolymers discarded (--maxpoly %u)", + m_TooPolyCount, opt_maxpoly); + return 0; + } + if (m_SeqLength < opt_minlen) + { + ++m_TooShortCount; + if (m_ShortestLength == 0 || m_SeqLength < m_ShortestLength) + m_ShortestLength = m_SeqLength; + continue; + } + if (m_SeqLength > opt_maxlen && opt_maxlen != 0) + { + if (m_LongestLength == 0 || m_SeqLength > m_LongestLength) + m_LongestLength = m_SeqLength; + ++m_TooLongCount; + continue; + } + return Seq; + } + } + +const byte *SFasta::GetNextSeqLo() + { +// End of cache? + if (m_BufferOffset == m_BufferBytes) + { + // End of file? + if (m_FilePos == m_FileSize) + return 0; + FillCache(); + } + + StartTimer(SF_GetNextSeq); + asserta(m_Buffer[m_BufferOffset] == '>'); + m_Label = (char *) (m_Buffer + m_BufferOffset + 1); + +//// Scan to end-of-line. +//// Use dubious library function strchr() in the hope +//// that it uses fast machine code. +// byte *ptr = (byte *) strchr(m_Label, '\n'); +// asserta(ptr != 0); +// *ptr = 0; + + byte *ptr = 0; + for (unsigned i = m_BufferOffset; i < m_BufferSize; ++i) + { + char c = m_Buffer[i]; + if (c == '\n' || c == '\r') + { + ptr = m_Buffer + i; + break; + } + } + asserta(ptr != 0); + + if (opt_trunclabels) + { + for (char *p = m_Label; *p; ++p) + if (isspace(*p)) + { + *p = 0; + break; + } + } + else + { + for (char *p = m_Label; *p; ++p) + { + if (*p == '\t') + *p = ' '; + else if (*p == '\r' || *p == '\n') + { + *p = 0; + char NextChar = *(p+1); + if (NextChar == '\r' || NextChar == '\n') + ++p; + break; + } + } + } + +// ptr points to end-of-line. +// Move to start of sequence data. + byte *Seq = ++ptr; + +// Delete white space in-place + byte *To = ptr; + m_BufferOffset = (unsigned) (ptr - m_Buffer); + while (m_BufferOffset < m_BufferBytes) + { + byte c = m_Buffer[m_BufferOffset]; + if (c == '>') + { + char prevc = '\n'; + if (m_BufferOffset > 0) + prevc = m_Buffer[m_BufferOffset-1]; + if (prevc == '\n' || prevc == '\r') + break; + } + ++m_BufferOffset; + if (isalpha(c) || (isgap(c) && m_AllowGaps)) + *To++ = c; + else if (c == '\n' || c == '\r') + continue; + else + { + const char *Label = (m_Label == 0 ? "" : m_Label); + static bool WarningDone = false; + if (!WarningDone) + { + if (isgap(c)) + Warning("Ignoring gaps in FASTA file '%s'", + m_FileName.c_str()); + else if (isprint(c)) + Warning("Invalid FASTA file '%s', non-letter '%c' in sequence >%s", + m_FileName.c_str(), c, Label); + else + Warning("Invalid FASTA file '%s', non-printing byte (hex %02x) in sequence >%s", + m_FileName.c_str(), c, Label); + WarningDone = true; + } + continue; + } + } + m_SeqLength = unsigned(To - Seq); + + if (m_SeqIndex == UINT_MAX) + m_SeqIndex = 0; + else + ++m_SeqIndex; + + EndTimer(SF_GetNextSeq); + return Seq; + } + +void SFasta::Open(const string &FileName) + { + Clear(); + m_FileName = FileName; + m_File = OpenStdioFile(FileName); + m_BufferSize = BufferSize; + //m_Buffer = myalloc(m_BufferSize); + m_Buffer = MYALLOC(byte, m_BufferSize, SFasta); + m_FileSize = GetStdioFileSize(m_File); + } + +void SFasta::Rewind() + { + m_BufferOffset = 0; + m_BufferBytes = 0; + m_FilePos = 0; + } + +bool SFasta::SetIsNucleo() + { + if (m_FilePos != 0) + Die("SFasta::IsNucleo, not at BOF"); + + unsigned LetterCount = 0; + unsigned NucleoLetterCount = 0; + for (;;) + { + const byte *Seq = GetNextSeq(); + if (Seq == 0) + break; + unsigned L = GetSeqLength(); + for (unsigned i = 0; i < L; ++i) + if (g_IsNucleoChar[Seq[i]]) + ++NucleoLetterCount; + LetterCount += L; + if (LetterCount > 256) + break; + } + Rewind(); + if (LetterCount == 0) + { + m_IsNucleoSet = true; + m_IsNucleo = true; + return true; + } + +// Nucleo if more than 90% nucleo letters AGCTUN + m_IsNucleo = double(NucleoLetterCount)/LetterCount > 0.9; + m_IsNucleoSet = true; + return m_IsNucleo; + } + +void SFasta::FillCache() + { + StartTimer(SF_FillCache); + asserta(m_FilePos < m_FileSize); + +// off_t may be larger type than unsigned, e.g. 64- vs. 32-bit. + off_t otBytesToRead = m_FileSize - m_FilePos; + + bool FinalBuffer = true; + if (otBytesToRead > (off_t) m_BufferSize) + { + FinalBuffer = false; + otBytesToRead = m_BufferSize; + } + + unsigned BytesToRead = unsigned(otBytesToRead); + asserta(BytesToRead > 0); + asserta(BytesToRead <= m_BufferSize); + + SetStdioFilePos(m_File, m_FilePos); + ReadStdioFile(m_File, m_Buffer, BytesToRead); + if (m_Buffer[0] != '>') + { + if (m_FilePos == 0) + Die("Input is not FASTA file"); + else + Die("SFasta::FillCache() failed, expected '>'"); + } + + m_BufferOffset = 0; + +// If last buffer in file, done + if (FinalBuffer) + { + m_BufferBytes = BytesToRead; + m_FilePos += BytesToRead; + EndTimer(SF_FillCache); + return; + } + +// If not last buffer, truncate any partial sequence +// at end of buffer. Search backwards to find last '>'. + byte *ptr = m_Buffer + BytesToRead - 1; + while (ptr > m_Buffer) + { + if (ptr[0] == '>' && (ptr[-1] == '\n' || ptr[-1] == '\r')) + break; + --ptr; + } + + if (ptr == m_Buffer) + { + LogMe(); + if (*ptr != '>') + { + // No '>' found. + // This might techincally be legal FASTA if the entire + // buffer is white space, but strange if not the last buffer + // in the file, so quit anyway. + Die("Failed to find '>' (pos=%u, bytes=%u)", + (unsigned) m_FilePos, BytesToRead); + } + else + { + // Entire buffer is one sequence which may be truncated. + Die("Sequence too long (pos=%u, bytes=%u)", + (unsigned) m_FilePos, BytesToRead); + } + } + + asserta(*ptr == '>'); + + m_BufferBytes = unsigned(ptr - m_Buffer); + m_FilePos += m_BufferBytes; + + EndTimer(SF_FillCache); + } + +unsigned SFasta::GetPctDoneX10() const + { + if (m_FilePos == 0 || m_FileSize == 0) + return 0; + + assert(m_FilePos >= (off_t) m_BufferBytes); + off_t BufferStart = m_FilePos - m_BufferBytes; + off_t BufferPos = BufferStart + m_BufferOffset; + + unsigned iPctX10 = unsigned(10.0*double(BufferPos)*100.0/double(m_FileSize)); + if (iPctX10 == 0) + return 1; + if (iPctX10 >= 999) + return 998; + return iPctX10; + } + +double SFasta::GetPctDone() const + { + if (m_FilePos == 0 || m_FileSize == 0) + return 0; + + assert(m_FilePos >= (off_t) m_BufferBytes); + off_t BufferStart = m_FilePos - m_BufferBytes; + off_t BufferPos = BufferStart + m_BufferOffset; + + return double(BufferPos)*100.0/double(m_FileSize); + } + +bool SFasta::GetNextSD(SeqData &SD) + { + SD.Seq = GetNextSeq(); + if (SD.Seq == 0) + return false; + + SD.Label = GetLabel(); + SD.L = GetSeqLength(); + SD.Index = GetSeqIndex(); + SD.ORFParent = 0; + SD.Nucleo = GetIsNucleo(); + SD.RevComp = false; + + return true; + } + +#if TEST +void TestSFasta() + { + SFasta SF; + SF.Open(opt_input); + + if (opt_verbose) + { + Log(" Index Length Label\n"); + Log("------- ------- -----\n"); + } + + unsigned Index = 0; + unsigned SeqCount = 0; + double LetterCount = 0.0; + ProgressStep(0, 1000, "Reading"); + for (;;) + { + const byte *Seq = SF.GetNextSeq(); + if (Seq == 0) + break; + ProgressStep(SF.GetPctDoneX10(), 1000, "Reading"); + const char *Label = SF.GetLabel(); + unsigned L = SF.GetSeqLength(); + ++SeqCount; + LetterCount += L; + + if (opt_verbose) + { + Log(">%7u %7u '%s'\n", Index, L, Label); + Log("+%7.7s %7.7s \"%*.*s\"\n", "", "", L, L, Seq); + } + + ++Index; + } + ProgressStep(999, 1000, "Reading"); + + Progress("%u seqs, %s letters\n", SeqCount, FloatToStr(LetterCount)); + Log("%u seqs, %s letters\n", SeqCount, FloatToStr(LetterCount)); + } +#endif // TEST diff --git a/sfasta.h b/sfasta.h new file mode 100644 index 0000000..ed2f2ff --- /dev/null +++ b/sfasta.h @@ -0,0 +1,93 @@ +#ifndef sfasta_h +#define sfasta_h + +#include "myutils.h" +#include "seq.h" + +typedef void (*ON_START_XSEQ)(const SeqData &SD); +typedef void (*ON_END_XSEQ)(const SeqData &SD); + +// Sequential reader for FASTA file format. +// Serves sequences in file order to save memory. +// Caches biggish chunks to compromise memory vs. speed. +class SFasta + { +public: + string m_FileName; + FILE *m_File; + bool m_AllowGaps; + + off_t m_FileSize; + +// Position to start next read + off_t m_FilePos; + +// Cached data. + byte *m_Buffer; + +// Bytes allocated to m_Buffer + unsigned m_BufferSize; + +// Current position in buffer, normally points to '>' + unsigned m_BufferOffset; + +// File data in buffer <= m_BufferSize + unsigned m_BufferBytes; + +// Current label +// Points into m_Buffer, not a separate buffer. + char *m_Label; + +// Current sequence length + unsigned m_SeqLength; + +// Current seq index + unsigned m_SeqIndex; + + unsigned m_ShortestLength; + unsigned m_LongestLength; + unsigned m_TooShortCount; + unsigned m_TooLongCount; + unsigned m_TooPolyCount; + +private: + bool m_IsNucleoSet; + bool m_IsNucleo; + +public: + SFasta(); + ~SFasta(); + + void Clear(); + void Open(const string &FileName); + void Rewind(); + bool SetIsNucleo(); + bool GetIsNucleo() const { asserta(m_IsNucleoSet); return m_IsNucleo; }; + +// Get next sequence. +// Returns zero on end-of-file + const byte *GetNextSeq(); + +// Get next sequence as SeqData object, return false on end-of-file. + bool GetNextSD(SeqData &SD); + +// Length of most recent sequence returned by GetNextSeq(). + unsigned GetSeqLength() const { return m_SeqLength; } + +// Label of most recent sequence returned by GetNextSeq(). + const char *GetLabel() const { return m_Label; } + +// Index of most recent sequence returned by GetNextSeq(). + unsigned GetSeqIndex() const { return m_SeqIndex; } + + unsigned GetPctDoneX10() const; + double GetPctDone() const; + + void LogMe() const; + +private: + void FillCache(); + const byte *GetNextSeqLo(); + }; + +#endif // sfasta_h diff --git a/summarysharedcommand.cpp b/summarysharedcommand.cpp index 8e162ee..30761e1 100644 --- a/summarysharedcommand.cpp +++ b/summarysharedcommand.cpp @@ -56,7 +56,7 @@ vector SummarySharedCommand::setParameters(){ CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pshared); CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel); CommandParameter pdistance("distance", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pdistance); - CommandParameter pcalc("calc", "Multiple", "sharedchao-sharedsobs-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan-kstest-whittaker-sharednseqs-ochiai-anderberg-skulczynski-kulczynskicody-lennon-morisitahorn-braycurtis-odum-canberra-structeuclidean-structchord-hellinger-manhattan-structpearson-soergel-spearman-structkulczynski-speciesprofile-structchi2-hamming-gower-memchi2-memchord-memeuclidean-mempearson", "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan", "", "", "",true,false); parameters.push_back(pcalc); + CommandParameter pcalc("calc", "Multiple", "sharedchao-sharedsobs-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan-kstest-whittaker-sharednseqs-ochiai-anderberg-kulczynski-kulczynskicody-lennon-morisitahorn-braycurtis-odum-canberra-structeuclidean-structchord-hellinger-manhattan-structpearson-soergel-spearman-structkulczynski-speciesprofile-structchi2-hamming-gower-memchi2-memchord-memeuclidean-mempearson", "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan", "", "", "",true,false); parameters.push_back(pcalc); CommandParameter pall("all", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pall); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors); CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups); diff --git a/svnmods.h b/svnmods.h new file mode 100644 index 0000000..c68513e --- /dev/null +++ b/svnmods.h @@ -0,0 +1,15 @@ +"Path: .\n" +"URL: file:///public/svn/usearch\n" +"Repository Root: file:///public/svn/usearch\n" +"Repository UUID: 58640331-1837-4c17-bc3e-636dc59aced1\n" +"Revision: 34\n" +"Node Kind: directory\n" +"Schedule: normal\n" +"Last Changed Author: bob\n" +"Last Changed Rev: 34\n" +"Last Changed Date: 2011-05-01 08:29:04 -0700 (Sun, 01 May 2011)\n" +"\n" +"? mk\n" +"! svnmods.h\n" +"M ungappedblastid.cpp\n" +"M chaindisjointhits.cpp\n" diff --git a/svnversion.h b/svnversion.h new file mode 100644 index 0000000..2a64d50 --- /dev/null +++ b/svnversion.h @@ -0,0 +1 @@ +"40" diff --git a/timers.h b/timers.h new file mode 100644 index 0000000..81cf7d1 --- /dev/null +++ b/timers.h @@ -0,0 +1,173 @@ +T(MxBase_Alloc) +T(MxBase_FreeData) +T(MxBase_AllocData) +T(SortSeqIndexes) +T(Alloc_Vectors) +T(MainLoop_NotNW) +T(WriteOutput) +T(NWB) +T(ReadAllStdioFile) +T(Windex_Init) +T(Windex_SetSeqIndex) +T(SeqToWords) +T(SeqToWordsStep) +T(SeqToShortWords) +T(SeqToShortWordsA) +T(SeqToShortWordsB) +T(GetFractIdB) +T(Windex_UniqueWordsAlloc) +T(Windex_UniqueWords) +T(GetPctId) +T(Windex_Reset) +T(GetSig) +T(NWEditDist) +T(EditDist_Myers) +T(EditDist_BlockTarget) +T(NWBand) +T(WordCounting) +T(NWAff) +T(NWAffBand) +T(NWSimple) +T(NWSimpleB) +T(BandWrap) +T(IncIdCounts) +T(GetBestDiagB) +T(GetBestDiagB1) +T(GetBestDiagB2) +T(ClusterInit) +T(ClusterPrep) +T(HotSort1) +T(HotSort2) +T(SortA) +T(SortB) +T(CountSort) +T(AddWords) +T(ClusterWindex) +T(MainInit) +T(Output) +T(WindexTail) +T(WindexExit) +T(Sort) +T(U_AllocSeqLength) +T(U_AllocSeedCount) +T(U_AddSeed) +T(AddSeq) +T(U_SetWordCounts) +T(U_SetWordCountsHash) +T(U_SetWordScores) +T(U_SetHotHits) +T(U_SetHotHitsHash) +T(U_SetHotHitsScores) +T(U_Search) +T(U_SearchExact) +T(WF_SeqToWords) +T(WF_SeqToWordsA) +T(WF_SeqToWordsB) +T(WF_AllocLA) +T(WF_AllocLB) +T(WF_AllocDiags) +T(WF_SetA) +T(WF_SetA_Nb) +T(WF_SetAZero) +T(WF_SetA2) +T(WF_SetB) +T(WF_GetCommonWordCount) +T(WF_GetBestDiag) +T(GetFractIdGivenPath) +T(WX_GetUniqueWords) +T(CompressPath) +T(GetHSPs1) +T(GetHSPs2) +T(AlignHSPs) +T(WF_ResolveHSPs) +T(WX_SetExcludes) +T(ViterbiFast) +T(ViterbiFastBand) +T(ViterbiFastBand0) +T(ViterbiFastBand1) +T(ViterbiFastBand2) +T(ViterbiFastBand3) +T(ViterbiFastBand4) +T(TraceBackBit) +T(TraceBackBitSW) +T(SF_GetNextSeq) +T(SF_FillCache) +T(OnGlobalAccept) +T(UngappedBlast) +T(UngappedBlastId) +T(UngappedBlast2Hit) +T(LogHSPs) +T(BlastOutput) +T(BlastLeft) +T(BlastRight) +T(Blast1) +T(Blast2) +T(Blast3) +T(Blast4) +T(GetBestSeg) +T(SWLinearDP) +T(SWLinearTB) +T(SWLinearDP2) +T(SWLinearTB2) +T(Chain) +T(XlatSeq) +T(XlatSeqToLetters) +T(XDropFwdSimple) +T(XDropFwdFast) +T(XDropFwdFastTB) +T(XDropBwd) +T(SWSimple) +T(PathAlloc) +T(SubPath) +T(SWUngapped) +T(SWFast) +T(SWFastNTB) +T(SWAT_CacheQuery) +T(SWAT_AlignTarget) +T(SWAT_CacheQueryNW) +T(SWAT_AlignTargetNW) +T(SeqDB_FromFasta) +T(LocalUngappedHitToAD) +T(LocalGappedHitToAD) +T(GlobalHitToAD) +T(ResolveOverlaps) +T(GetORFs) +T(ChainCov_AddHit) +T(ChainCov_EndQuery) +T(ChainCov_DoTarget) +T(BuildNb) +T(MakeIntSubstMx) +T(UngappedExtendLeft) +T(UngappedExtendRight) +T(AlignSP) +T(AlignHSP) + +// Background +T(Bg_SearchLoop) +T(Bg_MainInit) +T(Bg_MainTerm) +T(Bg_Other) +T(Bg_1) +T(Bg_2) +T(Bg_3) +T(Bg_4) +T(Bg_5) +T(Bg_6) +T(Bg_7) +T(Bg_8) +T(Bg_9) +T(Bg_XFrame2) +T(Bg_Usearch1) +T(Bg_Usearch2) +T(Bg_Usearch3) +T(Bg_Usearch4) +T(Bg_Hot) + +// For Timer2 +T(Search_2) +T(Search_Loop_2) +T(Search_InnerLoop_2) +T(OnHit_2) +T(UngappedBlast_2) +T(MainInit_2) +T(MainTerm_2) diff --git a/timing.h b/timing.h new file mode 100644 index 0000000..b566e1b --- /dev/null +++ b/timing.h @@ -0,0 +1,238 @@ +#define TIMING 0 +#ifndef timing_h +#define timing_h + +#define BG_TIMING 0 + +#if !TIMING +#undef BG_TIMING +#define BG_TIMING 0 +#endif + +//#if UCHIMES +#undef TIMING +#define TIMING 0 +//#endif + +#if TIMING + +enum TIMER + { + TIMER_None, +#define T(x) TIMER_##x, +#include "timers.h" +#undef T + }; + +const unsigned TimerCount = + 1 // TIMER_None +#define T(x) +1 +#include "timers.h" +#undef T + ; + +enum COUNTER + { +#define C(x) COUNTER_##x, +#include "counters.h" +#undef C + }; + +enum ALLOCER + { +#define A(x) ALLOCER_##x, +#include "allocs.h" +#undef A + }; + +const unsigned CounterCount = +#define C(x) +1 +#include "counters.h" +#undef C + ; + +const unsigned AllocerCount = +#define A(x) +1 +#include "allocs.h" +#undef A + ; + +#ifdef _MSC_VER + +typedef unsigned __int64 TICKS; + +#pragma warning(disable:4035) +inline TICKS GetClockTicks() + { + _asm + { + _emit 0x0f + _emit 0x31 + } + } + +#else // ifdef _MSC_VER + +typedef uint64_t TICKS; +__inline__ uint64_t GetClockTicks() + { + uint32_t lo, hi; + /* We cannot use "=A", since this would use %rax on x86_64 */ + __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); + return (uint64_t)hi << 32 | lo; + } + +#endif // ifdef _MSC_VER + +//void AddTicks(const string &Name, TICKS Ticks1, TICKS Ticks2); +//void AddBytes(const string &Name, double Bytes); +//#define SubBytes(Name, Bytes) AddBytes(Name, -double(Bytes)) + +const char *TimerToStr(TIMER t); + +extern TICKS g_BeginTicks[TimerCount]; +extern double g_TotalTicks[TimerCount]; +extern double g_TotalCounts[TimerCount]; +extern double g_Counters[CounterCount]; +extern unsigned g_AllocNewCount[AllocerCount]; +extern unsigned g_AllocFreeCount[AllocerCount]; +extern double g_AllocNewBytes[AllocerCount]; +extern double g_AllocFreeBytes[AllocerCount]; +extern double g_AllocNetBytes[AllocerCount]; +extern double g_AllocPeakBytes[AllocerCount]; +extern bool g_Timer2[TimerCount]; +extern TIMER g_CurrTimer; +#if BG_TIMING +extern TIMER g_BackgroundTimer; +#endif + +#define MYALLOC(Type, N, Name) (Type *) MyAlloc_((N)*sizeof(Type), ALLOCER_##Name, __FILE__, __LINE__) +#define MYFREE(Array, N, Name) MyFree_(Array, N*sizeof(Array[0]), ALLOCER_##Name, __FILE__, __LINE__) + +inline void *MyAlloc_(unsigned Bytes, unsigned a, const char *FileName, int Line) + { + ++g_AllocNewCount[a]; + g_AllocNewBytes[a] += Bytes; + g_AllocNetBytes[a] += Bytes; + if (g_AllocNetBytes[a] > g_AllocPeakBytes[a]) + g_AllocPeakBytes[a] = g_AllocNetBytes[a]; + return mymalloc(Bytes); + } + +inline void MyFree_(void *p, unsigned Bytes, unsigned a, const char *FileName, int Line) + { + ++g_AllocFreeCount[a]; + g_AllocFreeBytes[a] += Bytes; + g_AllocNetBytes[a] -= Bytes; + myfree2(p, Bytes); + } + +#if BG_TIMING +inline void SetBackgroundTimer_(TIMER Timer) + { + TICKS Now = GetClockTicks(); + if (g_BeginTicks[g_BackgroundTimer] != 0) + { + ++g_TotalCounts[g_BackgroundTimer]; + g_TotalTicks[g_BackgroundTimer] += double(Now - g_BeginTicks[g_BackgroundTimer]); + } + g_BackgroundTimer = Timer; + g_BeginTicks[Timer] = Now; + } +#else +#define SetBackgroundTimer_(Timer) /* empty */ +#endif + +inline void StartTimer_(TIMER Timer) + { + if (g_CurrTimer != TIMER_None) + Die("StartTimer(%s), curr=%s", TimerToStr(Timer), TimerToStr(g_CurrTimer)); + + TICKS Now = GetClockTicks(); +#if BG_TIMING + if (g_BeginTicks[g_BackgroundTimer] != 0) + { + ++g_TotalCounts[g_BackgroundTimer]; + g_TotalTicks[g_BackgroundTimer] += double(Now - g_BeginTicks[g_BackgroundTimer]); + } +#endif + g_BeginTicks[Timer] = Now; + g_CurrTimer = Timer; + } + +inline void PauseTimer_(TIMER Timer) + { + if (Timer != g_CurrTimer) + Die("PauseTimer(%s), curr=%s", TimerToStr(Timer), TimerToStr(g_CurrTimer)); + + TICKS Now = GetClockTicks(); + g_TotalTicks[Timer] += double(Now - g_BeginTicks[Timer]); + g_BeginTicks[Timer] = Now; + g_CurrTimer = TIMER_None; + } + +inline void EndTimer_(TIMER Timer) + { + if (Timer != g_CurrTimer) + Die("EndTimer(%s), curr=%s", TimerToStr(Timer), TimerToStr(g_CurrTimer)); + + TICKS Now = GetClockTicks(); +#if BG_TIMING + g_BeginTicks[g_BackgroundTimer] = Now; +#endif + g_TotalTicks[Timer] += double(Now - g_BeginTicks[Timer]); + ++g_TotalCounts[Timer]; + g_CurrTimer = TIMER_None; + } + +inline void StartTimer2_(TIMER Timer) + { + g_Timer2[Timer] = true; + g_BeginTicks[Timer] = GetClockTicks(); + } + +inline void EndTimer2_(TIMER Timer) + { + g_TotalTicks[Timer] += double(GetClockTicks() - g_BeginTicks[Timer]); + ++g_TotalCounts[Timer]; + } + +#define AddCounter(x, N) g_Counters[COUNTER_##x] += N +#define IncCounter(x) ++(g_Counters[COUNTER_##x]) +#define StartTimer(x) StartTimer_(TIMER_##x) +#define PauseTimer(x) PauseTimer_(TIMER_##x) +#define EndTimer(x) EndTimer_(TIMER_##x) +#define StartTimer2(x) StartTimer2_(TIMER_##x) +#define EndTimer2(x) EndTimer2_(TIMER_##x) + +#if BG_TIMING +#define SetBackgroundTimer(x) SetBackgroundTimer_(TIMER_##x) +#else +#define SetBackgroundTimer(x) /* empty */ +#endif + +#else // if TIMING + +#define AddCounter(x, N) /* empty */ +#define IncCounter(x) /* empty */ +#define StartTimer(x) /* empty */ +#define PauseTimer(x) /* empty */ +#define EndTimer(x) /* empty */ +#define StartTimer2(x) /* empty */ +#define PauseTimer2(x) /* empty */ +#define EndTimer2(x) /* empty */ +#define SetBackgroundTimer(x) /* empty */ +#define MYALLOC(Type, N, Name) myalloc(Type, N) +#define MYFREE(Array, N, Name) myfree(Array) + +#endif // if TIMING + +void LogMemStats(); +void LogTickStats(); +void LogStats(); +void LogAllocs(); + +#define AddBytes(x, n) /* empty */ +#define SubBytes(x, n) /* empty */ + +#endif // if timing_h diff --git a/tracebackbit.cpp b/tracebackbit.cpp new file mode 100644 index 0000000..94159cd --- /dev/null +++ b/tracebackbit.cpp @@ -0,0 +1,180 @@ +#include "dp.h" + +#define TRACE 0 + +Mx g_Mx_TBBit; +byte **g_TBBit; +float *g_DPRow1; +float *g_DPRow2; +static float *g_DPBuffer1; +static float *g_DPBuffer2; + +static unsigned g_CacheLB; + +void AllocBit(unsigned LA, unsigned LB) + { + g_Mx_TBBit.Alloc("TBBit", LA+1, LB+1); + g_TBBit = g_Mx_TBBit.GetData(); + if (LB > g_CacheLB) + { + MYFREE(g_DPBuffer1, g_CacheLB, AllocBit); + MYFREE(g_DPBuffer2, g_CacheLB, AllocBit); + + g_CacheLB = LB + 128; + + // Allow use of [-1] + //g_DPBuffer1 = myalloc(g_CacheLB+3); + //g_DPBuffer2 = myalloc(g_CacheLB+3); + g_DPBuffer1 = MYALLOC(float, g_CacheLB+3, AllocBit); + g_DPBuffer2 = MYALLOC(float, g_CacheLB+3, AllocBit); + g_DPRow1 = g_DPBuffer1 + 1; + g_DPRow2 = g_DPBuffer2 + 1; + } + } + +void TraceBackBit(unsigned LA, unsigned LB, char State, PathData &PD) + { + PD.Alloc(LA+LB); + + StartTimer(TraceBackBit); + char *PathPtr = PD.Back; + *PathPtr = 0; + + byte **TB = g_TBBit; + +#if TRACE + Log("\n"); + Log("TraceBackBit\n"); +#endif + + size_t i = LA; + size_t j = LB; + for (;;) + { +#if TRACE + Log("i=%3d j=%3d state=%c\n", (int) i, (int) j, State); +#endif + if (i == 0 && j == 0) + break; + + --PathPtr; + *PathPtr = State; + + byte t; + switch (State) + { + case 'M': + asserta(i > 0 && j > 0); + t = TB[i-1][j-1]; + if (t & TRACEBITS_DM) + State = 'D'; + else if (t & TRACEBITS_IM) + State = 'I'; + else + State = 'M'; + --i; + --j; + break; + case 'D': + asserta(i > 0); + t = TB[i-1][j]; + if (t & TRACEBITS_MD) + State = 'M'; + else + State = 'D'; + --i; + break; + + case 'I': + asserta(j > 0); + t = TB[i][j-1]; + if (t & TRACEBITS_MI) + State = 'M'; + else + State = 'I'; + --j; + break; + + default: + Die("TraceBackBit, invalid state %c", State); + } + } + PD.Start = PathPtr; + EndTimer(TraceBackBit); + } + +void TraceBackBitSW(unsigned LA, unsigned LB, unsigned Besti, unsigned Bestj, + unsigned &Leni, unsigned &Lenj, PathData &PD) + { + PD.Alloc(LA+LB); + + StartTimer(TraceBackBitSW); + char *PathPtr = PD.Back; + *PathPtr = 0; + + byte **TB = g_TBBit; + +#if TRACE + Log("\n"); + Log("TraceBackBitSW\n"); +#endif + + unsigned i = Besti; + unsigned j = Bestj; + char State = 'M'; + for (;;) + { +#if TRACE + Log("i=%3d j=%3d state=%c\n", (int) i, (int) j, State); +#endif + --PathPtr; + *PathPtr = State; + + byte t; + switch (State) + { + case 'M': + asserta(i > 0 && j > 0); + t = TB[i-1][j-1]; + if (t & TRACEBITS_DM) + State = 'D'; + else if (t & TRACEBITS_IM) + State = 'I'; + else if (t & TRACEBITS_SM) + { + Leni = Besti - i + 1; + Lenj = Bestj - j + 1; + PD.Start = PathPtr; + EndTimer(TraceBackBitSW); + return; + } + else + State = 'M'; + --i; + --j; + break; + case 'D': + asserta(i > 0); + t = TB[i-1][j]; + if (t & TRACEBITS_MD) + State = 'M'; + else + State = 'D'; + --i; + break; + + case 'I': + asserta(j > 0); + t = TB[i][j-1]; + if (t & TRACEBITS_MI) + State = 'M'; + else + State = 'I'; + --j; + break; + + default: + Die("TraceBackBitSW, invalid state %c", State); + } + } + } diff --git a/uc.h b/uc.h new file mode 100644 index 0000000..631ea36 --- /dev/null +++ b/uc.h @@ -0,0 +1,65 @@ +#ifndef uc_h +#define uc_h + +#include "seqdb.h" +#include "seq.h" +#include "path.h" + +struct AlnData; + +int uchime_main(int, char**); + +class UCFile + { +public: + FILE *m_File; + byte *m_Data; + vector m_RecTypes; + vector m_PctIds; + vector m_Labels; + vector m_SeedLabels; + vector m_SeedIndexes; + vector m_CompressedPaths; + vector m_SeqLengths; + vector m_SortOrder; + vector m_Strands; + vector m_Los; + vector m_SeedLos; + +public: + /* some function prototypes */ + + + UCFile(); + void Clear(bool ctor = false); + void Close(); + void FromFile(const string &FileName); + void FromClstr(const string &FileName); + void ToFile(const string &FileName); + unsigned GetRecordCount() const; + void LogMe() const; + void ToClstr(const string &FileName); + void ToFasta(const string &FileName, const SeqDB &Input, bool Reformat); + void Create(const string &FileName); + void Sort(); + void Flush() const; + + void WriteNotMatched(unsigned L, const char *Label) const; + void WriteLibSeed(unsigned SeedIndex, unsigned L, const char *Label) const; + void WriteNewSeed(unsigned SeedIndex, unsigned L, const char *Label) const; + void WriteHit(const SeqData &SA, const SeqData &SB, double FractId, + const PathData &PD) const; + void WriteReject(const SeqData &SA, const SeqData &SB, double FractId, + const char *Path) const; + void WriteHit(unsigned SeedIndex, unsigned L, double PctId, + const char *CompressedPath, char Strand, unsigned Lo, unsigned SeedLo, + const char *Label, const char *SeedLabel) const; + void WriteHit(const AlnData &AD); + void WriteLibCluster(unsigned SeedIndex, unsigned Size, double AvgId, + const char *Label) const; + void WriteNewCluster(unsigned SeedIndex, unsigned Size, double AvgId, + const char *Label) const; + void WriteSeqX(FILE *f, const byte *Seq, unsigned L, const char *CompressedPath) const; + }; + +#endif // uc_h diff --git a/uchime_main.cpp b/uchime_main.cpp new file mode 100644 index 0000000..40e7f44 --- /dev/null +++ b/uchime_main.cpp @@ -0,0 +1,219 @@ +#include "myutils.h" +#include "chime.h" +#include "seqdb.h" +#include "dp.h" +#include "ultra.h" +#include "hspfinder.h" +#include +#include +#include "mothurout.h" + +bool SearchChime(Ultra &U, const SeqData &QSD, float QAb, + const AlnParams &AP, const AlnHeuristics &AH, HSPFinder &HF, + float MinFractId, ChimeHit2 &Hit); + +FILE *g_fUChime; +FILE *g_fUChimeAlns; +const vector *g_SortVecFloat; +bool g_UchimeDeNovo = false; + +void Usage() + { + //printf("\n"); + //printf("UCHIME %s by Robert C. Edgar\n", MY_VERSION); + //printf("http://www.drive5.com/uchime\n"); + //printf("\n"); + //printf("This software is donated to the public domain\n"); + //printf("\n"); + + //printf( +//#include "help.h" + //); + } + +void SetBLOSUM62() + { + Die("SetBLOSUM62 not implemented"); + } + +void ReadSubstMx(const string &/*FileName*/, Mx &/*Mxf*/) + { + Die("ReadSubstMx not implemented"); + } + +void LogAllocs() + { + /*empty*/ + } + +static bool CmpDescVecFloat(unsigned i, unsigned j) + { + return (*g_SortVecFloat)[i] > (*g_SortVecFloat)[j]; + } + +void Range(vector &v, unsigned N) + { + v.clear(); + v.reserve(N); + for (unsigned i = 0; i < N; ++i) + v.push_back(i); + } + +void SortDescending(const vector &Values, vector &Order) + { + StartTimer(Sort); + const unsigned N = SIZE(Values); + Range(Order, N); + g_SortVecFloat = &Values; + sort(Order.begin(), Order.end(), CmpDescVecFloat); + EndTimer(Sort); + } + +float GetAbFromLabel(const string &Label) + { + vector Fields; + Split(Label, Fields, '/'); + const unsigned N = SIZE(Fields); + for (unsigned i = 0; i < N; ++i) + { + const string &Field = Fields[i]; + if (Field.substr(0, 3) == "ab=") + { + string a = Field.substr(3, string::npos); + return (float) atof(a.c_str()); + } + } + if (g_UchimeDeNovo) + Die("Missing abundance /ab=xx/ in label >%s", Label.c_str()); + return 0.0; + } + +int uchime_main(int argc, char *argv[]) + { + MothurOut* m; + m = MothurOut::getInstance(); + + MyCmdLine(argc, argv); + + if (argc < 2) + { + Usage(); + return 0; + } + + if (opt_version) + { + printf("uchime v" MY_VERSION ".%s\n", SVN_VERSION); + return 0; + } + + //printf("uchime v" MY_VERSION ".%s\n", SVN_VERSION); + //printf("by Robert C. Edgar\n"); + //printf("http://drive5.com/uchime\n"); + //printf("This code is donated to the public domain.\n"); + //printf("\n"); + if (!optset_w) + opt_w = 8; + + float MinFractId = 0.95f; + if (optset_id) + MinFractId = (float) opt_id; + + Log("%8.2f minh\n", opt_minh); + Log("%8.2f xn\n", opt_xn); + Log("%8.2f dn\n", opt_dn); + Log("%8.2f xa\n", opt_xa); + Log("%8.2f mindiv\n", opt_mindiv); + Log("%8u maxp\n", opt_maxp); + + if (opt_input == "" && opt_uchime != "") + opt_input = opt_uchime; + + if (opt_input == "") + Die("Missing --input"); + + g_UchimeDeNovo = (opt_db == ""); + + if (opt_uchimeout != "") + g_fUChime = CreateStdioFile(opt_uchimeout); + + if (opt_uchimealns != "") + g_fUChimeAlns = CreateStdioFile(opt_uchimealns); + + SeqDB Input; + SeqDB DB; + + Input.FromFasta(opt_input); + if (!Input.IsNucleo()) + Die("Input contains amino acid sequences"); + + const unsigned QuerySeqCount = Input.GetSeqCount(); + vector Order; + for (unsigned i = 0; i < QuerySeqCount; ++i) + Order.push_back(i); + + if (g_UchimeDeNovo) + { + vector Abs; + for (unsigned i = 0; i < QuerySeqCount; ++i) + { + const char *Label = Input.GetLabel(i); + float Ab = GetAbFromLabel(Label); + Abs.push_back(Ab); + } + SortDescending(Abs, Order); + DB.m_IsNucleoSet = true; + DB.m_IsNucleo = true; + } + else + { + DB.FromFasta(opt_db); + if (!DB.IsNucleo()) + Die("Database contains amino acid sequences"); + } + + vector Hits; + unsigned HitCount = 0; + for (unsigned i = 0; i < QuerySeqCount; ++i) + { + + if (m->control_pressed) { break; } + + unsigned QuerySeqIndex = Order[i]; + + SeqData QSD; + Input.GetSeqData(QuerySeqIndex, QSD); + + float QAb = -1.0; + if (g_UchimeDeNovo) + QAb = GetAbFromLabel(QSD.Label); + + ChimeHit2 Hit; + AlnParams &AP = *(AlnParams *) 0; + AlnHeuristics &AH = *(AlnHeuristics *) 0; + HSPFinder &HF = *(HSPFinder *) 0; + bool Found = SearchChime(DB, QSD, QAb, AP, AH, HF, MinFractId, Hit); + if (Found) + ++HitCount; + else + { + if (g_UchimeDeNovo) + DB.AddSeq(QSD.Label, QSD.Seq, QSD.L); + } + + WriteChimeHit(g_fUChime, Hit); + + ProgressStep(i, QuerySeqCount, "%u/%u chimeras found (%.1f%%)", HitCount, i, Pct(HitCount, i+1)); + + } + + Log("\n"); + Log("%s: %u/%u chimeras found (%.1f%%)\n", + opt_input.c_str(), HitCount, QuerySeqCount, Pct(HitCount, QuerySeqCount)); + + CloseStdioFile(g_fUChime); + CloseStdioFile(g_fUChimeAlns); + + ProgressExit(); + return 0; + } diff --git a/ultra.h b/ultra.h new file mode 100644 index 0000000..e0a432f --- /dev/null +++ b/ultra.h @@ -0,0 +1,8 @@ +#ifndef ultra_h +#define ultra_h + +#include "seqdb.h" +#define Ultra SeqDB +#define GetSeedLabel GetLabel + +#endif // ultra_h diff --git a/usort.cpp b/usort.cpp new file mode 100644 index 0000000..7afbf42 --- /dev/null +++ b/usort.cpp @@ -0,0 +1,86 @@ +//#if UCHIMES + +#include "myutils.h" +#include "seqdb.h" +#include "seq.h" +#include "alpha.h" + +void SortDescending(const vector &Values, vector &Order); + +static byte *g_QueryHasWord; +static unsigned g_WordCount; + +unsigned GetWord(const byte *Seq) + { + unsigned Word = 0; + const byte *Front = Seq; + for (unsigned i = 0; i < opt_w; ++i) + { + unsigned Letter = g_CharToLetterNucleo[*Front++]; + Word = (Word*4) + Letter; + } + return Word; + } + +static void SetQuery(const SeqData &Query) + { + if (g_QueryHasWord == 0) + { + g_WordCount = 4; + for (unsigned i = 1; i < opt_w; ++i) + g_WordCount *= 4; + + g_QueryHasWord = myalloc(byte, g_WordCount); + } + + memset(g_QueryHasWord, 0, g_WordCount); + + if (Query.L <= opt_w) + return; + + const unsigned L = Query.L - opt_w + 1; + const byte *Seq = Query.Seq; + for (unsigned i = 0; i < L; ++i) + { + unsigned Word = GetWord(Seq++); + g_QueryHasWord[Word] = 1; + } + } + +static unsigned GetUniqueWordsInCommon(const SeqData &Target) + { + if (Target.L <= opt_w) + return 0; + + unsigned Count = 0; + const unsigned L = Target.L - opt_w + 1; + const byte *Seq = Target.Seq; + for (unsigned i = 0; i < L; ++i) + { + unsigned Word = GetWord(Seq++); + if (g_QueryHasWord[Word]) + ++Count; + } + return Count; + } + +void USort(const SeqData &Query, const SeqDB &DB, vector &WordCounts, + vector &Order) + { + WordCounts.clear(); + Order.clear(); + + SetQuery(Query); + + const unsigned SeqCount = DB.GetSeqCount(); + for (unsigned SeqIndex = 0; SeqIndex < SeqCount; ++SeqIndex) + { + SeqData Target; + DB.GetSeqData(SeqIndex, Target); + float WordCount = (float) GetUniqueWordsInCommon(Target); + WordCounts.push_back(WordCount); + } + SortDescending(WordCounts, Order); + } + +//#endif // UCHIMES diff --git a/validcalculator.cpp b/validcalculator.cpp index a22ece2..7df7c4e 100644 --- a/validcalculator.cpp +++ b/validcalculator.cpp @@ -177,7 +177,7 @@ void ValidCalculators::printCitations(vector Estimators) { }else if (Estimators[i] == "ochiai") { Calculator* temp = new Ochiai(); m->mothurOut(temp->getName() + ": "); temp->citation(); delete temp; }else if (Estimators[i] == "anderberg") { Calculator* temp = new Anderberg(); m->mothurOut(temp->getName() + ": "); temp->citation(); delete temp; - }else if (Estimators[i] == "skulczynski") { Calculator* temp = new Kulczynski(); m->mothurOut(temp->getName() + ": "); temp->citation(); delete temp; + }else if (Estimators[i] == "kulczynski") { Calculator* temp = new Kulczynski(); m->mothurOut(temp->getName() + ": "); temp->citation(); delete temp; }else if (Estimators[i] == "kulczynskicody") { Calculator* temp = new KulczynskiCody(); m->mothurOut(temp->getName() + ": "); temp->citation(); delete temp; }else if (Estimators[i] == "lennon") { Calculator* temp = new Lennon(); m->mothurOut(temp->getName() + ": "); temp->citation(); delete temp; }else if (Estimators[i] == "morisitahorn") { Calculator* temp = new MorHorn(); m->mothurOut(temp->getName() + ": "); temp->citation(); delete temp; diff --git a/viterbifast.cpp b/viterbifast.cpp new file mode 100644 index 0000000..2b20174 --- /dev/null +++ b/viterbifast.cpp @@ -0,0 +1,378 @@ +#include "dp.h" +#include "out.h" +#include "evalue.h" + +#define CMP_SIMPLE 0 + +#if SAVE_FAST +static Mx g_MxDPM; +static Mx g_MxDPD; +static Mx g_MxDPI; + +static Mx g_MxTBM; +static Mx g_MxTBD; +static Mx g_MxTBI; + +static float **g_DPM; +static float **g_DPD; +static float **g_DPI; + +static char **g_TBM; +static char **g_TBD; +static char **g_TBI; + +#if CMP_SIMPLE +static Mx *g_DPMSimpleMx; +static Mx *g_DPDSimpleMx; +static Mx *g_DPISimpleMx; +static float **g_DPMSimple; +static float **g_DPDSimple; +static float **g_DPISimple; + +#define cmpm(i, j, x) { if (!feq(x, g_DPMSimple[i][j])) \ + { \ + Die("%s:%d %.1f != DPMSimple[%u][%u] = %.1f", \ + __FILE__, __LINE__, x, i, j, g_DPMSimple[i][j]); \ + } \ + } + +#define cmpd(i, j, x) { if (!feq(x, g_DPDSimple[i][j])) \ + { \ + Die("%s:%d %.1f != DPMSimple[%u][%u] = %.1f", \ + __FILE__, __LINE__, x, i, j, g_DPDSimple[i][j]); \ + } \ + } + +#define cmpi(i, j, x) { if (!feq(x, g_DPISimple[i][j])) \ + { \ + Die("%s:%d %.1f != DPMSimple[%u][%u] = %.1f", \ + __FILE__, __LINE__, x, i, j, g_DPISimple[i][j]); \ + } \ + } + +#else + +#define cmpm(i, j, x) /* empty */ +#define cmpd(i, j, x) /* empty */ +#define cmpi(i, j, x) /* empty */ + +#endif + +static void AllocSave(unsigned LA, unsigned LB) + { +#if CMP_SIMPLE + GetSimpleDPMxs(&g_DPMSimpleMx, &g_DPDSimpleMx, &g_DPISimpleMx); + g_DPMSimple = g_DPMSimpleMx->GetData(); + g_DPDSimple = g_DPDSimpleMx->GetData(); + g_DPISimple = g_DPISimpleMx->GetData(); +#endif + g_MxDPM.Alloc("FastM", LA+1, LB+1); + g_MxDPD.Alloc("FastD", LA+1, LB+1); + g_MxDPI.Alloc("FastI", LA+1, LB+1); + + g_MxTBM.Alloc("FastTBM", LA+1, LB+1); + g_MxTBD.Alloc("FastTBD", LA+1, LB+1); + g_MxTBI.Alloc("FastTBI", LA+1, LB+1); + + g_DPM = g_MxDPM.GetData(); + g_DPD = g_MxDPD.GetData(); + g_DPI = g_MxDPI.GetData(); + + g_TBM = g_MxTBM.GetData(); + g_TBD = g_MxTBD.GetData(); + g_TBI = g_MxTBI.GetData(); + } + +static void SAVE_DPM(unsigned i, unsigned j, float x) + { + g_DPM[i][j] = x; +#if CMP_SIMPLE + if (i > 0 && j > 0) + asserta(feq(x, g_DPMSimple[i][j])); +#endif + } + +static void SAVE_DPD(unsigned i, unsigned j, float x) + { + g_DPD[i][j] = x; +#if CMP_SIMPLE + if (i > 0 && j > 0) + asserta(feq(x, g_DPDSimple[i][j])); +#endif + } + +static void SAVE_DPI(unsigned i, unsigned j, float x) + { + g_DPI[i][j] = x; +#if CMP_SIMPLE + if (i > 0 && j > 0) + asserta(feq(x, g_DPISimple[i][j])); +#endif + } + +static void SAVE_TBM(unsigned i, unsigned j, char x) + { + g_TBM[i][j] = x; + } + +static void SAVE_TBD(unsigned i, unsigned j, char x) + { + g_TBD[i][j] = x; + } + +static void SAVE_TBI(unsigned i, unsigned j, char x) + { + g_TBI[i][j] = x; + } + +void GetFastMxs(Mx **M, Mx **D, Mx **I) + { + *M = &g_MxDPM; + *D = &g_MxDPD; + *I = &g_MxDPI; + } + +#else // SAVE_FAST + +#define SAVE_DPM(i, j, x) /* empty */ +#define SAVE_DPD(i, j, x) /* empty */ +#define SAVE_DPI(i, j, x) /* empty */ + +#define SAVE_TBM(i, j, x) /* empty */ +#define SAVE_TBD(i, j, x) /* empty */ +#define SAVE_TBI(i, j, x) /* empty */ + +#define AllocSave(LA, LB) /* empty */ + +#define cmpm(i, j, x) /* empty */ +#define cmpd(i, j, x) /* empty */ +#define cmpi(i, j, x) /* empty */ + +#endif // SAVE_FAST + +float ViterbiFast(const byte *A, unsigned LA, const byte *B, unsigned LB, + const AlnParams &AP, PathData &PD) + { + if (LA*LB > 100*1000*1000) + Die("ViterbiFast, too long LA=%u, LB=%u", LA, LB); + + AllocBit(LA, LB); + AllocSave(LA, LB); + + StartTimer(ViterbiFast); + + const float * const *Mx = AP.SubstMx; + float OpenA = AP.LOpenA; + float ExtA = AP.LExtA; + + byte **TB = g_TBBit; + float *Mrow = g_DPRow1; + float *Drow = g_DPRow2; + +// Use Mrow[-1], so... + Mrow[-1] = MINUS_INFINITY; + for (unsigned j = 0; j <= LB; ++j) + { + Mrow[j] = MINUS_INFINITY; + SAVE_DPM(0, j, MINUS_INFINITY); + SAVE_TBM(0, j, '?'); + + Drow[j] = MINUS_INFINITY; + SAVE_DPD(0, j, MINUS_INFINITY); + SAVE_TBD(0, j, '?'); + } + +// Main loop + float M0 = float (0); + SAVE_DPM(0, 0, 0); + for (unsigned i = 0; i < LA; ++i) + { + byte a = A[i]; + const float *MxRow = Mx[a]; + float OpenB = AP.LOpenB; + float ExtB = AP.LExtB; + float I0 = MINUS_INFINITY; + + SAVE_TBM(i, 0, '?'); + + SAVE_DPI(i, 0, MINUS_INFINITY); + SAVE_DPI(i, 1, MINUS_INFINITY); + + SAVE_TBI(i, 0, '?'); + SAVE_TBI(i, 1, '?'); + + byte *TBrow = TB[i]; + for (unsigned j = 0; j < LB; ++j) + { + byte b = B[j]; + byte TraceBits = 0; + float SavedM0 = M0; + + // MATCH + { + // M0 = DPM[i][j] + // I0 = DPI[i][j] + // Drow[j] = DPD[i][j] + cmpm(i, j, M0); + cmpd(i, j, Drow[j]); + cmpi(i, j, I0); + + float xM = M0; + SAVE_TBM(i+1, j+1, 'M'); + if (Drow[j] > xM) + { + xM = Drow[j]; + TraceBits = TRACEBITS_DM; + SAVE_TBM(i+1, j+1, 'D'); + } + if (I0 > xM) + { + xM = I0; + TraceBits = TRACEBITS_IM; + SAVE_TBM(i+1, j+1, 'I'); + } + M0 = Mrow[j]; + cmpm(i, j+1, M0); + + Mrow[j] = xM + MxRow[b]; + // Mrow[j] = DPM[i+1][j+1]) + SAVE_DPM(i+1, j+1, Mrow[j]); + } + + // DELETE + { + // SavedM0 = DPM[i][j] + // Drow[j] = DPD[i][j] + cmpm(i, j, SavedM0); + cmpd(i, j, Drow[j]); + + float md = SavedM0 + OpenB; + Drow[j] += ExtB; + SAVE_TBD(i+1, j, 'D'); + if (md >= Drow[j]) + { + Drow[j] = md; + TraceBits |= TRACEBITS_MD; + SAVE_TBD(i+1, j, 'M'); + } + // Drow[j] = DPD[i+1][j] + SAVE_DPD(i+1, j, Drow[j]); + } + + // INSERT + { + // SavedM0 = DPM[i][j] + // I0 = DPI[i][j] + cmpm(i, j, SavedM0); + cmpi(i, j, I0); + + float mi = SavedM0 + OpenA; + I0 += ExtA; + SAVE_TBI(i, j+1, 'I'); + if (mi >= I0) + { + I0 = mi; + TraceBits |= TRACEBITS_MI; + SAVE_TBI(i, j+1, 'M'); + } + // I0 = DPI[i][j+1] + SAVE_DPI(i, j+1, I0); + } + + OpenB = AP.OpenB; + ExtB = AP.ExtB; + + TBrow[j] = TraceBits; + } + + // Special case for end of Drow[] + { + // M0 = DPM[i][LB] + // Drow[LB] = DPD[i][LB] + + TBrow[LB] = 0; + float md = M0 + AP.ROpenB; + Drow[LB] += AP.RExtB; + SAVE_TBD(i+1, LB, 'D'); + if (md >= Drow[LB]) + { + Drow[LB] = md; + TBrow[LB] = TRACEBITS_MD; + SAVE_TBD(i+1, LB, 'M'); + } + // Drow[LB] = DPD[i+1][LB] + SAVE_DPD(i+1, LB, Drow[LB]); + } + + SAVE_DPM(i+1, 0, MINUS_INFINITY); + M0 = MINUS_INFINITY; + + OpenA = AP.OpenA; + ExtA = AP.ExtA; + } + + SAVE_TBM(LA, 0, '?'); + +// Special case for last row of DPI + byte *TBrow = TB[LA]; + float I1 = MINUS_INFINITY; + + SAVE_DPI(LA, 0, MINUS_INFINITY); + SAVE_TBI(LA, 0, '?'); + + SAVE_DPI(LA, 1, MINUS_INFINITY); + SAVE_TBI(LA, 1, '?'); + + for (unsigned j = 1; j < LB; ++j) + { + // Mrow[j-1] = DPM[LA][j] + // I1 = DPI[LA][j] + + TBrow[j] = 0; + float mi = Mrow[int(j)-1] + AP.ROpenA; + I1 += AP.RExtA; + SAVE_TBI(LA, j+1, 'I'); + if (mi > I1) + { + I1 = mi; + TBrow[j] = TRACEBITS_MI; + SAVE_TBI(LA, j+1, 'M'); + } + SAVE_DPI(LA, j+1, I1); + } + + float FinalM = Mrow[LB-1]; + float FinalD = Drow[LB]; + float FinalI = I1; +// FinalM = DPM[LA][LB] +// FinalD = DPD[LA][LB] +// FinalI = DPI[LA][LB] + + float Score = FinalM; + byte State = 'M'; + if (FinalD > Score) + { + Score = FinalD; + State = 'D'; + } + if (FinalI > Score) + { + Score = FinalI; + State = 'I'; + } + + EndTimer(ViterbiFast); + TraceBackBit(LA, LB, State, PD); + +#if SAVE_FAST + g_MxDPM.LogMe(); + g_MxDPD.LogMe(); + g_MxDPI.LogMe(); + + g_MxTBM.LogMe(); + g_MxTBD.LogMe(); + g_MxTBI.LogMe(); +#endif + + return Score; + } diff --git a/windex.h b/windex.h new file mode 100644 index 0000000..0b324ca --- /dev/null +++ b/windex.h @@ -0,0 +1,71 @@ +#ifndef windex_h +#define windex_h + +class SFasta; +struct SeqDB; + +typedef uint32 word_t; +typedef uint16 wordcount_t; +typedef uint32 arrsize_t; +typedef uint16 seqcountperword_t; +typedef uint32 seqindex_t; +typedef uint16 commonwordcount_t; + +const uint32 WindexFileHdr_Magic1 = 0x312DE41; +const uint32 WindexFileHdr_Magic2 = 0x312DE42; +const uint32 WindexFileHdr_Magic3 = 0x312DE43; +const uint32 WindexFileHdr_Magic4 = 0x312DE44; + +struct WindexFileHdr + { + uint32 Magic1; + uint32 IsNucleo; + uint32 WordLength; + uint32 Magic2; + }; + +class Windex + { +public: + bool m_Nucleo; + bool m_RedAlpha; + unsigned m_WordLength; + unsigned m_AlphaSize; + unsigned m_WordCount; + unsigned m_Hi; + unsigned m_CapacityInc; + arrsize_t *m_Capacities; + arrsize_t *m_Sizes; + float *m_WordScores; + seqindex_t **m_SeedIndexes; + byte *m_UniqueCounts; + unsigned m_CharToLetter[256]; + +public: + Windex(); + void ToFile(const string &FileName) const; + void FromFile(const string &FileName); + void FromSFasta(SFasta &SF); + void FromSeqDB(const SeqDB &DB); + void Clear(bool ctor = false); + void AddWords(unsigned SeqIndex, const word_t *Words, unsigned N); + void Init(bool Nucleo, unsigned WordLength); + void Init2(bool Nucleo, unsigned TableSize); + void InitRed(unsigned WordLength); + void InitWordScores(const float *const *SubstMx); + void Reset(); + void LogMe() const; + unsigned LogMemSize() const; + void LogWordStats(unsigned TopWords = 10) const; + const char *WordToStr(word_t Word) const; + word_t SeqToWord(const byte *Seq) const; + unsigned SeqToWords(const byte *Seq, unsigned L, word_t *Words) const; + unsigned SeqToWordsStep(unsigned Step, const byte *Seq, unsigned L, word_t *Words) const; + unsigned WordsToCounts(const word_t *Words, unsigned N, + word_t *UniqueWords, seqcountperword_t *Counts) const; + unsigned GetUniqueWords(const word_t *Words, unsigned N, + word_t *UniqueWords) const; + void LogSizeHisto() const; + }; + +#endif // windex_h diff --git a/writechhit.cpp b/writechhit.cpp new file mode 100644 index 0000000..ea67061 --- /dev/null +++ b/writechhit.cpp @@ -0,0 +1,329 @@ +#include "myutils.h" +#include "chime.h" + +void WriteChimeFileHdr(FILE *f) + { + if (f == 0) + return; + + fprintf(f, + "\tQuery" // 1 + "\tA" // 2 + "\tB" // 3 + "\tIdQM" // 4 + "\tIdQA" // 5 + "\tIdQB" // 6 + "\tIdAB" // 7 + "\tIdQT" // 8 + "\tLY" // 9 + "\tLN" // 10 + "\tLA" // 11 + "\tRY" // 12 + "\tRN" // 13 + "\tRA" // 14 + "\tDiv" // 15 + "\tY" // 16 + "\n" + ); + } + +void WriteChimeHit(FILE *f, const ChimeHit2 &Hit) + { + if (f == 0) + return; + + if (Hit.Div <= 0.0) + { + fprintf(f, "0.0000"); // 0 + + fprintf(f, + "\t%s", Hit.QLabel.c_str()); // 1 + + fprintf(f, + "\t*" // 2 + "\t*" // 3 + "\t*" // 4 + "\t*" // 5 + "\t*" // 6 + "\t*" // 7 + "\t*" // 8 + "\t*" // 9 + "\t*" // 10 + "\t*" // 11 + "\t*" // 12 + "\t*" // 13 + "\t*" // 14 + "\t*" // 15 + "\tN" // 16 + "\n" + ); + return; + } + + fprintf(f, "%.4f", Hit.Score); // 0 + + fputc('\t', f); + fputs(Hit.QLabel.c_str(), f); // 1 + + fputc('\t', f); + fputs(Hit.ALabel.c_str(), f); // 2 + + fputc('\t', f); + fputs(Hit.BLabel.c_str(), f); // 3 + + fprintf(f, "\t%.1f", Hit.PctIdQM); // 4 + fprintf(f, "\t%.1f", Hit.PctIdQA); // 5 + fprintf(f, "\t%.1f", Hit.PctIdQB); // 6 + fprintf(f, "\t%.1f", Hit.PctIdAB); // 7 + fprintf(f, "\t%.1f", Hit.PctIdQT); // 8 + + fprintf(f, "\t%u", Hit.CS_LY); // 9 + fprintf(f, "\t%u", Hit.CS_LN); // 10 + fprintf(f, "\t%u", Hit.CS_LA); // 11 + + fprintf(f, "\t%u", Hit.CS_RY); // 12 + fprintf(f, "\t%u", Hit.CS_RN); // 13 + fprintf(f, "\t%u", Hit.CS_RA); // 14 + + fprintf(f, "\t%.2f", Hit.Div); // 15 + + fprintf(f, "\t%c", yon(Hit.Accept())); // 16 + fputc('\n', f); + } + +unsigned GetUngappedLength(const byte *Seq, unsigned L) + { + unsigned UL = 0; + for (unsigned i = 0; i < L; ++i) + if (!isgap(Seq[i])) + ++UL; + return UL; + } + +void WriteChimeHitX(FILE *f, const ChimeHit2 &Hit) + { + if (f == 0) + return; + + if (Hit.Div <= 0.0) + return; + + const string &Q3 = Hit.Q3; + const string &A3 = Hit.A3; + const string &B3 = Hit.B3; + + const byte *Q3Seq = (const byte *) Q3.c_str(); + const byte *A3Seq = (const byte *) A3.c_str(); + const byte *B3Seq = (const byte *) B3.c_str(); + +// Aligned + unsigned ColCount = SIZE(Q3); + asserta(SIZE(A3) == ColCount && SIZE(B3) == ColCount); + + unsigned LQ = GetUngappedLength(Q3Seq, ColCount); + unsigned LA = GetUngappedLength(A3Seq, ColCount); + unsigned LB = GetUngappedLength(B3Seq, ColCount); + + fprintf(f, "\n"); + fprintf(f, "------------------------------------------------------------------------\n"); + fprintf(f, "Query (%5u nt) %s\n", LQ, Hit.QLabel.c_str()); + fprintf(f, "ParentA (%5u nt) %s\n", LA, Hit.ALabel.c_str()); + fprintf(f, "ParentB (%5u nt) %s\n", LB, Hit.BLabel.c_str()); + +// Strip terminal gaps in query + unsigned FromCol = UINT_MAX; + unsigned ToCol = UINT_MAX; + for (unsigned Col = 0; Col < ColCount; ++Col) + { + if (!isgap(Q3Seq[Col])) + { + if (FromCol == UINT_MAX) + FromCol = Col; + ToCol = Col; + } + } + + unsigned QPos = 0; + unsigned APos = 0; + unsigned BPos = 0; + for (unsigned Col = 0; Col < FromCol; ++Col) + { + if (!isgap(A3Seq[Col])) + ++APos; + if (!isgap(B3Seq[Col])) + ++BPos; + } + + unsigned Range = ToCol - FromCol + 1; + unsigned RowCount = (Range + 79)/80; + unsigned RowFromCol = FromCol; + for (unsigned RowIndex = 0; RowIndex < RowCount; ++RowIndex) + { + fprintf(f, "\n"); + unsigned RowToCol = RowFromCol + 79; + if (RowToCol > ToCol) + RowToCol = ToCol; + + // A row + fprintf(f, "A %5u ", APos + 1); + for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col) + { + char q = Q3Seq[Col]; + char a = A3Seq[Col]; + if (a != q) + a = tolower(a); + fprintf(f, "%c", a); + if (!isgap(a)) + ++APos; + } + fprintf(f, " %u\n", APos); + + // Q row + fprintf(f, "Q %5u ", QPos + 1); + for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col) + { + char q = Q3Seq[Col]; + fprintf(f, "%c", q); + if (!isgap(q)) + ++QPos; + } + fprintf(f, " %u\n", QPos); + + // B row + fprintf(f, "B %5u ", BPos + 1); + for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col) + { + char q = Q3Seq[Col]; + char b = B3Seq[Col]; + if (b != q) + b = tolower(b); + fprintf(f, "%c", b); + if (!isgap(b)) + ++BPos; + } + fprintf(f, " %u\n", BPos); + + // Diffs + fprintf(f, "Diffs "); + for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col) + { + char q = Q3Seq[Col]; + char a = A3Seq[Col]; + char b = B3Seq[Col]; + + char c = ' '; + if (isgap(q) || isgap(a) || isgap(b)) + c = ' '; + else if (Col < Hit.ColXLo) + { + if (q == a && q == b) + c = ' '; + else if (q == a && q != b) + c = 'A'; + else if (q == b && q != a) + c = 'b'; + else if (a == b && q != a) + c = 'N'; + else + c = '?'; + } + else if (Col > Hit.ColXHi) + { + if (q == a && q == b) + c = ' '; + else if (q == b && q != a) + c = 'B'; + else if (q == a && q != b) + c = 'a'; + else if (a == b && q != a) + c = 'N'; + else + c = '?'; + } + + fprintf(f, "%c", c); + } + fprintf(f, "\n"); + + // SNPs + fprintf(f, "Votes "); + for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col) + { + char q = Q3Seq[Col]; + char a = A3Seq[Col]; + char b = B3Seq[Col]; + + bool PrevGap = Col > 0 && (isgap(Q3Seq[Col-1]) || isgap(A3Seq[Col-1]) || isgap(B3Seq[Col-1])); + bool NextGap = Col+1 < ColCount && (isgap(Q3Seq[Col+1]) || isgap(A3Seq[Col+1]) || isgap(B3Seq[Col+1])); + + char c = ' '; + if (isgap(q) || isgap(a) || isgap(b) || PrevGap || NextGap) + c = ' '; + else if (Col < Hit.ColXLo) + { + if (q == a && q == b) + c = ' '; + else if (q == a && q != b) + c = '+'; + else if (q == b && q != a) + c = '!'; + else + c = '0'; + } + else if (Col > Hit.ColXHi) + { + if (q == a && q == b) + c = ' '; + else if (q == b && q != a) + c = '+'; + else if (q == a && q != b) + c = '!'; + else + c = '0'; + } + + fprintf(f, "%c", c); + } + fprintf(f, "\n"); + + // LR row + fprintf(f, "Model "); + for (unsigned Col = RowFromCol; Col <= RowToCol; ++Col) + { + if (Col < Hit.ColXLo) + fprintf(f, "A"); + else if (Col >= Hit.ColXLo && Col <= Hit.ColXHi) + fprintf(f, "x"); + else + fprintf(f, "B"); + } + + fprintf(f, "\n"); + + RowFromCol += 80; + } + fprintf(f, "\n"); + + double PctIdBestP = max(Hit.PctIdQA, Hit.PctIdQB); + double Div = (Hit.PctIdQM - PctIdBestP)*100.0/PctIdBestP; + + unsigned LTot = Hit.CS_LY + Hit.CS_LN + Hit.CS_LA; + unsigned RTot = Hit.CS_RY + Hit.CS_RN + Hit.CS_RA; + + double PctL = Pct(Hit.CS_LY, LTot); + double PctR = Pct(Hit.CS_RY, RTot); + + fprintf(f, + "Ids. QA %.1f%%, QB %.1f%%, AB %.1f%%, QModel %.1f%%, Div. %+.1f%%\n", + Hit.PctIdQA, + Hit.PctIdQB, + Hit.PctIdAB, + Hit.PctIdQM, + Div); + + fprintf(f, + "Diffs Left %u: N %u, A %u, Y %u (%.1f%%); Right %u: N %u, A %u, Y %u (%.1f%%), Score %.4f\n", + LTot, Hit.CS_LN, Hit.CS_LA, Hit.CS_LY, PctL, + RTot, Hit.CS_RN, Hit.CS_RA, Hit.CS_RY, PctR, + Hit.Score); + }