From a2cde58c1e72199498a2142983ef040dce36da10 Mon Sep 17 00:00:00 2001 From: Sarah Westcott Date: Thu, 9 Jan 2014 09:44:50 -0500 Subject: [PATCH] added Jensen-Shannon calc. working on get.communitytype command. fixed bug in get.oturep command with count file. fixed bug in trim.flows that caused issue when you read back in scrap flow file. added log transform parameter to trims.seqs command and allow non paired barcodes and primers to use reorient. fixed bug in lefse when kruskal returned nan. --- Mothur.xcodeproj/project.pbxproj | 42 ++- collectsharedcommand.cpp | 9 +- communitytype.cpp | 512 ++++++++++++++++++++++++++++++ communitytype.h | 75 +++++ flowdata.cpp | 13 +- getmetacommunitycommand.cpp | 474 +++++++++++++++++++++------ getmetacommunitycommand.h | 161 +++------- getoturepcommand.cpp | 6 +- kmeans.cpp | 237 ++++++++++++++ kmeans.h | 32 ++ linearalgebra.cpp | 12 +- linearalgebra.h | 2 +- matrixoutputcommand.cpp | 7 +- matrixoutputcommand.h | 4 + metastatscommand.cpp | 16 +- pam.cpp | 373 ++++++++++++++++++++++ pam.h | 40 +++ qFinderDMM.cpp | 528 +++---------------------------- qFinderDMM.h | 45 +-- qualityscores.cpp | 41 ++- qualityscores.h | 8 +- seqsummarycommand.cpp | 4 +- shannonrange.cpp | 35 ++ shannonrange.h | 29 ++ sharedjsd.cpp | 48 +++ sharedjsd.h | 30 ++ summarysharedcommand.cpp | 4 +- summarysharedcommand.h | 3 + trimflowscommand.cpp | 3 +- trimseqscommand.cpp | 35 +- trimseqscommand.h | 26 +- validcalculator.cpp | 5 + 32 files changed, 2085 insertions(+), 774 deletions(-) create mode 100644 communitytype.cpp create mode 100644 communitytype.h create mode 100644 kmeans.cpp create mode 100644 kmeans.h create mode 100644 pam.cpp create mode 100644 pam.h create mode 100644 shannonrange.cpp create mode 100644 shannonrange.h create mode 100644 sharedjsd.cpp create mode 100644 sharedjsd.h diff --git a/Mothur.xcodeproj/project.pbxproj b/Mothur.xcodeproj/project.pbxproj index f3b6a09..26456a3 100644 --- a/Mothur.xcodeproj/project.pbxproj +++ b/Mothur.xcodeproj/project.pbxproj @@ -18,6 +18,7 @@ A70056EB156AB6E500924A2D /* removeotulabelscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A70056EA156AB6E500924A2D /* removeotulabelscommand.cpp */; }; A70332B712D3A13400761E33 /* makefile in Sources */ = {isa = PBXBuildFile; fileRef = A70332B512D3A13400761E33 /* makefile */; }; A7128B1D16B7002A00723BE4 /* getdistscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7128B1C16B7002600723BE4 /* getdistscommand.cpp */; }; + A7132EB3184E792700AAA402 /* communitytype.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7132EB2184E792700AAA402 /* communitytype.cpp */; }; A713EBAC12DC7613000092AC /* readphylipvector.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A713EBAB12DC7613000092AC /* readphylipvector.cpp */; }; A713EBED12DC7C5E000092AC /* nmdscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A713EBEC12DC7C5E000092AC /* nmdscommand.cpp */; }; A7190B221768E0DF00A9AFA6 /* lefsecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7190B201768E0DF00A9AFA6 /* lefsecommand.cpp */; }; @@ -29,6 +30,7 @@ A721AB71161C572A009860A1 /* kmernode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB6D161C572A009860A1 /* kmernode.cpp */; }; A721AB72161C572A009860A1 /* kmertree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB6F161C572A009860A1 /* kmertree.cpp */; }; A721AB77161C573B009860A1 /* taxonomynode.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A721AB73161C573B009860A1 /* taxonomynode.cpp */; }; + A7222D731856277C0055A993 /* sharedjsd.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7222D721856277C0055A993 /* sharedjsd.cpp */; }; A724D2B7153C8628000A826F /* makebiomcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A724D2B6153C8628000A826F /* makebiomcommand.cpp */; }; A727864412E9E28C00F86ABA /* removerarecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A727864312E9E28C00F86ABA /* removerarecommand.cpp */; }; A7386C251619E52300651424 /* abstractdecisiontree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7386C241619E52200651424 /* abstractdecisiontree.cpp */; }; @@ -70,16 +72,19 @@ A79EEF8616971D4A0006DEC1 /* filtersharedcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A79EEF8516971D4A0006DEC1 /* filtersharedcommand.cpp */; }; A7A0671A1562946F0095C8C5 /* listotulabelscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7A067191562946F0095C8C5 /* listotulabelscommand.cpp */; }; A7A0671F1562AC3E0095C8C5 /* makecontigscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7A0671E1562AC3E0095C8C5 /* makecontigscommand.cpp */; }; + A7A09B1018773C0E00FAA081 /* shannonrange.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7A09B0F18773C0E00FAA081 /* shannonrange.cpp */; }; A7A32DAA14DC43B00001D2E5 /* sortseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7A32DA914DC43B00001D2E5 /* sortseqscommand.cpp */; }; A7A3C8C914D041AD00B1BFBE /* otuassociationcommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7A3C8C714D041AD00B1BFBE /* otuassociationcommand.cpp */; }; A7A61F2D130062E000E05B6B /* amovacommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7A61F2C130062E000E05B6B /* amovacommand.cpp */; }; A7B0231516B8244C006BA09E /* removedistscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B0231416B8244B006BA09E /* removedistscommand.cpp */; }; + A7B093C018579F0400843CD1 /* pam.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7B093BF18579F0400843CD1 /* pam.cpp */; }; A7BF221414587886000AD524 /* myPerseus.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7BF221214587886000AD524 /* myPerseus.cpp */; }; A7BF2232145879B2000AD524 /* chimeraperseuscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7BF2231145879B2000AD524 /* chimeraperseuscommand.cpp */; }; A7C3DC0B14FE457500FE1924 /* cooccurrencecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C3DC0914FE457500FE1924 /* cooccurrencecommand.cpp */; }; A7C3DC0F14FE469500FE1924 /* trialSwap2.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C3DC0D14FE469500FE1924 /* trialSwap2.cpp */; }; A7C7DAB915DA758B0059B0CF /* sffmultiplecommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7C7DAB815DA758B0059B0CF /* sffmultiplecommand.cpp */; }; A7CFA4311755401800D9ED4D /* renameseqscommand.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7CFA4301755401800D9ED4D /* renameseqscommand.cpp */; }; + A7D395C4184FA3A200A350D7 /* kmeans.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D395C3184FA3A200A350D7 /* kmeans.cpp */; }; A7D755DA1535F679009BF21A /* treereader.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D755D91535F679009BF21A /* treereader.cpp */; }; A7D9378A17B146B5001E90B0 /* wilcox.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7D9378917B146B5001E90B0 /* wilcox.cpp */; }; A7E0243D15B4520A00A5F046 /* sparsedistancematrix.cpp in Sources */ = {isa = PBXBuildFile; fileRef = A7E0243C15B4520A00A5F046 /* sparsedistancematrix.cpp */; }; @@ -418,6 +423,8 @@ A70332B512D3A13400761E33 /* makefile */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.make; path = makefile; sourceTree = ""; }; A7128B1A16B7001200723BE4 /* getdistscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = getdistscommand.h; sourceTree = ""; }; A7128B1C16B7002600723BE4 /* getdistscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = getdistscommand.cpp; sourceTree = ""; }; + A7132EAE184E76EB00AAA402 /* communitytype.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = communitytype.h; sourceTree = ""; }; + A7132EB2184E792700AAA402 /* communitytype.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = communitytype.cpp; sourceTree = ""; }; A713EBAA12DC7613000092AC /* readphylipvector.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = readphylipvector.h; sourceTree = ""; }; A713EBAB12DC7613000092AC /* readphylipvector.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = readphylipvector.cpp; sourceTree = ""; }; A713EBEB12DC7C5E000092AC /* nmdscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = nmdscommand.h; sourceTree = ""; }; @@ -440,6 +447,8 @@ A721AB70161C572A009860A1 /* kmertree.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = kmertree.h; sourceTree = ""; }; A721AB73161C573B009860A1 /* taxonomynode.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = taxonomynode.cpp; sourceTree = ""; }; A721AB74161C573B009860A1 /* taxonomynode.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = taxonomynode.h; sourceTree = ""; }; + A7222D711856276C0055A993 /* sharedjsd.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = sharedjsd.h; sourceTree = ""; }; + A7222D721856277C0055A993 /* sharedjsd.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sharedjsd.cpp; sourceTree = ""; }; A724D2B4153C8600000A826F /* makebiomcommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = makebiomcommand.h; sourceTree = ""; }; A724D2B6153C8628000A826F /* makebiomcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = makebiomcommand.cpp; sourceTree = ""; }; A727864212E9E28C00F86ABA /* removerarecommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = removerarecommand.h; sourceTree = ""; }; @@ -523,6 +532,8 @@ A7A0671C156294810095C8C5 /* listotulabelscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = listotulabelscommand.h; sourceTree = ""; }; A7A0671D1562AC230095C8C5 /* makecontigscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = makecontigscommand.h; sourceTree = ""; }; A7A0671E1562AC3E0095C8C5 /* makecontigscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = makecontigscommand.cpp; sourceTree = ""; }; + A7A09B0E18773BF700FAA081 /* shannonrange.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = shannonrange.h; sourceTree = ""; }; + A7A09B0F18773C0E00FAA081 /* shannonrange.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = shannonrange.cpp; sourceTree = ""; }; A7A32DA914DC43B00001D2E5 /* sortseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sortseqscommand.cpp; sourceTree = ""; }; A7A32DAC14DC43D10001D2E5 /* sortseqscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = sortseqscommand.h; sourceTree = ""; }; A7A3C8C714D041AD00B1BFBE /* otuassociationcommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = otuassociationcommand.cpp; sourceTree = ""; }; @@ -533,6 +544,8 @@ A7AACFBA132FE008003D6C4D /* currentfile.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = currentfile.h; sourceTree = ""; }; A7B0231416B8244B006BA09E /* removedistscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = removedistscommand.cpp; sourceTree = ""; }; A7B0231716B8245D006BA09E /* removedistscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = removedistscommand.h; sourceTree = ""; }; + A7B093BE18579EF600843CD1 /* pam.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = pam.h; sourceTree = ""; }; + A7B093BF18579F0400843CD1 /* pam.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = pam.cpp; sourceTree = ""; }; A7BF221214587886000AD524 /* myPerseus.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = myPerseus.cpp; sourceTree = ""; }; A7BF221314587886000AD524 /* myPerseus.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = myPerseus.h; sourceTree = ""; }; A7BF2230145879B2000AD524 /* chimeraperseuscommand.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = chimeraperseuscommand.h; sourceTree = ""; }; @@ -545,6 +558,8 @@ A7C7DAB815DA758B0059B0CF /* sffmultiplecommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = sffmultiplecommand.cpp; sourceTree = ""; }; A7CFA42F1755400500D9ED4D /* renameseqscommand.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = renameseqscommand.h; sourceTree = ""; }; A7CFA4301755401800D9ED4D /* renameseqscommand.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = renameseqscommand.cpp; sourceTree = ""; }; + A7D395C2184FA39300A350D7 /* kmeans.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = kmeans.h; sourceTree = ""; }; + A7D395C3184FA3A200A350D7 /* kmeans.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = kmeans.cpp; sourceTree = ""; }; A7D755D71535F665009BF21A /* treereader.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = treereader.h; sourceTree = ""; }; A7D755D91535F679009BF21A /* treereader.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = treereader.cpp; sourceTree = ""; }; A7D9378917B146B5001E90B0 /* wilcox.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = wilcox.cpp; sourceTree = ""; }; @@ -1178,6 +1193,7 @@ A7E9B6B112D37EC400DA6239 /* commandoptionparser.cpp */, A7E9B6B212D37EC400DA6239 /* commandoptionparser.hpp */, A7DAAFA3133A254E003956EB /* commandparameter.h */, + A7D395C1184FA34300A350D7 /* communitytype */, A7E9B6B412D37EC400DA6239 /* completelinkage.cpp */, A7E9BA4212D3960D00DA6239 /* containers */, A7E9B6B612D37EC400DA6239 /* consensus.h */, @@ -1229,8 +1245,6 @@ A7E9B77C12D37EC400DA6239 /* overlap.hpp */, A7E9B79B12D37EC400DA6239 /* progress.cpp */, A7E9B79C12D37EC400DA6239 /* progress.hpp */, - A7548FAF171440ED00B1F05A /* qFinderDMM.h */, - A7548FAE171440EC00B1F05A /* qFinderDMM.cpp */, A77B7187173D4041002163C2 /* randomnumber.h */, A77B7186173D4041002163C2 /* randomnumber.cpp */, A7E9B7A512D37EC400DA6239 /* rarecalc.cpp */, @@ -1318,6 +1332,21 @@ name = fortran; sourceTree = ""; }; + A7D395C1184FA34300A350D7 /* communitytype */ = { + isa = PBXGroup; + children = ( + A7132EAE184E76EB00AAA402 /* communitytype.h */, + A7132EB2184E792700AAA402 /* communitytype.cpp */, + A7D395C2184FA39300A350D7 /* kmeans.h */, + A7D395C3184FA3A200A350D7 /* kmeans.cpp */, + A7B093BE18579EF600843CD1 /* pam.h */, + A7B093BF18579F0400843CD1 /* pam.cpp */, + A7548FAF171440ED00B1F05A /* qFinderDMM.h */, + A7548FAE171440EC00B1F05A /* qFinderDMM.cpp */, + ); + name = communitytype; + sourceTree = ""; + }; A7E9BA3812D3956100DA6239 /* commands */ = { isa = PBXGroup; children = ( @@ -1683,6 +1712,8 @@ A7E9B7E612D37EC400DA6239 /* shannon.h */, A7E9B7E712D37EC400DA6239 /* shannoneven.cpp */, A7E9B7E812D37EC400DA6239 /* shannoneven.h */, + A7A09B0E18773BF700FAA081 /* shannonrange.h */, + A7A09B0F18773C0E00FAA081 /* shannonrange.cpp */, A7E9B7E912D37EC400DA6239 /* sharedace.cpp */, A7E9B7EA12D37EC400DA6239 /* sharedace.h */, A7E9B7EC12D37EC400DA6239 /* sharedanderbergs.cpp */, @@ -1699,6 +1730,8 @@ A7E9B7F912D37EC400DA6239 /* sharedjclass.h */, A7E9B7FA12D37EC400DA6239 /* sharedjest.cpp */, A7E9B7FB12D37EC400DA6239 /* sharedjest.h */, + A7222D711856276C0055A993 /* sharedjsd.h */, + A7222D721856277C0055A993 /* sharedjsd.cpp */, A7E9B7FC12D37EC400DA6239 /* sharedkstest.cpp */, A7E9B7FD12D37EC400DA6239 /* sharedkstest.h */, A7E9B7FE12D37EC400DA6239 /* sharedkulczynski.cpp */, @@ -2400,6 +2433,11 @@ A7D9378A17B146B5001E90B0 /* wilcox.cpp in Sources */, A7F24FC317EA36600021DC9A /* classifyrfsharedcommand.cpp in Sources */, A747EC71181EA0F900345732 /* sracommand.cpp in Sources */, + A7132EB3184E792700AAA402 /* communitytype.cpp in Sources */, + A7D395C4184FA3A200A350D7 /* kmeans.cpp in Sources */, + A7222D731856277C0055A993 /* sharedjsd.cpp in Sources */, + A7B093C018579F0400843CD1 /* pam.cpp in Sources */, + A7A09B1018773C0E00FAA081 /* shannonrange.cpp in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; diff --git a/collectsharedcommand.cpp b/collectsharedcommand.cpp index 40ee3d2..56e60d8 100644 --- a/collectsharedcommand.cpp +++ b/collectsharedcommand.cpp @@ -49,6 +49,7 @@ #include "memchord.h" #include "memeuclidean.h" #include "mempearson.h" +#include "sharedjsd.h" //********************************************************************************************************************** @@ -57,7 +58,7 @@ vector CollectSharedCommand::setParameters(){ CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(pshared); CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel); CommandParameter pfreq("freq", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pfreq); - CommandParameter pcalc("calc", "Multiple", "sharedchao-sharedsobs-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan-kstest-whittaker-sharednseqs-ochiai-anderberg-kulczynski-kulczynskicody-lennon-morisitahorn-braycurtis-odum-canberra-structeuclidean-structchord-hellinger-manhattan-structpearson-soergel-spearman-structkulczynski-speciesprofile-structchi2-hamming-gower-memchi2-memchord-memeuclidean-mempearson", "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan", "", "", "","",true,false,true); parameters.push_back(pcalc); + CommandParameter pcalc("calc", "Multiple", "sharedchao-sharedsobs-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan-kstest-whittaker-sharednseqs-ochiai-anderberg-kulczynski-kulczynskicody-lennon-morisitahorn-braycurtis-odum-canberra-structeuclidean-structchord-hellinger-manhattan-structpearson-soergel-spearman-structkulczynski-speciesprofile-structchi2-hamming-gower-memchi2-memchord-memeuclidean-mempearson-jsd", "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan", "", "", "","",true,false,true); parameters.push_back(pcalc); CommandParameter pall("all", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pall); CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); @@ -141,6 +142,7 @@ string CollectSharedCommand::getOutputPattern(string type) { else if (type == "memchord") { pattern = "[filename],memchord"; } else if (type == "memeuclidean") { pattern = "[filename],memeuclidean"; } else if (type == "mempearson") { pattern = "[filename],mempearson"; } + else if (type == "jsd") { pattern = "[filename],jsd"; } else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; } return pattern; @@ -196,6 +198,7 @@ CollectSharedCommand::CollectSharedCommand(){ outputTypes["memchord"] = tempOutNames; outputTypes["memeuclidean"] = tempOutNames; outputTypes["mempearson"] = tempOutNames; + outputTypes["jsd"] = tempOutNames; } catch(exception& e) { @@ -268,6 +271,7 @@ CollectSharedCommand::CollectSharedCommand(string option) { outputTypes["memchord"] = tempOutNames; outputTypes["memeuclidean"] = tempOutNames; outputTypes["mempearson"] = tempOutNames; + outputTypes["jsd"] = tempOutNames; //if the user changes the input directory command factory will send this info to us in the output parameter @@ -460,6 +464,9 @@ CollectSharedCommand::CollectSharedCommand(string option) { }else if (Estimators[i] == "mempearson") { cDisplays.push_back(new CollectDisplay(new MemPearson(), new SharedOneColumnFile(getOutputFileName("mempearson", variables)))); outputNames.push_back(getOutputFileName("mempearson", variables)); outputTypes["mempearson"].push_back(getOutputFileName("mempearson", variables)); + }else if (Estimators[i] == "jsd") { + cDisplays.push_back(new CollectDisplay(new JSD(), new SharedOneColumnFile(getOutputFileName("jsd", variables)))); + outputNames.push_back(getOutputFileName("jsd", variables)); outputTypes["jsd"].push_back(getOutputFileName("jsd", variables)); } } diff --git a/communitytype.cpp b/communitytype.cpp new file mode 100644 index 0000000..f88932c --- /dev/null +++ b/communitytype.cpp @@ -0,0 +1,512 @@ +// +// communitytype.cpp +// Mothur +// +// Created by SarahsWork on 12/3/13. +// Copyright (c) 2013 Schloss Lab. All rights reserved. +// + +#include "communitytype.h" + +/**************************************************************************************************/ + +//can we get these psi/psi1 calculations into their own math class? +//psi calcualtions swiped from gsl library... + +static const double psi_cs[23] = { + -.038057080835217922, + .491415393029387130, + -.056815747821244730, + .008357821225914313, + -.001333232857994342, + .000220313287069308, + -.000037040238178456, + .000006283793654854, + -.000001071263908506, + .000000183128394654, + -.000000031353509361, + .000000005372808776, + -.000000000921168141, + .000000000157981265, + -.000000000027098646, + .000000000004648722, + -.000000000000797527, + .000000000000136827, + -.000000000000023475, + .000000000000004027, + -.000000000000000691, + .000000000000000118, + -.000000000000000020 +}; + +static double apsi_cs[16] = { + -.0204749044678185, + -.0101801271534859, + .0000559718725387, + -.0000012917176570, + .0000000572858606, + -.0000000038213539, + .0000000003397434, + -.0000000000374838, + .0000000000048990, + -.0000000000007344, + .0000000000001233, + -.0000000000000228, + .0000000000000045, + -.0000000000000009, + .0000000000000002, + -.0000000000000000 +}; + +/**************************************************************************************************/ +/* coefficients for Maclaurin summation in hzeta() + * B_{2j}/(2j)! + */ +static double hzeta_c[15] = { + 1.00000000000000000000000000000, + 0.083333333333333333333333333333, + -0.00138888888888888888888888888889, + 0.000033068783068783068783068783069, + -8.2671957671957671957671957672e-07, + 2.0876756987868098979210090321e-08, + -5.2841901386874931848476822022e-10, + 1.3382536530684678832826980975e-11, + -3.3896802963225828668301953912e-13, + 8.5860620562778445641359054504e-15, + -2.1748686985580618730415164239e-16, + 5.5090028283602295152026526089e-18, + -1.3954464685812523340707686264e-19, + 3.5347070396294674716932299778e-21, + -8.9535174270375468504026113181e-23 +}; + +/**************************************************************************************************/ +void CommunityTypeFinder::printSilData(ofstream& out, double chi, vector sils){ + try { + out << setprecision (6) << numPartitions << '\t' << chi << '\t'; + for (int i = 0; i < sils.size(); i++) { + out << sils[i] << '\t'; + } + out << endl; + + return; + } + catch(exception& e){ + m->errorOut(e, "CommunityTypeFinder", "printSilData"); + exit(1); + } +} +/**************************************************************************************************/ +void CommunityTypeFinder::printSilData(ostream& out, double chi, vector sils){ + try { + out << setprecision (6) << numPartitions << '\t' << chi << '\t'; + m->mothurOutJustToLog(toString(numPartitions) + '\t' + toString(chi) + '\t'); + for (int i = 0; i < sils.size(); i++) { + out << sils[i] << '\t'; + m->mothurOutJustToLog(toString(sils[i]) + '\t'); + } + out << endl; + m->mothurOutJustToLog("\n"); + + return; + } + catch(exception& e){ + m->errorOut(e, "CommunityTypeFinder", "printSilData"); + exit(1); + } +} +/**************************************************************************************************/ + +void CommunityTypeFinder::printZMatrix(string fileName, vector sampleName){ + try { + ofstream printMatrix; + m->openOutputFile(fileName, printMatrix); //(fileName.c_str()); + printMatrix.setf(ios::fixed, ios::floatfield); + printMatrix.setf(ios::showpoint); + + for(int i=0;ierrorOut(e, "CommunityTypeFinder", "printZMatrix"); + exit(1); + } +} + +/**************************************************************************************************/ + +void CommunityTypeFinder::printRelAbund(string fileName, vector otuNames){ + try { + ofstream printRA; + m->openOutputFile(fileName, printRA); //(fileName.c_str()); + printRA.setf(ios::fixed, ios::floatfield); + printRA.setf(ios::showpoint); + + vector totals(numPartitions, 0.0000); + for(int i=0;icontrol_pressed) { break; } + + printRA << otuNames[i]; + for(int j=0;j= 0.0000){ + double std = sqrt(error[j][i]); + printRA << '\t' << 100 * exp(lambdaMatrix[j][i]) / totals[j]; + printRA << '\t' << 100 * exp(lambdaMatrix[j][i] - 2.0 * std) / totals[j]; + printRA << '\t' << 100 * exp(lambdaMatrix[j][i] + 2.0 * std) / totals[j]; + } + else{ + printRA << '\t' << 100 * exp(lambdaMatrix[j][i]) / totals[j]; + printRA << '\t' << "NA"; + printRA << '\t' << "NA"; + } + } + printRA << endl; + } + + printRA.close(); + } + catch(exception& e) { + m->errorOut(e, "CommunityTypeFinder", "printRelAbund"); + exit(1); + } +} + +/**************************************************************************************************/ + +vector > CommunityTypeFinder::getHessian(){ + try { + vector alpha(numOTUs, 0.0000); + double alphaSum = 0.0000; + + vector pi = zMatrix[currentPartition]; + vector psi_ajk(numOTUs, 0.0000); + vector psi_cjk(numOTUs, 0.0000); + vector psi1_ajk(numOTUs, 0.0000); + vector psi1_cjk(numOTUs, 0.0000); + + for(int j=0;jcontrol_pressed) { break; } + + alpha[j] = exp(lambdaMatrix[currentPartition][j]); + alphaSum += alpha[j]; + + for(int i=0;icontrol_pressed) { break; } + weight += pi[i]; + double sum = 0.0000; + for(int j=0;j > hessian(numOTUs); + for(int i=0;icontrol_pressed) { break; } + double term1 = -alpha[i] * (- psi_ajk[i] + psi_Ak + psi_cjk[i] - psi_Ck); + double term2 = -alpha[i] * alpha[i] * (-psi1_ajk[i] + psi1_Ak + psi1_cjk[i] - psi1_Ck); + double term3 = 0.1 * alpha[i]; + + hessian[i][i] = term1 + term2 + term3; + + for(int j=0;jerrorOut(e, "CommunityTypeFinder", "getHessian"); + exit(1); + } +} +/**************************************************************************************************/ + +double CommunityTypeFinder::psi1(double xx){ + try { + + /* Euler-Maclaurin summation formula + * [Moshier, p. 400, with several typo corrections] + */ + + double s = 2.0000; + const int jmax = 12; + const int kmax = 10; + int j, k; + const double pmax = pow(kmax + xx, -s); + double scp = s; + double pcp = pmax / (kmax + xx); + double value = pmax*((kmax+xx)/(s-1.0) + 0.5); + + for(k=0; kcontrol_pressed) { return 0; } + value += pow(k + xx, -s); + } + + for(j=0; j<=jmax; j++) { + if (m->control_pressed) { return 0; } + double delta = hzeta_c[j+1] * scp * pcp; + value += delta; + + if(fabs(delta/value) < 0.5*EPSILON) break; + + scp *= (s+2*j+1)*(s+2*j+2); + pcp /= (kmax + xx)*(kmax + xx); + } + + return value; + } + catch(exception& e){ + m->errorOut(e, "CommunityTypeFinder", "psi1"); + exit(1); + } +} + +/**************************************************************************************************/ + +double CommunityTypeFinder::psi(double xx){ + try { + double psiX = 0.0000; + + if(xx < 1.0000){ + + double t1 = 1.0 / xx; + psiX = cheb_eval(psi_cs, 22, 2.0*xx-1.0); + psiX = -t1 + psiX; + + } + else if(xx < 2.0000){ + + const double v = xx - 1.0; + psiX = cheb_eval(psi_cs, 22, 2.0*v-1.0); + + } + else{ + const double t = 8.0/(xx*xx)-1.0; + psiX = cheb_eval(apsi_cs, 15, t); + psiX += log(xx) - 0.5/xx; + } + + return psiX; + } + catch(exception& e){ + m->errorOut(e, "CommunityTypeFinder", "psi"); + exit(1); + } +} +/**************************************************************************************************/ + +double CommunityTypeFinder::cheb_eval(const double seriesData[], int order, double xx){ + try { + double d = 0.0000; + double dd = 0.0000; + + double x2 = xx * 2.0000; + + for(int j=order;j>=1;j--){ + if (m->control_pressed) { return 0; } + double temp = d; + d = x2 * d - dd + seriesData[j]; + dd = temp; + } + + d = xx * d - dd + 0.5 * seriesData[0]; + return d; + } + catch(exception& e){ + m->errorOut(e, "CommunityTypeFinder", "cheb_eval"); + exit(1); + } +} +/**************************************************************************************************/ + +int CommunityTypeFinder::findkMeans(){ + try { + error.resize(numPartitions); for (int i = 0; i < numPartitions; i++) { error[i].resize(numOTUs, 0.0); } + vector > relativeAbundance(numSamples); + vector > alphaMatrix; + + alphaMatrix.resize(numPartitions); + lambdaMatrix.resize(numPartitions); + for(int i=0;icontrol_pressed) { return 0; } + int groupTotal = 0; + + relativeAbundance[i].assign(numOTUs, 0.0); + + for(int j=0;j 1e-6 && iteration < maxIters){ + + if (m->control_pressed) { return 0; } + //calcualte average relative abundance + maxChange = 0.0000; + for(int i=0;i averageRelativeAbundance(numOTUs, 0); + for(int j=0;j maxChange){ maxChange = normChange; } + } + + + //calcualte distance between each sample in partition and the average relative abundance + for(int i=0;icontrol_pressed) { return 0; } + + double normalizationFactor = 0; + vector totalDistToPartition(numPartitions, 0); + + for(int j=0;jcontrol_pressed) { return 0; } + for(int j=0;j 0){ + lambdaMatrix[j][i] = log(alphaMatrix[j][i]); + } + else{ + lambdaMatrix[j][i] = -10.0; + } + } + } + + return 0; + } + catch(exception& e){ + m->errorOut(e, "CommunityTypeFinder", "kMeans"); + exit(1); + } +} + + +/**************************************************************************************************/ + diff --git a/communitytype.h b/communitytype.h new file mode 100644 index 0000000..59291f5 --- /dev/null +++ b/communitytype.h @@ -0,0 +1,75 @@ +// +// communitytype.h +// Mothur +// +// Created by SarahsWork on 12/3/13. +// Copyright (c) 2013 Schloss Lab. All rights reserved. +// + +#ifndef Mothur_communitytype_h +#define Mothur_communitytype_h + +#define EPSILON numeric_limits::epsilon() + + +#include "mothurout.h" +#include "linearalgebra.h" +/**************************************************************************************************/ + +class CommunityTypeFinder { + +public: + CommunityTypeFinder(){ m = MothurOut::getInstance(); } + virtual ~CommunityTypeFinder(){}; + + virtual void printZMatrix(string, vector); + virtual void printRelAbund(string, vector); + virtual void printFitData(ofstream&) {} + virtual void printFitData(ostream&, double) {} + virtual void printSilData(ofstream&, double, vector); + virtual void printSilData(ostream&, double, vector); + + virtual double getNLL() { return currNLL; } + virtual double getAIC() { return aic; } + virtual double getBIC() { return bic; } + virtual double getLogDet() { return logDeterminant; } + virtual double getLaplace() { return laplace; } + + virtual double calcCHIndex(vector< vector< double> >) {return 0;} //Calinski-Harabasz + virtual vector calcSilhouettes(vector< vector< double> >) { vector s; return s; } //if none provided by child class + + +protected: + + int findkMeans(); + vector > getHessian(); + double psi1(double); + double psi(double); + double cheb_eval(const double[], int, double); + + + MothurOut* m; + vector > zMatrix; + vector > lambdaMatrix; + vector > error; + vector > countMatrix; + vector weights; + + + + int numPartitions; + int numSamples; + int numOTUs; + int currentPartition; + + double currNLL, aic, bic, logDeterminant, laplace; + + +}; + +/**************************************************************************************************/ + + + + +#endif diff --git a/flowdata.cpp b/flowdata.cpp index 7d61f8c..2891281 100644 --- a/flowdata.cpp +++ b/flowdata.cpp @@ -40,12 +40,19 @@ FlowData::FlowData(int numFlows, float signal, float noise, int maxHomoP, string //********************************************************************************************************************** bool FlowData::getNext(ifstream& flowFile){ - try { + seqName = getSequenceName(flowFile); - flowFile >> endFlow; + if (m->debug) { m->mothurOut("[DEBUG]: flow = " + seqName + " "); } + flowFile >> endFlow; + if (m->debug) { m->mothurOut(toString(endFlow) + " "); } if (!m->control_pressed) { - for(int i=0;i> flowData[i]; } + if (m->debug) { m->mothurOut(" "); } + for(int i=0;i> flowData[i]; + if (m->debug) { m->mothurOut(toString(flowData[i]) + " "); } + } + if (m->debug) { m->mothurOut("\n"); } updateEndFlow(); translateFlow(); m->gobble(flowFile); diff --git a/getmetacommunitycommand.cpp b/getmetacommunitycommand.cpp index 08cd35a..b316074 100644 --- a/getmetacommunitycommand.cpp +++ b/getmetacommunitycommand.cpp @@ -7,7 +7,10 @@ // #include "getmetacommunitycommand.h" - +#include "communitytype.h" +#include "kmeans.h" +#include "validcalculator.h" +#include "subsample.h" //********************************************************************************************************************** vector GetMetaCommunityCommand::setParameters(){ @@ -15,13 +18,17 @@ vector GetMetaCommunityCommand::setParameters(){ CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none","outputType",false,true); parameters.push_back(pshared); CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups); CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel); + CommandParameter pcalc("calc", "Multiple", "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan-kstest-sharednseqs-ochiai-anderberg-kulczynski-kulczynskicody-lennon-morisitahorn-braycurtis-whittaker-odum-canberra-structeuclidean-structchord-hellinger-manhattan-structpearson-soergel-spearman-structkulczynski-speciesprofile-hamming-structchi2-gower-memchi2-memchord-memeuclidean-mempearson-jsd", "jsd", "", "", "","",false,false,true); parameters.push_back(pcalc); + CommandParameter psubsample("subsample", "String", "", "", "", "", "","",false,false); parameters.push_back(psubsample); + CommandParameter piters("iters", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(piters); CommandParameter pminpartitions("minpartitions", "Number", "", "5", "", "", "","",false,false,true); parameters.push_back(pminpartitions); CommandParameter pmaxpartitions("maxpartitions", "Number", "", "100", "", "", "","",false,false,true); parameters.push_back(pmaxpartitions); CommandParameter poptimizegap("optimizegap", "Number", "", "3", "", "", "","",false,false,true); parameters.push_back(poptimizegap); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir); CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir); - + CommandParameter pmethod("method", "Multiple", "dmm-kmeans-pam", "dmm", "", "", "","",false,false,true); parameters.push_back(pmethod); + vector myArray; for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); } return myArray; @@ -35,9 +42,13 @@ vector GetMetaCommunityCommand::setParameters(){ string GetMetaCommunityCommand::getHelpString(){ try { string helpString = ""; - helpString += "The get.communitytype command parameters are shared, label, groups, minpartitions, maxpartitions, optimizegap and processors. The shared file is required. \n"; + helpString += "The get.communitytype command parameters are shared, method, label, groups, minpartitions, maxpartitions, optimizegap and processors. The shared file is required. \n"; helpString += "The label parameter is used to analyze specific labels in your input. labels are separated by dashes.\n"; helpString += "The groups parameter allows you to specify which of the groups in your shared file you would like analyzed. Group names are separated by dashes.\n"; + helpString += "The method parameter allows to select the method you would like to use. Options are dmm, kmeans and pam. Default=dmm.\n"; + helpString += "The calc parameter allows to select the calculator you would like to use to calculate the distance matrix used by the pam method. By default the jsd calculator is used.\n"; + helpString += "The iters parameter allows you to choose the number of times you would like to run the subsample while calculating the distance matirx for the pam method.\n"; + helpString += "The subsample parameter allows you to enter the size pergroup of the sample or you can set subsample=T and mothur will use the size of your smallest group while calculating the distance matrix for the pam method.\n"; helpString += "The minpartitions parameter is used to .... Default=5.\n"; helpString += "The maxpartitions parameter is used to .... Default=10.\n"; helpString += "The optimizegap parameter is used to .... Default=3.\n"; @@ -171,6 +182,37 @@ GetMetaCommunityCommand::GetMetaCommunityCommand(string option) { if(label != "all") { m->splitAtDash(label, labels); allLines = 0; } else { allLines = 1; } } + + method = validParameter.validFile(parameters, "method", false); + if (method == "not found") { method = "dmm"; } + + if ((method == "dmm") || (method == "kmeans") || (method == "pam")) { } + else { m->mothurOut("[ERROR]: " + method + " is not a valid method. Valid algorithms are dmm, kmeans and pam."); m->mothurOutEndLine(); abort = true; } + + calc = validParameter.validFile(parameters, "calc", false); + if (calc == "not found") { calc = "jsd"; } + else { + if (calc == "default") { calc = "jsd"; } + } + m->splitAtDash(calc, Estimators); + if (m->inUsersGroups("citation", Estimators)) { + ValidCalculators validCalc; validCalc.printCitations(Estimators); + //remove citation from list of calcs + for (int i = 0; i < Estimators.size(); i++) { if (Estimators[i] == "citation") { Estimators.erase(Estimators.begin()+i); break; } } + } + if (Estimators.size() != 1) { abort = true; m->mothurOut("[ERROR]: only one calculator is allowed.\n"); } + + temp = validParameter.validFile(parameters, "iters", false); if (temp == "not found") { temp = "1000"; } + m->mothurConvert(temp, iters); + + temp = validParameter.validFile(parameters, "subsample", false); if (temp == "not found") { temp = "F"; } + if (m->isNumeric1(temp)) { m->mothurConvert(temp, subsampleSize); subsample = true; } + else { + if (m->isTrue(temp)) { subsample = true; subsampleSize = -1; } //we will set it to smallest group later + else { subsample = false; } + } + + if (subsample == false) { iters = 0; } } } @@ -194,6 +236,35 @@ int GetMetaCommunityCommand::execute(){ set processedLabels; set userLabels = labels; + if (subsample) { + if (subsampleSize == -1) { //user has not set size, set size = smallest samples size + subsampleSize = lookup[0]->getNumSeqs(); + for (int i = 1; i < lookup.size(); i++) { + int thisSize = lookup[i]->getNumSeqs(); + + if (thisSize < subsampleSize) { subsampleSize = thisSize; } + } + }else { + m->clearGroups(); + Groups.clear(); + vector temp; + for (int i = 0; i < lookup.size(); i++) { + if (lookup[i]->getNumSeqs() < subsampleSize) { + m->mothurOut(lookup[i]->getGroup() + " contains " + toString(lookup[i]->getNumSeqs()) + ". Eliminating."); m->mothurOutEndLine(); + delete lookup[i]; + }else { + Groups.push_back(lookup[i]->getGroup()); + temp.push_back(lookup[i]); + } + } + lookup = temp; + m->setGroups(Groups); + } + + if (lookup.size() < 2) { m->mothurOut("You have not provided enough valid groups. I cannot run the command."); m->mothurOutEndLine(); m->control_pressed = true; return 0; } + } + + //as long as you are not at the end of the file or done wih the lines you want while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) { @@ -363,7 +434,12 @@ int GetMetaCommunityCommand::createProcesses(vector& thislo } //do my part - m->mothurOut("K\tNLE\t\tlogDet\tBIC\t\tAIC\t\tLaplace\n"); + if (method == "dmm") { m->mothurOut("K\tNLE\t\tlogDet\tBIC\t\tAIC\t\tLaplace\n"); } + else { + m->mothurOut("K\tCH\t"); + for (int i = 0; i < thislookup.size(); i++) { m->mothurOut(thislookup[i]->getGroup() + '\t'); } + m->mothurOut("\n"); + } minPartition = processDriver(thislookup, dividedPartitions[0], outputFileName, rels[0], matrix[0], doneFlags, 0); //force parent to wait until all the processes are done @@ -437,56 +513,6 @@ int GetMetaCommunityCommand::createProcesses(vector& thislo } #else - /* - vector pDataArray; - DWORD dwThreadIdArray[processors-1]; - HANDLE hThreadArray[processors-1]; - - //Create processor worker threads. - for( int i=1; i newLookup; - for (int k = 0; k < thislookup.size(); k++) { - SharedRAbundVector* temp = new SharedRAbundVector(); - temp->setLabel(thislookup[k]->getLabel()); - temp->setGroup(thislookup[k]->getGroup()); - newLookup.push_back(temp); - } - - //for each bin - for (int k = 0; k < thislookup[0]->getNumBins(); k++) { - if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; } - for (int j = 0; j < thislookup.size(); j++) { newLookup[j]->push_back(thislookup[j]->getAbundance(k), thislookup[j]->getGroup()); } - } - - processIDS.push_back(i); - - // Allocate memory for thread data. - metaCommunityData* tempMeta = new metaCommunityData(newLookup, m, dividedPartitions[i], outputFileName + toString(i), rels[i], matrix[i], minpartitions, maxpartitions, optimizegap); - pDataArray.push_back(tempMeta); - - hThreadArray[i] = CreateThread(NULL, 0, MyMetaCommunityThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]); - } - - //do my part - minPartition = processDriver(thislookup, dividedPartitions[0], outputFileName, rels[0], matrix[0]); - - //Wait until all threads have terminated. - WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE); - - //Close all thread handles and free memory allocations. - for(int i=0; i < pDataArray.size(); i++){ - if (pDataArray[i]->minPartition < minPartition) { minPartition = pDataArray[i]->minPartition; } - for (int j = 0; j < pDataArray[i]->outputNames.size(); j++) { - outputNames.push_back(pDataArray[i]->outputNames[j]); - } - m->appendFilesWithoutHeaders(outputFileName + toString(processIDS[i]), outputFileName); - m->mothurRemove(outputFileName + toString(processIDS[i])); - CloseHandle(hThreadArray[i]); - delete pDataArray[i]; - } */ - //do my part m->mothurOut("K\tNLE\t\tlogDet\tBIC\t\tAIC\t\tLaplace\n"); minPartition = processDriver(thislookup, dividedPartitions[0], outputFileName, rels[0], matrix[0], doneFlags, 0); #endif @@ -501,7 +527,8 @@ int GetMetaCommunityCommand::createProcesses(vector& thislo //run generate Summary function for smallest minPartition variables["[tag]"] = toString(minPartition); - generateSummaryFile(minPartition, variables); + vector piValues = generateDesignFile(minPartition, variables); + if (method == "dmm") { generateSummaryFile(minPartition, variables, piValues); } //pam doesn't make a relabund file return 0; @@ -516,12 +543,25 @@ int GetMetaCommunityCommand::processDriver(vector& thislook try { double minLaplace = 1e10; - int minPartition = 0; + int minPartition = 1; + vector minSilhouettes; minSilhouettes.resize(thislookup.size(), 0); + + ofstream fitData, silData; + if (method == "dmm") { + m->openOutputFile(outputFileName, fitData); + fitData.setf(ios::fixed, ios::floatfield); + fitData.setf(ios::showpoint); + fitData << "K\tNLE\tlogDet\tBIC\tAIC\tLaplace" << endl; + }else if((method == "pam") || (method == "kmeans")) { //because ch is looking of maximal value + minLaplace = 0; + m->openOutputFile(outputFileName, silData); + silData.setf(ios::fixed, ios::floatfield); + silData.setf(ios::showpoint); + silData << "K\tCH\t"; + for (int i = 0; i < thislookup.size(); i++) { silData << thislookup[i]->getGroup() << '\t'; } + silData << endl; + } - ofstream fitData; - m->openOutputFile(outputFileName, fitData); - fitData.setf(ios::fixed, ios::floatfield); - fitData.setf(ios::showpoint); cout.setf(ios::fixed, ios::floatfield); cout.setf(ios::showpoint); @@ -529,7 +569,18 @@ int GetMetaCommunityCommand::processDriver(vector& thislook vector thisGroups; for (int i = 0; i < thislookup.size(); i++) { sharedMatrix.push_back(thislookup[i]->getAbundances()); thisGroups.push_back(thislookup[i]->getGroup()); } - fitData << "K\tNLE\tlogDet\tBIC\tAIC\tLaplace" << endl; + vector< vector > dists; //do we want to output this matrix?? + if ((method == "pam") || (method == "kmeans")) { dists = generateDistanceMatrix(thislookup); } + + if (m->debug) { + m->mothurOut("[DEBUG]: dists = \n"); + for (int i = 0; i < dists.size(); i++) { + if (m->control_pressed) { break; } + m->mothurOut("[DEBUG]: i = " + toString(i) + '\t'); + for (int j = 0; j < i; j++) { m->mothurOut(toString(dists[i][j]) +"\t"); } + m->mothurOut("\n"); + } + } for(int i=0;i& thislook } } - qFinderDMM findQ(sharedMatrix, numPartitions); - - double laplace = findQ.getLaplace(); - m->mothurOut(toString(numPartitions) + '\t'); - cout << setprecision (2) << findQ.getNLL() << '\t' << findQ.getLogDet() << '\t'; - m->mothurOutJustToLog(toString(findQ.getNLL()) + '\t' + toString(findQ.getLogDet()) + '\t'); - cout << findQ.getBIC() << '\t' << findQ.getAIC() << '\t' << laplace; - m->mothurOutJustToLog(toString(findQ.getBIC()) + '\t' + toString(findQ.getAIC()) + '\t' + toString(laplace)); - - fitData << numPartitions << '\t'; - fitData << setprecision (2) << findQ.getNLL() << '\t' << findQ.getLogDet() << '\t'; - fitData << findQ.getBIC() << '\t' << findQ.getAIC() << '\t' << laplace << endl; - - if(laplace < minLaplace){ - minPartition = numPartitions; - minLaplace = laplace; - m->mothurOut("***"); + CommunityTypeFinder* finder = NULL; + if (method == "dmm") { finder = new qFinderDMM(sharedMatrix, numPartitions); } + else if (method == "kmeans") { finder = new KMeans(sharedMatrix, numPartitions); } + else if (method == "pam") { finder = new Pam(sharedMatrix, dists, numPartitions); } + else { + if (i == 0) { m->mothurOut(method + " is not a valid method option. I will run the command using dmm.\n"); } + finder = new qFinderDMM(sharedMatrix, numPartitions); } - m->mothurOutEndLine(); + double chi; vector silhouettes; + if (method == "dmm") { + double laplace = finder->getLaplace(); + if(laplace < minLaplace){ + minPartition = numPartitions; + minLaplace = laplace; + } + }else { + chi = finder->calcCHIndex(dists); + silhouettes = finder->calcSilhouettes(dists); + if (chi > minLaplace) { //save partition with maximum ch index score + minPartition = numPartitions; + minLaplace = chi; + minSilhouettes = silhouettes; + } + } string relabund = relabunds[i]; - outputNames.push_back(relabund); outputTypes["relabund"].push_back(relabund); string matrixName = matrix[i]; outputNames.push_back(matrixName); outputTypes["matrix"].push_back(matrixName); - findQ.printZMatrix(matrixName, thisGroups); - findQ.printRelAbund(relabund, m->currentSharedBinLabels); + finder->printZMatrix(matrixName, thisGroups); + + if (method == "dmm") { + finder->printFitData(cout, minLaplace); + finder->printFitData(fitData); + finder->printRelAbund(relabund, m->currentSharedBinLabels); + outputNames.push_back(relabund); outputTypes["relabund"].push_back(relabund); + }else if ((method == "pam") || (method == "kmeans")) { //print silouettes and ch values + finder->printSilData(cout, chi, silhouettes); + finder->printSilData(silData, chi, silhouettes); + if (method == "kmeans") { + finder->printRelAbund(relabund, m->currentSharedBinLabels); + outputNames.push_back(relabund); outputTypes["relabund"].push_back(relabund); + } + } + delete finder; if(optimizegap != -1 && (numPartitions - minPartition) >= optimizegap && numPartitions >= minpartitions){ string tempDoneFile = m->getRootName(m->getSimpleName(sharedfile)) + toString(processID) + ".done.temp"; @@ -588,9 +657,7 @@ int GetMetaCommunityCommand::processDriver(vector& thislook break; } } - fitData.close(); - - //minPartition = 4; + if (method == "dmm") { fitData.close(); } if (m->control_pressed) { return 0; } @@ -672,7 +739,7 @@ vector GetMetaCommunityCommand::generateDesignFile(int numPartitions, ma inline bool summaryFunction(summaryData i, summaryData j){ return i.difference > j.difference; } /**************************************************************************************************/ -int GetMetaCommunityCommand::generateSummaryFile(int numPartitions, map v){ +int GetMetaCommunityCommand::generateSummaryFile(int numPartitions, map v, vector piValues){ try { vector summary; @@ -683,9 +750,6 @@ int GetMetaCommunityCommand::generateSummaryFile(int numPartitions, map piValues = generateDesignFile(numPartitions, v); - ifstream referenceFile; map variables; variables["[filename]"] = v["[filename]"]; @@ -808,4 +872,236 @@ int GetMetaCommunityCommand::generateSummaryFile(int numPartitions, map > GetMetaCommunityCommand::generateDistanceMatrix(vector& thisLookup){ + try { + vector > results; + + Calculator* matrixCalculator; + ValidCalculators validCalculator; + int i = 0; + + if (validCalculator.isValidCalculator("matrix", Estimators[i]) == true) { + if (Estimators[i] == "sharedsobs") { + matrixCalculator = new SharedSobsCS(); + }else if (Estimators[i] == "sharedchao") { + matrixCalculator = new SharedChao1(); + }else if (Estimators[i] == "sharedace") { + matrixCalculator = new SharedAce(); + }else if (Estimators[i] == "jabund") { + matrixCalculator = new JAbund(); + }else if (Estimators[i] == "sorabund") { + matrixCalculator = new SorAbund(); + }else if (Estimators[i] == "jclass") { + matrixCalculator = new Jclass(); + }else if (Estimators[i] == "sorclass") { + matrixCalculator = new SorClass(); + }else if (Estimators[i] == "jest") { + matrixCalculator = new Jest(); + }else if (Estimators[i] == "sorest") { + matrixCalculator = new SorEst(); + }else if (Estimators[i] == "thetayc") { + matrixCalculator = new ThetaYC(); + }else if (Estimators[i] == "thetan") { + matrixCalculator = new ThetaN(); + }else if (Estimators[i] == "kstest") { + matrixCalculator = new KSTest(); + }else if (Estimators[i] == "sharednseqs") { + matrixCalculator = new SharedNSeqs(); + }else if (Estimators[i] == "ochiai") { + matrixCalculator = new Ochiai(); + }else if (Estimators[i] == "anderberg") { + matrixCalculator = new Anderberg(); + }else if (Estimators[i] == "kulczynski") { + matrixCalculator = new Kulczynski(); + }else if (Estimators[i] == "kulczynskicody") { + matrixCalculator = new KulczynskiCody(); + }else if (Estimators[i] == "lennon") { + matrixCalculator = new Lennon(); + }else if (Estimators[i] == "morisitahorn") { + matrixCalculator = new MorHorn(); + }else if (Estimators[i] == "braycurtis") { + matrixCalculator = new BrayCurtis(); + }else if (Estimators[i] == "whittaker") { + matrixCalculator = new Whittaker(); + }else if (Estimators[i] == "odum") { + matrixCalculator = new Odum(); + }else if (Estimators[i] == "canberra") { + matrixCalculator = new Canberra(); + }else if (Estimators[i] == "structeuclidean") { + matrixCalculator = new StructEuclidean(); + }else if (Estimators[i] == "structchord") { + matrixCalculator = new StructChord(); + }else if (Estimators[i] == "hellinger") { + matrixCalculator = new Hellinger(); + }else if (Estimators[i] == "manhattan") { + matrixCalculator = new Manhattan(); + }else if (Estimators[i] == "structpearson") { + matrixCalculator = new StructPearson(); + }else if (Estimators[i] == "soergel") { + matrixCalculator = new Soergel(); + }else if (Estimators[i] == "spearman") { + matrixCalculator = new Spearman(); + }else if (Estimators[i] == "structkulczynski") { + matrixCalculator = new StructKulczynski(); + }else if (Estimators[i] == "speciesprofile") { + matrixCalculator = new SpeciesProfile(); + }else if (Estimators[i] == "hamming") { + matrixCalculator = new Hamming(); + }else if (Estimators[i] == "structchi2") { + matrixCalculator = new StructChi2(); + }else if (Estimators[i] == "gower") { + matrixCalculator = new Gower(); + }else if (Estimators[i] == "memchi2") { + matrixCalculator = new MemChi2(); + }else if (Estimators[i] == "memchord") { + matrixCalculator = new MemChord(); + }else if (Estimators[i] == "memeuclidean") { + matrixCalculator = new MemEuclidean(); + }else if (Estimators[i] == "mempearson") { + matrixCalculator = new MemPearson(); + }else if (Estimators[i] == "jsd") { + matrixCalculator = new JSD(); + }else { + m->mothurOut("[ERROR]: " + Estimators[i] + " is not a valid calculator, please correct.\n"); m->control_pressed = true; return results; + } + } + + //calc distances + vector< vector< vector > > calcDistsTotals; //each iter, then each groupCombos dists. this will be used to make .dist files + vector< vector > calcDists; calcDists.resize(1); + + for (int thisIter = 0; thisIter < iters+1; thisIter++) { + + vector thisItersLookup = thisLookup; + + if (subsample && (thisIter != 0)) { + SubSample sample; + vector tempLabels; //dont need since we arent printing the sampled sharedRabunds + + //make copy of lookup so we don't get access violations + vector newLookup; + for (int k = 0; k < thisItersLookup.size(); k++) { + SharedRAbundVector* temp = new SharedRAbundVector(); + temp->setLabel(thisItersLookup[k]->getLabel()); + temp->setGroup(thisItersLookup[k]->getGroup()); + newLookup.push_back(temp); + } + + //for each bin + for (int k = 0; k < thisItersLookup[0]->getNumBins(); k++) { + if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return results; } + for (int j = 0; j < thisItersLookup.size(); j++) { newLookup[j]->push_back(thisItersLookup[j]->getAbundance(k), thisItersLookup[j]->getGroup()); } + } + + tempLabels = sample.getSample(newLookup, subsampleSize); + thisItersLookup = newLookup; + } + + + driver(thisItersLookup, calcDists, matrixCalculator); + + if (subsample && (thisIter != 0)) { + if((thisIter) % 100 == 0){ m->mothurOutJustToScreen(toString(thisIter)+"\n"); } + calcDistsTotals.push_back(calcDists); + for (int i = 0; i < calcDists.size(); i++) { + for (int j = 0; j < calcDists[i].size(); j++) { + if (m->debug) { m->mothurOut("[DEBUG]: Results: iter = " + toString(thisIter) + ", " + thisLookup[calcDists[i][j].seq1]->getGroup() + " - " + thisLookup[calcDists[i][j].seq2]->getGroup() + " distance = " + toString(calcDists[i][j].dist) + ".\n"); } + } + } + //clean up memory + for (int i = 0; i < thisItersLookup.size(); i++) { delete thisItersLookup[i]; } + thisItersLookup.clear(); + }else { //print results for whole dataset + for (int i = 0; i < calcDists.size(); i++) { + if (m->control_pressed) { break; } + + //initialize matrix + results.resize(thisLookup.size()); + for (int k = 0; k < thisLookup.size(); k++) { results[k].resize(thisLookup.size(), 0.0); } + + for (int j = 0; j < calcDists[i].size(); j++) { + int row = calcDists[i][j].seq1; + int column = calcDists[i][j].seq2; + double dist = calcDists[i][j].dist; + + results[row][column] = dist; + results[column][row] = dist; + } + } + } + for (int i = 0; i < calcDists.size(); i++) { calcDists[i].clear(); } + } + + if (iters != 0) { + //we need to find the average distance and standard deviation for each groups distance + vector< vector > calcAverages = m->getAverages(calcDistsTotals, "average"); + + //print results + for (int i = 0; i < calcDists.size(); i++) { + results.resize(thisLookup.size()); + for (int k = 0; k < thisLookup.size(); k++) { results[k].resize(thisLookup.size(), 0.0); } + + for (int j = 0; j < calcAverages[i].size(); j++) { + int row = calcAverages[i][j].seq1; + int column = calcAverages[i][j].seq2; + float dist = calcAverages[i][j].dist; + + results[row][column] = dist; + results[column][row] = dist; + } + } + } + + + return results; + } + catch(exception& e) { + m->errorOut(e, "GetMetaCommunityCommand", "generateDistanceMatrix"); + exit(1); + } +} +/**************************************************************************************************/ +int GetMetaCommunityCommand::driver(vector thisLookup, vector< vector >& calcDists, Calculator* matrixCalculator) { + try { + vector subset; + + for (int k = 0; k < thisLookup.size(); k++) { // pass cdd each set of groups to compare + + for (int l = 0; l < k; l++) { + + if (k != l) { //we dont need to similiarity of a groups to itself + subset.clear(); //clear out old pair of sharedrabunds + //add new pair of sharedrabunds + subset.push_back(thisLookup[k]); subset.push_back(thisLookup[l]); + + + + //if this calc needs all groups to calculate the pair load all groups + if (matrixCalculator->getNeedsAll()) { + //load subset with rest of lookup for those calcs that need everyone to calc for a pair + for (int w = 0; w < thisLookup.size(); w++) { + if ((w != k) && (w != l)) { subset.push_back(thisLookup[w]); } + } + } + + vector tempdata = matrixCalculator->getValues(subset); //saves the calculator outputs + + if (m->control_pressed) { return 1; } + + seqDist temp(l, k, tempdata[0]); + //cout << l << '\t' << k << '\t' << tempdata[0] << endl; + calcDists[0].push_back(temp); + } + + } + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "MatrixOutputCommand", "driver"); + exit(1); + } +} +//********************************************************************************************************************** diff --git a/getmetacommunitycommand.h b/getmetacommunitycommand.h index 7697430..9c17e4d 100644 --- a/getmetacommunitycommand.h +++ b/getmetacommunitycommand.h @@ -12,6 +12,49 @@ #include "command.hpp" #include "inputdata.h" #include "qFinderDMM.h" +#include "pam.h" +#include "sharedsobscollectsummary.h" +#include "sharedchao1.h" +#include "sharedace.h" +#include "sharednseqs.h" +#include "sharedjabund.h" +#include "sharedsorabund.h" +#include "sharedjclass.h" +#include "sharedsorclass.h" +#include "sharedjest.h" +#include "sharedsorest.h" +#include "sharedthetayc.h" +#include "sharedthetan.h" +#include "sharedkstest.h" +#include "whittaker.h" +#include "sharedochiai.h" +#include "sharedanderbergs.h" +#include "sharedkulczynski.h" +#include "sharedkulczynskicody.h" +#include "sharedlennon.h" +#include "sharedmorisitahorn.h" +#include "sharedbraycurtis.h" +#include "sharedjackknife.h" +#include "whittaker.h" +#include "odum.h" +#include "canberra.h" +#include "structeuclidean.h" +#include "structchord.h" +#include "hellinger.h" +#include "manhattan.h" +#include "structpearson.h" +#include "soergel.h" +#include "spearman.h" +#include "structkulczynski.h" +#include "structchi2.h" +#include "speciesprofile.h" +#include "hamming.h" +#include "gower.h" +#include "memchi2.h" +#include "memchord.h" +#include "memeuclidean.h" +#include "mempearson.h" +#include "sharedjsd.h" /**************************************************************************************************/ @@ -40,18 +83,20 @@ private: unsigned long long end; linePair(unsigned long long i, unsigned long long j) : start(i), end(j) {} }; - bool abort, allLines; + bool abort, allLines, subsample; string outputDir; vector outputNames; - string sharedfile; - int minpartitions, maxpartitions, optimizegap, processors; - vector Groups; + string sharedfile, method, calc; + int minpartitions, maxpartitions, optimizegap, processors, iters, subsampleSize; + vector Groups, Estimators; set labels; + vector > generateDistanceMatrix(vector& lookup); + int driver(vector thisLookup, vector< vector >& calcDists, Calculator*); int processDriver(vector&, vector&, string, vector, vector, vector, int); int createProcesses(vector&); vector generateDesignFile(int, map); - int generateSummaryFile(int, map); + int generateSummaryFile(int, map, vector); }; @@ -63,111 +108,5 @@ struct summaryData { vector partMean, partLCI, partUCI; }; -/**************************************************************************************************/ - -struct metaCommunityData { - vector thislookup; - MothurOut* m; - string outputFileName; - vector relabunds, matrix, outputNames; - int minpartitions, maxpartitions, optimizegap; - vector parts; - int minPartition; - - metaCommunityData(){} - metaCommunityData(vector lu, MothurOut* mout, vector dp, string fit, vector rels, vector mat, int minp, int maxp, int opg) { - m = mout; - thislookup = lu; - parts = dp; - outputFileName = fit; - relabunds = rels; - matrix = mat; - minpartitions = minp; - maxpartitions = maxp; - optimizegap = opg; - minPartition = 0; - } -}; -/**************************************************************************************************/ -#if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix) -#else -static DWORD WINAPI MyMetaCommunityThreadFunction(LPVOID lpParam){ - metaCommunityData* pDataArray; - pDataArray = (metaCommunityData*)lpParam; - - try { - - double minLaplace = 1e10; - - ofstream fitData; - pDataArray->m->openOutputFile(pDataArray->outputFileName, fitData); - fitData.setf(ios::fixed, ios::floatfield); - fitData.setf(ios::showpoint); - cout.setf(ios::fixed, ios::floatfield); - cout.setf(ios::showpoint); - - vector< vector > sharedMatrix; - for (int i = 0; i < pDataArray->thislookup.size(); i++) { sharedMatrix.push_back(pDataArray->thislookup[i]->getAbundances()); } - - pDataArray->m->mothurOut("K\tNLE\t\tlogDet\tBIC\t\tAIC\t\tLaplace\n"); - fitData << "K\tNLE\tlogDet\tBIC\tAIC\tLaplace" << endl; - - for(int i=0;iparts.size();i++){ - - int numPartitions = pDataArray->parts[i]; - - if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: running partition " + toString(numPartitions) + "\n"); } - - if (pDataArray->m->control_pressed) { break; } - - qFinderDMM* findQ = new qFinderDMM(sharedMatrix, numPartitions); - - if (pDataArray->m->debug) { pDataArray->m->mothurOut("[DEBUG]: done finding Q " + toString(numPartitions) + "\n"); } - - double laplace = findQ->getLaplace(); - pDataArray->m->mothurOut(toString(numPartitions) + '\t'); - cout << setprecision (2) << findQ->getNLL() << '\t' << findQ->getLogDet() << '\t'; - pDataArray->m->mothurOutJustToLog(toString(findQ->getNLL()) + '\t' + toString(findQ->getLogDet()) + '\t'); - cout << findQ->getBIC() << '\t' << findQ->getAIC() << '\t' << laplace; - pDataArray->m->mothurOutJustToLog(toString(findQ->getBIC()) + '\t' + toString(findQ->getAIC()) + '\t' + toString(laplace)); - - fitData << numPartitions << '\t'; - fitData << setprecision (2) << findQ->getNLL() << '\t' << findQ->getLogDet() << '\t'; - fitData << findQ->getBIC() << '\t' << findQ->getAIC() << '\t' << laplace << endl; - - if(laplace < minLaplace){ - pDataArray->minPartition = numPartitions; - minLaplace = laplace; - pDataArray->m->mothurOut("***"); - } - pDataArray->m->mothurOutEndLine(); - - pDataArray->outputNames.push_back(pDataArray->relabunds[i]); - pDataArray->outputNames.push_back(pDataArray->matrix[i]); - - findQ->printZMatrix(pDataArray->matrix[i], pDataArray->m->getGroups()); - findQ->printRelAbund(pDataArray->relabunds[i], pDataArray->m->currentSharedBinLabels); - - if(pDataArray->optimizegap != -1 && (numPartitions - pDataArray->minPartition) >= pDataArray->optimizegap && numPartitions >= pDataArray->minpartitions){ break; } - - delete findQ; - } - fitData.close(); - - //minPartition = 4; - - if (pDataArray->m->control_pressed) { return 0; } - - return 0; - - } - catch(exception& e) { - pDataArray->m->errorOut(e, "GetMetaCommunityCommand", "MyMetaCommunityThreadFunction"); - exit(1); - } -} -#endif - - #endif diff --git a/getoturepcommand.cpp b/getoturepcommand.cpp index 6109237..c645493 100644 --- a/getoturepcommand.cpp +++ b/getoturepcommand.cpp @@ -760,6 +760,8 @@ string GetOTURepCommand::findRepAbund(vector names, string group) { try{ vector reps; string rep = "notFound"; + + if (m->debug) { m->mothurOut("[DEBUG]: group=" + group + " names.size() = " + toString(names.size()) + " " + names[0] + "\n"); } if ((names.size() == 1)) { return names[0]; @@ -773,7 +775,7 @@ string GetOTURepCommand::findRepAbund(vector names, string group) { if (countfile != "") { //if countfile is not blank then we can assume the list file contains only uniques, otherwise we assume list file contains everyone. int numRep = 0; if (group != "") { numRep = ct.getGroupCount(names[i], group); } - else { numRep = ct.getGroupCount(names[i]); } + else { numRep = ct.getNumSeqs(names[i]); } if (numRep > maxAbund) { reps.clear(); reps.push_back(names[i]); @@ -834,7 +836,7 @@ string GetOTURepCommand::findRep(vector names, string group) { if (countfile != "") { //if countfile is not blank then we can assume the list file contains only uniques, otherwise we assume list file contains everyone. int numRep = 0; if (group != "") { numRep = ct.getGroupCount(names[i], group); } - else { numRep = ct.getGroupCount(names[i]); } + else { numRep = ct.getNumSeqs(names[i]); } for (int j = 1; j < numRep; j++) { //don't add yourself again seqIndex.push_back(nameToIndex[names[i]]); } diff --git a/kmeans.cpp b/kmeans.cpp new file mode 100644 index 0000000..dab5c7d --- /dev/null +++ b/kmeans.cpp @@ -0,0 +1,237 @@ +// +// kmeans.cpp +// Mothur +// +// Created by SarahsWork on 12/4/13. +// Copyright (c) 2013 Schloss Lab. All rights reserved. +// + +#include "kmeans.h" + +/**************************************************************************************************/ + +KMeans::KMeans(vector > cm, int p) : CommunityTypeFinder() { + try { + countMatrix = cm; + numSamples = (int)countMatrix.size(); + numOTUs = (int)countMatrix[0].size(); + numPartitions = p; + + findkMeans(); + } + catch(exception& e) { + m->errorOut(e, "KMeans", "KMeans"); + exit(1); + } +} +/**************************************************************************************************/ +//The silhouette width S(i)of individual data points i is calculated using the following formula: +/* + s(i) = b(i) - a(i) + ----------- + max(b(i),a(i)) + where a(i) is the average dissimilarity (or distance) of sample i to all other samples in the same cluster, while b(i) is the average dissimilarity (or distance) to all objects in the closest other cluster. + + The formula implies -1 =< S(i) =< 1 . A sample which is much closer to its own cluster than to any other cluster has a high S(i) value, while S(i) close to 0 implies that the given sample lies somewhere between two clusters. Large negative S(i) values indicate that the sample was assigned to the wrong cluster. + */ + +vector KMeans::calcSilhouettes(vector > dists) { + try { + vector silhouettes; silhouettes.resize(numSamples, 0.0); + if (numPartitions < 2) { return silhouettes; } + + map clusterMap; //map sample to partition + for (int j = 0; j < numSamples; j++) { + double maxValue = 0.0; + for (int i = 0; i < numPartitions; i++) { + if (m->control_pressed) { return silhouettes; } + if (zMatrix[i][j] > maxValue) { //for kmeans zmatrix contains values for each sample in each partition. partition with highest value for that sample is the partition where the sample should be + clusterMap[j] = i; + maxValue = zMatrix[i][j]; + } + } + } + + vector nextClosestPartition; + findSecondClosest(nextClosestPartition, dists, clusterMap); + + if (m->control_pressed) { return silhouettes; } + + vector a, b; a.resize(numSamples, 0.0); b.resize(numSamples, 0.0); + + //calc a - all a[i] are the same in the same partition + for (int k = 0; k < numPartitions; k++) { + if (m->control_pressed) { break; } + + int count = 0; + double totalA = 0.0; + for (int i = 0; i < numSamples; i++) { + for (int j = 0; j < numSamples; j++) { + if (m->control_pressed) { break; } + if ((clusterMap[i] == k) && (clusterMap[j] == k)){ //are both samples in the partition, if so add there distance + totalA += dists[j][i]; //distance from this sample to the other samples in the partition + count++; + } + } + } + totalA /= (double) count; + + //set a[i] to average for cluster + for (int i = 0; i < numSamples; i++) { + if (clusterMap[i] == k) { a[i] = totalA; } + } + } + + //calc b + for (int i = 0; i < numSamples; i++) { + if (m->control_pressed) { break; } + + int thisPartition = nextClosestPartition[i]; + int count = 0; + double totalB = 0.0; + for (int j = 0; j < numSamples; j++) { + if (clusterMap[j] == thisPartition) { //this sample is in this partition + totalB += dists[i][j]; + count++; + } + } + b[i] = totalB / (double) count; + } + + //calc silhouettes + for (int i = 0; i < numSamples; i++) { + if (m->control_pressed) { break; } + + double denom = a[i]; + if (b[i] > denom) { denom = b[i]; } //max(a[i],b[i]) + + silhouettes[i] = (b[i] - a[i]) / denom; + + //cout << "silhouettes " << i << '\t' << silhouettes[i] << endl; + } + + return silhouettes; + } + catch(exception& e) { + m->errorOut(e, "KMeans", "calcSilhouettes"); + exit(1); + } +} +/**************************************************************************************************/ +int KMeans::findSecondClosest(vector& nextClosestPartition, vector >& dists, map clusterMap) { + try { + vector minScores; minScores.resize(numSamples, 1e6); + nextClosestPartition.resize(numSamples, 0); + + + for (int i = 0; i < numSamples; i++) { + for (int j = 0; j < numPartitions; j++) { + if (m->control_pressed) { break; } + + //is this the one we are assigned to - ie the "best" cluster. We want second best. + //if numPartitions = 2, then special case?? + if (clusterMap[i] != j) { + double score = 1e6; + if (numPartitions == 2) { + score = 0.0; //choose other option, there are only 2. + }else{ score = calcScore(i, j, dists, clusterMap); } + + if (m->debug) { m->mothurOut("[DEBUG]: sample = " + toString(i) + " partition = " + toString(j) + " score = " + toString(score) + "\n"); } + + //is this better than our last find + if (score < minScores[i]) { + minScores[i] = score; + nextClosestPartition[i] = j; + } + }else {} //best case, ignore + } + } + + return 0; + } + catch(exception& e) { + m->errorOut(e, "KMeans", "findSecondClosest"); + exit(1); + } +} +/**************************************************************************************************/ +double KMeans::calcScore(int sample, int partition, vector >& dists, map clusterMap) { + try { + //square the distances and then for each pair of clusters, calculate the sum of the squraed distances between the clusters + //then with the sum of hte squared dsitances take the square root and divide by the number of distances in the sum + + double sum = 0.0; int count = 0; + for (int i = 0; i < numSamples; i++) { + if (m->control_pressed) { break; } + if (clusterMap[i] == partition) { //samples in this cluster + sum += (dists[sample][i] * dists[sample][i]); + count++; + } + } + + sum = sqrt(sum); + sum /= (double) count; + + return sum; + } + catch(exception& e) { + m->errorOut(e, "KMeans", "calcScore"); + exit(1); + } +} +/**************************************************************************************************/ +/*To assess the optimal number of clusters our dataset was most robustly partitioned into, we used the Calinski-Harabasz (CH) Index that has shown good performance in recovering the number of clusters. It is defined as: + + CHk=Bk/(k−1)/Wk/(n−k) + + where Bk is the between-cluster sum of squares (i.e. the squared distances between all points i and j, for which i and j are not in the same cluster) and Wk is the within-clusters sum of squares (i.e. the squared distances between all points i and j, for which i and j are in the same cluster). This measure implements the idea that the clustering is more robust when between-cluster distances are substantially larger than within-cluster distances. Consequently, we chose the number of clusters k such that CHk was maximal.*/ +double KMeans::calcCHIndex(vector< vector< double> > dists){ + try { + double CH = 0.0; + + if (numPartitions < 2) { return CH; } + + map clusterMap; //map sample to partition + for (int j = 0; j < numSamples; j++) { + double maxValue = 0.0; + for (int i = 0; i < numPartitions; i++) { + if (m->control_pressed) { return 0.0; } + if (zMatrix[i][j] > maxValue) { //for kmeans zmatrix contains values for each sample in each partition. partition with highest value for that sample is the partition where the sample should be + clusterMap[j] = i; + maxValue = zMatrix[i][j]; + } + } + } + + double sumBetweenCluster = 0.0; + double sumWithinClusters = 0.0; + + for (int i = 0; i < numSamples; i++) { //lt + for (int j = 0; j < i; j++) { + if (m->control_pressed) { return 0.0; } + int partitionI = clusterMap[i]; + int partitionJ = clusterMap[j]; + + if (partitionI == partitionJ) { //they are from the same cluster so this distance is added to Wk + sumWithinClusters += (dists[i][j] * dists[i][j]); + }else { //they are NOT from the same cluster so this distance is added to Bk + sumBetweenCluster += (dists[i][j] * dists[i][j]); + } + } + } + //cout << numPartitions << '\t' << sumWithinClusters << '\t' << sumBetweenCluster << '\t' << (numSamples - numPartitions) << endl; + + CH = (sumBetweenCluster / (double)(numPartitions - 1)) / (sumWithinClusters / (double) (numSamples - numPartitions)); + + return CH; + } + catch(exception& e){ + m->errorOut(e, "KMeans", "calcCHIndex"); + exit(1); + } +} +/**************************************************************************************************/ + + + + diff --git a/kmeans.h b/kmeans.h new file mode 100644 index 0000000..97dfb68 --- /dev/null +++ b/kmeans.h @@ -0,0 +1,32 @@ +// +// kmeans.h +// Mothur +// +// Created by SarahsWork on 12/4/13. +// Copyright (c) 2013 Schloss Lab. All rights reserved. +// + +#ifndef Mothur_kmeans_h +#define Mothur_kmeans_h + +#include "communitytype.h" + +/**************************************************************************************************/ + +class KMeans : public CommunityTypeFinder { + +public: + KMeans(vector >, int); + vector calcSilhouettes(vector< vector< double> >); + double calcCHIndex(vector< vector< double> >); + +private: + + int findSecondClosest(vector&, vector >&, map); + double calcScore(int sample, int partition, vector >&, map); + +}; + +/**************************************************************************************************/ + +#endif diff --git a/linearalgebra.cpp b/linearalgebra.cpp index b208aab..bacf2c1 100644 --- a/linearalgebra.cpp +++ b/linearalgebra.cpp @@ -114,7 +114,7 @@ double LinearAlgebra::gammp(const double a, const double x) { } /*********************************************************************************************************************************/ //Numerical Recipes pg. 223 -double LinearAlgebra::gammq(const double a, const double x) { +/*double LinearAlgebra::gammq(const double a, const double x) { try { double gamser,gammcf,gln; @@ -130,11 +130,11 @@ double LinearAlgebra::gammq(const double a, const double x) { return 0; } catch(exception& e) { - m->errorOut(e, "LinearAlgebra", "gammp"); + m->errorOut(e, "LinearAlgebra", "gammq"); exit(1); } } -/*********************************************************************************************************************************/ +*********************************************************************************************************************************/ //Numerical Recipes pg. 224 double LinearAlgebra::gcf(double& gammcf, const double a, const double x, double& gln){ try { @@ -161,7 +161,7 @@ double LinearAlgebra::gcf(double& gammcf, const double a, const double x, double h *= del; if (fabs(del-1.0) <= EPS) break; } - if (i > ITMAX) { m->mothurOut("[ERROR]: a too large, ITMAX too small in gcf\n"); m->control_pressed = true; } + if (i > ITMAX) { m->mothurOut("[ERROR]: " + toString(a) + " too large, ITMAX=100 too small in gcf\n"); m->control_pressed = true; } gammcf=exp(-x+a*log(x)-gln)*h; return 0.0; @@ -1302,7 +1302,7 @@ double LinearAlgebra::calcKruskalWallis(vector& values, double& pV double lastTerm = 3 * (values.size()+1); H = firstTerm * middleTerm - lastTerm; - + //adjust for ties if (TIES.size() != 0) { double sum = 0.0; @@ -1311,6 +1311,8 @@ double LinearAlgebra::calcKruskalWallis(vector& values, double& pV H /= result; } + if (isnan(H) || isinf(H)) { H = 0; } + //Numerical Recipes pg221 pValue = 1.0 - (gammp(((treatments.size()-1)/(double)2.0), H/2.0)); diff --git a/linearalgebra.h b/linearalgebra.h index 7453f4e..e4c8c15 100644 --- a/linearalgebra.h +++ b/linearalgebra.h @@ -57,7 +57,7 @@ private: double betacf(const double, const double, const double); double betai(const double, const double, const double); double gammln(const double); - double gammq(const double, const double); + //double gammq(const double, const double); double gser(double&, const double, const double, double&); double gcf(double&, const double, const double, double&); double erfcc(double); diff --git a/matrixoutputcommand.cpp b/matrixoutputcommand.cpp index a53a3dc..8a7656c 100644 --- a/matrixoutputcommand.cpp +++ b/matrixoutputcommand.cpp @@ -17,7 +17,7 @@ vector MatrixOutputCommand::setParameters(){ CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel); CommandParameter psubsample("subsample", "String", "", "", "", "", "","",false,false); parameters.push_back(psubsample); CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups); - CommandParameter pcalc("calc", "Multiple", "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan-kstest-sharednseqs-ochiai-anderberg-kulczynski-kulczynskicody-lennon-morisitahorn-braycurtis-whittaker-odum-canberra-structeuclidean-structchord-hellinger-manhattan-structpearson-soergel-spearman-structkulczynski-speciesprofile-hamming-structchi2-gower-memchi2-memchord-memeuclidean-mempearson", "jclass-thetayc", "", "", "","",true,false,true); parameters.push_back(pcalc); + CommandParameter pcalc("calc", "Multiple", "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan-kstest-sharednseqs-ochiai-anderberg-kulczynski-kulczynskicody-lennon-morisitahorn-braycurtis-whittaker-odum-canberra-structeuclidean-structchord-hellinger-manhattan-structpearson-soergel-spearman-structkulczynski-speciesprofile-hamming-structchi2-gower-memchi2-memchord-memeuclidean-mempearson-jsd", "jclass-thetayc", "", "", "","",true,false,true); parameters.push_back(pcalc); CommandParameter poutput("output", "Multiple", "lt-square-column", "lt", "", "", "","",false,false); parameters.push_back(poutput); CommandParameter pmode("mode", "Multiple", "average-median", "average", "", "", "","",false,false); parameters.push_back(pmode); CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); @@ -281,6 +281,9 @@ MatrixOutputCommand::MatrixOutputCommand(string option) { matrixCalculators.push_back(new MemEuclidean()); }else if (Estimators[i] == "mempearson") { matrixCalculators.push_back(new MemPearson()); + }else if (Estimators[i] == "jsd") { + matrixCalculators.push_back(new JSD()); + } } } @@ -791,7 +794,7 @@ int MatrixOutputCommand::driver(vector thisLookup, int star vector tempdata = matrixCalculators[i]->getValues(subset); //saves the calculator outputs if (m->control_pressed) { return 1; } - + seqDist temp(l, k, tempdata[0]); calcDists[i].push_back(temp); } diff --git a/matrixoutputcommand.h b/matrixoutputcommand.h index 90f1206..5ddfb2c 100644 --- a/matrixoutputcommand.h +++ b/matrixoutputcommand.h @@ -54,6 +54,7 @@ #include "memchord.h" #include "memeuclidean.h" #include "mempearson.h" +#include "sharedjsd.h" // aka. dist.shared() @@ -224,7 +225,10 @@ static DWORD WINAPI MyDistSharedThreadFunction(LPVOID lpParam){ matrixCalculators.push_back(new MemEuclidean()); }else if (pDataArray->Estimators[i] == "mempearson") { matrixCalculators.push_back(new MemPearson()); + }else if (pDataArray->Estimators[i] == "jsd") { + matrixCalculators.push_back(new JSD()); } + } } diff --git a/metastatscommand.cpp b/metastatscommand.cpp index 33b559f..de9e373 100644 --- a/metastatscommand.cpp +++ b/metastatscommand.cpp @@ -9,6 +9,7 @@ #include "metastatscommand.h" #include "sharedutilities.h" +#include "sharedrabundfloatvector.h" //********************************************************************************************************************** @@ -213,7 +214,7 @@ MetaStatsCommand::MetaStatsCommand(string option) { int MetaStatsCommand::execute(){ try { - if (abort == true) { if (calledHelp) { return 0; } return 2; } + if (abort == true) { if (calledHelp) { return 0; } return 2; } //just used to convert files to test metastats online /****************************************************/ @@ -221,7 +222,6 @@ int MetaStatsCommand::execute(){ convertSharedToInput = false; if (convertInputToShared) { convertToShared(sharedfile); return 0; } /****************************************************/ - designMap = new GroupMap(designfile); designMap->readDesignMap(); @@ -574,9 +574,11 @@ int MetaStatsCommand::convertToShared(string filename) { string header = m->getline(in); m->gobble(in); vector groups = m->splitWhiteSpace(header); - vector newLookup; + vector newLookup; + cout << groups.size() << endl; for (int i = 0; i < groups.size(); i++) { - SharedRAbundVector* temp = new SharedRAbundVector(); + cout << "creating group " << groups[i] << endl; + SharedRAbundFloatVector* temp = new SharedRAbundFloatVector(); temp->setLabel("0.03"); temp->setGroup(groups[i]); newLookup.push_back(temp); @@ -589,9 +591,9 @@ int MetaStatsCommand::convertToShared(string filename) { string otuname; in >> otuname; m->gobble(in); otuCount++; - + cout << otuname << endl; for (int i = 0; i < groups.size(); i++) { - int temp; + double temp; in >> temp; m->gobble(in); newLookup[i]->push_back(temp, groups[i]); } @@ -654,6 +656,8 @@ int MetaStatsCommand::convertToInput(vector& subset, string } out.close(); + cout << thisfilename+".matrix" << endl; + return 0; } catch(exception& e) { diff --git a/pam.cpp b/pam.cpp new file mode 100644 index 0000000..2c70f58 --- /dev/null +++ b/pam.cpp @@ -0,0 +1,373 @@ +// +// pam.cpp +// Mothur +// +// Created by SarahsWork on 12/10/13. +// Copyright (c) 2013 Schloss Lab. All rights reserved. +// + +#include "pam.h" +#define DBL_EPSILON 1e-9 + +/**************************************************************************************************/ +Pam::Pam(vector > c, vector > d, int p) : CommunityTypeFinder() { + try { + countMatrix = c; + numSamples = (int)d.size(); + numOTUs = (int)c[0].size(); + numPartitions = p; + dists = d; + + largestDist = 0; + for (int i = 0; i < dists.size(); i++) { + for (int j = i; j < dists.size(); j++) { + if (m->control_pressed) { break; } + if (dists[i][j] > largestDist) { largestDist = dists[i][j]; } + } + } + + buildPhase(); //choosing the medoids + swapPhase(); //optimize clusters + } + catch(exception& e) { + m->errorOut(e, "Pam", "Pam"); + exit(1); + } +} +/**************************************************************************************************/ +//sets Dp[0] does not set Dp[1]. chooses intial medoids. +int Pam::buildPhase() { + try { + + if (m->debug) { m->mothurOut("[DEBUG]: building medoids\n"); } + + vector gains; gains.resize(numSamples); + + largestDist *= 1.1 + 1; //make this distance larger than any distance in the matrix + Dp.resize(numSamples); + for (int i = 0; i < numSamples; i++) { Dp[i].push_back(largestDist); Dp[i].push_back(largestDist); } //2 smallest dists for this sample in this partition + + zMatrix.resize(numPartitions); + for(int i=0;icontrol_pressed) { break; } + + if (medoids.count(i) == 0) { //is this sample is NOT a medoid? + gains[i] = 0.0; + + for (int j = 0; j < numSamples; j++) { + //cout << i << '\t' << j << '\t' << Dp[i][0] << '\t' << dists[i][j] << '\t' << totalGain << endl; + totalGain = Dp[i][0] - dists[i][j]; + if (totalGain > 0.0) { gains[i] += totalGain; } + } + if (m->debug) { m->mothurOut("[DEBUG]: " + toString(i) + " totalGain = " + toString(totalGain) + "\n"); } + + if (clusterGain <= gains[i]) { + clusterGain = gains[i]; + medoid = i; + } + } + } + + //save medoid value + medoids.insert(medoid); + + if (m->debug) { m->mothurOut("[DEBUG]: new medoid " + toString(medoid) + "\n"); } + + //update dp values + for (int i = 0; i < numSamples; i++) { + if (Dp[i][0] > dists[i][medoid]) { Dp[i][0] = dists[i][medoid]; } + } + } + if (m->debug) { m->mothurOut("[DEBUG]: done building medoids\n"); } + return 0; + } + catch(exception& e) { + m->errorOut(e, "Pam", "buildPhase"); + exit(1); + } +} +/**************************************************************************************************/ +//goal to swap medoids with non-medoids to see if we can reduce the overall cost +int Pam::swapPhase() { + try { + if (m->debug) { m->mothurOut("[DEBUG]: swapping medoids\n"); } + //calculate cost of initial choice - average distance of samples to their closest medoid + double sky = 0.0; + double dzsky = 1.0; + for (int i = 0; i < numSamples; i++) { sky += Dp[i][0]; } sky /= (double) numSamples; + + bool done = false; + int hbest, nbest; hbest = -1; nbest = -1; + while (!done) { + if (m->control_pressed) { break; } + + updateDp(); + + dzsky = 1; + + for (int h = 0; h < numSamples; h++) { + if (m->control_pressed) { break; } + if (medoids.count(h) == 0) { //this is NOT a medoid + for (int i = 0; i < numSamples; i++) { + if (medoids.count(i) != 0) { //this is a medoid + + double dz = 0.0; //Tih sum of distances between objects and closest medoid caused by swapping i and h. Basically the change in cost. If this < 0 its a "good" swap. When all Tih are > 0, then we stop the algo, because we have the optimal medoids. + for (int j = 0; j < numSamples; j++) { + if (m->control_pressed) { break; } + if (dists[i][j] == Dp[j][0]) { + double small = 0.0; + if (Dp[j][1] > dists[h][j]) { small = dists[h][j]; } + else { small = Dp[j][1]; } + dz += (small - Dp[j][0]); + }else if (dists[h][j] < Dp[j][0]) { + dz += (dists[h][j] - Dp[j][0]); + } + } + if (dzsky > dz) { + dzsky = dz; + hbest = h; + nbest = i; + } + }//end if medoid + }//end for i + }//end if NOT medoid + }//end if h + + if (dzsky < -16 *DBL_EPSILON * fabs(sky)) { + medoids.insert(hbest); + medoids.erase(nbest); + if (m->debug) { m->mothurOut("[DEBUG]: swapping " + toString(hbest) + " " + toString(nbest) + "\n"); } + sky += dzsky; + }else { done = true; } //stop algo. + } + + + //fill zmatrix + int count = 0; + vector tempMedoids; + for (set::iterator it = medoids.begin(); it != medoids.end(); it++) { + medoid2Partition[*it] = count; + zMatrix[count][*it] = 1; count++; //set medoid in this partition. + tempMedoids.push_back(*it); + } + + //which partition do you belong to? + laplace = 0; + for (int i = 0; i < numSamples; i++) { + int partition = 0; + double dist = dists[i][tempMedoids[0]]; //assign to first medoid + for (int j = 1; j < tempMedoids.size(); j++) { + if (dists[i][tempMedoids[j]] < dist) { //is this medoid closer? + dist = dists[i][tempMedoids[j]]; + partition = j; + } + } + zMatrix[partition][i] = 1; + laplace += dist; + } + laplace /= (double) numSamples; + + if (m->debug) { + for(int i=0;imothurOut("[DEBUG]: partition 1: "); + for (int j = 0; j < numSamples; j++) { + m->mothurOut(toString(zMatrix[i][j]) + " "); + } + m->mothurOut("\n"); + } + m->mothurOut("[DEBUG]: medoids : "); + for (set::iterator it = medoids.begin(); it != medoids.end(); it++) { m->mothurOut(toString(*it) + " "); + } + m->mothurOut("\n"); + + m->mothurOut("[DEBUG]: laplace : " + toString(laplace)); m->mothurOut("\n"); + } + + if (m->debug) { m->mothurOut("[DEBUG]: done swapping medoids\n"); } + return 0; + } + catch(exception& e) { + m->errorOut(e, "Pam", "swapPhase"); + exit(1); + } +} + +/**************************************************************************************************/ +int Pam::updateDp() { + try { + for (int j = 0; j < numSamples; j++) { + if (m->control_pressed) { break; } + + //initialize dp and ep + Dp[j][0] = largestDist; Dp[j][1] = largestDist; + + for (int i = 0; i < numSamples; i++) { + if (medoids.count(i) != 0) { //is this a medoid? + if (Dp[j][0] > dists[j][i]) { + Dp[j][0] = Dp[j][1]; + Dp[j][0] = dists[j][i]; + }else if (Dp[j][1] > dists[j][i]) { + Dp[j][1] = dists[j][i]; + } + } + } + } + return 0; + } + catch(exception& e) { + m->errorOut(e, "Pam", "updateDp"); + exit(1); + } +} +/**************************************************************************************************/ +//The silhouette width S(i)of individual data points i is calculated using the following formula: +/* + s(i) = b(i) - a(i) + ----------- + max(b(i),a(i)) + where a(i) is the average dissimilarity (or distance) of sample i to all other samples in the same cluster, while b(i) is the average dissimilarity (or distance) to all objects in the closest other cluster. + + The formula implies -1 =< S(i) =< 1 . A sample which is much closer to its own cluster than to any other cluster has a high S(i) value, while S(i) close to 0 implies that the given sample lies somewhere between two clusters. Large negative S(i) values indicate that the sample was assigned to the wrong cluster. + */ + +vector Pam::calcSilhouettes(vector > dists) { + try { + vector silhouettes; silhouettes.resize(numSamples, 0.0); + if (numPartitions < 2) { return silhouettes; } + + vector whoDp; + for (int i = 0; i < numSamples; i++) { // assumes at least 2 partitions + vector tempMeds; + for (set::iterator it = medoids.begin(); it != medoids.end(); it++) { + if (m->control_pressed) { break; } + seqDist temp(*it, *it, dists[i][*it]); //medoid, medoid, distance from medoid to sample + tempMeds.push_back(temp); + } + sort(tempMeds.begin(), tempMeds.end(), compareSequenceDistance); //sort by distance + whoDp.push_back(tempMeds[1].seq1); //second closest medoid + } + + + vector a, b; a.resize(numSamples, 0.0); b.resize(numSamples, 0.0); + + //calc a - all a[i] are the same in the same partition + for (int k = 0; k < numPartitions; k++) { + if (m->control_pressed) { break; } + + int count = 0; + double totalA = 0.0; + for (int i = 0; i < numSamples; i++) { + for (int j = 0; j < numSamples; j++) { + if (m->control_pressed) { break; } + if ((zMatrix[k][i] != 0) && (zMatrix[k][j] != 0)){ //are both samples in the partition, if so add there distance + totalA += dists[j][i]; //distance from this sample to the other samples in the partition + count++; + } + } + } + totalA /= (double) count; + + //set a[i] to average for cluster + for (int i = 0; i < numSamples; i++) { + if (zMatrix[k][i] != 0) { a[i] = totalA; } + } + } + + //calc b + for (int i = 0; i < numSamples; i++) { + if (m->control_pressed) { break; } + + int nextClosestMedoid = whoDp[i]; + int thisPartition = medoid2Partition[nextClosestMedoid]; + int count = 0; + double totalB = 0.0; + for (int j = 0; j < numSamples; j++) { + if (zMatrix[thisPartition][j] != 0) { //this sample is in this partition + totalB += dists[i][j]; + count++; + } + } + b[i] = totalB / (double) count; + } + + //calc silhouettes + for (int i = 0; i < numSamples; i++) { + if (m->control_pressed) { break; } + + double denom = a[i]; + if (b[i] > denom) { denom = b[i]; } //max(a[i],b[i]) + + silhouettes[i] = (b[i] - a[i]) / denom; + + //cout << "silhouettes " << i << '\t' << silhouettes[i] << endl; + } + + return silhouettes; + } + catch(exception& e) { + m->errorOut(e, "Pam", "calcSilhouettes"); + exit(1); + } +} +/**************************************************************************************************/ +/*To assess the optimal number of clusters our dataset was most robustly partitioned into, we used the Calinski-Harabasz (CH) Index that has shown good performance in recovering the number of clusters. It is defined as: + + CHk=Bk/(k−1)/Wk/(n−k) + + where Bk is the between-cluster sum of squares (i.e. the squared distances between all points i and j, for which i and j are not in the same cluster) and Wk is the within-clusters sum of squares (i.e. the squared distances between all points i and j, for which i and j are in the same cluster). This measure implements the idea that the clustering is more robust when between-cluster distances are substantially larger than within-cluster distances. Consequently, we chose the number of clusters k such that CHk was maximal.*/ +double Pam::calcCHIndex(vector< vector > dists){ //countMatrix = [numSamples][numOtus] + try { + double CH = 0.0; + + if (numPartitions < 2) { return CH; } + + map clusterMap; //map sample to partition + for (int i = 0; i < numPartitions; i++) { + for (int j = 0; j < numSamples; j++) { + if (m->control_pressed) { return 0.0; } + if (zMatrix[i][j] != 0) { clusterMap[j] = i; } + } + } + + double sumBetweenCluster = 0.0; + double sumWithinClusters = 0.0; + + for (int i = 0; i < numSamples; i++) { //lt + for (int j = 0; j < i; j++) { + if (m->control_pressed) { return 0.0; } + int partitionI = clusterMap[i]; + int partitionJ = clusterMap[j]; + + if (partitionI == partitionJ) { //they are from the same cluster so this distance is added to Wk + sumWithinClusters += (dists[i][j] * dists[i][j]); + }else { //they are NOT from the same cluster so this distance is added to Bk + sumBetweenCluster += (dists[i][j] * dists[i][j]); + } + } + } + //cout << numPartitions << '\t' << sumWithinClusters << '\t' << sumBetweenCluster << '\t' << (numSamples - numPartitions) << endl; + + CH = (sumBetweenCluster / (double)(numPartitions - 1)) / (sumWithinClusters / (double) (numSamples - numPartitions)); + + return CH; + } + catch(exception& e){ + m->errorOut(e, "Pam", "calcCHIndex"); + exit(1); + } +} + +/**************************************************************************************************/ + + + diff --git a/pam.h b/pam.h new file mode 100644 index 0000000..9930176 --- /dev/null +++ b/pam.h @@ -0,0 +1,40 @@ +// +// pam.h +// Mothur +// +// Created by SarahsWork on 12/10/13. +// Copyright (c) 2013 Schloss Lab. All rights reserved. +// + +#ifndef Mothur_pam_h +#define Mothur_pam_h + +#include "communitytype.h" + +/**************************************************************************************************/ + +class Pam : public CommunityTypeFinder { + +public: + Pam(vector >, vector >, int); + vector calcSilhouettes(vector< vector< double> >); + double calcCHIndex(vector< vector< double> >); + +private: + set medoids; + map medoid2Partition; + double largestDist; + vector > dists; + vector > Dp; // [numSamples][2] - It contains Dp and Ep. Dp is in [numSamples][0] and Ep is in [numSamples][1]. Dp is the distance between p and the closest sample in S and Ep is the distance between p and the second closest object in S. Both are used in the build and swap phases. + + int buildPhase(); + int swapPhase(); + int updateDp(); + + + +/**************************************************************************************************/ +}; + + +#endif diff --git a/qFinderDMM.cpp b/qFinderDMM.cpp index c89644b..7ea5611 100644 --- a/qFinderDMM.cpp +++ b/qFinderDMM.cpp @@ -7,26 +7,27 @@ // #include "qFinderDMM.h" -#include "linearalgebra.h" -#define EPSILON numeric_limits::epsilon() + /**************************************************************************************************/ -qFinderDMM::qFinderDMM(vector > cm, int p): countMatrix(cm), numPartitions(p){ +qFinderDMM::qFinderDMM(vector > cm, int p) : CommunityTypeFinder() { try { - m = MothurOut::getInstance(); + //cout << "here" << endl; + numPartitions = p; + countMatrix = cm; numSamples = (int)countMatrix.size(); numOTUs = (int)countMatrix[0].size(); - - kMeans(); - // cout << "done kMeans" << endl; + //cout << "before kmeans" < > cm, int p): countMatrix(cm), numPart while(change > 1.0e-6 && iter < 100){ - // cout << "Calc_Z: "; + //cout << "Calc_Z: "; calculatePiK(); optimizeLambda(); - // printf("Iter:%d\t",iter); + //printf("Iter:%d\t",iter); for(int i=0;i > cm, int p): countMatrix(cm), numPart currNLL = nLL; - // printf("NLL=%.5f\tDelta=%.4e\n",currNLL, change); + // printf("NLL=%.5f\tDelta=%.4e\n",currNLL, change); iter++; } @@ -95,84 +96,35 @@ qFinderDMM::qFinderDMM(vector > cm, int p): countMatrix(cm), numPart exit(1); } } - /**************************************************************************************************/ - -void qFinderDMM::printZMatrix(string fileName, vector sampleName){ +void qFinderDMM::printFitData(ofstream& out){ try { - ofstream printMatrix; - m->openOutputFile(fileName, printMatrix); //(fileName.c_str()); - printMatrix.setf(ios::fixed, ios::floatfield); - printMatrix.setf(ios::showpoint); - - for(int i=0;ierrorOut(e, "CommunityTypeFinder", "printFitData"); + exit(1); } - catch(exception& e) { - m->errorOut(e, "qFinderDMM", "printZMatrix"); - exit(1); - } } - /**************************************************************************************************/ - -void qFinderDMM::printRelAbund(string fileName, vector otuNames){ +void qFinderDMM::printFitData(ostream& out, double minLaplace){ try { - ofstream printRA; - m->openOutputFile(fileName, printRA); //(fileName.c_str()); - printRA.setf(ios::fixed, ios::floatfield); - printRA.setf(ios::showpoint); - - vector totals(numPartitions, 0.0000); - for(int i=0;icontrol_pressed) { break; } - - printRA << otuNames[i]; - for(int j=0;j= 0.0000){ - double std = sqrt(error[j][i]); - printRA << '\t' << 100 * exp(lambdaMatrix[j][i]) / totals[j]; - printRA << '\t' << 100 * exp(lambdaMatrix[j][i] - 2.0 * std) / totals[j]; - printRA << '\t' << 100 * exp(lambdaMatrix[j][i] + 2.0 * std) / totals[j]; - } - else{ - printRA << '\t' << 100 * exp(lambdaMatrix[j][i]) / totals[j]; - printRA << '\t' << "NA"; - printRA << '\t' << "NA"; - } - } - printRA << endl; - } - - printRA.close(); + m->mothurOutJustToLog(toString(numPartitions) + '\t' + toString(getNLL()) + '\t' + toString(getLogDet()) + '\t'); + m->mothurOutJustToLog(toString(getBIC()) + '\t' + toString(getAIC()) + '\t' + toString(laplace)); + + return; + } + catch(exception& e){ + m->errorOut(e, "CommunityTypeFinder", "printFitData"); + exit(1); } - catch(exception& e) { - m->errorOut(e, "qFinderDMM", "printRelAbund"); - exit(1); - } } /**************************************************************************************************/ @@ -193,7 +145,7 @@ interp_quad (double f0, double fp0, double f1, double zl, double zh) double zmin = zl, fmin = fl; - if (fh < fmin) { zmin = zh; fmin = fh; } + if (fh < fmin) { zmin = zh; fmin = fh; } if (c > 0) /* positive curvature required for a minimum */ { @@ -216,7 +168,7 @@ interp_quad (double f0, double fp0, double f1, double zl, double zh) * * c(x) = f0 + fp0 * z + eta * z^2 + xi * z^3 * - * where eta=3*(f1-f0)-2*fp0-fp1, xi=fp0+fp1-2*(f1-f0). + * where eta=3*(f1-f0)-2*fp0-fp1, xi=fp0+fp1-2*(f1-f0). */ double cubic (double c0, double c1, double c2, double c3, double z){ @@ -231,7 +183,7 @@ void check_extremum (double c0, double c1, double c2, double c3, double z, double y = cubic (c0, c1, c2, c3, z); - if (y < *fmin) + if (y < *fmin) { *zmin = z; /* accepted new point*/ *fmin = y; @@ -240,7 +192,7 @@ void check_extremum (double c0, double c1, double c2, double c3, double z, /**************************************************************************************************/ -int gsl_poly_solve_quadratic (double a, double b, double c, +int gsl_poly_solve_quadratic (double a, double b, double c, double *x0, double *x1) { @@ -274,12 +226,12 @@ int gsl_poly_solve_quadratic (double a, double b, double c, double r1 = temp / a ; double r2 = c / temp ; - if (r1 < r2) + if (r1 < r2) { *x0 = r1 ; *x1 = r2 ; - } - else + } + else { *x0 = r2 ; *x1 = r1 ; @@ -287,7 +239,7 @@ int gsl_poly_solve_quadratic (double a, double b, double c, } return 2; } - else if (disc == 0) + else if (disc == 0) { *x0 = -0.5 * b / a ; *x1 = -0.5 * b / a ; @@ -297,7 +249,7 @@ int gsl_poly_solve_quadratic (double a, double b, double c, { return 0; } - + } /**************************************************************************************************/ @@ -317,14 +269,14 @@ double interp_cubic (double f0, double fp0, double f1, double fp1, double zl, do if (n == 2) /* found 2 roots */ { - if (z0 > zl && z0 < zh) + if (z0 > zl && z0 < zh) check_extremum (c0, c1, c2, c3, z0, &zmin, &fmin); - if (z1 > zl && z1 < zh) + if (z1 > zl && z1 < zh) check_extremum (c0, c1, c2, c3, z1, &zmin, &fmin); } else if (n == 1) /* found 1 root */ { - if (z0 > zl && z0 < zh) + if (z0 > zl && z0 < zh) check_extremum (c0, c1, c2, c3, z0, &zmin, &fmin); } } @@ -355,7 +307,7 @@ double interpolate (double a, double fa, double fpa, else{ z = interp_quad(fa, fpa * (b - a), fb, zmin, zmax); } - + alpha = a + z * (b - a); @@ -635,178 +587,6 @@ int qFinderDMM::bfgs2_Solver(vector& x){ } } -/**************************************************************************************************/ - -//can we get these psi/psi1 calculations into their own math class? -//psi calcualtions swiped from gsl library... - -static const double psi_cs[23] = { - -.038057080835217922, - .491415393029387130, - -.056815747821244730, - .008357821225914313, - -.001333232857994342, - .000220313287069308, - -.000037040238178456, - .000006283793654854, - -.000001071263908506, - .000000183128394654, - -.000000031353509361, - .000000005372808776, - -.000000000921168141, - .000000000157981265, - -.000000000027098646, - .000000000004648722, - -.000000000000797527, - .000000000000136827, - -.000000000000023475, - .000000000000004027, - -.000000000000000691, - .000000000000000118, - -.000000000000000020 -}; - -static double apsi_cs[16] = { - -.0204749044678185, - -.0101801271534859, - .0000559718725387, - -.0000012917176570, - .0000000572858606, - -.0000000038213539, - .0000000003397434, - -.0000000000374838, - .0000000000048990, - -.0000000000007344, - .0000000000001233, - -.0000000000000228, - .0000000000000045, - -.0000000000000009, - .0000000000000002, - -.0000000000000000 -}; - -/**************************************************************************************************/ - -double qFinderDMM::cheb_eval(const double seriesData[], int order, double xx){ - try { - double d = 0.0000; - double dd = 0.0000; - - double x2 = xx * 2.0000; - - for(int j=order;j>=1;j--){ - if (m->control_pressed) { return 0; } - double temp = d; - d = x2 * d - dd + seriesData[j]; - dd = temp; - } - - d = xx * d - dd + 0.5 * seriesData[0]; - return d; - } - catch(exception& e){ - m->errorOut(e, "qFinderDMM", "cheb_eval"); - exit(1); - } -} - -/**************************************************************************************************/ - -double qFinderDMM::psi(double xx){ - try { - double psiX = 0.0000; - - if(xx < 1.0000){ - - double t1 = 1.0 / xx; - psiX = cheb_eval(psi_cs, 22, 2.0*xx-1.0); - psiX = -t1 + psiX; - - } - else if(xx < 2.0000){ - - const double v = xx - 1.0; - psiX = cheb_eval(psi_cs, 22, 2.0*v-1.0); - - } - else{ - const double t = 8.0/(xx*xx)-1.0; - psiX = cheb_eval(apsi_cs, 15, t); - psiX += log(xx) - 0.5/xx; - } - - return psiX; - } - catch(exception& e){ - m->errorOut(e, "qFinderDMM", "psi"); - exit(1); - } -} - -/**************************************************************************************************/ - -/* coefficients for Maclaurin summation in hzeta() - * B_{2j}/(2j)! - */ -static double hzeta_c[15] = { - 1.00000000000000000000000000000, - 0.083333333333333333333333333333, - -0.00138888888888888888888888888889, - 0.000033068783068783068783068783069, - -8.2671957671957671957671957672e-07, - 2.0876756987868098979210090321e-08, - -5.2841901386874931848476822022e-10, - 1.3382536530684678832826980975e-11, - -3.3896802963225828668301953912e-13, - 8.5860620562778445641359054504e-15, - -2.1748686985580618730415164239e-16, - 5.5090028283602295152026526089e-18, - -1.3954464685812523340707686264e-19, - 3.5347070396294674716932299778e-21, - -8.9535174270375468504026113181e-23 -}; - -/**************************************************************************************************/ - -double qFinderDMM::psi1(double xx){ - try { - - /* Euler-Maclaurin summation formula - * [Moshier, p. 400, with several typo corrections] - */ - - double s = 2.0000; - const int jmax = 12; - const int kmax = 10; - int j, k; - const double pmax = pow(kmax + xx, -s); - double scp = s; - double pcp = pmax / (kmax + xx); - double value = pmax*((kmax+xx)/(s-1.0) + 0.5); - - for(k=0; kcontrol_pressed) { return 0; } - value += pow(k + xx, -s); - } - - for(j=0; j<=jmax; j++) { - if (m->control_pressed) { return 0; } - double delta = hzeta_c[j+1] * scp * pcp; - value += delta; - - if(fabs(delta/value) < 0.5*EPSILON) break; - - scp *= (s+2*j+1)*(s+2*j+2); - pcp /= (kmax + xx)*(kmax + xx); - } - - return value; - } - catch(exception& e){ - m->errorOut(e, "qFinderDMM", "psi1"); - exit(1); - } -} /**************************************************************************************************/ @@ -961,145 +741,6 @@ double qFinderDMM::getNegativeLogEvidence(vector& lambda, int group){ /**************************************************************************************************/ -void qFinderDMM::kMeans(){ - try { - - vector > relativeAbundance(numSamples); - vector > alphaMatrix; - - alphaMatrix.resize(numPartitions); - lambdaMatrix.resize(numPartitions); - for(int i=0;icontrol_pressed) { return; } - int groupTotal = 0; - - relativeAbundance[i].assign(numOTUs, 0.0); - - for(int j=0;j 1e-6 && iteration < maxIters){ - - if (m->control_pressed) { return; } - //calcualte average relative abundance - maxChange = 0.0000; - for(int i=0;i averageRelativeAbundance(numOTUs, 0); - for(int j=0;j maxChange){ maxChange = normChange; } - } - - - //calcualte distance between each sample in partition adn the average relative abundance - for(int i=0;icontrol_pressed) { return; } - - double normalizationFactor = 0; - vector totalDistToPartition(numPartitions, 0); - - for(int j=0;jcontrol_pressed) { return; } - for(int j=0;j 0){ - lambdaMatrix[j][i] = log(alphaMatrix[j][i]); - } - else{ - lambdaMatrix[j][i] = -10.0; - } - } - } - } - catch(exception& e){ - m->errorOut(e, "qFinderDMM", "kMeans"); - exit(1); - } -} - -/**************************************************************************************************/ - void qFinderDMM::optimizeLambda(){ try { for(currentPartition=0;currentPartition > qFinderDMM::getHessian(){ - try { - vector alpha(numOTUs, 0.0000); - double alphaSum = 0.0000; - - vector pi = zMatrix[currentPartition]; - vector psi_ajk(numOTUs, 0.0000); - vector psi_cjk(numOTUs, 0.0000); - vector psi1_ajk(numOTUs, 0.0000); - vector psi1_cjk(numOTUs, 0.0000); - - for(int j=0;jcontrol_pressed) { break; } - - alpha[j] = exp(lambdaMatrix[currentPartition][j]); - alphaSum += alpha[j]; - - for(int i=0;icontrol_pressed) { break; } - weight += pi[i]; - double sum = 0.0000; - for(int j=0;j > hessian(numOTUs); - for(int i=0;icontrol_pressed) { break; } - double term1 = -alpha[i] * (- psi_ajk[i] + psi_Ak + psi_cjk[i] - psi_Ck); - double term2 = -alpha[i] * alpha[i] * (-psi1_ajk[i] + psi1_Ak + psi1_cjk[i] - psi1_Ck); - double term3 = 0.1 * alpha[i]; - - hessian[i][i] = term1 + term2 + term3; - - for(int j=0;jerrorOut(e, "qFinderDMM", "getHessian"); - exit(1); - } -} - /**************************************************************************************************/ diff --git a/qFinderDMM.h b/qFinderDMM.h index d94e6cc..df7b443 100644 --- a/qFinderDMM.h +++ b/qFinderDMM.h @@ -9,27 +9,19 @@ #ifndef pds_dmm_qFinderDMM_h #define pds_dmm_qFinderDMM_h -/**************************************************************************************************/ - -#include "mothurout.h" +#include "communitytype.h" /**************************************************************************************************/ -class qFinderDMM { +class qFinderDMM : public CommunityTypeFinder { public: qFinderDMM(vector >, int); - double getNLL() { return currNLL; } - double getAIC() { return aic; } - double getBIC() { return bic; } - double getLogDet() { return logDeterminant; } - double getLaplace() { return laplace; } - void printZMatrix(string, vector); - void printRelAbund(string, vector); - + void printFitData(ofstream&); + void printFitData(ostream&, double); + private: - MothurOut* m; - void kMeans(); + void optimizeLambda(); void calculatePiK(); @@ -37,31 +29,14 @@ private: void negativeLogDerivEvidenceLambdaPi(vector&, vector&); double getNegativeLogEvidence(vector&, int); double getNegativeLogLikelihood(); - vector > getHessian(); + int lineMinimizeFletcher(vector&, vector&, double, double, double, double&, double&, vector&, vector&); int bfgs2_Solver(vector&);//, double, double); - double cheb_eval(const double[], int, double); - double psi(double); - double psi1(double); - - vector > countMatrix; - vector > zMatrix; - vector > lambdaMatrix; - vector weights; - vector > error; - - int numPartitions; - int numSamples; - int numOTUs; - int currentPartition; - - double currNLL; - double aic; - double bic; - double logDeterminant; - double laplace; + + + }; /**************************************************************************************************/ diff --git a/qualityscores.cpp b/qualityscores.cpp index 0c55650..3a1687b 100644 --- a/qualityscores.cpp +++ b/qualityscores.cpp @@ -130,7 +130,7 @@ string QualityScores::getName(){ void QualityScores::printQScores(ofstream& qFile){ try { - double aveQScore = calculateAverage(); + double aveQScore = calculateAverage(false); qFile << '>' << seqName << '\t' << aveQScore << endl; @@ -228,7 +228,7 @@ bool QualityScores::stripQualThreshold(Sequence& sequence, double qThreshold){ /**************************************************************************************************/ -bool QualityScores::stripQualRollingAverage(Sequence& sequence, double qThreshold){ +bool QualityScores::stripQualRollingAverage(Sequence& sequence, double qThreshold, bool logTransform){ try { string rawSequence = sequence.getUnaligned(); int seqLength = sequence.getNumBases(); @@ -240,12 +240,22 @@ bool QualityScores::stripQualRollingAverage(Sequence& sequence, double qThreshol int end = -1; double rollingSum = 0.0000; + double value = 0.0; for(int i=0;imothurOutEndLine(); } - double aveQScore = calculateAverage(); + double aveQScore = calculateAverage(logTransform); if(aveQScore >= qAverage) { success = 1; } else { success = 0; } diff --git a/qualityscores.h b/qualityscores.h index 500d3e9..a802636 100644 --- a/qualityscores.h +++ b/qualityscores.h @@ -30,9 +30,9 @@ public: void trimQScores(int, int); void flipQScores(); bool stripQualThreshold(Sequence&, double); - bool stripQualRollingAverage(Sequence&, double); - bool stripQualWindowAverage(Sequence&, int, int, double); - bool cullQualAverage(Sequence&, double); + bool stripQualRollingAverage(Sequence&, double, bool); + bool stripQualWindowAverage(Sequence&, int, int, double, bool); + bool cullQualAverage(Sequence&, double, bool); void updateQScoreErrorMap(map >&, string, int, int, int); void updateForwardMap(vector >&, int, int, int); void updateReverseMap(vector >&, int, int, int); @@ -42,7 +42,7 @@ public: private: - double calculateAverage(); + double calculateAverage(bool); MothurOut* m; vector qScores; diff --git a/seqsummarycommand.cpp b/seqsummarycommand.cpp index e941b3a..70aa55d 100644 --- a/seqsummarycommand.cpp +++ b/seqsummarycommand.cpp @@ -449,7 +449,8 @@ int SeqSummaryCommand::driverCreateSummary(vector& startPosition, vectorcontrol_pressed) { in.close(); outSummary.close(); return 1; } @@ -479,7 +480,6 @@ int SeqSummaryCommand::driverCreateSummary(vector& startPosition, vector shared) { + try { + data.resize(3,0); + + double commSize = 1e20; + + SAbundVector sabund1 = shared[0]->getSAbundVector(); + SAbundVector sabund2 = shared[1]->getSAbundVector(); + + double sampleSize = 0; + for (int i = 0; i < sabund1.getNumBins(); i++) { sampleSize += (sabund1.get(i) * sabund2.get(i)); } + int aux = ceil(pow((sampleSize+1), 0.33333)); + + if (isnan(data[0]) || isinf(data[0])) { data[0] = 0; } + + return data; + } + catch(exception& e) { + m->errorOut(e, "RangeShannon", "getValues"); + exit(1); + } +} +/***********************************************************************/ diff --git a/shannonrange.h b/shannonrange.h new file mode 100644 index 0000000..34b73f4 --- /dev/null +++ b/shannonrange.h @@ -0,0 +1,29 @@ +// +// shannonrange.h +// Mothur +// +// Created by SarahsWork on 1/3/14. +// Copyright (c) 2014 Schloss Lab. All rights reserved. +// + +#ifndef Mothur_shannonrange_h +#define Mothur_shannonrange_h + +#include "calculator.h" + +/***********************************************************************/ + +class RangeShannon : public Calculator { + +public: + RangeShannon() : Calculator("rangeshannon", 3, false) {}; + EstOutput getValues(SAbundVector*) {return data;}; + EstOutput getValues(vector); + string getCitation() { return "http://www.mothur.org/wiki/rangeshannon"; } +}; + +/***********************************************************************/ + + + +#endif diff --git a/sharedjsd.cpp b/sharedjsd.cpp new file mode 100644 index 0000000..46cb977 --- /dev/null +++ b/sharedjsd.cpp @@ -0,0 +1,48 @@ +// +// sharedjsd.cpp +// Mothur +// +// Created by SarahsWork on 12/9/13. +// Copyright (c) 2013 Schloss Lab. All rights reserved. +// + +#include "sharedjsd.h" + +/***********************************************************************/ +//KLD <- function(x,y) sum(x *log(x/y)) +//JSD<- function(x,y) sqrt(0.5 * KLD(x, (x+y)/2) + 0.5 * KLD(y, (x+y)/2)) +EstOutput JSD::getValues(vector shared) { + try { + + data.resize(1,0); + + double KLD1 = 0.0; + double KLD2 = 0.0; + + for (int i = 0; i < shared[0]->getNumBins(); i++) { + //store in temps to avoid multiple repetitive function calls + double tempA = shared[0]->getAbundance(i); + double tempB = shared[1]->getAbundance(i); + + if (tempA == 0) { tempA = 0.000001; } + if (tempB == 0) { tempB = 0.000001; } + + double denom = (tempA+tempB)/(double)2.0; + + if (tempA != 0) { KLD1 += tempA * log(tempA/denom); } //KLD(x,m) + if (tempB != 0) { KLD2 += tempB * log(tempB/denom); } //KLD(y,m) + } + + data[0] = sqrt((0.5*KLD1) + (0.5*KLD2)); + + if (isnan(data[0]) || isinf(data[0])) { data[0] = 0; } + + return data; + } + catch(exception& e) { + m->errorOut(e, "JSD", "getValues"); + exit(1); + } +} + +/***********************************************************************/ diff --git a/sharedjsd.h b/sharedjsd.h new file mode 100644 index 0000000..cec0faa --- /dev/null +++ b/sharedjsd.h @@ -0,0 +1,30 @@ +// +// sharedjsd.h +// Mothur +// +// Created by SarahsWork on 12/9/13. +// Copyright (c) 2013 Schloss Lab. All rights reserved. +// + +#ifndef Mothur_sharedjsd_h +#define Mothur_sharedjsd_h + +#include "calculator.h" + +/***********************************************************************/ +//Jensen-Shannon divergence (JSD) +class JSD : public Calculator { + +public: + JSD() : Calculator("jsd", 1, false) {}; + EstOutput getValues(SAbundVector*) {return data;}; + EstOutput getValues(vector); + string getCitation() { return "http://www.mothur.org/wiki/JSD"; } +private: + +}; + +/***********************************************************************/ + + +#endif diff --git a/summarysharedcommand.cpp b/summarysharedcommand.cpp index 2c9399b..ef5e365 100644 --- a/summarysharedcommand.cpp +++ b/summarysharedcommand.cpp @@ -17,7 +17,7 @@ vector SummarySharedCommand::setParameters(){ CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel); CommandParameter psubsample("subsample", "String", "", "", "", "", "","phylip",false,false); parameters.push_back(psubsample); CommandParameter pdistance("distance", "Boolean", "", "F", "", "", "","phylip",false,false); parameters.push_back(pdistance); - CommandParameter pcalc("calc", "Multiple", "sharedchao-sharedsobs-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan-kstest-whittaker-sharednseqs-ochiai-anderberg-kulczynski-kulczynskicody-lennon-morisitahorn-braycurtis-odum-canberra-structeuclidean-structchord-hellinger-manhattan-structpearson-soergel-spearman-structkulczynski-speciesprofile-structchi2-hamming-gower-memchi2-memchord-memeuclidean-mempearson", "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan", "", "", "","",true,false,true); parameters.push_back(pcalc); + CommandParameter pcalc("calc", "Multiple", "sharedchao-sharedsobs-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan-kstest-whittaker-sharednseqs-ochiai-anderberg-kulczynski-kulczynskicody-lennon-morisitahorn-braycurtis-odum-canberra-structeuclidean-structchord-hellinger-manhattan-structpearson-soergel-spearman-structkulczynski-speciesprofile-structchi2-hamming-gower-memchi2-memchord-memeuclidean-mempearson-jsd", "sharedsobs-sharedchao-sharedace-jabund-sorabund-jclass-sorclass-jest-sorest-thetayc-thetan", "", "", "","",true,false,true); parameters.push_back(pcalc); CommandParameter poutput("output", "Multiple", "lt-square", "lt", "", "", "","",false,false); parameters.push_back(poutput); CommandParameter pall("all", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pall); CommandParameter piters("iters", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(piters); @@ -294,6 +294,8 @@ SummarySharedCommand::SummarySharedCommand(string option) { sumCalculators.push_back(new MemEuclidean()); }else if (Estimators[i] == "mempearson") { sumCalculators.push_back(new MemPearson()); + }else if (Estimators[i] == "jsd") { + sumCalculators.push_back(new JSD()); } } } diff --git a/summarysharedcommand.h b/summarysharedcommand.h index 8ba9b9f..1b3f164 100644 --- a/summarysharedcommand.h +++ b/summarysharedcommand.h @@ -56,6 +56,7 @@ #include "memchord.h" #include "memeuclidean.h" #include "mempearson.h" +#include "sharedjsd.h" class SummarySharedCommand : public Command { @@ -216,6 +217,8 @@ static DWORD WINAPI MySummarySharedThreadFunction(LPVOID lpParam){ sumCalculators.push_back(new MemEuclidean()); }else if (pDataArray->Estimators[i] == "mempearson") { sumCalculators.push_back(new MemPearson()); + }else if (pDataArray->Estimators[i] == "jsd") { + sumCalculators.push_back(new JSD()); } } } diff --git a/trimflowscommand.cpp b/trimflowscommand.cpp index fd55edf..33349de 100644 --- a/trimflowscommand.cpp +++ b/trimflowscommand.cpp @@ -395,7 +395,7 @@ int TrimFlowsCommand::driverCreateTrim(string flowFileName, string trimFlowFileN if(line->start == 0){ flowFile >> numFlows; m->gobble(flowFile); - scrapFlowFile << maxFlows << endl; + scrapFlowFile << numFlows << endl; trimFlowFile << maxFlows << endl; if(allFiles){ for(int i=0;i >& outFlowFileNames){ else if(type == "REVERSE"){ string oligoRC = reverseOligo(oligo); revPrimer.push_back(oligoRC); + if (m->debug) { m->mothurOut("[DEBUG]: reverse oligos = " + oligoRC + ".\n"); } } else if(type == "BARCODE"){ oligosFile >> group; diff --git a/trimseqscommand.cpp b/trimseqscommand.cpp index 0d55d7c..951ce65 100644 --- a/trimseqscommand.cpp +++ b/trimseqscommand.cpp @@ -34,6 +34,7 @@ vector TrimSeqsCommand::setParameters(){ CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors); CommandParameter pallfiles("allfiles", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pallfiles); CommandParameter pkeepforward("keepforward", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pkeepforward); + CommandParameter plogtransform("logtransform", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plogtransform); CommandParameter pqtrim("qtrim", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pqtrim); CommandParameter pqthreshold("qthreshold", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pqthreshold); CommandParameter pqaverage("qaverage", "Number", "", "0", "", "", "","",false,false); parameters.push_back(pqaverage); @@ -61,7 +62,7 @@ string TrimSeqsCommand::getHelpString(){ string helpString = ""; helpString += "The trim.seqs command reads a fastaFile and creates 2 new fasta files, .trim.fasta and scrap.fasta, as well as group files if you provide and oligos file.\n"; helpString += "The .trim.fasta contains sequences that meet your requirements, and the .scrap.fasta contains those which don't.\n"; - helpString += "The trim.seqs command parameters are fasta, name, count, flip, checkorient, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast and allfiles.\n"; + helpString += "The trim.seqs command parameters are fasta, name, count, flip, checkorient, oligos, maxambig, maxhomop, minlength, maxlength, qfile, qthreshold, qaverage, diffs, qtrim, keepfirst, removelast, logtransform and allfiles.\n"; helpString += "The fasta parameter is required.\n"; helpString += "The flip parameter will output the reverse compliment of your trimmed sequence. The default is false.\n"; helpString += "The checkorient parameter will check the reverse compliment of the sequence if the barcodes and primers cannot be found in the forward. The default is false.\n"; @@ -84,6 +85,7 @@ string TrimSeqsCommand::getHelpString(){ helpString += "The qwindowaverage parameter allows you to set a minimum average quality score allowed over a window. \n"; helpString += "The rollaverage parameter allows you to set a minimum rolling average quality score allowed over a window. \n"; helpString += "The qstepsize parameter allows you to set a number of bases to move the window over. Default=1.\n"; + helpString += "The logtransform parameter allows you to indicate you want the averages for the qwindowaverage, rollaverage and qaverage to be calculated using a logtransform. Default=F.\n"; helpString += "The allfiles parameter will create separate group and fasta file for each grouping. The default is F.\n"; helpString += "The keepforward parameter allows you to indicate whether you want the forward primer removed or not. The default is F, meaning remove the forward primer.\n"; helpString += "The qtrim parameter will trim sequence from the point that they fall below the qthreshold and put it in the .trim file if set to true. The default is T.\n"; @@ -329,6 +331,9 @@ TrimSeqsCommand::TrimSeqsCommand(string option) { temp = validParameter.validFile(parameters, "keepforward", false); if (temp == "not found") { temp = "F"; } keepforward = m->isTrue(temp); + temp = validParameter.validFile(parameters, "logtransform", false); if (temp == "not found") { temp = "F"; } + logtransform = m->isTrue(temp); + temp = validParameter.validFile(parameters, "checkorient", false); if (temp == "not found") { temp = "F"; } reorient = m->isTrue(temp); @@ -444,8 +449,6 @@ int TrimSeqsCommand::execute(){ } } - if (!pairedOligos) { if (reorient) { m->mothurOut("[WARNING]: You cannot use reorient without paired barcodes or primers, skipping."); m->mothurOutEndLine(); reorient = false; } } - if (m->control_pressed) { return 0; } //fills lines and qlines @@ -695,7 +698,21 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string rpairedBarcodes[it->first] = tempPair; //cout << reverseOligo((it->second).reverse) << '\t' << (reverseOligo((it->second).forward)) << '\t' << barcodeNameVector[it->first] << endl; } - rtrimOligos = new TrimOligos(pdiffs, bdiffs, 0, 0, rpairedPrimers, rpairedBarcodes); numBarcodes = rpairedBarcodes.size(); + int index = rpairedBarcodes.size(); + for (map::iterator it = barcodes.begin(); it != barcodes.end(); it++) { + oligosPair tempPair("", reverseOligo((it->first))); //reverseBarcode, rc ForwardBarcode + rpairedBarcodes[index] = tempPair; index++; + //cout << reverseOligo((it->second).reverse) << '\t' << (reverseOligo((it->second).forward)) << '\t' << barcodeNameVector[it->first] << endl; + } + + index = rpairedPrimers.size(); + for (map::iterator it = primers.begin(); it != primers.end(); it++) { + oligosPair tempPair("", reverseOligo((it->first))); //reverseBarcode, rc ForwardBarcode + rpairedPrimers[index] = tempPair; index++; + //cout << reverseOligo((it->second).reverse) << '\t' << (reverseOligo((it->second).forward)) << '\t' << primerNameVector[it->first] << endl; + } + + rtrimOligos = new TrimOligos(pdiffs, bdiffs, 0, 0, rpairedPrimers, rpairedBarcodes); numBarcodes = rpairedBarcodes.size(); } while (moreSeqs) { @@ -819,9 +836,9 @@ int TrimSeqsCommand::driverCreateTrim(string filename, string qFileName, string int origLength = currSeq.getNumBases(); if(qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, qThreshold); } - else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage); } - else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage); } - else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage); } + else if(qAverage != 0) { success = currQual.cullQualAverage(currSeq, qAverage, logtransform); } + else if(qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, qRollAverage, logtransform); } + else if(qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, qWindowStep, qWindowSize, qWindowAverage, logtransform); } else { success = 1; } //you don't want to trim, if it fails above then scrap it @@ -1174,7 +1191,7 @@ int TrimSeqsCommand::createProcessesCreateTrim(string filename, string qFileName lines[h].start, lines[h].end, qLines[h].start, qLines[h].end, m, pdiffs, bdiffs, ldiffs, sdiffs, tdiffs, primers, barcodes, revPrimer, linker, spacer, pairedBarcodes, pairedPrimers, pairedOligos, primerNameVector, barcodeNameVector, createGroup, allFiles, keepforward, keepFirst, removeLast, - qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage, + qWindowStep, qWindowSize, qWindowAverage, qtrim, qThreshold, qAverage, qRollAverage, logtransform, minLength, maxAmbig, maxHomoP, maxLength, flip, reorient, nameMap, nameCount); pDataArray.push_back(tempTrim); @@ -1646,7 +1663,7 @@ bool TrimSeqsCommand::getOligos(vector >& fastaFileNames, vector< if (hasPairedBarcodes || hasPrimer) { pairedOligos = true; if ((primers.size() != 0) || (barcodes.size() != 0) || (linker.size() != 0) || (spacer.size() != 0) || (revPrimer.size() != 0)) { m->control_pressed = true; m->mothurOut("[ERROR]: cannot mix paired primers and barcodes with non paired or linkers and spacers, quitting."); m->mothurOutEndLine(); return 0; } - }else if (reorient) { m->mothurOut("[Warning]: cannot use checkorient without paired barcodes or primers, ignoring.\n"); m->mothurOutEndLine(); reorient = false; } + } if(barcodeNameVector.size() == 0 && primerNameVector[0] == ""){ allFiles = 0; } diff --git a/trimseqscommand.h b/trimseqscommand.h index 53f04d3..14eed17 100644 --- a/trimseqscommand.h +++ b/trimseqscommand.h @@ -55,7 +55,7 @@ private: bool abort, createGroup; string fastaFile, oligoFile, qFileName, groupfile, nameFile, countfile, outputDir; - bool flip, allFiles, qtrim, keepforward, pairedOligos, reorient; + bool flip, allFiles, qtrim, keepforward, pairedOligos, reorient, logtransform; int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, processors, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs, comboStarts; int qWindowSize, qWindowStep, keepFirst, removeLast; double qRollAverage, qThreshold, qWindowAverage, qAverage; @@ -98,7 +98,7 @@ struct trimData { vector > qualFileNames; vector > nameFileNames; unsigned long long lineStart, lineEnd, qlineStart, qlineEnd; - bool flip, allFiles, qtrim, keepforward, createGroup, pairedOligos, reorient; + bool flip, allFiles, qtrim, keepforward, createGroup, pairedOligos, reorient, logtransform; int numFPrimers, numRPrimers, numLinkers, numSpacers, maxAmbig, maxHomoP, minLength, maxLength, tdiffs, bdiffs, pdiffs, ldiffs, sdiffs; int qWindowSize, qWindowStep, keepFirst, removeLast, count; double qRollAverage, qThreshold, qWindowAverage, qAverage; @@ -121,7 +121,7 @@ struct trimData { trimData(string fn, string qn, string nf, string cf, string tn, string sn, string tqn, string sqn, string tnn, string snn, string tcn, string scn,string gn, vector > ffn, vector > qfn, vector > nfn, unsigned long long lstart, unsigned long long lend, unsigned long long qstart, unsigned long long qend, MothurOut* mout, int pd, int bd, int ld, int sd, int td, map pri, map bar, vector revP, vector li, vector spa, map pbr, map ppr, bool po, vector priNameVector, vector barNameVector, bool cGroup, bool aFiles, bool keepF, int keepfi, int removeL, - int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage, + int WindowStep, int WindowSize, int WindowAverage, bool trim, double Threshold, double Average, double RollAverage, bool lt, int minL, int maxA, int maxH, int maxL, bool fli, bool reo, map nm, map ncount) { filename = fn; qFileName = qn; @@ -174,6 +174,7 @@ struct trimData { qThreshold = Threshold; qAverage = Average; qRollAverage = RollAverage; + logtransform = lt; minLength = minL; maxAmbig = maxA; maxHomoP = maxH; @@ -276,6 +277,19 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ oligosPair tempPair(trimOligos->reverseOligo((it->second).reverse), (trimOligos->reverseOligo((it->second).forward))); //reverseBarcode, rc ForwardBarcode rpairedBarcodes[it->first] = tempPair; } + + int index = rpairedBarcodes.size(); + for (map::iterator it = pDataArray->barcodes.begin(); it != pDataArray->barcodes.end(); it++) { + oligosPair tempPair("", trimOligos->reverseOligo((it->first))); //reverseBarcode, rc ForwardBarcode + rpairedBarcodes[index] = tempPair; index++; + } + + index = rpairedPrimers.size(); + for (map::iterator it = pDataArray->primers.begin(); it != pDataArray->primers.end(); it++) { + oligosPair tempPair("", trimOligos->reverseOligo((it->first))); //reverseBarcode, rc ForwardBarcode + rpairedPrimers[index] = tempPair; index++; + } + rtrimOligos = new TrimOligos(pDataArray->pdiffs, pDataArray->bdiffs, 0, 0, rpairedPrimers, rpairedBarcodes); numBarcodes = rpairedBarcodes.size(); } @@ -416,9 +430,9 @@ static DWORD WINAPI MyTrimThreadFunction(LPVOID lpParam){ int origLength = currSeq.getNumBases(); if(pDataArray->qThreshold != 0) { success = currQual.stripQualThreshold(currSeq, pDataArray->qThreshold); } - else if(pDataArray->qAverage != 0) { success = currQual.cullQualAverage(currSeq, pDataArray->qAverage); } - else if(pDataArray->qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, pDataArray->qRollAverage); } - else if(pDataArray->qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, pDataArray->qWindowStep, pDataArray->qWindowSize, pDataArray->qWindowAverage); } + else if(pDataArray->qAverage != 0) { success = currQual.cullQualAverage(currSeq, pDataArray->qAverage, pDataArray->logtransform); } + else if(pDataArray->qRollAverage != 0) { success = currQual.stripQualRollingAverage(currSeq, pDataArray->qRollAverage, pDataArray->logtransform); } + else if(pDataArray->qWindowAverage != 0){ success = currQual.stripQualWindowAverage(currSeq, pDataArray->qWindowStep, pDataArray->qWindowSize, pDataArray->qWindowAverage, pDataArray->logtransform); } else { success = 1; } //you don't want to trim, if it fails above then scrap it diff --git a/validcalculator.cpp b/validcalculator.cpp index 7df7c4e..4718e2f 100644 --- a/validcalculator.cpp +++ b/validcalculator.cpp @@ -76,6 +76,7 @@ #include "mempearson.h" #include "sharedsobs.h" #include "sharednseqs.h" +#include "sharedjsd.h" /********************************************************************/ @@ -206,6 +207,7 @@ void ValidCalculators::printCitations(vector Estimators) { }else if (Estimators[i] == "mempearson") { Calculator* temp = new MemPearson(); m->mothurOut(temp->getName() + ": "); temp->citation(); delete temp; }else if (Estimators[i] == "sharedobserved") { Calculator* temp = new SharedSobs(); m->mothurOut(temp->getName() + ": "); temp->citation(); delete temp; }else if (Estimators[i] == "kulczynski") { Calculator* temp = new Kulczynski(); m->mothurOut(temp->getName() + ": "); temp->citation(); delete temp; + }else if (Estimators[i] == "jsd") { Calculator* temp = new JSD(); m->mothurOut(temp->getName() + ": "); temp->citation(); delete temp; }else { m->mothurOut("[ERROR]: Missing else if for " + Estimators[i] + " in printCitations."); m->mothurOutEndLine(); } }else { m->mothurOut(Estimators[i] + " is not a valid calculator, no citation will be given."); m->mothurOutEndLine(); } } @@ -456,6 +458,7 @@ void ValidCalculators::initialShared() { shared["memchord"] = "memchord"; shared["memeuclidean"] = "memeuclidean"; shared["mempearson"] = "mempearson"; + shared["jsd"] = "jsd"; shared["default"] = "default"; } catch(exception& e) { @@ -569,6 +572,7 @@ void ValidCalculators::initialSharedSummary() { sharedsummary["memchord"] = "memchord"; sharedsummary["memeuclidean"] = "memeuclidean"; sharedsummary["mempearson"] = "mempearson"; + sharedsummary["jsd"] = "jsd"; sharedsummary["default"] = "default"; } catch(exception& e) { @@ -733,6 +737,7 @@ void ValidCalculators::initialMatrix() { matrix["memchord"] = "memchord"; matrix["memeuclidean"] = "memeuclidean"; matrix["mempearson"] = "mempearson"; + matrix["jsd"] = "jsd"; } catch(exception& e) { -- 2.39.2